{ "best_global_step": 90000, "best_metric": 0.80835962, "best_model_checkpoint": "/mnt/hdd2/yansc/qwenomni/sft/v1-20250413-095545/checkpoint-90000", "epoch": 1.1678144737226255, "eval_steps": 10000, "global_step": 90000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.2975680331139362e-05, "grad_norm": 5.700944423675537, "learning_rate": 1.2975217334890361e-08, "loss": 1.041358232498169, "memory(GiB)": 88.34, "step": 1, "token_acc": 0.7537619699042407, "train_speed(iter/s)": 0.030396 }, { "epoch": 6.48784016556968e-05, "grad_norm": 6.273989677429199, "learning_rate": 6.487608667445181e-08, "loss": 1.0705899000167847, "memory(GiB)": 88.34, "step": 5, "token_acc": 0.7133769538349691, "train_speed(iter/s)": 0.075205 }, { "epoch": 0.0001297568033113936, "grad_norm": 5.625618934631348, "learning_rate": 1.2975217334890362e-07, "loss": 1.0776777267456055, "memory(GiB)": 88.96, "step": 10, "token_acc": 0.7090220507556719, "train_speed(iter/s)": 0.091638 }, { "epoch": 0.00019463520496709044, "grad_norm": 6.4206061363220215, "learning_rate": 1.946282600233554e-07, "loss": 1.0783297538757324, "memory(GiB)": 88.96, "step": 15, "token_acc": 0.7194785487653469, "train_speed(iter/s)": 0.099606 }, { "epoch": 0.0002595136066227872, "grad_norm": 5.6823601722717285, "learning_rate": 2.5950434669780724e-07, "loss": 1.0434499740600587, "memory(GiB)": 88.99, "step": 20, "token_acc": 0.716674837936482, "train_speed(iter/s)": 0.103072 }, { "epoch": 0.000324392008278484, "grad_norm": 5.914145469665527, "learning_rate": 3.24380433372259e-07, "loss": 1.0831805229187013, "memory(GiB)": 88.99, "step": 25, "token_acc": 0.7241424889300112, "train_speed(iter/s)": 0.106743 }, { "epoch": 0.0003892704099341809, "grad_norm": 4.041904926300049, "learning_rate": 3.892565200467108e-07, "loss": 1.0192447662353517, "memory(GiB)": 88.99, "step": 30, "token_acc": 0.7181571815718157, "train_speed(iter/s)": 0.108914 }, { "epoch": 0.0004541488115898777, "grad_norm": 4.361569404602051, "learning_rate": 4.541326067211626e-07, "loss": 1.0296567916870116, "memory(GiB)": 88.99, "step": 35, "token_acc": 0.7249411049471844, "train_speed(iter/s)": 0.110658 }, { "epoch": 0.0005190272132455744, "grad_norm": 3.21584153175354, "learning_rate": 5.190086933956145e-07, "loss": 0.9840399742126464, "memory(GiB)": 88.99, "step": 40, "token_acc": 0.7402076934327921, "train_speed(iter/s)": 0.112156 }, { "epoch": 0.0005839056149012713, "grad_norm": 2.6856861114501953, "learning_rate": 5.838847800700661e-07, "loss": 0.9612736701965332, "memory(GiB)": 88.99, "step": 45, "token_acc": 0.7209634619310064, "train_speed(iter/s)": 0.112872 }, { "epoch": 0.000648784016556968, "grad_norm": 2.421036958694458, "learning_rate": 6.48760866744518e-07, "loss": 0.9069182395935058, "memory(GiB)": 88.99, "step": 50, "token_acc": 0.7295835966575381, "train_speed(iter/s)": 0.113732 }, { "epoch": 0.0007136624182126649, "grad_norm": 2.468371629714966, "learning_rate": 7.136369534189699e-07, "loss": 0.9195474624633789, "memory(GiB)": 88.99, "step": 55, "token_acc": 0.7347648261758691, "train_speed(iter/s)": 0.114713 }, { "epoch": 0.0007785408198683618, "grad_norm": 2.2073981761932373, "learning_rate": 7.785130400934216e-07, "loss": 0.8823270797729492, "memory(GiB)": 88.99, "step": 60, "token_acc": 0.7541250693288963, "train_speed(iter/s)": 0.115168 }, { "epoch": 0.0008434192215240585, "grad_norm": 1.6704285144805908, "learning_rate": 8.433891267678735e-07, "loss": 0.8640064239501953, "memory(GiB)": 88.99, "step": 65, "token_acc": 0.7588807239354363, "train_speed(iter/s)": 0.115649 }, { "epoch": 0.0009082976231797554, "grad_norm": 1.5314635038375854, "learning_rate": 9.082652134423252e-07, "loss": 0.8488792419433594, "memory(GiB)": 88.99, "step": 70, "token_acc": 0.7623102425405688, "train_speed(iter/s)": 0.115748 }, { "epoch": 0.0009731760248354521, "grad_norm": 2.0286080837249756, "learning_rate": 9.73141300116777e-07, "loss": 0.8506875991821289, "memory(GiB)": 88.99, "step": 75, "token_acc": 0.7558329996724056, "train_speed(iter/s)": 0.116103 }, { "epoch": 0.0010380544264911489, "grad_norm": 1.7463434934616089, "learning_rate": 1.038017386791229e-06, "loss": 0.8515713691711426, "memory(GiB)": 88.99, "step": 80, "token_acc": 0.7542338184389772, "train_speed(iter/s)": 0.116457 }, { "epoch": 0.0011029328281468458, "grad_norm": 1.6576606035232544, "learning_rate": 1.1028934734656805e-06, "loss": 0.8009872436523438, "memory(GiB)": 88.99, "step": 85, "token_acc": 0.7552264508351989, "train_speed(iter/s)": 0.116823 }, { "epoch": 0.0011678112298025426, "grad_norm": 1.5042972564697266, "learning_rate": 1.1677695601401323e-06, "loss": 0.8383214950561524, "memory(GiB)": 88.99, "step": 90, "token_acc": 0.7417153996101364, "train_speed(iter/s)": 0.117013 }, { "epoch": 0.0012326896314582394, "grad_norm": 1.647212028503418, "learning_rate": 1.2326456468145842e-06, "loss": 0.8437481880187988, "memory(GiB)": 88.99, "step": 95, "token_acc": 0.742150405881452, "train_speed(iter/s)": 0.117287 }, { "epoch": 0.001297568033113936, "grad_norm": 1.596248984336853, "learning_rate": 1.297521733489036e-06, "loss": 0.7994852542877198, "memory(GiB)": 88.99, "step": 100, "token_acc": 0.7656600790513834, "train_speed(iter/s)": 0.117496 }, { "epoch": 0.001362446434769633, "grad_norm": 1.5298240184783936, "learning_rate": 1.3623978201634877e-06, "loss": 0.8329332351684571, "memory(GiB)": 88.99, "step": 105, "token_acc": 0.7445796958891758, "train_speed(iter/s)": 0.117603 }, { "epoch": 0.0014273248364253298, "grad_norm": 1.666501522064209, "learning_rate": 1.4272739068379397e-06, "loss": 0.828717041015625, "memory(GiB)": 88.99, "step": 110, "token_acc": 0.7691705233179067, "train_speed(iter/s)": 0.117874 }, { "epoch": 0.0014922032380810266, "grad_norm": 1.7198013067245483, "learning_rate": 1.4921499935123915e-06, "loss": 0.8542831420898438, "memory(GiB)": 88.99, "step": 115, "token_acc": 0.763012001043569, "train_speed(iter/s)": 0.118285 }, { "epoch": 0.0015570816397367235, "grad_norm": 1.8853005170822144, "learning_rate": 1.5570260801868432e-06, "loss": 0.8229326248168946, "memory(GiB)": 88.99, "step": 120, "token_acc": 0.7560686358077308, "train_speed(iter/s)": 0.118393 }, { "epoch": 0.0016219600413924203, "grad_norm": 1.711550235748291, "learning_rate": 1.621902166861295e-06, "loss": 0.8550407409667968, "memory(GiB)": 88.99, "step": 125, "token_acc": 0.7439443568280157, "train_speed(iter/s)": 0.118358 }, { "epoch": 0.001686838443048117, "grad_norm": 1.6034162044525146, "learning_rate": 1.686778253535747e-06, "loss": 0.8363868713378906, "memory(GiB)": 88.99, "step": 130, "token_acc": 0.7610404703308723, "train_speed(iter/s)": 0.118443 }, { "epoch": 0.0017517168447038138, "grad_norm": 1.5253584384918213, "learning_rate": 1.7516543402101987e-06, "loss": 0.8248476028442383, "memory(GiB)": 88.99, "step": 135, "token_acc": 0.7420135369322827, "train_speed(iter/s)": 0.118645 }, { "epoch": 0.0018165952463595108, "grad_norm": 1.5756032466888428, "learning_rate": 1.8165304268846505e-06, "loss": 0.793170690536499, "memory(GiB)": 88.99, "step": 140, "token_acc": 0.784899923992906, "train_speed(iter/s)": 0.118495 }, { "epoch": 0.0018814736480152075, "grad_norm": 1.4896162748336792, "learning_rate": 1.8814065135591024e-06, "loss": 0.8247398376464844, "memory(GiB)": 88.99, "step": 145, "token_acc": 0.7522339005101167, "train_speed(iter/s)": 0.118345 }, { "epoch": 0.0019463520496709043, "grad_norm": 1.5199918746948242, "learning_rate": 1.946282600233554e-06, "loss": 0.7645186424255371, "memory(GiB)": 88.99, "step": 150, "token_acc": 0.7698362050627526, "train_speed(iter/s)": 0.118214 }, { "epoch": 0.0020112304513266012, "grad_norm": 1.706283688545227, "learning_rate": 2.011158686908006e-06, "loss": 0.8147221565246582, "memory(GiB)": 88.99, "step": 155, "token_acc": 0.7604058893752487, "train_speed(iter/s)": 0.118196 }, { "epoch": 0.0020761088529822978, "grad_norm": 1.5332438945770264, "learning_rate": 2.076034773582458e-06, "loss": 0.8186124801635742, "memory(GiB)": 88.99, "step": 160, "token_acc": 0.7672207424390572, "train_speed(iter/s)": 0.118209 }, { "epoch": 0.0021409872546379947, "grad_norm": 1.4400845766067505, "learning_rate": 2.1409108602569095e-06, "loss": 0.8224897384643555, "memory(GiB)": 88.99, "step": 165, "token_acc": 0.7622574955908289, "train_speed(iter/s)": 0.118174 }, { "epoch": 0.0022058656562936917, "grad_norm": 1.8300663232803345, "learning_rate": 2.205786946931361e-06, "loss": 0.8577741622924805, "memory(GiB)": 88.99, "step": 170, "token_acc": 0.7516731734523145, "train_speed(iter/s)": 0.118249 }, { "epoch": 0.0022707440579493882, "grad_norm": 1.527734398841858, "learning_rate": 2.270663033605813e-06, "loss": 0.7476388931274414, "memory(GiB)": 88.99, "step": 175, "token_acc": 0.7786770583533174, "train_speed(iter/s)": 0.118375 }, { "epoch": 0.002335622459605085, "grad_norm": 1.5682142972946167, "learning_rate": 2.3355391202802645e-06, "loss": 0.8055953979492188, "memory(GiB)": 88.99, "step": 180, "token_acc": 0.7599508776999205, "train_speed(iter/s)": 0.118528 }, { "epoch": 0.002400500861260782, "grad_norm": 1.8021656274795532, "learning_rate": 2.4004152069547165e-06, "loss": 0.8100423812866211, "memory(GiB)": 88.99, "step": 185, "token_acc": 0.7784101628828376, "train_speed(iter/s)": 0.118603 }, { "epoch": 0.0024653792629164787, "grad_norm": 1.571658730506897, "learning_rate": 2.4652912936291685e-06, "loss": 0.7944746971130371, "memory(GiB)": 88.99, "step": 190, "token_acc": 0.7728953824746421, "train_speed(iter/s)": 0.11875 }, { "epoch": 0.0025302576645721757, "grad_norm": 1.6411762237548828, "learning_rate": 2.53016738030362e-06, "loss": 0.7777754783630371, "memory(GiB)": 88.99, "step": 195, "token_acc": 0.7860903116110257, "train_speed(iter/s)": 0.118823 }, { "epoch": 0.002595136066227872, "grad_norm": 1.6627740859985352, "learning_rate": 2.595043466978072e-06, "loss": 0.8061195373535156, "memory(GiB)": 88.99, "step": 200, "token_acc": 0.7808260363035063, "train_speed(iter/s)": 0.118802 }, { "epoch": 0.002660014467883569, "grad_norm": 1.6121529340744019, "learning_rate": 2.659919553652524e-06, "loss": 0.7770388126373291, "memory(GiB)": 88.99, "step": 205, "token_acc": 0.7706188194515241, "train_speed(iter/s)": 0.118886 }, { "epoch": 0.002724892869539266, "grad_norm": 1.5919891595840454, "learning_rate": 2.7247956403269755e-06, "loss": 0.8116676330566406, "memory(GiB)": 88.99, "step": 210, "token_acc": 0.75459455667789, "train_speed(iter/s)": 0.119057 }, { "epoch": 0.0027897712711949627, "grad_norm": 1.6965458393096924, "learning_rate": 2.7896717270014274e-06, "loss": 0.8515666961669922, "memory(GiB)": 88.99, "step": 215, "token_acc": 0.7462025076707451, "train_speed(iter/s)": 0.11917 }, { "epoch": 0.0028546496728506596, "grad_norm": 1.7966089248657227, "learning_rate": 2.8545478136758794e-06, "loss": 0.8133340835571289, "memory(GiB)": 88.99, "step": 220, "token_acc": 0.7424694884445598, "train_speed(iter/s)": 0.119337 }, { "epoch": 0.0029195280745063566, "grad_norm": 1.5558422803878784, "learning_rate": 2.919423900350331e-06, "loss": 0.7785086154937744, "memory(GiB)": 88.99, "step": 225, "token_acc": 0.7603693499143644, "train_speed(iter/s)": 0.119453 }, { "epoch": 0.002984406476162053, "grad_norm": 1.620419979095459, "learning_rate": 2.984299987024783e-06, "loss": 0.7743879795074463, "memory(GiB)": 88.99, "step": 230, "token_acc": 0.7768516564676092, "train_speed(iter/s)": 0.119483 }, { "epoch": 0.00304928487781775, "grad_norm": 1.6542143821716309, "learning_rate": 3.0491760736992345e-06, "loss": 0.8363024711608886, "memory(GiB)": 88.99, "step": 235, "token_acc": 0.7411748717426019, "train_speed(iter/s)": 0.119584 }, { "epoch": 0.003114163279473447, "grad_norm": 1.6566967964172363, "learning_rate": 3.1140521603736864e-06, "loss": 0.7716734886169434, "memory(GiB)": 88.99, "step": 240, "token_acc": 0.7768668631785802, "train_speed(iter/s)": 0.11962 }, { "epoch": 0.0031790416811291436, "grad_norm": 1.5472670793533325, "learning_rate": 3.178928247048138e-06, "loss": 0.8218975067138672, "memory(GiB)": 88.99, "step": 245, "token_acc": 0.7777815910494886, "train_speed(iter/s)": 0.119868 }, { "epoch": 0.0032439200827848406, "grad_norm": 1.6825416088104248, "learning_rate": 3.24380433372259e-06, "loss": 0.8180648803710937, "memory(GiB)": 88.99, "step": 250, "token_acc": 0.7696312520196761, "train_speed(iter/s)": 0.120053 }, { "epoch": 0.003308798484440537, "grad_norm": 1.7405505180358887, "learning_rate": 3.3086804203970415e-06, "loss": 0.7569472312927246, "memory(GiB)": 88.99, "step": 255, "token_acc": 0.7583772940040122, "train_speed(iter/s)": 0.120109 }, { "epoch": 0.003373676886096234, "grad_norm": 1.5726145505905151, "learning_rate": 3.373556507071494e-06, "loss": 0.790015172958374, "memory(GiB)": 88.99, "step": 260, "token_acc": 0.751433356954197, "train_speed(iter/s)": 0.120096 }, { "epoch": 0.003438555287751931, "grad_norm": 1.484437108039856, "learning_rate": 3.4384325937459454e-06, "loss": 0.7567539215087891, "memory(GiB)": 88.99, "step": 265, "token_acc": 0.7854355543819499, "train_speed(iter/s)": 0.120107 }, { "epoch": 0.0035034336894076276, "grad_norm": 1.5922520160675049, "learning_rate": 3.5033086804203974e-06, "loss": 0.7565516948699951, "memory(GiB)": 88.99, "step": 270, "token_acc": 0.7594816894053137, "train_speed(iter/s)": 0.120136 }, { "epoch": 0.0035683120910633245, "grad_norm": 1.6169992685317993, "learning_rate": 3.568184767094849e-06, "loss": 0.7563276290893555, "memory(GiB)": 88.99, "step": 275, "token_acc": 0.7719404473571734, "train_speed(iter/s)": 0.120178 }, { "epoch": 0.0036331904927190215, "grad_norm": 1.3504871129989624, "learning_rate": 3.633060853769301e-06, "loss": 0.7971472263336181, "memory(GiB)": 88.99, "step": 280, "token_acc": 0.7571265678449259, "train_speed(iter/s)": 0.120229 }, { "epoch": 0.003698068894374718, "grad_norm": 1.583593487739563, "learning_rate": 3.6979369404437525e-06, "loss": 0.7914379596710205, "memory(GiB)": 88.99, "step": 285, "token_acc": 0.7603770351328192, "train_speed(iter/s)": 0.120306 }, { "epoch": 0.003762947296030415, "grad_norm": 1.5785223245620728, "learning_rate": 3.762813027118205e-06, "loss": 0.7989423751831055, "memory(GiB)": 88.99, "step": 290, "token_acc": 0.7602706113292205, "train_speed(iter/s)": 0.120329 }, { "epoch": 0.003827825697686112, "grad_norm": 1.5213161706924438, "learning_rate": 3.827689113792656e-06, "loss": 0.7914615631103515, "memory(GiB)": 88.99, "step": 295, "token_acc": 0.7575522850503486, "train_speed(iter/s)": 0.120428 }, { "epoch": 0.0038927040993418085, "grad_norm": 1.5847012996673584, "learning_rate": 3.892565200467108e-06, "loss": 0.7716495513916015, "memory(GiB)": 88.99, "step": 300, "token_acc": 0.7755852727835264, "train_speed(iter/s)": 0.120326 }, { "epoch": 0.003957582500997505, "grad_norm": 1.6228224039077759, "learning_rate": 3.9574412871415595e-06, "loss": 0.7860604763031006, "memory(GiB)": 88.99, "step": 305, "token_acc": 0.7702477720105768, "train_speed(iter/s)": 0.12034 }, { "epoch": 0.0040224609026532025, "grad_norm": 1.599313497543335, "learning_rate": 4.022317373816012e-06, "loss": 0.755602216720581, "memory(GiB)": 88.99, "step": 310, "token_acc": 0.7941092670418121, "train_speed(iter/s)": 0.120327 }, { "epoch": 0.004087339304308899, "grad_norm": 1.5956838130950928, "learning_rate": 4.0871934604904634e-06, "loss": 0.77231764793396, "memory(GiB)": 88.99, "step": 315, "token_acc": 0.7847880105606461, "train_speed(iter/s)": 0.120409 }, { "epoch": 0.0041522177059645955, "grad_norm": 1.6035664081573486, "learning_rate": 4.152069547164916e-06, "loss": 0.8121329307556152, "memory(GiB)": 88.99, "step": 320, "token_acc": 0.7636363636363637, "train_speed(iter/s)": 0.120411 }, { "epoch": 0.004217096107620293, "grad_norm": 1.540673017501831, "learning_rate": 4.216945633839367e-06, "loss": 0.745725440979004, "memory(GiB)": 88.99, "step": 325, "token_acc": 0.7879506137899317, "train_speed(iter/s)": 0.120341 }, { "epoch": 0.0042819745092759895, "grad_norm": 1.4387177228927612, "learning_rate": 4.281821720513819e-06, "loss": 0.7484595298767089, "memory(GiB)": 88.99, "step": 330, "token_acc": 0.7883790809953071, "train_speed(iter/s)": 0.120319 }, { "epoch": 0.004346852910931686, "grad_norm": 1.5613651275634766, "learning_rate": 4.3466978071882705e-06, "loss": 0.8066761016845703, "memory(GiB)": 88.99, "step": 335, "token_acc": 0.7707006369426752, "train_speed(iter/s)": 0.120257 }, { "epoch": 0.004411731312587383, "grad_norm": 1.644921898841858, "learning_rate": 4.411573893862722e-06, "loss": 0.8112164497375488, "memory(GiB)": 88.99, "step": 340, "token_acc": 0.7571581687337314, "train_speed(iter/s)": 0.120351 }, { "epoch": 0.00447660971424308, "grad_norm": 1.4823637008666992, "learning_rate": 4.476449980537174e-06, "loss": 0.7602863788604737, "memory(GiB)": 88.99, "step": 345, "token_acc": 0.781716487017076, "train_speed(iter/s)": 0.120329 }, { "epoch": 0.0045414881158987765, "grad_norm": 1.63776433467865, "learning_rate": 4.541326067211626e-06, "loss": 0.7734726905822754, "memory(GiB)": 88.99, "step": 350, "token_acc": 0.7807982666982634, "train_speed(iter/s)": 0.120392 }, { "epoch": 0.004606366517554474, "grad_norm": 1.441038966178894, "learning_rate": 4.606202153886078e-06, "loss": 0.7632351875305176, "memory(GiB)": 88.99, "step": 355, "token_acc": 0.7716786895100691, "train_speed(iter/s)": 0.120461 }, { "epoch": 0.00467124491921017, "grad_norm": 1.5190823078155518, "learning_rate": 4.671078240560529e-06, "loss": 0.7766232013702392, "memory(GiB)": 88.99, "step": 360, "token_acc": 0.7857451403887689, "train_speed(iter/s)": 0.120385 }, { "epoch": 0.004736123320865867, "grad_norm": 1.4780420064926147, "learning_rate": 4.735954327234981e-06, "loss": 0.771604585647583, "memory(GiB)": 88.99, "step": 365, "token_acc": 0.7670655600591907, "train_speed(iter/s)": 0.120335 }, { "epoch": 0.004801001722521564, "grad_norm": 1.556571364402771, "learning_rate": 4.800830413909433e-06, "loss": 0.7955253601074219, "memory(GiB)": 88.99, "step": 370, "token_acc": 0.7570559134427163, "train_speed(iter/s)": 0.120416 }, { "epoch": 0.004865880124177261, "grad_norm": 1.7111777067184448, "learning_rate": 4.865706500583885e-06, "loss": 0.757494592666626, "memory(GiB)": 88.99, "step": 375, "token_acc": 0.755387640232854, "train_speed(iter/s)": 0.120489 }, { "epoch": 0.004930758525832957, "grad_norm": 1.777712106704712, "learning_rate": 4.930582587258337e-06, "loss": 0.7754056930541993, "memory(GiB)": 88.99, "step": 380, "token_acc": 0.7813418023025009, "train_speed(iter/s)": 0.12055 }, { "epoch": 0.004995636927488655, "grad_norm": 1.4869753122329712, "learning_rate": 4.9954586739327884e-06, "loss": 0.742713451385498, "memory(GiB)": 88.99, "step": 385, "token_acc": 0.7814744666404603, "train_speed(iter/s)": 0.120547 }, { "epoch": 0.005060515329144351, "grad_norm": 1.6478495597839355, "learning_rate": 5.06033476060724e-06, "loss": 0.7774733543395996, "memory(GiB)": 88.99, "step": 390, "token_acc": 0.782145255147717, "train_speed(iter/s)": 0.120511 }, { "epoch": 0.005125393730800048, "grad_norm": 1.534139633178711, "learning_rate": 5.125210847281692e-06, "loss": 0.7861319541931152, "memory(GiB)": 88.99, "step": 395, "token_acc": 0.767795813600376, "train_speed(iter/s)": 0.120493 }, { "epoch": 0.005190272132455744, "grad_norm": 1.6937205791473389, "learning_rate": 5.190086933956144e-06, "loss": 0.7431811809539794, "memory(GiB)": 88.99, "step": 400, "token_acc": 0.8024006160873144, "train_speed(iter/s)": 0.12043 }, { "epoch": 0.005255150534111442, "grad_norm": 1.5120518207550049, "learning_rate": 5.254963020630596e-06, "loss": 0.749376916885376, "memory(GiB)": 88.99, "step": 405, "token_acc": 0.757150408175251, "train_speed(iter/s)": 0.12036 }, { "epoch": 0.005320028935767138, "grad_norm": 1.7693002223968506, "learning_rate": 5.319839107305048e-06, "loss": 0.7528721809387207, "memory(GiB)": 88.99, "step": 410, "token_acc": 0.778224186420249, "train_speed(iter/s)": 0.120416 }, { "epoch": 0.005384907337422835, "grad_norm": 1.5337936878204346, "learning_rate": 5.384715193979499e-06, "loss": 0.7827159881591796, "memory(GiB)": 88.99, "step": 415, "token_acc": 0.7486437613019892, "train_speed(iter/s)": 0.12042 }, { "epoch": 0.005449785739078532, "grad_norm": 1.5827404260635376, "learning_rate": 5.449591280653951e-06, "loss": 0.7884416580200195, "memory(GiB)": 88.99, "step": 420, "token_acc": 0.7751951194574146, "train_speed(iter/s)": 0.120398 }, { "epoch": 0.005514664140734229, "grad_norm": 1.4603688716888428, "learning_rate": 5.514467367328403e-06, "loss": 0.7633646965026856, "memory(GiB)": 88.99, "step": 425, "token_acc": 0.7717600881421292, "train_speed(iter/s)": 0.120389 }, { "epoch": 0.005579542542389925, "grad_norm": 1.679620623588562, "learning_rate": 5.579343454002855e-06, "loss": 0.7838947296142578, "memory(GiB)": 88.99, "step": 430, "token_acc": 0.7714113187292428, "train_speed(iter/s)": 0.120452 }, { "epoch": 0.005644420944045623, "grad_norm": 1.6713002920150757, "learning_rate": 5.6442195406773064e-06, "loss": 0.7701513290405273, "memory(GiB)": 88.99, "step": 435, "token_acc": 0.7612957968423285, "train_speed(iter/s)": 0.120485 }, { "epoch": 0.005709299345701319, "grad_norm": 1.6575478315353394, "learning_rate": 5.709095627351759e-06, "loss": 0.7572904586791992, "memory(GiB)": 88.99, "step": 440, "token_acc": 0.7947555205047319, "train_speed(iter/s)": 0.120524 }, { "epoch": 0.005774177747357016, "grad_norm": 1.6258591413497925, "learning_rate": 5.7739717140262095e-06, "loss": 0.7807128429412842, "memory(GiB)": 88.99, "step": 445, "token_acc": 0.7578516940416467, "train_speed(iter/s)": 0.120567 }, { "epoch": 0.005839056149012713, "grad_norm": 1.5995073318481445, "learning_rate": 5.838847800700662e-06, "loss": 0.7817898750305176, "memory(GiB)": 88.99, "step": 450, "token_acc": 0.7454181359104959, "train_speed(iter/s)": 0.12064 }, { "epoch": 0.00590393455066841, "grad_norm": 1.4361919164657593, "learning_rate": 5.9037238873751135e-06, "loss": 0.7911847591400146, "memory(GiB)": 88.99, "step": 455, "token_acc": 0.772484038823486, "train_speed(iter/s)": 0.120631 }, { "epoch": 0.005968812952324106, "grad_norm": 1.5771868228912354, "learning_rate": 5.968599974049566e-06, "loss": 0.7834386825561523, "memory(GiB)": 88.99, "step": 460, "token_acc": 0.7799222132531005, "train_speed(iter/s)": 0.120665 }, { "epoch": 0.006033691353979804, "grad_norm": 1.5440179109573364, "learning_rate": 6.033476060724017e-06, "loss": 0.782247257232666, "memory(GiB)": 88.99, "step": 465, "token_acc": 0.7564348471723796, "train_speed(iter/s)": 0.120688 }, { "epoch": 0.0060985697556355, "grad_norm": 1.537164330482483, "learning_rate": 6.098352147398469e-06, "loss": 0.7851636409759521, "memory(GiB)": 88.99, "step": 470, "token_acc": 0.7715116279069767, "train_speed(iter/s)": 0.120605 }, { "epoch": 0.006163448157291197, "grad_norm": 1.676908016204834, "learning_rate": 6.1632282340729205e-06, "loss": 0.7744393348693848, "memory(GiB)": 88.99, "step": 475, "token_acc": 0.7823878069432684, "train_speed(iter/s)": 0.120604 }, { "epoch": 0.006228326558946894, "grad_norm": 1.4424726963043213, "learning_rate": 6.228104320747373e-06, "loss": 0.7507990837097168, "memory(GiB)": 88.99, "step": 480, "token_acc": 0.7757501744591766, "train_speed(iter/s)": 0.120593 }, { "epoch": 0.006293204960602591, "grad_norm": 1.7014158964157104, "learning_rate": 6.292980407421825e-06, "loss": 0.7857891082763672, "memory(GiB)": 88.99, "step": 485, "token_acc": 0.7820235793623715, "train_speed(iter/s)": 0.120624 }, { "epoch": 0.006358083362258287, "grad_norm": 1.5683897733688354, "learning_rate": 6.357856494096276e-06, "loss": 0.8094940185546875, "memory(GiB)": 88.99, "step": 490, "token_acc": 0.7630306622165383, "train_speed(iter/s)": 0.120649 }, { "epoch": 0.006422961763913985, "grad_norm": 1.6461645364761353, "learning_rate": 6.422732580770728e-06, "loss": 0.7418383121490478, "memory(GiB)": 88.99, "step": 495, "token_acc": 0.7581383235983482, "train_speed(iter/s)": 0.120671 }, { "epoch": 0.006487840165569681, "grad_norm": 1.7708282470703125, "learning_rate": 6.48760866744518e-06, "loss": 0.7552562713623047, "memory(GiB)": 88.99, "step": 500, "token_acc": 0.7757850335885018, "train_speed(iter/s)": 0.120643 }, { "epoch": 0.006552718567225378, "grad_norm": 1.511718511581421, "learning_rate": 6.552484754119632e-06, "loss": 0.7978401184082031, "memory(GiB)": 88.99, "step": 505, "token_acc": 0.7556167055627155, "train_speed(iter/s)": 0.120666 }, { "epoch": 0.006617596968881074, "grad_norm": 1.6851942539215088, "learning_rate": 6.617360840794083e-06, "loss": 0.7745277404785156, "memory(GiB)": 88.99, "step": 510, "token_acc": 0.7710567378241094, "train_speed(iter/s)": 0.120618 }, { "epoch": 0.006682475370536772, "grad_norm": 1.5629817247390747, "learning_rate": 6.682236927468535e-06, "loss": 0.7983939170837402, "memory(GiB)": 88.99, "step": 515, "token_acc": 0.7493489583333334, "train_speed(iter/s)": 0.120695 }, { "epoch": 0.006747353772192468, "grad_norm": 1.5985691547393799, "learning_rate": 6.747113014142988e-06, "loss": 0.7826771259307861, "memory(GiB)": 88.99, "step": 520, "token_acc": 0.7732952903785033, "train_speed(iter/s)": 0.120687 }, { "epoch": 0.006812232173848165, "grad_norm": 1.3125470876693726, "learning_rate": 6.811989100817439e-06, "loss": 0.7189431190490723, "memory(GiB)": 88.99, "step": 525, "token_acc": 0.7869453982360466, "train_speed(iter/s)": 0.120657 }, { "epoch": 0.006877110575503862, "grad_norm": 1.6405136585235596, "learning_rate": 6.876865187491891e-06, "loss": 0.7740489959716796, "memory(GiB)": 88.99, "step": 530, "token_acc": 0.7758620689655172, "train_speed(iter/s)": 0.120675 }, { "epoch": 0.006941988977159559, "grad_norm": 1.7218329906463623, "learning_rate": 6.941741274166342e-06, "loss": 0.7664342880249023, "memory(GiB)": 88.99, "step": 535, "token_acc": 0.7670311835851462, "train_speed(iter/s)": 0.120705 }, { "epoch": 0.007006867378815255, "grad_norm": 1.4869719743728638, "learning_rate": 7.006617360840795e-06, "loss": 0.7802762985229492, "memory(GiB)": 88.99, "step": 540, "token_acc": 0.7743733554909292, "train_speed(iter/s)": 0.120726 }, { "epoch": 0.0070717457804709526, "grad_norm": 1.7089225053787231, "learning_rate": 7.0714934475152455e-06, "loss": 0.7865973472595215, "memory(GiB)": 88.99, "step": 545, "token_acc": 0.7806748466257669, "train_speed(iter/s)": 0.120765 }, { "epoch": 0.007136624182126649, "grad_norm": 1.7331798076629639, "learning_rate": 7.136369534189698e-06, "loss": 0.7940951824188233, "memory(GiB)": 88.99, "step": 550, "token_acc": 0.7680044182621503, "train_speed(iter/s)": 0.120763 }, { "epoch": 0.007201502583782346, "grad_norm": 1.4751121997833252, "learning_rate": 7.20124562086415e-06, "loss": 0.7791842460632324, "memory(GiB)": 88.99, "step": 555, "token_acc": 0.7710298812641245, "train_speed(iter/s)": 0.12074 }, { "epoch": 0.007266380985438043, "grad_norm": 1.5206568241119385, "learning_rate": 7.266121707538602e-06, "loss": 0.7746591091156005, "memory(GiB)": 88.99, "step": 560, "token_acc": 0.7671073915183909, "train_speed(iter/s)": 0.12073 }, { "epoch": 0.00733125938709374, "grad_norm": 1.5347800254821777, "learning_rate": 7.3309977942130525e-06, "loss": 0.7881784439086914, "memory(GiB)": 88.99, "step": 565, "token_acc": 0.7763662662725614, "train_speed(iter/s)": 0.120725 }, { "epoch": 0.007396137788749436, "grad_norm": 1.729779839515686, "learning_rate": 7.395873880887505e-06, "loss": 0.750407361984253, "memory(GiB)": 88.99, "step": 570, "token_acc": 0.7725149086489156, "train_speed(iter/s)": 0.120708 }, { "epoch": 0.0074610161904051335, "grad_norm": 1.5035102367401123, "learning_rate": 7.460749967561957e-06, "loss": 0.7684652805328369, "memory(GiB)": 88.99, "step": 575, "token_acc": 0.7730566861908071, "train_speed(iter/s)": 0.120687 }, { "epoch": 0.00752589459206083, "grad_norm": 1.6906331777572632, "learning_rate": 7.52562605423641e-06, "loss": 0.7618125915527344, "memory(GiB)": 88.99, "step": 580, "token_acc": 0.7631311419857676, "train_speed(iter/s)": 0.1207 }, { "epoch": 0.007590772993716527, "grad_norm": 1.5615710020065308, "learning_rate": 7.59050214091086e-06, "loss": 0.7799367904663086, "memory(GiB)": 88.99, "step": 585, "token_acc": 0.7730421094057458, "train_speed(iter/s)": 0.120706 }, { "epoch": 0.007655651395372224, "grad_norm": 1.6343941688537598, "learning_rate": 7.655378227585313e-06, "loss": 0.7559982776641846, "memory(GiB)": 88.99, "step": 590, "token_acc": 0.7727013636164336, "train_speed(iter/s)": 0.120745 }, { "epoch": 0.0077205297970279205, "grad_norm": 1.7993617057800293, "learning_rate": 7.720254314259764e-06, "loss": 0.7837891578674316, "memory(GiB)": 88.99, "step": 595, "token_acc": 0.783558124598587, "train_speed(iter/s)": 0.120782 }, { "epoch": 0.007785408198683617, "grad_norm": 1.4809664487838745, "learning_rate": 7.785130400934216e-06, "loss": 0.8150426864624023, "memory(GiB)": 88.99, "step": 600, "token_acc": 0.7546378091872792, "train_speed(iter/s)": 0.12079 }, { "epoch": 0.007850286600339314, "grad_norm": 1.5885556936264038, "learning_rate": 7.850006487608667e-06, "loss": 0.8111782073974609, "memory(GiB)": 88.99, "step": 605, "token_acc": 0.7716290509259259, "train_speed(iter/s)": 0.120842 }, { "epoch": 0.00791516500199501, "grad_norm": 1.7796320915222168, "learning_rate": 7.914882574283119e-06, "loss": 0.7957969665527344, "memory(GiB)": 88.99, "step": 610, "token_acc": 0.780784469096672, "train_speed(iter/s)": 0.120871 }, { "epoch": 0.007980043403650708, "grad_norm": 1.6342053413391113, "learning_rate": 7.979758660957572e-06, "loss": 0.7749110221862793, "memory(GiB)": 88.99, "step": 615, "token_acc": 0.7889411150280715, "train_speed(iter/s)": 0.120853 }, { "epoch": 0.008044921805306405, "grad_norm": 1.444749355316162, "learning_rate": 8.044634747632024e-06, "loss": 0.7551668167114258, "memory(GiB)": 88.99, "step": 620, "token_acc": 0.7839343808403129, "train_speed(iter/s)": 0.120867 }, { "epoch": 0.0081098002069621, "grad_norm": 1.612693190574646, "learning_rate": 8.109510834306475e-06, "loss": 0.7908324241638184, "memory(GiB)": 88.99, "step": 625, "token_acc": 0.7742276518266181, "train_speed(iter/s)": 0.120883 }, { "epoch": 0.008174678608617798, "grad_norm": 1.5517915487289429, "learning_rate": 8.174386920980927e-06, "loss": 0.7711538314819336, "memory(GiB)": 88.99, "step": 630, "token_acc": 0.7718871782999576, "train_speed(iter/s)": 0.120895 }, { "epoch": 0.008239557010273495, "grad_norm": 1.3879389762878418, "learning_rate": 8.239263007655378e-06, "loss": 0.7749207496643067, "memory(GiB)": 88.99, "step": 635, "token_acc": 0.7604717845635401, "train_speed(iter/s)": 0.120899 }, { "epoch": 0.008304435411929191, "grad_norm": 1.3078559637069702, "learning_rate": 8.304139094329832e-06, "loss": 0.746762752532959, "memory(GiB)": 88.99, "step": 640, "token_acc": 0.7820720008252243, "train_speed(iter/s)": 0.120869 }, { "epoch": 0.008369313813584888, "grad_norm": 1.417127013206482, "learning_rate": 8.369015181004281e-06, "loss": 0.7307230472564697, "memory(GiB)": 88.99, "step": 645, "token_acc": 0.7788510944404481, "train_speed(iter/s)": 0.12083 }, { "epoch": 0.008434192215240586, "grad_norm": 1.7385598421096802, "learning_rate": 8.433891267678735e-06, "loss": 0.7855213165283204, "memory(GiB)": 88.99, "step": 650, "token_acc": 0.7748447743591522, "train_speed(iter/s)": 0.120845 }, { "epoch": 0.008499070616896282, "grad_norm": 1.6111338138580322, "learning_rate": 8.498767354353186e-06, "loss": 0.7760546207427979, "memory(GiB)": 88.99, "step": 655, "token_acc": 0.7880117516883514, "train_speed(iter/s)": 0.120857 }, { "epoch": 0.008563949018551979, "grad_norm": 1.5974353551864624, "learning_rate": 8.563643441027638e-06, "loss": 0.7892468929290771, "memory(GiB)": 88.99, "step": 660, "token_acc": 0.7689669421487604, "train_speed(iter/s)": 0.120856 }, { "epoch": 0.008628827420207676, "grad_norm": 1.4883291721343994, "learning_rate": 8.62851952770209e-06, "loss": 0.7537619590759277, "memory(GiB)": 88.99, "step": 665, "token_acc": 0.7911347795442086, "train_speed(iter/s)": 0.120884 }, { "epoch": 0.008693705821863372, "grad_norm": 1.645544409751892, "learning_rate": 8.693395614376541e-06, "loss": 0.7884732723236084, "memory(GiB)": 88.99, "step": 670, "token_acc": 0.7671790084353016, "train_speed(iter/s)": 0.120887 }, { "epoch": 0.00875858422351907, "grad_norm": 1.7396761178970337, "learning_rate": 8.758271701050994e-06, "loss": 0.7885085105895996, "memory(GiB)": 88.99, "step": 675, "token_acc": 0.7702906684212564, "train_speed(iter/s)": 0.120883 }, { "epoch": 0.008823462625174767, "grad_norm": 1.6165481805801392, "learning_rate": 8.823147787725444e-06, "loss": 0.7960989475250244, "memory(GiB)": 88.99, "step": 680, "token_acc": 0.767362360561353, "train_speed(iter/s)": 0.120869 }, { "epoch": 0.008888341026830462, "grad_norm": 1.394339919090271, "learning_rate": 8.888023874399897e-06, "loss": 0.7779340744018555, "memory(GiB)": 88.99, "step": 685, "token_acc": 0.7650054437537316, "train_speed(iter/s)": 0.120862 }, { "epoch": 0.00895321942848616, "grad_norm": 1.627441167831421, "learning_rate": 8.952899961074349e-06, "loss": 0.8120803833007812, "memory(GiB)": 88.99, "step": 690, "token_acc": 0.7545879812882331, "train_speed(iter/s)": 0.120901 }, { "epoch": 0.009018097830141857, "grad_norm": 1.5875228643417358, "learning_rate": 9.0177760477488e-06, "loss": 0.7872458457946777, "memory(GiB)": 88.99, "step": 695, "token_acc": 0.7787601591457817, "train_speed(iter/s)": 0.120914 }, { "epoch": 0.009082976231797553, "grad_norm": 1.456844687461853, "learning_rate": 9.082652134423252e-06, "loss": 0.7619693756103516, "memory(GiB)": 88.99, "step": 700, "token_acc": 0.7865318580694391, "train_speed(iter/s)": 0.120932 }, { "epoch": 0.00914785463345325, "grad_norm": 1.5902259349822998, "learning_rate": 9.147528221097703e-06, "loss": 0.7843690872192383, "memory(GiB)": 88.99, "step": 705, "token_acc": 0.7713069731912262, "train_speed(iter/s)": 0.120896 }, { "epoch": 0.009212733035108948, "grad_norm": 1.7407634258270264, "learning_rate": 9.212404307772157e-06, "loss": 0.7747676849365235, "memory(GiB)": 88.99, "step": 710, "token_acc": 0.776157450796626, "train_speed(iter/s)": 0.120924 }, { "epoch": 0.009277611436764643, "grad_norm": 1.513055443763733, "learning_rate": 9.277280394446608e-06, "loss": 0.7524981498718262, "memory(GiB)": 88.99, "step": 715, "token_acc": 0.7773573431534849, "train_speed(iter/s)": 0.120952 }, { "epoch": 0.00934248983842034, "grad_norm": 1.5682921409606934, "learning_rate": 9.342156481121058e-06, "loss": 0.7798657417297363, "memory(GiB)": 88.99, "step": 720, "token_acc": 0.7618179316039358, "train_speed(iter/s)": 0.12094 }, { "epoch": 0.009407368240076038, "grad_norm": 1.3596172332763672, "learning_rate": 9.407032567795511e-06, "loss": 0.7656975269317627, "memory(GiB)": 88.99, "step": 725, "token_acc": 0.7669680062151282, "train_speed(iter/s)": 0.120946 }, { "epoch": 0.009472246641731734, "grad_norm": 1.5097339153289795, "learning_rate": 9.471908654469963e-06, "loss": 0.7964097499847412, "memory(GiB)": 88.99, "step": 730, "token_acc": 0.7622796785899429, "train_speed(iter/s)": 0.120943 }, { "epoch": 0.009537125043387431, "grad_norm": 1.5011403560638428, "learning_rate": 9.536784741144416e-06, "loss": 0.8019659042358398, "memory(GiB)": 88.99, "step": 735, "token_acc": 0.7752509731612375, "train_speed(iter/s)": 0.120928 }, { "epoch": 0.009602003445043129, "grad_norm": 1.536797046661377, "learning_rate": 9.601660827818866e-06, "loss": 0.7569317817687988, "memory(GiB)": 88.99, "step": 740, "token_acc": 0.7782495228450123, "train_speed(iter/s)": 0.120938 }, { "epoch": 0.009666881846698824, "grad_norm": 1.3230431079864502, "learning_rate": 9.666536914493317e-06, "loss": 0.7365289211273194, "memory(GiB)": 88.99, "step": 745, "token_acc": 0.7896295321445022, "train_speed(iter/s)": 0.120926 }, { "epoch": 0.009731760248354522, "grad_norm": 1.325033187866211, "learning_rate": 9.73141300116777e-06, "loss": 0.7740601539611817, "memory(GiB)": 88.99, "step": 750, "token_acc": 0.7680759501987451, "train_speed(iter/s)": 0.120919 }, { "epoch": 0.00979663865001022, "grad_norm": 1.52169930934906, "learning_rate": 9.79628908784222e-06, "loss": 0.7559280395507812, "memory(GiB)": 88.99, "step": 755, "token_acc": 0.7638211903540277, "train_speed(iter/s)": 0.12093 }, { "epoch": 0.009861517051665915, "grad_norm": 1.550423502922058, "learning_rate": 9.861165174516674e-06, "loss": 0.7413440227508545, "memory(GiB)": 88.99, "step": 760, "token_acc": 0.7946013960873604, "train_speed(iter/s)": 0.120913 }, { "epoch": 0.009926395453321612, "grad_norm": 1.5364054441452026, "learning_rate": 9.926041261191125e-06, "loss": 0.7869215488433838, "memory(GiB)": 88.99, "step": 765, "token_acc": 0.7487464975667306, "train_speed(iter/s)": 0.120957 }, { "epoch": 0.00999127385497731, "grad_norm": 1.5263090133666992, "learning_rate": 9.990917347865577e-06, "loss": 0.8179900169372558, "memory(GiB)": 88.99, "step": 770, "token_acc": 0.7733578284343131, "train_speed(iter/s)": 0.120969 }, { "epoch": 0.010056152256633005, "grad_norm": 1.361249327659607, "learning_rate": 1.0055793434540028e-05, "loss": 0.7354475975036621, "memory(GiB)": 88.99, "step": 775, "token_acc": 0.780567081604426, "train_speed(iter/s)": 0.120957 }, { "epoch": 0.010121030658288703, "grad_norm": 1.501365303993225, "learning_rate": 1.012066952121448e-05, "loss": 0.7841170310974122, "memory(GiB)": 88.99, "step": 780, "token_acc": 0.7736553494341384, "train_speed(iter/s)": 0.120986 }, { "epoch": 0.0101859090599444, "grad_norm": 1.3933998346328735, "learning_rate": 1.0185545607888933e-05, "loss": 0.7463688850402832, "memory(GiB)": 88.99, "step": 785, "token_acc": 0.7405287644930212, "train_speed(iter/s)": 0.120972 }, { "epoch": 0.010250787461600096, "grad_norm": 1.429369330406189, "learning_rate": 1.0250421694563385e-05, "loss": 0.7584861278533935, "memory(GiB)": 88.99, "step": 790, "token_acc": 0.7771158735924446, "train_speed(iter/s)": 0.121011 }, { "epoch": 0.010315665863255793, "grad_norm": 1.4375731945037842, "learning_rate": 1.0315297781237836e-05, "loss": 0.7550110816955566, "memory(GiB)": 88.99, "step": 795, "token_acc": 0.7777160413125899, "train_speed(iter/s)": 0.12101 }, { "epoch": 0.010380544264911489, "grad_norm": 1.519633412361145, "learning_rate": 1.0380173867912288e-05, "loss": 0.8225497245788574, "memory(GiB)": 88.99, "step": 800, "token_acc": 0.7484290843806104, "train_speed(iter/s)": 0.121011 }, { "epoch": 0.010445422666567186, "grad_norm": 1.3774982690811157, "learning_rate": 1.044504995458674e-05, "loss": 0.7988626956939697, "memory(GiB)": 88.99, "step": 805, "token_acc": 0.7681908399805265, "train_speed(iter/s)": 0.121015 }, { "epoch": 0.010510301068222884, "grad_norm": 1.4984382390975952, "learning_rate": 1.0509926041261193e-05, "loss": 0.7920001983642578, "memory(GiB)": 88.99, "step": 810, "token_acc": 0.7691175439826172, "train_speed(iter/s)": 0.121036 }, { "epoch": 0.01057517946987858, "grad_norm": 1.4115475416183472, "learning_rate": 1.0574802127935642e-05, "loss": 0.8027165412902832, "memory(GiB)": 88.99, "step": 815, "token_acc": 0.7684788200994683, "train_speed(iter/s)": 0.12104 }, { "epoch": 0.010640057871534277, "grad_norm": 1.5469005107879639, "learning_rate": 1.0639678214610096e-05, "loss": 0.7772024631500244, "memory(GiB)": 88.99, "step": 820, "token_acc": 0.7629185139886867, "train_speed(iter/s)": 0.121077 }, { "epoch": 0.010704936273189974, "grad_norm": 1.4085545539855957, "learning_rate": 1.0704554301284547e-05, "loss": 0.7876359939575195, "memory(GiB)": 88.99, "step": 825, "token_acc": 0.7903298066650584, "train_speed(iter/s)": 0.121084 }, { "epoch": 0.01076981467484567, "grad_norm": 1.480093240737915, "learning_rate": 1.0769430387958999e-05, "loss": 0.7705525875091552, "memory(GiB)": 88.99, "step": 830, "token_acc": 0.761519805982215, "train_speed(iter/s)": 0.121048 }, { "epoch": 0.010834693076501367, "grad_norm": 1.3595157861709595, "learning_rate": 1.083430647463345e-05, "loss": 0.7666215419769287, "memory(GiB)": 88.99, "step": 835, "token_acc": 0.7708062196040825, "train_speed(iter/s)": 0.121018 }, { "epoch": 0.010899571478157065, "grad_norm": 1.447662353515625, "learning_rate": 1.0899182561307902e-05, "loss": 0.800776195526123, "memory(GiB)": 88.99, "step": 840, "token_acc": 0.7668604831939305, "train_speed(iter/s)": 0.121022 }, { "epoch": 0.01096444987981276, "grad_norm": 1.3211452960968018, "learning_rate": 1.0964058647982355e-05, "loss": 0.7278727531433106, "memory(GiB)": 88.99, "step": 845, "token_acc": 0.7803736959869817, "train_speed(iter/s)": 0.12101 }, { "epoch": 0.011029328281468458, "grad_norm": 1.3030983209609985, "learning_rate": 1.1028934734656807e-05, "loss": 0.7509611129760743, "memory(GiB)": 88.99, "step": 850, "token_acc": 0.7944630299720079, "train_speed(iter/s)": 0.120968 }, { "epoch": 0.011094206683124155, "grad_norm": 1.3583621978759766, "learning_rate": 1.1093810821331258e-05, "loss": 0.7491464138031005, "memory(GiB)": 88.99, "step": 855, "token_acc": 0.7822228244300667, "train_speed(iter/s)": 0.120974 }, { "epoch": 0.01115908508477985, "grad_norm": 1.622078776359558, "learning_rate": 1.115868690800571e-05, "loss": 0.7803543090820313, "memory(GiB)": 88.99, "step": 860, "token_acc": 0.7578767853336176, "train_speed(iter/s)": 0.120989 }, { "epoch": 0.011223963486435548, "grad_norm": 1.5720829963684082, "learning_rate": 1.1223562994680161e-05, "loss": 0.7556359767913818, "memory(GiB)": 88.99, "step": 865, "token_acc": 0.7549557288225188, "train_speed(iter/s)": 0.120989 }, { "epoch": 0.011288841888091245, "grad_norm": 1.3739575147628784, "learning_rate": 1.1288439081354613e-05, "loss": 0.7580018997192383, "memory(GiB)": 88.99, "step": 870, "token_acc": 0.7642188503261127, "train_speed(iter/s)": 0.120965 }, { "epoch": 0.011353720289746941, "grad_norm": 1.4265506267547607, "learning_rate": 1.1353315168029064e-05, "loss": 0.7915122985839844, "memory(GiB)": 88.99, "step": 875, "token_acc": 0.7490128331688055, "train_speed(iter/s)": 0.12097 }, { "epoch": 0.011418598691402639, "grad_norm": 1.4449727535247803, "learning_rate": 1.1418191254703518e-05, "loss": 0.7835105895996094, "memory(GiB)": 88.99, "step": 880, "token_acc": 0.7461764251506257, "train_speed(iter/s)": 0.120972 }, { "epoch": 0.011483477093058336, "grad_norm": 1.4010093212127686, "learning_rate": 1.148306734137797e-05, "loss": 0.7685888767242431, "memory(GiB)": 88.99, "step": 885, "token_acc": 0.7718791946308725, "train_speed(iter/s)": 0.120959 }, { "epoch": 0.011548355494714032, "grad_norm": 1.3375827074050903, "learning_rate": 1.1547943428052419e-05, "loss": 0.7718112945556641, "memory(GiB)": 88.99, "step": 890, "token_acc": 0.7584947165809012, "train_speed(iter/s)": 0.120991 }, { "epoch": 0.011613233896369729, "grad_norm": 1.733007788658142, "learning_rate": 1.1612819514726872e-05, "loss": 0.7982136726379394, "memory(GiB)": 88.99, "step": 895, "token_acc": 0.7812229212975172, "train_speed(iter/s)": 0.120988 }, { "epoch": 0.011678112298025426, "grad_norm": 1.406057357788086, "learning_rate": 1.1677695601401324e-05, "loss": 0.7965377807617188, "memory(GiB)": 88.99, "step": 900, "token_acc": 0.764695560624671, "train_speed(iter/s)": 0.120992 }, { "epoch": 0.011742990699681122, "grad_norm": 1.5243380069732666, "learning_rate": 1.1742571688075777e-05, "loss": 0.7783063411712646, "memory(GiB)": 88.99, "step": 905, "token_acc": 0.789471799462847, "train_speed(iter/s)": 0.121006 }, { "epoch": 0.01180786910133682, "grad_norm": 1.471895694732666, "learning_rate": 1.1807447774750227e-05, "loss": 0.792516040802002, "memory(GiB)": 88.99, "step": 910, "token_acc": 0.7699312884242364, "train_speed(iter/s)": 0.12101 }, { "epoch": 0.011872747502992517, "grad_norm": 1.4713935852050781, "learning_rate": 1.1872323861424678e-05, "loss": 0.7606297492980957, "memory(GiB)": 88.99, "step": 915, "token_acc": 0.7665484187485848, "train_speed(iter/s)": 0.121019 }, { "epoch": 0.011937625904648213, "grad_norm": 1.3422436714172363, "learning_rate": 1.1937199948099132e-05, "loss": 0.7942726135253906, "memory(GiB)": 88.99, "step": 920, "token_acc": 0.7822605965463109, "train_speed(iter/s)": 0.120999 }, { "epoch": 0.01200250430630391, "grad_norm": 1.379199504852295, "learning_rate": 1.2002076034773583e-05, "loss": 0.7308454513549805, "memory(GiB)": 88.99, "step": 925, "token_acc": 0.776418372412182, "train_speed(iter/s)": 0.120995 }, { "epoch": 0.012067382707959607, "grad_norm": 1.4105873107910156, "learning_rate": 1.2066952121448035e-05, "loss": 0.7801652908325195, "memory(GiB)": 88.99, "step": 930, "token_acc": 0.7593141405588484, "train_speed(iter/s)": 0.120993 }, { "epoch": 0.012132261109615303, "grad_norm": 1.4529753923416138, "learning_rate": 1.2131828208122486e-05, "loss": 0.7717767715454101, "memory(GiB)": 88.99, "step": 935, "token_acc": 0.7755362710845637, "train_speed(iter/s)": 0.121012 }, { "epoch": 0.012197139511271, "grad_norm": 1.5389188528060913, "learning_rate": 1.2196704294796938e-05, "loss": 0.7840795516967773, "memory(GiB)": 88.99, "step": 940, "token_acc": 0.7699959122496253, "train_speed(iter/s)": 0.120971 }, { "epoch": 0.012262017912926698, "grad_norm": 1.437314510345459, "learning_rate": 1.2261580381471391e-05, "loss": 0.7849497318267822, "memory(GiB)": 88.99, "step": 945, "token_acc": 0.7678724825808915, "train_speed(iter/s)": 0.120961 }, { "epoch": 0.012326896314582394, "grad_norm": 1.4155938625335693, "learning_rate": 1.2326456468145841e-05, "loss": 0.7867606163024903, "memory(GiB)": 88.99, "step": 950, "token_acc": 0.7840063570035397, "train_speed(iter/s)": 0.120979 }, { "epoch": 0.012391774716238091, "grad_norm": 1.3128910064697266, "learning_rate": 1.2391332554820294e-05, "loss": 0.7840981006622314, "memory(GiB)": 88.99, "step": 955, "token_acc": 0.7606411980662281, "train_speed(iter/s)": 0.120974 }, { "epoch": 0.012456653117893788, "grad_norm": 1.5400124788284302, "learning_rate": 1.2456208641494746e-05, "loss": 0.7999919891357422, "memory(GiB)": 88.99, "step": 960, "token_acc": 0.746576511969402, "train_speed(iter/s)": 0.120974 }, { "epoch": 0.012521531519549484, "grad_norm": 1.385090708732605, "learning_rate": 1.2521084728169197e-05, "loss": 0.7846032619476319, "memory(GiB)": 88.99, "step": 965, "token_acc": 0.7544318031485646, "train_speed(iter/s)": 0.120998 }, { "epoch": 0.012586409921205181, "grad_norm": 1.2454363107681274, "learning_rate": 1.258596081484365e-05, "loss": 0.8086328506469727, "memory(GiB)": 88.99, "step": 970, "token_acc": 0.7561565458459905, "train_speed(iter/s)": 0.120972 }, { "epoch": 0.012651288322860879, "grad_norm": 1.3294788599014282, "learning_rate": 1.26508369015181e-05, "loss": 0.7703170776367188, "memory(GiB)": 88.99, "step": 975, "token_acc": 0.7577779389368338, "train_speed(iter/s)": 0.120981 }, { "epoch": 0.012716166724516574, "grad_norm": 1.3650318384170532, "learning_rate": 1.2715712988192552e-05, "loss": 0.7638664245605469, "memory(GiB)": 88.99, "step": 980, "token_acc": 0.7676040237768633, "train_speed(iter/s)": 0.121001 }, { "epoch": 0.012781045126172272, "grad_norm": 1.3574118614196777, "learning_rate": 1.2780589074867005e-05, "loss": 0.7604038715362549, "memory(GiB)": 88.99, "step": 985, "token_acc": 0.7717529315109418, "train_speed(iter/s)": 0.121009 }, { "epoch": 0.01284592352782797, "grad_norm": 1.4957056045532227, "learning_rate": 1.2845465161541457e-05, "loss": 0.7843387603759766, "memory(GiB)": 88.99, "step": 990, "token_acc": 0.7742864539512005, "train_speed(iter/s)": 0.121004 }, { "epoch": 0.012910801929483665, "grad_norm": 1.483878254890442, "learning_rate": 1.2910341248215907e-05, "loss": 0.8171511650085449, "memory(GiB)": 88.99, "step": 995, "token_acc": 0.7657834816808115, "train_speed(iter/s)": 0.121009 }, { "epoch": 0.012975680331139362, "grad_norm": 1.42325758934021, "learning_rate": 1.297521733489036e-05, "loss": 0.7528001308441162, "memory(GiB)": 88.99, "step": 1000, "token_acc": 0.7520244017575075, "train_speed(iter/s)": 0.121033 }, { "epoch": 0.01304055873279506, "grad_norm": 1.2656631469726562, "learning_rate": 1.3040093421564811e-05, "loss": 0.7772601127624512, "memory(GiB)": 88.99, "step": 1005, "token_acc": 0.7638668255229017, "train_speed(iter/s)": 0.121041 }, { "epoch": 0.013105437134450755, "grad_norm": 1.3651313781738281, "learning_rate": 1.3104969508239265e-05, "loss": 0.7643068313598633, "memory(GiB)": 88.99, "step": 1010, "token_acc": 0.7624582805846473, "train_speed(iter/s)": 0.121049 }, { "epoch": 0.013170315536106453, "grad_norm": 1.4499164819717407, "learning_rate": 1.3169845594913716e-05, "loss": 0.802497673034668, "memory(GiB)": 88.99, "step": 1015, "token_acc": 0.7609862462260987, "train_speed(iter/s)": 0.121055 }, { "epoch": 0.013235193937762148, "grad_norm": 1.3379343748092651, "learning_rate": 1.3234721681588166e-05, "loss": 0.7739057540893555, "memory(GiB)": 88.99, "step": 1020, "token_acc": 0.774463651798117, "train_speed(iter/s)": 0.121039 }, { "epoch": 0.013300072339417846, "grad_norm": 1.2017782926559448, "learning_rate": 1.329959776826262e-05, "loss": 0.7882849693298339, "memory(GiB)": 88.99, "step": 1025, "token_acc": 0.7689829728485964, "train_speed(iter/s)": 0.121022 }, { "epoch": 0.013364950741073543, "grad_norm": 1.4153211116790771, "learning_rate": 1.336447385493707e-05, "loss": 0.7619176864624023, "memory(GiB)": 88.99, "step": 1030, "token_acc": 0.766272819202725, "train_speed(iter/s)": 0.120999 }, { "epoch": 0.013429829142729239, "grad_norm": 1.4137516021728516, "learning_rate": 1.3429349941611522e-05, "loss": 0.8119400978088379, "memory(GiB)": 88.99, "step": 1035, "token_acc": 0.7659113086848097, "train_speed(iter/s)": 0.120996 }, { "epoch": 0.013494707544384936, "grad_norm": 1.4864331483840942, "learning_rate": 1.3494226028285976e-05, "loss": 0.7880813598632812, "memory(GiB)": 88.99, "step": 1040, "token_acc": 0.7639299759858003, "train_speed(iter/s)": 0.120986 }, { "epoch": 0.013559585946040634, "grad_norm": 1.435581922531128, "learning_rate": 1.3559102114960425e-05, "loss": 0.7931936264038086, "memory(GiB)": 88.99, "step": 1045, "token_acc": 0.7642078295372887, "train_speed(iter/s)": 0.120969 }, { "epoch": 0.01362446434769633, "grad_norm": 1.372991681098938, "learning_rate": 1.3623978201634879e-05, "loss": 0.773499584197998, "memory(GiB)": 88.99, "step": 1050, "token_acc": 0.7782622681296687, "train_speed(iter/s)": 0.12097 }, { "epoch": 0.013689342749352027, "grad_norm": 1.3987115621566772, "learning_rate": 1.368885428830933e-05, "loss": 0.7554070472717285, "memory(GiB)": 88.99, "step": 1055, "token_acc": 0.7707226548286151, "train_speed(iter/s)": 0.120994 }, { "epoch": 0.013754221151007724, "grad_norm": 1.4154067039489746, "learning_rate": 1.3753730374983782e-05, "loss": 0.784373664855957, "memory(GiB)": 88.99, "step": 1060, "token_acc": 0.7847406914893617, "train_speed(iter/s)": 0.120967 }, { "epoch": 0.01381909955266342, "grad_norm": 1.2835272550582886, "learning_rate": 1.3818606461658235e-05, "loss": 0.7629591941833496, "memory(GiB)": 88.99, "step": 1065, "token_acc": 0.7510412206609318, "train_speed(iter/s)": 0.120942 }, { "epoch": 0.013883977954319117, "grad_norm": 1.2087140083312988, "learning_rate": 1.3883482548332685e-05, "loss": 0.7536805629730224, "memory(GiB)": 88.99, "step": 1070, "token_acc": 0.786323839077912, "train_speed(iter/s)": 0.120943 }, { "epoch": 0.013948856355974815, "grad_norm": 1.499936580657959, "learning_rate": 1.3948358635007136e-05, "loss": 0.7950594425201416, "memory(GiB)": 88.99, "step": 1075, "token_acc": 0.7650608825304412, "train_speed(iter/s)": 0.120946 }, { "epoch": 0.01401373475763051, "grad_norm": 1.3120145797729492, "learning_rate": 1.401323472168159e-05, "loss": 0.7935418605804443, "memory(GiB)": 88.99, "step": 1080, "token_acc": 0.7596114658334465, "train_speed(iter/s)": 0.120962 }, { "epoch": 0.014078613159286208, "grad_norm": 1.325702428817749, "learning_rate": 1.4078110808356041e-05, "loss": 0.7875230789184571, "memory(GiB)": 88.99, "step": 1085, "token_acc": 0.7727141739738997, "train_speed(iter/s)": 0.120957 }, { "epoch": 0.014143491560941905, "grad_norm": 1.206756830215454, "learning_rate": 1.4142986895030491e-05, "loss": 0.7929877281188965, "memory(GiB)": 88.99, "step": 1090, "token_acc": 0.7542499837165375, "train_speed(iter/s)": 0.120955 }, { "epoch": 0.0142083699625976, "grad_norm": 1.3708581924438477, "learning_rate": 1.4207862981704944e-05, "loss": 0.7602263450622558, "memory(GiB)": 88.99, "step": 1095, "token_acc": 0.7777568450138156, "train_speed(iter/s)": 0.120949 }, { "epoch": 0.014273248364253298, "grad_norm": 1.4624006748199463, "learning_rate": 1.4272739068379396e-05, "loss": 0.7810771465301514, "memory(GiB)": 88.99, "step": 1100, "token_acc": 0.7589705610072209, "train_speed(iter/s)": 0.120941 }, { "epoch": 0.014338126765908996, "grad_norm": 1.4321434497833252, "learning_rate": 1.4337615155053849e-05, "loss": 0.7443621158599854, "memory(GiB)": 88.99, "step": 1105, "token_acc": 0.7806929298619774, "train_speed(iter/s)": 0.120922 }, { "epoch": 0.014403005167564691, "grad_norm": 1.207082748413086, "learning_rate": 1.44024912417283e-05, "loss": 0.7780801773071289, "memory(GiB)": 88.99, "step": 1110, "token_acc": 0.7638664067919791, "train_speed(iter/s)": 0.120923 }, { "epoch": 0.014467883569220389, "grad_norm": 1.3135071992874146, "learning_rate": 1.446736732840275e-05, "loss": 0.8058554649353027, "memory(GiB)": 88.99, "step": 1115, "token_acc": 0.7554621848739496, "train_speed(iter/s)": 0.12093 }, { "epoch": 0.014532761970876086, "grad_norm": 1.3215055465698242, "learning_rate": 1.4532243415077204e-05, "loss": 0.7692514419555664, "memory(GiB)": 88.99, "step": 1120, "token_acc": 0.7700924415443176, "train_speed(iter/s)": 0.120938 }, { "epoch": 0.014597640372531782, "grad_norm": 1.4765125513076782, "learning_rate": 1.4597119501751655e-05, "loss": 0.7751624107360839, "memory(GiB)": 88.99, "step": 1125, "token_acc": 0.7669339127828563, "train_speed(iter/s)": 0.120956 }, { "epoch": 0.01466251877418748, "grad_norm": 1.2353060245513916, "learning_rate": 1.4661995588426105e-05, "loss": 0.7634161472320556, "memory(GiB)": 88.99, "step": 1130, "token_acc": 0.7630998254454434, "train_speed(iter/s)": 0.120956 }, { "epoch": 0.014727397175843177, "grad_norm": 1.2847663164138794, "learning_rate": 1.472687167510056e-05, "loss": 0.7396027565002441, "memory(GiB)": 88.99, "step": 1135, "token_acc": 0.7782121691644172, "train_speed(iter/s)": 0.12095 }, { "epoch": 0.014792275577498872, "grad_norm": 1.3910115957260132, "learning_rate": 1.479174776177501e-05, "loss": 0.8068326950073242, "memory(GiB)": 88.99, "step": 1140, "token_acc": 0.7665747860147977, "train_speed(iter/s)": 0.120977 }, { "epoch": 0.01485715397915457, "grad_norm": 1.318517804145813, "learning_rate": 1.4856623848449463e-05, "loss": 0.8108409881591797, "memory(GiB)": 88.99, "step": 1145, "token_acc": 0.7802239711541795, "train_speed(iter/s)": 0.120976 }, { "epoch": 0.014922032380810267, "grad_norm": 1.4316216707229614, "learning_rate": 1.4921499935123915e-05, "loss": 0.7926053047180176, "memory(GiB)": 88.99, "step": 1150, "token_acc": 0.7602514116698037, "train_speed(iter/s)": 0.120981 }, { "epoch": 0.014986910782465963, "grad_norm": 1.3846346139907837, "learning_rate": 1.4986376021798364e-05, "loss": 0.8049087524414062, "memory(GiB)": 88.99, "step": 1155, "token_acc": 0.7549708254357602, "train_speed(iter/s)": 0.120992 }, { "epoch": 0.01505178918412166, "grad_norm": 1.4106427431106567, "learning_rate": 1.505125210847282e-05, "loss": 0.8152707099914551, "memory(GiB)": 88.99, "step": 1160, "token_acc": 0.7713218433316074, "train_speed(iter/s)": 0.120978 }, { "epoch": 0.015116667585777357, "grad_norm": 1.376334309577942, "learning_rate": 1.511612819514727e-05, "loss": 0.8087575912475586, "memory(GiB)": 88.99, "step": 1165, "token_acc": 0.7521818550842297, "train_speed(iter/s)": 0.120985 }, { "epoch": 0.015181545987433053, "grad_norm": 1.3464103937149048, "learning_rate": 1.518100428182172e-05, "loss": 0.7526932716369629, "memory(GiB)": 88.99, "step": 1170, "token_acc": 0.7733970660952695, "train_speed(iter/s)": 0.120991 }, { "epoch": 0.01524642438908875, "grad_norm": 1.309433937072754, "learning_rate": 1.5245880368496174e-05, "loss": 0.7770566940307617, "memory(GiB)": 88.99, "step": 1175, "token_acc": 0.7641756221719457, "train_speed(iter/s)": 0.121029 }, { "epoch": 0.015311302790744448, "grad_norm": 1.4957537651062012, "learning_rate": 1.5310756455170626e-05, "loss": 0.7670279979705811, "memory(GiB)": 88.99, "step": 1180, "token_acc": 0.7579278000086055, "train_speed(iter/s)": 0.121035 }, { "epoch": 0.015376181192400144, "grad_norm": 1.3201152086257935, "learning_rate": 1.5375632541845075e-05, "loss": 0.7681041240692139, "memory(GiB)": 88.99, "step": 1185, "token_acc": 0.7793047452937992, "train_speed(iter/s)": 0.121019 }, { "epoch": 0.015441059594055841, "grad_norm": 1.3186240196228027, "learning_rate": 1.544050862851953e-05, "loss": 0.7992112636566162, "memory(GiB)": 88.99, "step": 1190, "token_acc": 0.7637014148642055, "train_speed(iter/s)": 0.121031 }, { "epoch": 0.015505937995711538, "grad_norm": 1.5308330059051514, "learning_rate": 1.550538471519398e-05, "loss": 0.7946316719055175, "memory(GiB)": 88.99, "step": 1195, "token_acc": 0.7713078016251868, "train_speed(iter/s)": 0.121058 }, { "epoch": 0.015570816397367234, "grad_norm": 1.2821868658065796, "learning_rate": 1.5570260801868432e-05, "loss": 0.7751747608184815, "memory(GiB)": 88.99, "step": 1200, "token_acc": 0.7565886336866903, "train_speed(iter/s)": 0.121087 }, { "epoch": 0.01563569479902293, "grad_norm": 1.374962329864502, "learning_rate": 1.5635136888542885e-05, "loss": 0.8180643081665039, "memory(GiB)": 88.99, "step": 1205, "token_acc": 0.7774783503906013, "train_speed(iter/s)": 0.121092 }, { "epoch": 0.01570057320067863, "grad_norm": 1.2265961170196533, "learning_rate": 1.5700012975217335e-05, "loss": 0.7834359169006347, "memory(GiB)": 88.99, "step": 1210, "token_acc": 0.7625341841385597, "train_speed(iter/s)": 0.121093 }, { "epoch": 0.015765451602334325, "grad_norm": 1.2128089666366577, "learning_rate": 1.5764889061891788e-05, "loss": 0.7659594535827636, "memory(GiB)": 88.99, "step": 1215, "token_acc": 0.7778349101878513, "train_speed(iter/s)": 0.121063 }, { "epoch": 0.01583033000399002, "grad_norm": 1.3955893516540527, "learning_rate": 1.5829765148566238e-05, "loss": 0.7864768028259277, "memory(GiB)": 88.99, "step": 1220, "token_acc": 0.7731794829976352, "train_speed(iter/s)": 0.121058 }, { "epoch": 0.01589520840564572, "grad_norm": 1.3482273817062378, "learning_rate": 1.589464123524069e-05, "loss": 0.7956264972686767, "memory(GiB)": 88.99, "step": 1225, "token_acc": 0.740721616213774, "train_speed(iter/s)": 0.121084 }, { "epoch": 0.015960086807301415, "grad_norm": 1.2203502655029297, "learning_rate": 1.5959517321915144e-05, "loss": 0.7681220054626465, "memory(GiB)": 88.99, "step": 1230, "token_acc": 0.7709503162982444, "train_speed(iter/s)": 0.121101 }, { "epoch": 0.01602496520895711, "grad_norm": 1.3605225086212158, "learning_rate": 1.6024393408589594e-05, "loss": 0.7674593448638916, "memory(GiB)": 88.99, "step": 1235, "token_acc": 0.7616452409194371, "train_speed(iter/s)": 0.121117 }, { "epoch": 0.01608984361061281, "grad_norm": 1.2495280504226685, "learning_rate": 1.6089269495264048e-05, "loss": 0.7579440593719482, "memory(GiB)": 88.99, "step": 1240, "token_acc": 0.7795187880588148, "train_speed(iter/s)": 0.121106 }, { "epoch": 0.016154722012268505, "grad_norm": 1.360642433166504, "learning_rate": 1.6154145581938497e-05, "loss": 0.7716471672058105, "memory(GiB)": 88.99, "step": 1245, "token_acc": 0.7683377308707124, "train_speed(iter/s)": 0.121137 }, { "epoch": 0.0162196004139242, "grad_norm": 1.47996187210083, "learning_rate": 1.621902166861295e-05, "loss": 0.7815244674682618, "memory(GiB)": 88.99, "step": 1250, "token_acc": 0.7683873750808828, "train_speed(iter/s)": 0.121134 }, { "epoch": 0.0162844788155799, "grad_norm": 1.23853600025177, "learning_rate": 1.6283897755287404e-05, "loss": 0.7547944068908692, "memory(GiB)": 88.99, "step": 1255, "token_acc": 0.7591486674909894, "train_speed(iter/s)": 0.121107 }, { "epoch": 0.016349357217235596, "grad_norm": 1.4894394874572754, "learning_rate": 1.6348773841961854e-05, "loss": 0.7598337650299072, "memory(GiB)": 88.99, "step": 1260, "token_acc": 0.7733724914341654, "train_speed(iter/s)": 0.121083 }, { "epoch": 0.01641423561889129, "grad_norm": 1.3006147146224976, "learning_rate": 1.6413649928636304e-05, "loss": 0.7956934928894043, "memory(GiB)": 88.99, "step": 1265, "token_acc": 0.7715933207864494, "train_speed(iter/s)": 0.121096 }, { "epoch": 0.01647911402054699, "grad_norm": 1.3282811641693115, "learning_rate": 1.6478526015310757e-05, "loss": 0.8035645484924316, "memory(GiB)": 88.99, "step": 1270, "token_acc": 0.7674109330160286, "train_speed(iter/s)": 0.121104 }, { "epoch": 0.016543992422202686, "grad_norm": 1.2799710035324097, "learning_rate": 1.654340210198521e-05, "loss": 0.798342227935791, "memory(GiB)": 88.99, "step": 1275, "token_acc": 0.7651376146788991, "train_speed(iter/s)": 0.121113 }, { "epoch": 0.016608870823858382, "grad_norm": 1.3710333108901978, "learning_rate": 1.6608278188659663e-05, "loss": 0.751163101196289, "memory(GiB)": 88.99, "step": 1280, "token_acc": 0.7804725286196458, "train_speed(iter/s)": 0.121087 }, { "epoch": 0.01667374922551408, "grad_norm": 1.5229240655899048, "learning_rate": 1.6673154275334113e-05, "loss": 0.8017669677734375, "memory(GiB)": 88.99, "step": 1285, "token_acc": 0.7636916835699797, "train_speed(iter/s)": 0.121113 }, { "epoch": 0.016738627627169777, "grad_norm": 1.2629148960113525, "learning_rate": 1.6738030362008563e-05, "loss": 0.8188319206237793, "memory(GiB)": 88.99, "step": 1290, "token_acc": 0.7538640893535407, "train_speed(iter/s)": 0.121114 }, { "epoch": 0.016803506028825473, "grad_norm": 1.205428957939148, "learning_rate": 1.6802906448683016e-05, "loss": 0.7835384368896484, "memory(GiB)": 88.99, "step": 1295, "token_acc": 0.7574808253213784, "train_speed(iter/s)": 0.121111 }, { "epoch": 0.01686838443048117, "grad_norm": 1.2718676328659058, "learning_rate": 1.686778253535747e-05, "loss": 0.7799610137939453, "memory(GiB)": 88.99, "step": 1300, "token_acc": 0.7688461215228171, "train_speed(iter/s)": 0.121114 }, { "epoch": 0.016933262832136867, "grad_norm": 1.2915140390396118, "learning_rate": 1.693265862203192e-05, "loss": 0.8337483406066895, "memory(GiB)": 88.99, "step": 1305, "token_acc": 0.7459247759508856, "train_speed(iter/s)": 0.121132 }, { "epoch": 0.016998141233792563, "grad_norm": 1.348522663116455, "learning_rate": 1.6997534708706373e-05, "loss": 0.7975125312805176, "memory(GiB)": 88.99, "step": 1310, "token_acc": 0.7746785138529599, "train_speed(iter/s)": 0.121148 }, { "epoch": 0.017063019635448262, "grad_norm": 1.4913171529769897, "learning_rate": 1.7062410795380822e-05, "loss": 0.7837538719177246, "memory(GiB)": 88.99, "step": 1315, "token_acc": 0.7684857801691007, "train_speed(iter/s)": 0.121167 }, { "epoch": 0.017127898037103958, "grad_norm": 1.308611273765564, "learning_rate": 1.7127286882055276e-05, "loss": 0.7372982501983643, "memory(GiB)": 88.99, "step": 1320, "token_acc": 0.784798008807199, "train_speed(iter/s)": 0.121172 }, { "epoch": 0.017192776438759654, "grad_norm": 1.4203530550003052, "learning_rate": 1.719216296872973e-05, "loss": 0.7720179557800293, "memory(GiB)": 88.99, "step": 1325, "token_acc": 0.7653901912945087, "train_speed(iter/s)": 0.121163 }, { "epoch": 0.017257654840415353, "grad_norm": 1.361214280128479, "learning_rate": 1.725703905540418e-05, "loss": 0.7861969947814942, "memory(GiB)": 88.99, "step": 1330, "token_acc": 0.7733642398061321, "train_speed(iter/s)": 0.121164 }, { "epoch": 0.01732253324207105, "grad_norm": 1.3854546546936035, "learning_rate": 1.7321915142078632e-05, "loss": 0.8030142784118652, "memory(GiB)": 88.99, "step": 1335, "token_acc": 0.7726259946949602, "train_speed(iter/s)": 0.121171 }, { "epoch": 0.017387411643726744, "grad_norm": 1.4125441312789917, "learning_rate": 1.7386791228753082e-05, "loss": 0.8090742111206055, "memory(GiB)": 88.99, "step": 1340, "token_acc": 0.7563230240549829, "train_speed(iter/s)": 0.12117 }, { "epoch": 0.017452290045382443, "grad_norm": 1.200348138809204, "learning_rate": 1.7451667315427535e-05, "loss": 0.7826071739196777, "memory(GiB)": 88.99, "step": 1345, "token_acc": 0.760561135023115, "train_speed(iter/s)": 0.121171 }, { "epoch": 0.01751716844703814, "grad_norm": 1.2351112365722656, "learning_rate": 1.7516543402101988e-05, "loss": 0.8143712043762207, "memory(GiB)": 88.99, "step": 1350, "token_acc": 0.766528565957595, "train_speed(iter/s)": 0.121184 }, { "epoch": 0.017582046848693834, "grad_norm": 1.4434330463409424, "learning_rate": 1.7581419488776438e-05, "loss": 0.7310038566589355, "memory(GiB)": 88.99, "step": 1355, "token_acc": 0.7801916534007554, "train_speed(iter/s)": 0.121182 }, { "epoch": 0.017646925250349534, "grad_norm": 1.4139667749404907, "learning_rate": 1.7646295575450888e-05, "loss": 0.7954696655273438, "memory(GiB)": 88.99, "step": 1360, "token_acc": 0.7600918157353174, "train_speed(iter/s)": 0.121189 }, { "epoch": 0.01771180365200523, "grad_norm": 1.3836840391159058, "learning_rate": 1.771117166212534e-05, "loss": 0.7958074569702148, "memory(GiB)": 88.99, "step": 1365, "token_acc": 0.7735705209656925, "train_speed(iter/s)": 0.121188 }, { "epoch": 0.017776682053660925, "grad_norm": 1.4292538166046143, "learning_rate": 1.7776047748799794e-05, "loss": 0.8389805793762207, "memory(GiB)": 88.99, "step": 1370, "token_acc": 0.7396052388858144, "train_speed(iter/s)": 0.121183 }, { "epoch": 0.017841560455316624, "grad_norm": 1.4032841920852661, "learning_rate": 1.7840923835474248e-05, "loss": 0.8107786178588867, "memory(GiB)": 88.99, "step": 1375, "token_acc": 0.7708408778156007, "train_speed(iter/s)": 0.121196 }, { "epoch": 0.01790643885697232, "grad_norm": 1.3896723985671997, "learning_rate": 1.7905799922148698e-05, "loss": 0.8191633224487305, "memory(GiB)": 88.99, "step": 1380, "token_acc": 0.7628137150936727, "train_speed(iter/s)": 0.121213 }, { "epoch": 0.017971317258628015, "grad_norm": 1.4399199485778809, "learning_rate": 1.7970676008823147e-05, "loss": 0.8483446121215821, "memory(GiB)": 88.99, "step": 1385, "token_acc": 0.7613959836225385, "train_speed(iter/s)": 0.121211 }, { "epoch": 0.018036195660283715, "grad_norm": 1.2841733694076538, "learning_rate": 1.80355520954976e-05, "loss": 0.81307373046875, "memory(GiB)": 88.99, "step": 1390, "token_acc": 0.7485281180212492, "train_speed(iter/s)": 0.121223 }, { "epoch": 0.01810107406193941, "grad_norm": 1.4121090173721313, "learning_rate": 1.8100428182172054e-05, "loss": 0.7864665985107422, "memory(GiB)": 88.99, "step": 1395, "token_acc": 0.7683898960680771, "train_speed(iter/s)": 0.121208 }, { "epoch": 0.018165952463595106, "grad_norm": 1.4357694387435913, "learning_rate": 1.8165304268846504e-05, "loss": 0.8318480491638184, "memory(GiB)": 88.99, "step": 1400, "token_acc": 0.7467476149176062, "train_speed(iter/s)": 0.121204 }, { "epoch": 0.018230830865250805, "grad_norm": 1.4196497201919556, "learning_rate": 1.8230180355520957e-05, "loss": 0.7704087257385254, "memory(GiB)": 88.99, "step": 1405, "token_acc": 0.7719472933054532, "train_speed(iter/s)": 0.121207 }, { "epoch": 0.0182957092669065, "grad_norm": 1.3428977727890015, "learning_rate": 1.8295056442195407e-05, "loss": 0.7300156593322754, "memory(GiB)": 88.99, "step": 1410, "token_acc": 0.795748500550818, "train_speed(iter/s)": 0.121207 }, { "epoch": 0.018360587668562196, "grad_norm": 1.1970882415771484, "learning_rate": 1.8359932528869857e-05, "loss": 0.8034570693969727, "memory(GiB)": 88.99, "step": 1415, "token_acc": 0.7567914896655125, "train_speed(iter/s)": 0.121193 }, { "epoch": 0.018425466070217895, "grad_norm": 1.3465750217437744, "learning_rate": 1.8424808615544313e-05, "loss": 0.7935601234436035, "memory(GiB)": 88.99, "step": 1420, "token_acc": 0.7609895644465934, "train_speed(iter/s)": 0.121218 }, { "epoch": 0.01849034447187359, "grad_norm": 1.4817829132080078, "learning_rate": 1.8489684702218763e-05, "loss": 0.8167617797851563, "memory(GiB)": 88.99, "step": 1425, "token_acc": 0.7736493700619261, "train_speed(iter/s)": 0.121232 }, { "epoch": 0.018555222873529287, "grad_norm": 1.4636294841766357, "learning_rate": 1.8554560788893216e-05, "loss": 0.8191740036010742, "memory(GiB)": 88.99, "step": 1430, "token_acc": 0.7583280604459002, "train_speed(iter/s)": 0.121246 }, { "epoch": 0.018620101275184986, "grad_norm": 1.2080475091934204, "learning_rate": 1.8619436875567666e-05, "loss": 0.766749095916748, "memory(GiB)": 88.99, "step": 1435, "token_acc": 0.7658229734967057, "train_speed(iter/s)": 0.121223 }, { "epoch": 0.01868497967684068, "grad_norm": 1.2866688966751099, "learning_rate": 1.8684312962242116e-05, "loss": 0.7813150405883789, "memory(GiB)": 88.99, "step": 1440, "token_acc": 0.7679389916232021, "train_speed(iter/s)": 0.121238 }, { "epoch": 0.018749858078496377, "grad_norm": 1.416496753692627, "learning_rate": 1.8749189048916573e-05, "loss": 0.7700918674468994, "memory(GiB)": 88.99, "step": 1445, "token_acc": 0.7834847346492726, "train_speed(iter/s)": 0.121248 }, { "epoch": 0.018814736480152076, "grad_norm": 1.3211246728897095, "learning_rate": 1.8814065135591023e-05, "loss": 0.8015035629272461, "memory(GiB)": 88.99, "step": 1450, "token_acc": 0.7548956478921253, "train_speed(iter/s)": 0.121249 }, { "epoch": 0.018879614881807772, "grad_norm": 1.2986242771148682, "learning_rate": 1.8878941222265472e-05, "loss": 0.7897496223449707, "memory(GiB)": 88.99, "step": 1455, "token_acc": 0.7667993799957451, "train_speed(iter/s)": 0.121219 }, { "epoch": 0.018944493283463468, "grad_norm": 1.3209288120269775, "learning_rate": 1.8943817308939926e-05, "loss": 0.79403657913208, "memory(GiB)": 88.99, "step": 1460, "token_acc": 0.7776368709011197, "train_speed(iter/s)": 0.121224 }, { "epoch": 0.019009371685119167, "grad_norm": 1.2276666164398193, "learning_rate": 1.9008693395614376e-05, "loss": 0.7670764923095703, "memory(GiB)": 88.99, "step": 1465, "token_acc": 0.7623919171443956, "train_speed(iter/s)": 0.121204 }, { "epoch": 0.019074250086774863, "grad_norm": 1.241157054901123, "learning_rate": 1.9073569482288832e-05, "loss": 0.7876185417175293, "memory(GiB)": 88.99, "step": 1470, "token_acc": 0.7729533050333536, "train_speed(iter/s)": 0.121183 }, { "epoch": 0.019139128488430558, "grad_norm": 1.195490837097168, "learning_rate": 1.9138445568963282e-05, "loss": 0.7645451545715332, "memory(GiB)": 88.99, "step": 1475, "token_acc": 0.7701145875158292, "train_speed(iter/s)": 0.121186 }, { "epoch": 0.019204006890086257, "grad_norm": 1.2759939432144165, "learning_rate": 1.9203321655637732e-05, "loss": 0.7571821212768555, "memory(GiB)": 88.99, "step": 1480, "token_acc": 0.760476107124103, "train_speed(iter/s)": 0.121203 }, { "epoch": 0.019268885291741953, "grad_norm": 1.195304274559021, "learning_rate": 1.9268197742312185e-05, "loss": 0.7685189247131348, "memory(GiB)": 88.99, "step": 1485, "token_acc": 0.7721407716698584, "train_speed(iter/s)": 0.121176 }, { "epoch": 0.01933376369339765, "grad_norm": 1.1754822731018066, "learning_rate": 1.9333073828986635e-05, "loss": 0.7519008636474609, "memory(GiB)": 88.99, "step": 1490, "token_acc": 0.7673869754406667, "train_speed(iter/s)": 0.121183 }, { "epoch": 0.019398642095053348, "grad_norm": 1.460143804550171, "learning_rate": 1.9397949915661088e-05, "loss": 0.8049972534179688, "memory(GiB)": 88.99, "step": 1495, "token_acc": 0.7596064521577569, "train_speed(iter/s)": 0.121196 }, { "epoch": 0.019463520496709043, "grad_norm": 1.3253891468048096, "learning_rate": 1.946282600233554e-05, "loss": 0.7561993598937988, "memory(GiB)": 88.99, "step": 1500, "token_acc": 0.7724215246636771, "train_speed(iter/s)": 0.121181 }, { "epoch": 0.01952839889836474, "grad_norm": 1.3591121435165405, "learning_rate": 1.952770208900999e-05, "loss": 0.807155990600586, "memory(GiB)": 88.99, "step": 1505, "token_acc": 0.7617165653778756, "train_speed(iter/s)": 0.121183 }, { "epoch": 0.01959327730002044, "grad_norm": 1.2553672790527344, "learning_rate": 1.959257817568444e-05, "loss": 0.8066631317138672, "memory(GiB)": 88.99, "step": 1510, "token_acc": 0.7553732939994164, "train_speed(iter/s)": 0.12119 }, { "epoch": 0.019658155701676134, "grad_norm": 1.2202832698822021, "learning_rate": 1.9657454262358894e-05, "loss": 0.8169960975646973, "memory(GiB)": 88.99, "step": 1515, "token_acc": 0.7599446558284331, "train_speed(iter/s)": 0.1212 }, { "epoch": 0.01972303410333183, "grad_norm": 1.2183386087417603, "learning_rate": 1.9722330349033348e-05, "loss": 0.7852190017700196, "memory(GiB)": 88.99, "step": 1520, "token_acc": 0.7749095491686654, "train_speed(iter/s)": 0.121207 }, { "epoch": 0.01978791250498753, "grad_norm": 1.5267528295516968, "learning_rate": 1.97872064357078e-05, "loss": 0.8165067672729492, "memory(GiB)": 88.99, "step": 1525, "token_acc": 0.7718067152247291, "train_speed(iter/s)": 0.121214 }, { "epoch": 0.019852790906643224, "grad_norm": 1.4094427824020386, "learning_rate": 1.985208252238225e-05, "loss": 0.7776802062988282, "memory(GiB)": 88.99, "step": 1530, "token_acc": 0.7504900411002213, "train_speed(iter/s)": 0.121215 }, { "epoch": 0.01991766930829892, "grad_norm": 1.209794282913208, "learning_rate": 1.99169586090567e-05, "loss": 0.7729447841644287, "memory(GiB)": 88.99, "step": 1535, "token_acc": 0.7608066444346567, "train_speed(iter/s)": 0.12121 }, { "epoch": 0.01998254770995462, "grad_norm": 1.1936936378479004, "learning_rate": 1.9981834695731154e-05, "loss": 0.7976233482360839, "memory(GiB)": 88.99, "step": 1540, "token_acc": 0.7712080662226924, "train_speed(iter/s)": 0.1212 }, { "epoch": 0.020047426111610315, "grad_norm": 1.339784026145935, "learning_rate": 2.0046710782405607e-05, "loss": 0.8319385528564454, "memory(GiB)": 88.99, "step": 1545, "token_acc": 0.7760439472950917, "train_speed(iter/s)": 0.121202 }, { "epoch": 0.02011230451326601, "grad_norm": 1.4388386011123657, "learning_rate": 2.0111586869080057e-05, "loss": 0.7950892925262452, "memory(GiB)": 88.99, "step": 1550, "token_acc": 0.7727386351462343, "train_speed(iter/s)": 0.121217 }, { "epoch": 0.02017718291492171, "grad_norm": 1.3081508874893188, "learning_rate": 2.017646295575451e-05, "loss": 0.8242027282714843, "memory(GiB)": 88.99, "step": 1555, "token_acc": 0.760724794796345, "train_speed(iter/s)": 0.121208 }, { "epoch": 0.020242061316577405, "grad_norm": 1.3358588218688965, "learning_rate": 2.024133904242896e-05, "loss": 0.7991954326629639, "memory(GiB)": 88.99, "step": 1560, "token_acc": 0.756513891068906, "train_speed(iter/s)": 0.121202 }, { "epoch": 0.0203069397182331, "grad_norm": 1.301878809928894, "learning_rate": 2.0306215129103413e-05, "loss": 0.7962523460388183, "memory(GiB)": 88.99, "step": 1565, "token_acc": 0.7592756967867921, "train_speed(iter/s)": 0.121174 }, { "epoch": 0.0203718181198888, "grad_norm": 1.2973817586898804, "learning_rate": 2.0371091215777866e-05, "loss": 0.8134052276611328, "memory(GiB)": 88.99, "step": 1570, "token_acc": 0.7689188236123845, "train_speed(iter/s)": 0.12117 }, { "epoch": 0.020436696521544496, "grad_norm": 1.1837217807769775, "learning_rate": 2.0435967302452316e-05, "loss": 0.7994165420532227, "memory(GiB)": 88.99, "step": 1575, "token_acc": 0.7662086657061531, "train_speed(iter/s)": 0.121178 }, { "epoch": 0.02050157492320019, "grad_norm": 1.2940919399261475, "learning_rate": 2.050084338912677e-05, "loss": 0.8174277305603027, "memory(GiB)": 88.99, "step": 1580, "token_acc": 0.7611950741673663, "train_speed(iter/s)": 0.12118 }, { "epoch": 0.020566453324855887, "grad_norm": 1.3489259481430054, "learning_rate": 2.056571947580122e-05, "loss": 0.7809474468231201, "memory(GiB)": 88.99, "step": 1585, "token_acc": 0.758794126835364, "train_speed(iter/s)": 0.121133 }, { "epoch": 0.020631331726511586, "grad_norm": 1.1679209470748901, "learning_rate": 2.0630595562475673e-05, "loss": 0.7601633071899414, "memory(GiB)": 88.99, "step": 1590, "token_acc": 0.7866461051139916, "train_speed(iter/s)": 0.121152 }, { "epoch": 0.020696210128167282, "grad_norm": 1.2842274904251099, "learning_rate": 2.0695471649150126e-05, "loss": 0.7820738315582275, "memory(GiB)": 88.99, "step": 1595, "token_acc": 0.7684490359053741, "train_speed(iter/s)": 0.121134 }, { "epoch": 0.020761088529822978, "grad_norm": 1.170737385749817, "learning_rate": 2.0760347735824576e-05, "loss": 0.8302230834960938, "memory(GiB)": 88.99, "step": 1600, "token_acc": 0.7587826738522724, "train_speed(iter/s)": 0.121134 }, { "epoch": 0.020825966931478677, "grad_norm": 1.264345645904541, "learning_rate": 2.0825223822499026e-05, "loss": 0.8060140609741211, "memory(GiB)": 88.99, "step": 1605, "token_acc": 0.7601739560866951, "train_speed(iter/s)": 0.121134 }, { "epoch": 0.020890845333134372, "grad_norm": 1.2071572542190552, "learning_rate": 2.089009990917348e-05, "loss": 0.792857551574707, "memory(GiB)": 88.99, "step": 1610, "token_acc": 0.7655487568739834, "train_speed(iter/s)": 0.12115 }, { "epoch": 0.020955723734790068, "grad_norm": 1.5548063516616821, "learning_rate": 2.0954975995847932e-05, "loss": 0.8017389297485351, "memory(GiB)": 88.99, "step": 1615, "token_acc": 0.7832936563506512, "train_speed(iter/s)": 0.121167 }, { "epoch": 0.021020602136445767, "grad_norm": 1.1260864734649658, "learning_rate": 2.1019852082522385e-05, "loss": 0.7740314483642579, "memory(GiB)": 88.99, "step": 1620, "token_acc": 0.7603969227338611, "train_speed(iter/s)": 0.121157 }, { "epoch": 0.021085480538101463, "grad_norm": 1.2657644748687744, "learning_rate": 2.1084728169196835e-05, "loss": 0.7357797622680664, "memory(GiB)": 88.99, "step": 1625, "token_acc": 0.7791249139447717, "train_speed(iter/s)": 0.121143 }, { "epoch": 0.02115035893975716, "grad_norm": 1.2750028371810913, "learning_rate": 2.1149604255871285e-05, "loss": 0.7853628635406494, "memory(GiB)": 88.99, "step": 1630, "token_acc": 0.7744446246148857, "train_speed(iter/s)": 0.121131 }, { "epoch": 0.021215237341412858, "grad_norm": 1.2528550624847412, "learning_rate": 2.1214480342545738e-05, "loss": 0.7995200634002686, "memory(GiB)": 88.99, "step": 1635, "token_acc": 0.7640860857868779, "train_speed(iter/s)": 0.121135 }, { "epoch": 0.021280115743068553, "grad_norm": 1.3026726245880127, "learning_rate": 2.127935642922019e-05, "loss": 0.7838740825653077, "memory(GiB)": 88.99, "step": 1640, "token_acc": 0.7798239797456146, "train_speed(iter/s)": 0.12113 }, { "epoch": 0.02134499414472425, "grad_norm": 1.1939988136291504, "learning_rate": 2.134423251589464e-05, "loss": 0.7928782939910889, "memory(GiB)": 88.99, "step": 1645, "token_acc": 0.7607081239216863, "train_speed(iter/s)": 0.121137 }, { "epoch": 0.021409872546379948, "grad_norm": 1.332716703414917, "learning_rate": 2.1409108602569095e-05, "loss": 0.7810121536254883, "memory(GiB)": 88.99, "step": 1650, "token_acc": 0.7677222175057306, "train_speed(iter/s)": 0.121139 }, { "epoch": 0.021474750948035644, "grad_norm": 1.2295795679092407, "learning_rate": 2.1473984689243544e-05, "loss": 0.803156852722168, "memory(GiB)": 88.99, "step": 1655, "token_acc": 0.7639601039544848, "train_speed(iter/s)": 0.121139 }, { "epoch": 0.02153962934969134, "grad_norm": 1.2823387384414673, "learning_rate": 2.1538860775917998e-05, "loss": 0.7698566913604736, "memory(GiB)": 88.99, "step": 1660, "token_acc": 0.7838070135228055, "train_speed(iter/s)": 0.121123 }, { "epoch": 0.02160450775134704, "grad_norm": 1.1708701848983765, "learning_rate": 2.160373686259245e-05, "loss": 0.7897217750549317, "memory(GiB)": 88.99, "step": 1665, "token_acc": 0.7612582286056254, "train_speed(iter/s)": 0.121132 }, { "epoch": 0.021669386153002734, "grad_norm": 1.1997865438461304, "learning_rate": 2.16686129492669e-05, "loss": 0.7482931613922119, "memory(GiB)": 88.99, "step": 1670, "token_acc": 0.7640257961038527, "train_speed(iter/s)": 0.121121 }, { "epoch": 0.02173426455465843, "grad_norm": 1.2309596538543701, "learning_rate": 2.1733489035941354e-05, "loss": 0.751674222946167, "memory(GiB)": 88.99, "step": 1675, "token_acc": 0.7747003563330094, "train_speed(iter/s)": 0.121123 }, { "epoch": 0.02179914295631413, "grad_norm": 1.2904269695281982, "learning_rate": 2.1798365122615804e-05, "loss": 0.8076250076293945, "memory(GiB)": 88.99, "step": 1680, "token_acc": 0.7782217446779934, "train_speed(iter/s)": 0.121133 }, { "epoch": 0.021864021357969825, "grad_norm": 1.3190960884094238, "learning_rate": 2.1863241209290257e-05, "loss": 0.76505446434021, "memory(GiB)": 88.99, "step": 1685, "token_acc": 0.7525690598111959, "train_speed(iter/s)": 0.12111 }, { "epoch": 0.02192889975962552, "grad_norm": 1.4495655298233032, "learning_rate": 2.192811729596471e-05, "loss": 0.7728255271911622, "memory(GiB)": 88.99, "step": 1690, "token_acc": 0.7592673735868899, "train_speed(iter/s)": 0.121116 }, { "epoch": 0.02199377816128122, "grad_norm": 1.373868465423584, "learning_rate": 2.199299338263916e-05, "loss": 0.8000710487365723, "memory(GiB)": 88.99, "step": 1695, "token_acc": 0.7494683935820607, "train_speed(iter/s)": 0.121128 }, { "epoch": 0.022058656562936915, "grad_norm": 1.1556038856506348, "learning_rate": 2.2057869469313613e-05, "loss": 0.7780329704284668, "memory(GiB)": 88.99, "step": 1700, "token_acc": 0.7631946098615033, "train_speed(iter/s)": 0.121134 }, { "epoch": 0.02212353496459261, "grad_norm": 1.2478408813476562, "learning_rate": 2.2122745555988063e-05, "loss": 0.7938796997070312, "memory(GiB)": 88.99, "step": 1705, "token_acc": 0.7739610445327918, "train_speed(iter/s)": 0.121123 }, { "epoch": 0.02218841336624831, "grad_norm": 1.194457769393921, "learning_rate": 2.2187621642662516e-05, "loss": 0.7949272632598877, "memory(GiB)": 88.99, "step": 1710, "token_acc": 0.7765385120062984, "train_speed(iter/s)": 0.121118 }, { "epoch": 0.022253291767904006, "grad_norm": 1.3416295051574707, "learning_rate": 2.225249772933697e-05, "loss": 0.8049333572387696, "memory(GiB)": 88.99, "step": 1715, "token_acc": 0.7636369494474337, "train_speed(iter/s)": 0.121136 }, { "epoch": 0.0223181701695597, "grad_norm": 1.361832857131958, "learning_rate": 2.231737381601142e-05, "loss": 0.7911515712738038, "memory(GiB)": 88.99, "step": 1720, "token_acc": 0.7726236021188935, "train_speed(iter/s)": 0.121126 }, { "epoch": 0.0223830485712154, "grad_norm": 1.282758116722107, "learning_rate": 2.238224990268587e-05, "loss": 0.8182442665100098, "memory(GiB)": 88.99, "step": 1725, "token_acc": 0.7497448979591836, "train_speed(iter/s)": 0.12114 }, { "epoch": 0.022447926972871096, "grad_norm": 1.3666630983352661, "learning_rate": 2.2447125989360323e-05, "loss": 0.8312170028686523, "memory(GiB)": 88.99, "step": 1730, "token_acc": 0.7432776221299069, "train_speed(iter/s)": 0.12115 }, { "epoch": 0.022512805374526792, "grad_norm": 1.2202833890914917, "learning_rate": 2.2512002076034776e-05, "loss": 0.841131591796875, "memory(GiB)": 88.99, "step": 1735, "token_acc": 0.764978072447253, "train_speed(iter/s)": 0.121138 }, { "epoch": 0.02257768377618249, "grad_norm": 1.2845572233200073, "learning_rate": 2.2576878162709226e-05, "loss": 0.7819697380065918, "memory(GiB)": 88.99, "step": 1740, "token_acc": 0.7762620317152552, "train_speed(iter/s)": 0.121134 }, { "epoch": 0.022642562177838187, "grad_norm": 1.2311683893203735, "learning_rate": 2.264175424938368e-05, "loss": 0.793233060836792, "memory(GiB)": 88.99, "step": 1745, "token_acc": 0.7636245322921751, "train_speed(iter/s)": 0.12114 }, { "epoch": 0.022707440579493882, "grad_norm": 1.2438453435897827, "learning_rate": 2.270663033605813e-05, "loss": 0.7657630920410157, "memory(GiB)": 88.99, "step": 1750, "token_acc": 0.7770156513063541, "train_speed(iter/s)": 0.121116 }, { "epoch": 0.02277231898114958, "grad_norm": 1.197200894355774, "learning_rate": 2.2771506422732582e-05, "loss": 0.7540364265441895, "memory(GiB)": 88.99, "step": 1755, "token_acc": 0.7701304847542424, "train_speed(iter/s)": 0.121117 }, { "epoch": 0.022837197382805277, "grad_norm": 1.3811007738113403, "learning_rate": 2.2836382509407035e-05, "loss": 0.8142763137817383, "memory(GiB)": 88.99, "step": 1760, "token_acc": 0.7523124729007082, "train_speed(iter/s)": 0.121104 }, { "epoch": 0.022902075784460973, "grad_norm": 1.242879867553711, "learning_rate": 2.2901258596081485e-05, "loss": 0.80398530960083, "memory(GiB)": 88.99, "step": 1765, "token_acc": 0.7690513502678669, "train_speed(iter/s)": 0.121124 }, { "epoch": 0.022966954186116672, "grad_norm": 1.2110538482666016, "learning_rate": 2.296613468275594e-05, "loss": 0.7987292766571045, "memory(GiB)": 88.99, "step": 1770, "token_acc": 0.7912062152568482, "train_speed(iter/s)": 0.121112 }, { "epoch": 0.023031832587772368, "grad_norm": 1.2611956596374512, "learning_rate": 2.3031010769430388e-05, "loss": 0.7918318271636963, "memory(GiB)": 88.99, "step": 1775, "token_acc": 0.7541551246537396, "train_speed(iter/s)": 0.121117 }, { "epoch": 0.023096710989428063, "grad_norm": 1.4080229997634888, "learning_rate": 2.3095886856104838e-05, "loss": 0.8536518096923829, "memory(GiB)": 88.99, "step": 1780, "token_acc": 0.7440029857835986, "train_speed(iter/s)": 0.12113 }, { "epoch": 0.023161589391083762, "grad_norm": 1.7417330741882324, "learning_rate": 2.3160762942779295e-05, "loss": 0.8064004898071289, "memory(GiB)": 88.99, "step": 1785, "token_acc": 0.7516243908534299, "train_speed(iter/s)": 0.12112 }, { "epoch": 0.023226467792739458, "grad_norm": 1.3726067543029785, "learning_rate": 2.3225639029453745e-05, "loss": 0.8512574195861816, "memory(GiB)": 88.99, "step": 1790, "token_acc": 0.7666996680449595, "train_speed(iter/s)": 0.121117 }, { "epoch": 0.023291346194395154, "grad_norm": 1.1748508214950562, "learning_rate": 2.3290515116128198e-05, "loss": 0.7980769157409668, "memory(GiB)": 88.99, "step": 1795, "token_acc": 0.7714575926866429, "train_speed(iter/s)": 0.121119 }, { "epoch": 0.023356224596050853, "grad_norm": 1.2650443315505981, "learning_rate": 2.3355391202802648e-05, "loss": 0.7959426879882813, "memory(GiB)": 88.99, "step": 1800, "token_acc": 0.7943757341256132, "train_speed(iter/s)": 0.121113 }, { "epoch": 0.02342110299770655, "grad_norm": 1.293664813041687, "learning_rate": 2.3420267289477098e-05, "loss": 0.8036101341247559, "memory(GiB)": 88.99, "step": 1805, "token_acc": 0.7624383857054837, "train_speed(iter/s)": 0.121117 }, { "epoch": 0.023485981399362244, "grad_norm": 1.2595906257629395, "learning_rate": 2.3485143376151554e-05, "loss": 0.7963203430175781, "memory(GiB)": 88.99, "step": 1810, "token_acc": 0.7691145467322558, "train_speed(iter/s)": 0.121121 }, { "epoch": 0.023550859801017943, "grad_norm": 1.2592605352401733, "learning_rate": 2.3550019462826004e-05, "loss": 0.8024679183959961, "memory(GiB)": 88.99, "step": 1815, "token_acc": 0.7634731707317073, "train_speed(iter/s)": 0.12111 }, { "epoch": 0.02361573820267364, "grad_norm": 1.1926612854003906, "learning_rate": 2.3614895549500454e-05, "loss": 0.794999074935913, "memory(GiB)": 88.99, "step": 1820, "token_acc": 0.7614711729622267, "train_speed(iter/s)": 0.12111 }, { "epoch": 0.023680616604329335, "grad_norm": 1.3002806901931763, "learning_rate": 2.3679771636174907e-05, "loss": 0.7845052719116211, "memory(GiB)": 88.99, "step": 1825, "token_acc": 0.7647629833360298, "train_speed(iter/s)": 0.121105 }, { "epoch": 0.023745495005985034, "grad_norm": 1.4474124908447266, "learning_rate": 2.3744647722849357e-05, "loss": 0.8277694702148437, "memory(GiB)": 88.99, "step": 1830, "token_acc": 0.7546965047526383, "train_speed(iter/s)": 0.121097 }, { "epoch": 0.02381037340764073, "grad_norm": 1.1825186014175415, "learning_rate": 2.380952380952381e-05, "loss": 0.811428451538086, "memory(GiB)": 88.99, "step": 1835, "token_acc": 0.7762847998067516, "train_speed(iter/s)": 0.121095 }, { "epoch": 0.023875251809296425, "grad_norm": 1.2852789163589478, "learning_rate": 2.3874399896198263e-05, "loss": 0.8105386734008789, "memory(GiB)": 88.99, "step": 1840, "token_acc": 0.74784693523455, "train_speed(iter/s)": 0.121092 }, { "epoch": 0.023940130210952124, "grad_norm": 1.3069127798080444, "learning_rate": 2.3939275982872713e-05, "loss": 0.8131989479064942, "memory(GiB)": 88.99, "step": 1845, "token_acc": 0.757750717335013, "train_speed(iter/s)": 0.121104 }, { "epoch": 0.02400500861260782, "grad_norm": 1.326335072517395, "learning_rate": 2.4004152069547167e-05, "loss": 0.8079988479614257, "memory(GiB)": 88.99, "step": 1850, "token_acc": 0.7513145451106488, "train_speed(iter/s)": 0.121113 }, { "epoch": 0.024069887014263516, "grad_norm": 1.0950180292129517, "learning_rate": 2.4069028156221616e-05, "loss": 0.7578811645507812, "memory(GiB)": 88.99, "step": 1855, "token_acc": 0.7732252012346427, "train_speed(iter/s)": 0.121091 }, { "epoch": 0.024134765415919215, "grad_norm": 1.2260973453521729, "learning_rate": 2.413390424289607e-05, "loss": 0.7865475654602051, "memory(GiB)": 88.99, "step": 1860, "token_acc": 0.7849212584975916, "train_speed(iter/s)": 0.121087 }, { "epoch": 0.02419964381757491, "grad_norm": 1.3556305170059204, "learning_rate": 2.4198780329570523e-05, "loss": 0.8292794227600098, "memory(GiB)": 88.99, "step": 1865, "token_acc": 0.7519861609431061, "train_speed(iter/s)": 0.121092 }, { "epoch": 0.024264522219230606, "grad_norm": 1.2087979316711426, "learning_rate": 2.4263656416244973e-05, "loss": 0.7929844856262207, "memory(GiB)": 88.99, "step": 1870, "token_acc": 0.7782005141388175, "train_speed(iter/s)": 0.121102 }, { "epoch": 0.024329400620886305, "grad_norm": 1.334384799003601, "learning_rate": 2.4328532502919423e-05, "loss": 0.8015927314758301, "memory(GiB)": 88.99, "step": 1875, "token_acc": 0.754624184753555, "train_speed(iter/s)": 0.121105 }, { "epoch": 0.024394279022542, "grad_norm": 1.2659754753112793, "learning_rate": 2.4393408589593876e-05, "loss": 0.8162626266479492, "memory(GiB)": 88.99, "step": 1880, "token_acc": 0.7654198577680525, "train_speed(iter/s)": 0.121094 }, { "epoch": 0.024459157424197697, "grad_norm": 1.4270050525665283, "learning_rate": 2.445828467626833e-05, "loss": 0.82205810546875, "memory(GiB)": 88.99, "step": 1885, "token_acc": 0.7655429071803853, "train_speed(iter/s)": 0.121108 }, { "epoch": 0.024524035825853396, "grad_norm": 1.173128366470337, "learning_rate": 2.4523160762942782e-05, "loss": 0.8034621238708496, "memory(GiB)": 88.99, "step": 1890, "token_acc": 0.7628803245436105, "train_speed(iter/s)": 0.121104 }, { "epoch": 0.02458891422750909, "grad_norm": 1.2840015888214111, "learning_rate": 2.4588036849617232e-05, "loss": 0.8182846069335937, "memory(GiB)": 88.99, "step": 1895, "token_acc": 0.7836689264393403, "train_speed(iter/s)": 0.1211 }, { "epoch": 0.024653792629164787, "grad_norm": 1.1736111640930176, "learning_rate": 2.4652912936291682e-05, "loss": 0.8144903182983398, "memory(GiB)": 88.99, "step": 1900, "token_acc": 0.7576257780528904, "train_speed(iter/s)": 0.121105 }, { "epoch": 0.024718671030820486, "grad_norm": 1.3881394863128662, "learning_rate": 2.4717789022966135e-05, "loss": 0.820223045349121, "memory(GiB)": 88.99, "step": 1905, "token_acc": 0.7601698298073085, "train_speed(iter/s)": 0.121107 }, { "epoch": 0.024783549432476182, "grad_norm": 1.3060158491134644, "learning_rate": 2.478266510964059e-05, "loss": 0.8184785842895508, "memory(GiB)": 88.99, "step": 1910, "token_acc": 0.754880694143167, "train_speed(iter/s)": 0.121109 }, { "epoch": 0.024848427834131877, "grad_norm": 1.1845195293426514, "learning_rate": 2.4847541196315038e-05, "loss": 0.7805597305297851, "memory(GiB)": 88.99, "step": 1915, "token_acc": 0.7865452503209243, "train_speed(iter/s)": 0.121099 }, { "epoch": 0.024913306235787577, "grad_norm": 1.2502285242080688, "learning_rate": 2.491241728298949e-05, "loss": 0.803799057006836, "memory(GiB)": 88.99, "step": 1920, "token_acc": 0.7645584646837698, "train_speed(iter/s)": 0.121097 }, { "epoch": 0.024978184637443272, "grad_norm": 1.4374942779541016, "learning_rate": 2.497729336966394e-05, "loss": 0.8096616744995118, "memory(GiB)": 88.99, "step": 1925, "token_acc": 0.7604275700193485, "train_speed(iter/s)": 0.121095 }, { "epoch": 0.025043063039098968, "grad_norm": 1.0919809341430664, "learning_rate": 2.5042169456338395e-05, "loss": 0.8197728157043457, "memory(GiB)": 88.99, "step": 1930, "token_acc": 0.7658335200179217, "train_speed(iter/s)": 0.121093 }, { "epoch": 0.025107941440754667, "grad_norm": 1.2625163793563843, "learning_rate": 2.5107045543012848e-05, "loss": 0.8418947219848633, "memory(GiB)": 88.99, "step": 1935, "token_acc": 0.7561249561249561, "train_speed(iter/s)": 0.121089 }, { "epoch": 0.025172819842410363, "grad_norm": 1.192050576210022, "learning_rate": 2.51719216296873e-05, "loss": 0.8175794601440429, "memory(GiB)": 88.99, "step": 1940, "token_acc": 0.7649950135346916, "train_speed(iter/s)": 0.121089 }, { "epoch": 0.02523769824406606, "grad_norm": 1.2323068380355835, "learning_rate": 2.5236797716361748e-05, "loss": 0.7943767070770263, "memory(GiB)": 88.99, "step": 1945, "token_acc": 0.7649799142558147, "train_speed(iter/s)": 0.121078 }, { "epoch": 0.025302576645721758, "grad_norm": 1.4256775379180908, "learning_rate": 2.53016738030362e-05, "loss": 0.8564391136169434, "memory(GiB)": 88.99, "step": 1950, "token_acc": 0.7557135978071637, "train_speed(iter/s)": 0.121097 }, { "epoch": 0.025367455047377453, "grad_norm": 1.1356807947158813, "learning_rate": 2.5366549889710654e-05, "loss": 0.8046245574951172, "memory(GiB)": 88.99, "step": 1955, "token_acc": 0.7726152287692047, "train_speed(iter/s)": 0.121093 }, { "epoch": 0.02543233344903315, "grad_norm": 1.4085049629211426, "learning_rate": 2.5431425976385104e-05, "loss": 0.8171615600585938, "memory(GiB)": 88.99, "step": 1960, "token_acc": 0.7836399986445732, "train_speed(iter/s)": 0.121108 }, { "epoch": 0.025497211850688848, "grad_norm": 1.256885290145874, "learning_rate": 2.5496302063059557e-05, "loss": 0.8159847259521484, "memory(GiB)": 88.99, "step": 1965, "token_acc": 0.7829807093963908, "train_speed(iter/s)": 0.121105 }, { "epoch": 0.025562090252344544, "grad_norm": 1.2502089738845825, "learning_rate": 2.556117814973401e-05, "loss": 0.828580665588379, "memory(GiB)": 88.99, "step": 1970, "token_acc": 0.7383414781982801, "train_speed(iter/s)": 0.121104 }, { "epoch": 0.02562696865400024, "grad_norm": 1.1520627737045288, "learning_rate": 2.562605423640846e-05, "loss": 0.7858683109283447, "memory(GiB)": 88.99, "step": 1975, "token_acc": 0.7731062495546213, "train_speed(iter/s)": 0.121089 }, { "epoch": 0.02569184705565594, "grad_norm": 1.2281907796859741, "learning_rate": 2.5690930323082913e-05, "loss": 0.7970191955566406, "memory(GiB)": 88.99, "step": 1980, "token_acc": 0.7720403022670025, "train_speed(iter/s)": 0.121095 }, { "epoch": 0.025756725457311634, "grad_norm": 1.174708366394043, "learning_rate": 2.5755806409757367e-05, "loss": 0.8239808082580566, "memory(GiB)": 88.99, "step": 1985, "token_acc": 0.7631354542472942, "train_speed(iter/s)": 0.121081 }, { "epoch": 0.02582160385896733, "grad_norm": 1.251980185508728, "learning_rate": 2.5820682496431813e-05, "loss": 0.7959480285644531, "memory(GiB)": 88.99, "step": 1990, "token_acc": 0.7713201392591663, "train_speed(iter/s)": 0.121081 }, { "epoch": 0.02588648226062303, "grad_norm": 1.2579920291900635, "learning_rate": 2.5885558583106266e-05, "loss": 0.8236524581909179, "memory(GiB)": 88.99, "step": 1995, "token_acc": 0.7644229105381554, "train_speed(iter/s)": 0.12108 }, { "epoch": 0.025951360662278725, "grad_norm": 1.3382580280303955, "learning_rate": 2.595043466978072e-05, "loss": 0.7840917587280274, "memory(GiB)": 88.99, "step": 2000, "token_acc": 0.7683293621342916, "train_speed(iter/s)": 0.121088 }, { "epoch": 0.02601623906393442, "grad_norm": 1.12234365940094, "learning_rate": 2.6015310756455176e-05, "loss": 0.81167631149292, "memory(GiB)": 88.99, "step": 2005, "token_acc": 0.7600130723700933, "train_speed(iter/s)": 0.121094 }, { "epoch": 0.02608111746559012, "grad_norm": 1.2292848825454712, "learning_rate": 2.6080186843129623e-05, "loss": 0.7967352867126465, "memory(GiB)": 88.99, "step": 2010, "token_acc": 0.7717461157524667, "train_speed(iter/s)": 0.121093 }, { "epoch": 0.026145995867245815, "grad_norm": 1.2416126728057861, "learning_rate": 2.6145062929804076e-05, "loss": 0.8538772583007812, "memory(GiB)": 88.99, "step": 2015, "token_acc": 0.7683937974573066, "train_speed(iter/s)": 0.121093 }, { "epoch": 0.02621087426890151, "grad_norm": 1.1926004886627197, "learning_rate": 2.620993901647853e-05, "loss": 0.823642349243164, "memory(GiB)": 88.99, "step": 2020, "token_acc": 0.7481950788271696, "train_speed(iter/s)": 0.121092 }, { "epoch": 0.026275752670557206, "grad_norm": 1.2453880310058594, "learning_rate": 2.627481510315298e-05, "loss": 0.7933976173400878, "memory(GiB)": 88.99, "step": 2025, "token_acc": 0.775756076879593, "train_speed(iter/s)": 0.121097 }, { "epoch": 0.026340631072212906, "grad_norm": 1.0735855102539062, "learning_rate": 2.6339691189827432e-05, "loss": 0.7677554130554199, "memory(GiB)": 88.99, "step": 2030, "token_acc": 0.7664680407671518, "train_speed(iter/s)": 0.121093 }, { "epoch": 0.0264055094738686, "grad_norm": 1.2413488626480103, "learning_rate": 2.6404567276501886e-05, "loss": 0.7899690628051758, "memory(GiB)": 88.99, "step": 2035, "token_acc": 0.7755978381579921, "train_speed(iter/s)": 0.121109 }, { "epoch": 0.026470387875524297, "grad_norm": 1.1678577661514282, "learning_rate": 2.6469443363176332e-05, "loss": 0.8443717956542969, "memory(GiB)": 88.99, "step": 2040, "token_acc": 0.7408229276365278, "train_speed(iter/s)": 0.121111 }, { "epoch": 0.026535266277179996, "grad_norm": 1.3856693506240845, "learning_rate": 2.6534319449850785e-05, "loss": 0.8481722831726074, "memory(GiB)": 88.99, "step": 2045, "token_acc": 0.7485904664274731, "train_speed(iter/s)": 0.121114 }, { "epoch": 0.02660014467883569, "grad_norm": 1.083650827407837, "learning_rate": 2.659919553652524e-05, "loss": 0.810784912109375, "memory(GiB)": 88.99, "step": 2050, "token_acc": 0.7655007677870671, "train_speed(iter/s)": 0.121112 }, { "epoch": 0.026665023080491387, "grad_norm": 1.2705154418945312, "learning_rate": 2.666407162319969e-05, "loss": 0.81783447265625, "memory(GiB)": 88.99, "step": 2055, "token_acc": 0.746022170273735, "train_speed(iter/s)": 0.121116 }, { "epoch": 0.026729901482147086, "grad_norm": 1.2476625442504883, "learning_rate": 2.672894770987414e-05, "loss": 0.8214866638183593, "memory(GiB)": 88.99, "step": 2060, "token_acc": 0.7550984774409834, "train_speed(iter/s)": 0.121112 }, { "epoch": 0.026794779883802782, "grad_norm": 1.3501343727111816, "learning_rate": 2.6793823796548595e-05, "loss": 0.7888534069061279, "memory(GiB)": 88.99, "step": 2065, "token_acc": 0.7815676530381749, "train_speed(iter/s)": 0.121108 }, { "epoch": 0.026859658285458478, "grad_norm": 1.27423894405365, "learning_rate": 2.6858699883223045e-05, "loss": 0.8081683158874512, "memory(GiB)": 88.99, "step": 2070, "token_acc": 0.7669765897393326, "train_speed(iter/s)": 0.121122 }, { "epoch": 0.026924536687114177, "grad_norm": 1.2231727838516235, "learning_rate": 2.6923575969897498e-05, "loss": 0.7873109817504883, "memory(GiB)": 88.99, "step": 2075, "token_acc": 0.7899874751344581, "train_speed(iter/s)": 0.12111 }, { "epoch": 0.026989415088769873, "grad_norm": 1.1377887725830078, "learning_rate": 2.698845205657195e-05, "loss": 0.823614501953125, "memory(GiB)": 88.99, "step": 2080, "token_acc": 0.7554426991063344, "train_speed(iter/s)": 0.121099 }, { "epoch": 0.02705429349042557, "grad_norm": 1.3921066522598267, "learning_rate": 2.7053328143246398e-05, "loss": 0.8348123550415039, "memory(GiB)": 88.99, "step": 2085, "token_acc": 0.7545530667783128, "train_speed(iter/s)": 0.121098 }, { "epoch": 0.027119171892081267, "grad_norm": 1.215801477432251, "learning_rate": 2.711820422992085e-05, "loss": 0.7738768577575683, "memory(GiB)": 88.99, "step": 2090, "token_acc": 0.7722833477562121, "train_speed(iter/s)": 0.121104 }, { "epoch": 0.027184050293736963, "grad_norm": 1.1372390985488892, "learning_rate": 2.7183080316595304e-05, "loss": 0.8408920288085937, "memory(GiB)": 88.99, "step": 2095, "token_acc": 0.744872961433471, "train_speed(iter/s)": 0.121107 }, { "epoch": 0.02724892869539266, "grad_norm": 1.172135353088379, "learning_rate": 2.7247956403269757e-05, "loss": 0.8199501991271972, "memory(GiB)": 88.99, "step": 2100, "token_acc": 0.7599595692028859, "train_speed(iter/s)": 0.1211 }, { "epoch": 0.027313807097048358, "grad_norm": 1.2689186334609985, "learning_rate": 2.7312832489944207e-05, "loss": 0.8360169410705567, "memory(GiB)": 88.99, "step": 2105, "token_acc": 0.7829869326247588, "train_speed(iter/s)": 0.121097 }, { "epoch": 0.027378685498704054, "grad_norm": 1.2405879497528076, "learning_rate": 2.737770857661866e-05, "loss": 0.8134943008422851, "memory(GiB)": 88.99, "step": 2110, "token_acc": 0.7692070348657821, "train_speed(iter/s)": 0.121101 }, { "epoch": 0.02744356390035975, "grad_norm": 1.2493927478790283, "learning_rate": 2.7442584663293114e-05, "loss": 0.7673440933227539, "memory(GiB)": 88.99, "step": 2115, "token_acc": 0.7922365706426224, "train_speed(iter/s)": 0.121107 }, { "epoch": 0.02750844230201545, "grad_norm": 1.209351658821106, "learning_rate": 2.7507460749967563e-05, "loss": 0.8269542694091797, "memory(GiB)": 88.99, "step": 2120, "token_acc": 0.7596244789294847, "train_speed(iter/s)": 0.121117 }, { "epoch": 0.027573320703671144, "grad_norm": 1.129157543182373, "learning_rate": 2.7572336836642017e-05, "loss": 0.8172142982482911, "memory(GiB)": 88.99, "step": 2125, "token_acc": 0.7660267333106138, "train_speed(iter/s)": 0.12112 }, { "epoch": 0.02763819910532684, "grad_norm": 1.294763445854187, "learning_rate": 2.763721292331647e-05, "loss": 0.8029541969299316, "memory(GiB)": 88.99, "step": 2130, "token_acc": 0.7527324519305199, "train_speed(iter/s)": 0.121108 }, { "epoch": 0.02770307750698254, "grad_norm": 1.2286314964294434, "learning_rate": 2.7702089009990916e-05, "loss": 0.7850248336791992, "memory(GiB)": 88.99, "step": 2135, "token_acc": 0.7709115739160123, "train_speed(iter/s)": 0.121109 }, { "epoch": 0.027767955908638235, "grad_norm": 1.2159881591796875, "learning_rate": 2.776696509666537e-05, "loss": 0.8415531158447266, "memory(GiB)": 88.99, "step": 2140, "token_acc": 0.7461693236341124, "train_speed(iter/s)": 0.121109 }, { "epoch": 0.02783283431029393, "grad_norm": 1.1316908597946167, "learning_rate": 2.7831841183339823e-05, "loss": 0.7863127708435058, "memory(GiB)": 88.99, "step": 2145, "token_acc": 0.765039337616381, "train_speed(iter/s)": 0.121111 }, { "epoch": 0.02789771271194963, "grad_norm": 1.1082152128219604, "learning_rate": 2.7896717270014273e-05, "loss": 0.7960594177246094, "memory(GiB)": 88.99, "step": 2150, "token_acc": 0.774797801197437, "train_speed(iter/s)": 0.12112 }, { "epoch": 0.027962591113605325, "grad_norm": 1.1472798585891724, "learning_rate": 2.7961593356688726e-05, "loss": 0.8192025184631347, "memory(GiB)": 88.99, "step": 2155, "token_acc": 0.7566598544333297, "train_speed(iter/s)": 0.12112 }, { "epoch": 0.02802746951526102, "grad_norm": 1.1996523141860962, "learning_rate": 2.802646944336318e-05, "loss": 0.7936247825622559, "memory(GiB)": 88.99, "step": 2160, "token_acc": 0.7633765370555001, "train_speed(iter/s)": 0.121129 }, { "epoch": 0.02809234791691672, "grad_norm": 1.1802107095718384, "learning_rate": 2.8091345530037626e-05, "loss": 0.8031808853149414, "memory(GiB)": 88.99, "step": 2165, "token_acc": 0.7618043160798312, "train_speed(iter/s)": 0.121134 }, { "epoch": 0.028157226318572415, "grad_norm": 1.1128746271133423, "learning_rate": 2.8156221616712082e-05, "loss": 0.7796154499053956, "memory(GiB)": 88.99, "step": 2170, "token_acc": 0.7688317269322856, "train_speed(iter/s)": 0.121138 }, { "epoch": 0.02822210472022811, "grad_norm": 1.2575429677963257, "learning_rate": 2.8221097703386536e-05, "loss": 0.8040067672729492, "memory(GiB)": 88.99, "step": 2175, "token_acc": 0.7661846098910908, "train_speed(iter/s)": 0.121136 }, { "epoch": 0.02828698312188381, "grad_norm": 1.1473419666290283, "learning_rate": 2.8285973790060982e-05, "loss": 0.8276853561401367, "memory(GiB)": 88.99, "step": 2180, "token_acc": 0.7875354107648725, "train_speed(iter/s)": 0.121132 }, { "epoch": 0.028351861523539506, "grad_norm": 1.351109266281128, "learning_rate": 2.8350849876735435e-05, "loss": 0.8072605133056641, "memory(GiB)": 88.99, "step": 2185, "token_acc": 0.7642501776830135, "train_speed(iter/s)": 0.121133 }, { "epoch": 0.0284167399251952, "grad_norm": 1.2238560914993286, "learning_rate": 2.841572596340989e-05, "loss": 0.7869336128234863, "memory(GiB)": 88.99, "step": 2190, "token_acc": 0.7721474916917234, "train_speed(iter/s)": 0.121144 }, { "epoch": 0.0284816183268509, "grad_norm": 1.1798787117004395, "learning_rate": 2.8480602050084342e-05, "loss": 0.7654293060302735, "memory(GiB)": 88.99, "step": 2195, "token_acc": 0.7696071991592223, "train_speed(iter/s)": 0.121142 }, { "epoch": 0.028546496728506596, "grad_norm": 1.2333637475967407, "learning_rate": 2.854547813675879e-05, "loss": 0.8038791656494141, "memory(GiB)": 88.99, "step": 2200, "token_acc": 0.7661785728084071, "train_speed(iter/s)": 0.121153 }, { "epoch": 0.028611375130162292, "grad_norm": 1.3615326881408691, "learning_rate": 2.8610354223433245e-05, "loss": 0.821125602722168, "memory(GiB)": 88.99, "step": 2205, "token_acc": 0.7608599366290757, "train_speed(iter/s)": 0.121154 }, { "epoch": 0.02867625353181799, "grad_norm": 1.2522703409194946, "learning_rate": 2.8675230310107698e-05, "loss": 0.8250799179077148, "memory(GiB)": 88.99, "step": 2210, "token_acc": 0.7542826885940322, "train_speed(iter/s)": 0.121155 }, { "epoch": 0.028741131933473687, "grad_norm": 1.159858226776123, "learning_rate": 2.8740106396782145e-05, "loss": 0.8004854202270508, "memory(GiB)": 88.99, "step": 2215, "token_acc": 0.7606983499535831, "train_speed(iter/s)": 0.121142 }, { "epoch": 0.028806010335129383, "grad_norm": 1.2889277935028076, "learning_rate": 2.88049824834566e-05, "loss": 0.8375580787658692, "memory(GiB)": 88.99, "step": 2220, "token_acc": 0.7576224734498116, "train_speed(iter/s)": 0.12114 }, { "epoch": 0.02887088873678508, "grad_norm": 1.3108104467391968, "learning_rate": 2.8869858570131054e-05, "loss": 0.8123729705810547, "memory(GiB)": 88.99, "step": 2225, "token_acc": 0.7560933671495508, "train_speed(iter/s)": 0.121149 }, { "epoch": 0.028935767138440777, "grad_norm": 1.2162882089614868, "learning_rate": 2.89347346568055e-05, "loss": 0.8037592887878418, "memory(GiB)": 88.99, "step": 2230, "token_acc": 0.7648563701483809, "train_speed(iter/s)": 0.121157 }, { "epoch": 0.029000645540096473, "grad_norm": 1.3043875694274902, "learning_rate": 2.8999610743479954e-05, "loss": 0.8310636520385742, "memory(GiB)": 88.99, "step": 2235, "token_acc": 0.7567712045616536, "train_speed(iter/s)": 0.121163 }, { "epoch": 0.029065523941752172, "grad_norm": 1.233794093132019, "learning_rate": 2.9064486830154407e-05, "loss": 0.8217221260070801, "memory(GiB)": 88.99, "step": 2240, "token_acc": 0.7517439156719888, "train_speed(iter/s)": 0.121162 }, { "epoch": 0.029130402343407868, "grad_norm": 1.1363722085952759, "learning_rate": 2.9129362916828857e-05, "loss": 0.8239664077758789, "memory(GiB)": 88.99, "step": 2245, "token_acc": 0.7662624731590248, "train_speed(iter/s)": 0.121162 }, { "epoch": 0.029195280745063563, "grad_norm": 1.2021563053131104, "learning_rate": 2.919423900350331e-05, "loss": 0.808903694152832, "memory(GiB)": 88.99, "step": 2250, "token_acc": 0.7656575175717595, "train_speed(iter/s)": 0.121179 }, { "epoch": 0.029260159146719263, "grad_norm": 1.3527460098266602, "learning_rate": 2.9259115090177764e-05, "loss": 0.8516548156738282, "memory(GiB)": 88.99, "step": 2255, "token_acc": 0.7654549492855557, "train_speed(iter/s)": 0.121163 }, { "epoch": 0.02932503754837496, "grad_norm": 1.2220326662063599, "learning_rate": 2.932399117685221e-05, "loss": 0.8312301635742188, "memory(GiB)": 88.99, "step": 2260, "token_acc": 0.7569728963126379, "train_speed(iter/s)": 0.121166 }, { "epoch": 0.029389915950030654, "grad_norm": 1.2515267133712769, "learning_rate": 2.9388867263526663e-05, "loss": 0.8372406005859375, "memory(GiB)": 88.99, "step": 2265, "token_acc": 0.7680972168872437, "train_speed(iter/s)": 0.121168 }, { "epoch": 0.029454794351686353, "grad_norm": 1.3270807266235352, "learning_rate": 2.945374335020112e-05, "loss": 0.8197408676147461, "memory(GiB)": 88.99, "step": 2270, "token_acc": 0.7517080745341614, "train_speed(iter/s)": 0.121179 }, { "epoch": 0.02951967275334205, "grad_norm": 1.3045392036437988, "learning_rate": 2.9518619436875566e-05, "loss": 0.8549169540405274, "memory(GiB)": 88.99, "step": 2275, "token_acc": 0.7535966418622401, "train_speed(iter/s)": 0.121189 }, { "epoch": 0.029584551154997744, "grad_norm": 1.1714587211608887, "learning_rate": 2.958349552355002e-05, "loss": 0.8123679161071777, "memory(GiB)": 88.99, "step": 2280, "token_acc": 0.7351778656126482, "train_speed(iter/s)": 0.121191 }, { "epoch": 0.029649429556653444, "grad_norm": 1.1782195568084717, "learning_rate": 2.9648371610224473e-05, "loss": 0.8532892227172851, "memory(GiB)": 88.99, "step": 2285, "token_acc": 0.7512030306132896, "train_speed(iter/s)": 0.121201 }, { "epoch": 0.02971430795830914, "grad_norm": 1.1895625591278076, "learning_rate": 2.9713247696898926e-05, "loss": 0.8012405395507812, "memory(GiB)": 88.99, "step": 2290, "token_acc": 0.7498059608817138, "train_speed(iter/s)": 0.121216 }, { "epoch": 0.029779186359964835, "grad_norm": 1.2063812017440796, "learning_rate": 2.9778123783573376e-05, "loss": 0.8006423950195313, "memory(GiB)": 88.99, "step": 2295, "token_acc": 0.7564468137862633, "train_speed(iter/s)": 0.12121 }, { "epoch": 0.029844064761620534, "grad_norm": 1.1972386837005615, "learning_rate": 2.984299987024783e-05, "loss": 0.830437183380127, "memory(GiB)": 88.99, "step": 2300, "token_acc": 0.7612697220135237, "train_speed(iter/s)": 0.121212 }, { "epoch": 0.02990894316327623, "grad_norm": 1.1284048557281494, "learning_rate": 2.9907875956922283e-05, "loss": 0.828968620300293, "memory(GiB)": 88.99, "step": 2305, "token_acc": 0.7608641898992397, "train_speed(iter/s)": 0.121197 }, { "epoch": 0.029973821564931925, "grad_norm": 1.2025707960128784, "learning_rate": 2.997275204359673e-05, "loss": 0.8265801429748535, "memory(GiB)": 88.99, "step": 2310, "token_acc": 0.7555169942283753, "train_speed(iter/s)": 0.121203 }, { "epoch": 0.030038699966587624, "grad_norm": 1.3624169826507568, "learning_rate": 3.0037628130271182e-05, "loss": 0.8480856895446778, "memory(GiB)": 88.99, "step": 2315, "token_acc": 0.7526784820266347, "train_speed(iter/s)": 0.121195 }, { "epoch": 0.03010357836824332, "grad_norm": 1.1238447427749634, "learning_rate": 3.010250421694564e-05, "loss": 0.8293070793151855, "memory(GiB)": 88.99, "step": 2320, "token_acc": 0.7544483985765125, "train_speed(iter/s)": 0.121183 }, { "epoch": 0.030168456769899016, "grad_norm": 1.2505385875701904, "learning_rate": 3.0167380303620085e-05, "loss": 0.7940178871154785, "memory(GiB)": 88.99, "step": 2325, "token_acc": 0.759357679750968, "train_speed(iter/s)": 0.121191 }, { "epoch": 0.030233335171554715, "grad_norm": 1.139037847518921, "learning_rate": 3.023225639029454e-05, "loss": 0.7909708976745605, "memory(GiB)": 88.99, "step": 2330, "token_acc": 0.77689586217505, "train_speed(iter/s)": 0.12118 }, { "epoch": 0.03029821357321041, "grad_norm": 1.1261494159698486, "learning_rate": 3.0297132476968992e-05, "loss": 0.8335762977600097, "memory(GiB)": 88.99, "step": 2335, "token_acc": 0.7614345629272481, "train_speed(iter/s)": 0.121176 }, { "epoch": 0.030363091974866106, "grad_norm": 1.1973204612731934, "learning_rate": 3.036200856364344e-05, "loss": 0.7957068920135498, "memory(GiB)": 88.99, "step": 2340, "token_acc": 0.7703488372093024, "train_speed(iter/s)": 0.121172 }, { "epoch": 0.030427970376521805, "grad_norm": 1.1365902423858643, "learning_rate": 3.0426884650317895e-05, "loss": 0.8259085655212403, "memory(GiB)": 88.99, "step": 2345, "token_acc": 0.7622818171309675, "train_speed(iter/s)": 0.121172 }, { "epoch": 0.0304928487781775, "grad_norm": 1.1524794101715088, "learning_rate": 3.0491760736992348e-05, "loss": 0.8277429580688477, "memory(GiB)": 88.99, "step": 2350, "token_acc": 0.7645685377955054, "train_speed(iter/s)": 0.12117 }, { "epoch": 0.030557727179833197, "grad_norm": 1.233309030532837, "learning_rate": 3.0556636823666795e-05, "loss": 0.7865606784820557, "memory(GiB)": 88.99, "step": 2355, "token_acc": 0.7783874580067189, "train_speed(iter/s)": 0.121158 }, { "epoch": 0.030622605581488896, "grad_norm": 1.0584903955459595, "learning_rate": 3.062151291034125e-05, "loss": 0.8368178367614746, "memory(GiB)": 88.99, "step": 2360, "token_acc": 0.7418409731493844, "train_speed(iter/s)": 0.121158 }, { "epoch": 0.03068748398314459, "grad_norm": 1.0939773321151733, "learning_rate": 3.06863889970157e-05, "loss": 0.8396332740783692, "memory(GiB)": 88.99, "step": 2365, "token_acc": 0.7710934668731262, "train_speed(iter/s)": 0.121156 }, { "epoch": 0.030752362384800287, "grad_norm": 1.2073169946670532, "learning_rate": 3.075126508369015e-05, "loss": 0.8241506576538086, "memory(GiB)": 88.99, "step": 2370, "token_acc": 0.767402499189744, "train_speed(iter/s)": 0.121162 }, { "epoch": 0.030817240786455986, "grad_norm": 1.0877091884613037, "learning_rate": 3.08161411703646e-05, "loss": 0.7554728507995605, "memory(GiB)": 88.99, "step": 2375, "token_acc": 0.7725345779985441, "train_speed(iter/s)": 0.121158 }, { "epoch": 0.030882119188111682, "grad_norm": 1.108596682548523, "learning_rate": 3.088101725703906e-05, "loss": 0.8140241622924804, "memory(GiB)": 88.99, "step": 2380, "token_acc": 0.7478371098082712, "train_speed(iter/s)": 0.12115 }, { "epoch": 0.030946997589767378, "grad_norm": 1.1770998239517212, "learning_rate": 3.0945893343713514e-05, "loss": 0.8555922508239746, "memory(GiB)": 88.99, "step": 2385, "token_acc": 0.7653577451216574, "train_speed(iter/s)": 0.121157 }, { "epoch": 0.031011875991423077, "grad_norm": 1.1476207971572876, "learning_rate": 3.101076943038796e-05, "loss": 0.8325518608093262, "memory(GiB)": 88.99, "step": 2390, "token_acc": 0.7619442429267281, "train_speed(iter/s)": 0.121167 }, { "epoch": 0.031076754393078772, "grad_norm": 1.232899785041809, "learning_rate": 3.1075645517062414e-05, "loss": 0.8218043327331543, "memory(GiB)": 88.99, "step": 2395, "token_acc": 0.7591911107935261, "train_speed(iter/s)": 0.121162 }, { "epoch": 0.031141632794734468, "grad_norm": 1.28532075881958, "learning_rate": 3.1140521603736864e-05, "loss": 0.8318622589111329, "memory(GiB)": 88.99, "step": 2400, "token_acc": 0.7437906220984215, "train_speed(iter/s)": 0.121158 }, { "epoch": 0.031206511196390167, "grad_norm": 1.136709451675415, "learning_rate": 3.1205397690411313e-05, "loss": 0.8366928100585938, "memory(GiB)": 88.99, "step": 2405, "token_acc": 0.7566222455060411, "train_speed(iter/s)": 0.121165 }, { "epoch": 0.03127138959804586, "grad_norm": 1.17936372756958, "learning_rate": 3.127027377708577e-05, "loss": 0.8260658264160157, "memory(GiB)": 88.99, "step": 2410, "token_acc": 0.7586727892164123, "train_speed(iter/s)": 0.121162 }, { "epoch": 0.03133626799970156, "grad_norm": 1.1161108016967773, "learning_rate": 3.133514986376022e-05, "loss": 0.8082788467407227, "memory(GiB)": 88.99, "step": 2415, "token_acc": 0.7613239094087247, "train_speed(iter/s)": 0.121167 }, { "epoch": 0.03140114640135726, "grad_norm": 1.1336753368377686, "learning_rate": 3.140002595043467e-05, "loss": 0.7935325145721436, "memory(GiB)": 88.99, "step": 2420, "token_acc": 0.776058394160584, "train_speed(iter/s)": 0.121174 }, { "epoch": 0.03146602480301295, "grad_norm": 1.3563612699508667, "learning_rate": 3.1464902037109126e-05, "loss": 0.8456561088562011, "memory(GiB)": 88.99, "step": 2425, "token_acc": 0.7741420294161343, "train_speed(iter/s)": 0.12118 }, { "epoch": 0.03153090320466865, "grad_norm": 1.1788748502731323, "learning_rate": 3.1529778123783576e-05, "loss": 0.8411725997924805, "memory(GiB)": 88.99, "step": 2430, "token_acc": 0.7277215895516445, "train_speed(iter/s)": 0.121191 }, { "epoch": 0.03159578160632435, "grad_norm": 1.0347676277160645, "learning_rate": 3.1594654210458026e-05, "loss": 0.813409423828125, "memory(GiB)": 88.99, "step": 2435, "token_acc": 0.7603243385201234, "train_speed(iter/s)": 0.121179 }, { "epoch": 0.03166066000798004, "grad_norm": 1.153407335281372, "learning_rate": 3.1659530297132476e-05, "loss": 0.7964404106140137, "memory(GiB)": 88.99, "step": 2440, "token_acc": 0.7939819157318125, "train_speed(iter/s)": 0.121182 }, { "epoch": 0.03172553840963574, "grad_norm": 1.2696373462677002, "learning_rate": 3.172440638380693e-05, "loss": 0.8641361236572266, "memory(GiB)": 88.99, "step": 2445, "token_acc": 0.7853176215630503, "train_speed(iter/s)": 0.121191 }, { "epoch": 0.03179041681129144, "grad_norm": 1.4281604290008545, "learning_rate": 3.178928247048138e-05, "loss": 0.8207902908325195, "memory(GiB)": 88.99, "step": 2450, "token_acc": 0.7590895136178237, "train_speed(iter/s)": 0.121202 }, { "epoch": 0.03185529521294713, "grad_norm": 1.4027645587921143, "learning_rate": 3.185415855715583e-05, "loss": 0.8196238517761231, "memory(GiB)": 88.99, "step": 2455, "token_acc": 0.7512840700333671, "train_speed(iter/s)": 0.1212 }, { "epoch": 0.03192017361460283, "grad_norm": 1.3835705518722534, "learning_rate": 3.191903464383029e-05, "loss": 0.8138736724853516, "memory(GiB)": 88.99, "step": 2460, "token_acc": 0.7789358200767965, "train_speed(iter/s)": 0.121201 }, { "epoch": 0.03198505201625853, "grad_norm": 1.208853006362915, "learning_rate": 3.198391073050473e-05, "loss": 0.82504243850708, "memory(GiB)": 88.99, "step": 2465, "token_acc": 0.7601375007957222, "train_speed(iter/s)": 0.121205 }, { "epoch": 0.03204993041791422, "grad_norm": 1.2112867832183838, "learning_rate": 3.204878681717919e-05, "loss": 0.851987075805664, "memory(GiB)": 88.99, "step": 2470, "token_acc": 0.7438129990245312, "train_speed(iter/s)": 0.121207 }, { "epoch": 0.03211480881956992, "grad_norm": 1.1634634733200073, "learning_rate": 3.2113662903853645e-05, "loss": 0.8158859252929688, "memory(GiB)": 88.99, "step": 2475, "token_acc": 0.769029097004341, "train_speed(iter/s)": 0.121205 }, { "epoch": 0.03217968722122562, "grad_norm": 1.1959149837493896, "learning_rate": 3.2178538990528095e-05, "loss": 0.8367803573608399, "memory(GiB)": 88.99, "step": 2480, "token_acc": 0.7636526811756787, "train_speed(iter/s)": 0.121212 }, { "epoch": 0.03224456562288131, "grad_norm": 1.1092256307601929, "learning_rate": 3.2243415077202545e-05, "loss": 0.8179047584533692, "memory(GiB)": 88.99, "step": 2485, "token_acc": 0.7512827721937162, "train_speed(iter/s)": 0.121218 }, { "epoch": 0.03230944402453701, "grad_norm": 1.1681865453720093, "learning_rate": 3.2308291163876995e-05, "loss": 0.7875986099243164, "memory(GiB)": 88.99, "step": 2490, "token_acc": 0.7474452554744525, "train_speed(iter/s)": 0.121221 }, { "epoch": 0.03237432242619271, "grad_norm": 1.2513362169265747, "learning_rate": 3.237316725055145e-05, "loss": 0.839387321472168, "memory(GiB)": 88.99, "step": 2495, "token_acc": 0.749674576950035, "train_speed(iter/s)": 0.121229 }, { "epoch": 0.0324392008278484, "grad_norm": 1.0485541820526123, "learning_rate": 3.24380433372259e-05, "loss": 0.8085195541381835, "memory(GiB)": 88.99, "step": 2500, "token_acc": 0.7597619578016103, "train_speed(iter/s)": 0.121226 }, { "epoch": 0.0325040792295041, "grad_norm": 1.193089485168457, "learning_rate": 3.250291942390035e-05, "loss": 0.7944709300994873, "memory(GiB)": 88.99, "step": 2505, "token_acc": 0.7744180207324509, "train_speed(iter/s)": 0.121226 }, { "epoch": 0.0325689576311598, "grad_norm": 1.186428189277649, "learning_rate": 3.256779551057481e-05, "loss": 0.811585807800293, "memory(GiB)": 88.99, "step": 2510, "token_acc": 0.7632986015903482, "train_speed(iter/s)": 0.121223 }, { "epoch": 0.03263383603281549, "grad_norm": 1.0790234804153442, "learning_rate": 3.263267159724925e-05, "loss": 0.8496809005737305, "memory(GiB)": 88.99, "step": 2515, "token_acc": 0.754048757005811, "train_speed(iter/s)": 0.121221 }, { "epoch": 0.03269871443447119, "grad_norm": 1.2959442138671875, "learning_rate": 3.269754768392371e-05, "loss": 0.8321864128112793, "memory(GiB)": 88.99, "step": 2520, "token_acc": 0.7628125524416849, "train_speed(iter/s)": 0.121234 }, { "epoch": 0.03276359283612689, "grad_norm": 1.1624484062194824, "learning_rate": 3.2762423770598164e-05, "loss": 0.846843147277832, "memory(GiB)": 88.99, "step": 2525, "token_acc": 0.7422467509346626, "train_speed(iter/s)": 0.12123 }, { "epoch": 0.03282847123778258, "grad_norm": 1.0946743488311768, "learning_rate": 3.282729985727261e-05, "loss": 0.813871955871582, "memory(GiB)": 88.99, "step": 2530, "token_acc": 0.7634895559066175, "train_speed(iter/s)": 0.121223 }, { "epoch": 0.03289334963943828, "grad_norm": 1.3029567003250122, "learning_rate": 3.2892175943947064e-05, "loss": 0.8459506034851074, "memory(GiB)": 88.99, "step": 2535, "token_acc": 0.7554489639293938, "train_speed(iter/s)": 0.121229 }, { "epoch": 0.03295822804109398, "grad_norm": 1.2145838737487793, "learning_rate": 3.2957052030621514e-05, "loss": 0.7962131500244141, "memory(GiB)": 88.99, "step": 2540, "token_acc": 0.7663207231335788, "train_speed(iter/s)": 0.12124 }, { "epoch": 0.033023106442749674, "grad_norm": 1.1022638082504272, "learning_rate": 3.3021928117295963e-05, "loss": 0.8379992485046387, "memory(GiB)": 88.99, "step": 2545, "token_acc": 0.7603449580149785, "train_speed(iter/s)": 0.121245 }, { "epoch": 0.03308798484440537, "grad_norm": 1.0685572624206543, "learning_rate": 3.308680420397042e-05, "loss": 0.8385686874389648, "memory(GiB)": 88.99, "step": 2550, "token_acc": 0.7792132595085586, "train_speed(iter/s)": 0.121245 }, { "epoch": 0.03315286324606107, "grad_norm": 1.1475025415420532, "learning_rate": 3.315168029064487e-05, "loss": 0.8206699371337891, "memory(GiB)": 88.99, "step": 2555, "token_acc": 0.7425536656020204, "train_speed(iter/s)": 0.121245 }, { "epoch": 0.033217741647716764, "grad_norm": 1.282378911972046, "learning_rate": 3.3216556377319327e-05, "loss": 0.7753420829772949, "memory(GiB)": 88.99, "step": 2560, "token_acc": 0.7561309373762108, "train_speed(iter/s)": 0.121244 }, { "epoch": 0.03328262004937246, "grad_norm": 1.1852844953536987, "learning_rate": 3.328143246399377e-05, "loss": 0.8417375564575196, "memory(GiB)": 88.99, "step": 2565, "token_acc": 0.768084711127624, "train_speed(iter/s)": 0.121244 }, { "epoch": 0.03334749845102816, "grad_norm": 1.167872428894043, "learning_rate": 3.3346308550668226e-05, "loss": 0.853665542602539, "memory(GiB)": 88.99, "step": 2570, "token_acc": 0.7753792075803309, "train_speed(iter/s)": 0.12125 }, { "epoch": 0.033412376852683855, "grad_norm": 1.1893254518508911, "learning_rate": 3.341118463734268e-05, "loss": 0.797910213470459, "memory(GiB)": 88.99, "step": 2575, "token_acc": 0.7739216807633266, "train_speed(iter/s)": 0.121242 }, { "epoch": 0.033477255254339554, "grad_norm": 1.1603277921676636, "learning_rate": 3.3476060724017126e-05, "loss": 0.8266533851623535, "memory(GiB)": 88.99, "step": 2580, "token_acc": 0.7645194670805847, "train_speed(iter/s)": 0.121229 }, { "epoch": 0.03354213365599525, "grad_norm": 1.2171142101287842, "learning_rate": 3.354093681069158e-05, "loss": 0.8551794052124023, "memory(GiB)": 88.99, "step": 2585, "token_acc": 0.7530091783910821, "train_speed(iter/s)": 0.121229 }, { "epoch": 0.033607012057650945, "grad_norm": 1.1556283235549927, "learning_rate": 3.360581289736603e-05, "loss": 0.8534928321838379, "memory(GiB)": 88.99, "step": 2590, "token_acc": 0.7777531855692921, "train_speed(iter/s)": 0.121232 }, { "epoch": 0.033671890459306644, "grad_norm": 1.258975625038147, "learning_rate": 3.367068898404048e-05, "loss": 0.8093547821044922, "memory(GiB)": 88.99, "step": 2595, "token_acc": 0.7861805959537199, "train_speed(iter/s)": 0.121227 }, { "epoch": 0.03373676886096234, "grad_norm": 1.1549276113510132, "learning_rate": 3.373556507071494e-05, "loss": 0.8540534973144531, "memory(GiB)": 88.99, "step": 2600, "token_acc": 0.7476883352524513, "train_speed(iter/s)": 0.121225 }, { "epoch": 0.033801647262618036, "grad_norm": 1.147618293762207, "learning_rate": 3.380044115738939e-05, "loss": 0.8539454460144043, "memory(GiB)": 88.99, "step": 2605, "token_acc": 0.767472876040803, "train_speed(iter/s)": 0.121231 }, { "epoch": 0.033866525664273735, "grad_norm": 1.2452045679092407, "learning_rate": 3.386531724406384e-05, "loss": 0.8210935592651367, "memory(GiB)": 88.99, "step": 2610, "token_acc": 0.7630883339251193, "train_speed(iter/s)": 0.12123 }, { "epoch": 0.033931404065929434, "grad_norm": 1.0605741739273071, "learning_rate": 3.393019333073829e-05, "loss": 0.7982230186462402, "memory(GiB)": 88.99, "step": 2615, "token_acc": 0.7604958849880196, "train_speed(iter/s)": 0.121233 }, { "epoch": 0.033996282467585126, "grad_norm": 1.229340672492981, "learning_rate": 3.3995069417412745e-05, "loss": 0.8198140144348145, "memory(GiB)": 88.99, "step": 2620, "token_acc": 0.7589821368385574, "train_speed(iter/s)": 0.121233 }, { "epoch": 0.034061160869240825, "grad_norm": 1.2001196146011353, "learning_rate": 3.4059945504087195e-05, "loss": 0.8463268280029297, "memory(GiB)": 88.99, "step": 2625, "token_acc": 0.7771444175171519, "train_speed(iter/s)": 0.121233 }, { "epoch": 0.034126039270896524, "grad_norm": 1.3354601860046387, "learning_rate": 3.4124821590761645e-05, "loss": 0.8319104194641114, "memory(GiB)": 88.99, "step": 2630, "token_acc": 0.7522401991288115, "train_speed(iter/s)": 0.12124 }, { "epoch": 0.03419091767255222, "grad_norm": 1.1444107294082642, "learning_rate": 3.41896976774361e-05, "loss": 0.7934362411499023, "memory(GiB)": 88.99, "step": 2635, "token_acc": 0.7646017699115044, "train_speed(iter/s)": 0.121232 }, { "epoch": 0.034255796074207916, "grad_norm": 1.1558629274368286, "learning_rate": 3.425457376411055e-05, "loss": 0.8011257171630859, "memory(GiB)": 88.99, "step": 2640, "token_acc": 0.7784386891787047, "train_speed(iter/s)": 0.121236 }, { "epoch": 0.034320674475863615, "grad_norm": 1.167937994003296, "learning_rate": 3.4319449850785e-05, "loss": 0.8428247451782227, "memory(GiB)": 88.99, "step": 2645, "token_acc": 0.7516170763260026, "train_speed(iter/s)": 0.121229 }, { "epoch": 0.03438555287751931, "grad_norm": 1.152483582496643, "learning_rate": 3.438432593745946e-05, "loss": 0.8704240798950196, "memory(GiB)": 88.99, "step": 2650, "token_acc": 0.7512293382309378, "train_speed(iter/s)": 0.121233 }, { "epoch": 0.034450431279175006, "grad_norm": 1.2148951292037964, "learning_rate": 3.444920202413391e-05, "loss": 0.8242805480957032, "memory(GiB)": 88.99, "step": 2655, "token_acc": 0.7657060518731988, "train_speed(iter/s)": 0.121234 }, { "epoch": 0.034515309680830705, "grad_norm": 1.1116348505020142, "learning_rate": 3.451407811080836e-05, "loss": 0.8479061126708984, "memory(GiB)": 88.99, "step": 2660, "token_acc": 0.7620515349525336, "train_speed(iter/s)": 0.121224 }, { "epoch": 0.0345801880824864, "grad_norm": 1.2974567413330078, "learning_rate": 3.457895419748281e-05, "loss": 0.7991674900054931, "memory(GiB)": 88.99, "step": 2665, "token_acc": 0.7856015595989603, "train_speed(iter/s)": 0.121233 }, { "epoch": 0.0346450664841421, "grad_norm": 1.112959861755371, "learning_rate": 3.4643830284157264e-05, "loss": 0.8671939849853516, "memory(GiB)": 88.99, "step": 2670, "token_acc": 0.7774050354105249, "train_speed(iter/s)": 0.121243 }, { "epoch": 0.034709944885797796, "grad_norm": 1.1431288719177246, "learning_rate": 3.4708706370831714e-05, "loss": 0.8076612472534179, "memory(GiB)": 88.99, "step": 2675, "token_acc": 0.760720732994993, "train_speed(iter/s)": 0.121243 }, { "epoch": 0.03477482328745349, "grad_norm": 1.1141619682312012, "learning_rate": 3.4773582457506164e-05, "loss": 0.8295097351074219, "memory(GiB)": 88.99, "step": 2680, "token_acc": 0.7536838464199239, "train_speed(iter/s)": 0.121245 }, { "epoch": 0.03483970168910919, "grad_norm": 1.1178258657455444, "learning_rate": 3.483845854418062e-05, "loss": 0.8251153945922851, "memory(GiB)": 88.99, "step": 2685, "token_acc": 0.7590745358825159, "train_speed(iter/s)": 0.121249 }, { "epoch": 0.034904580090764886, "grad_norm": 1.1005855798721313, "learning_rate": 3.490333463085507e-05, "loss": 0.8147114753723145, "memory(GiB)": 88.99, "step": 2690, "token_acc": 0.7738262088297126, "train_speed(iter/s)": 0.121256 }, { "epoch": 0.03496945849242058, "grad_norm": 1.1604021787643433, "learning_rate": 3.496821071752952e-05, "loss": 0.8077749252319336, "memory(GiB)": 88.99, "step": 2695, "token_acc": 0.7710130221545747, "train_speed(iter/s)": 0.121255 }, { "epoch": 0.03503433689407628, "grad_norm": 1.1356070041656494, "learning_rate": 3.5033086804203977e-05, "loss": 0.8308780670166016, "memory(GiB)": 88.99, "step": 2700, "token_acc": 0.74699210535388, "train_speed(iter/s)": 0.12126 }, { "epoch": 0.03509921529573198, "grad_norm": 1.0373420715332031, "learning_rate": 3.509796289087842e-05, "loss": 0.849609661102295, "memory(GiB)": 88.99, "step": 2705, "token_acc": 0.7626885033241446, "train_speed(iter/s)": 0.121258 }, { "epoch": 0.03516409369738767, "grad_norm": 1.2454980611801147, "learning_rate": 3.5162838977552876e-05, "loss": 0.8377523422241211, "memory(GiB)": 88.99, "step": 2710, "token_acc": 0.7639458884328926, "train_speed(iter/s)": 0.121249 }, { "epoch": 0.03522897209904337, "grad_norm": 1.084794282913208, "learning_rate": 3.5227715064227326e-05, "loss": 0.8498483657836914, "memory(GiB)": 88.99, "step": 2715, "token_acc": 0.7557013220949598, "train_speed(iter/s)": 0.121244 }, { "epoch": 0.03529385050069907, "grad_norm": 1.2725317478179932, "learning_rate": 3.5292591150901776e-05, "loss": 0.8325004577636719, "memory(GiB)": 88.99, "step": 2720, "token_acc": 0.7596057209122535, "train_speed(iter/s)": 0.121253 }, { "epoch": 0.03535872890235476, "grad_norm": 1.1281828880310059, "learning_rate": 3.535746723757623e-05, "loss": 0.8211791038513183, "memory(GiB)": 88.99, "step": 2725, "token_acc": 0.7682209144409234, "train_speed(iter/s)": 0.121264 }, { "epoch": 0.03542360730401046, "grad_norm": 1.149043321609497, "learning_rate": 3.542234332425068e-05, "loss": 0.7988929748535156, "memory(GiB)": 88.99, "step": 2730, "token_acc": 0.7583910857453459, "train_speed(iter/s)": 0.121271 }, { "epoch": 0.03548848570566616, "grad_norm": 1.1053017377853394, "learning_rate": 3.548721941092513e-05, "loss": 0.8386638641357422, "memory(GiB)": 88.99, "step": 2735, "token_acc": 0.746278179498086, "train_speed(iter/s)": 0.121282 }, { "epoch": 0.03555336410732185, "grad_norm": 1.1845366954803467, "learning_rate": 3.555209549759959e-05, "loss": 0.8183986663818359, "memory(GiB)": 88.99, "step": 2740, "token_acc": 0.7646592709984152, "train_speed(iter/s)": 0.121278 }, { "epoch": 0.03561824250897755, "grad_norm": 1.2997535467147827, "learning_rate": 3.561697158427404e-05, "loss": 0.8256820678710938, "memory(GiB)": 88.99, "step": 2745, "token_acc": 0.7634138006027278, "train_speed(iter/s)": 0.121277 }, { "epoch": 0.03568312091063325, "grad_norm": 1.1458073854446411, "learning_rate": 3.5681847670948495e-05, "loss": 0.8413201332092285, "memory(GiB)": 88.99, "step": 2750, "token_acc": 0.7665466629742453, "train_speed(iter/s)": 0.121276 }, { "epoch": 0.03574799931228894, "grad_norm": 1.2225732803344727, "learning_rate": 3.574672375762294e-05, "loss": 0.8493804931640625, "memory(GiB)": 88.99, "step": 2755, "token_acc": 0.7423258659958527, "train_speed(iter/s)": 0.121286 }, { "epoch": 0.03581287771394464, "grad_norm": 1.3367308378219604, "learning_rate": 3.5811599844297395e-05, "loss": 0.8361062049865723, "memory(GiB)": 88.99, "step": 2760, "token_acc": 0.7378830544972669, "train_speed(iter/s)": 0.121289 }, { "epoch": 0.03587775611560034, "grad_norm": 1.1282069683074951, "learning_rate": 3.5876475930971845e-05, "loss": 0.7715101718902588, "memory(GiB)": 88.99, "step": 2765, "token_acc": 0.7650504393101204, "train_speed(iter/s)": 0.121292 }, { "epoch": 0.03594263451725603, "grad_norm": 1.0880389213562012, "learning_rate": 3.5941352017646295e-05, "loss": 0.8209485054016114, "memory(GiB)": 88.99, "step": 2770, "token_acc": 0.7687493516821686, "train_speed(iter/s)": 0.12129 }, { "epoch": 0.03600751291891173, "grad_norm": 1.1201281547546387, "learning_rate": 3.600622810432075e-05, "loss": 0.8388846397399903, "memory(GiB)": 88.99, "step": 2775, "token_acc": 0.7676588953649737, "train_speed(iter/s)": 0.121292 }, { "epoch": 0.03607239132056743, "grad_norm": 1.0840579271316528, "learning_rate": 3.60711041909952e-05, "loss": 0.8026821136474609, "memory(GiB)": 88.99, "step": 2780, "token_acc": 0.763955989153134, "train_speed(iter/s)": 0.121284 }, { "epoch": 0.03613726972222312, "grad_norm": 1.1011638641357422, "learning_rate": 3.613598027766965e-05, "loss": 0.8228658676147461, "memory(GiB)": 88.99, "step": 2785, "token_acc": 0.7818284583863219, "train_speed(iter/s)": 0.121279 }, { "epoch": 0.03620214812387882, "grad_norm": 1.1723895072937012, "learning_rate": 3.620085636434411e-05, "loss": 0.8499242782592773, "memory(GiB)": 88.99, "step": 2790, "token_acc": 0.7406828345998618, "train_speed(iter/s)": 0.121279 }, { "epoch": 0.03626702652553452, "grad_norm": 1.1274687051773071, "learning_rate": 3.626573245101856e-05, "loss": 0.853207015991211, "memory(GiB)": 88.99, "step": 2795, "token_acc": 0.7471996006115635, "train_speed(iter/s)": 0.121287 }, { "epoch": 0.03633190492719021, "grad_norm": 1.1812845468521118, "learning_rate": 3.633060853769301e-05, "loss": 0.855527400970459, "memory(GiB)": 88.99, "step": 2800, "token_acc": 0.7459195308682196, "train_speed(iter/s)": 0.121287 }, { "epoch": 0.03639678332884591, "grad_norm": 1.1330982446670532, "learning_rate": 3.639548462436746e-05, "loss": 0.8630107879638672, "memory(GiB)": 88.99, "step": 2805, "token_acc": 0.7445142197199365, "train_speed(iter/s)": 0.121294 }, { "epoch": 0.03646166173050161, "grad_norm": 1.1683281660079956, "learning_rate": 3.6460360711041914e-05, "loss": 0.8227935791015625, "memory(GiB)": 88.99, "step": 2810, "token_acc": 0.7487147784212924, "train_speed(iter/s)": 0.121295 }, { "epoch": 0.0365265401321573, "grad_norm": 1.1870572566986084, "learning_rate": 3.6525236797716364e-05, "loss": 0.828979778289795, "memory(GiB)": 88.99, "step": 2815, "token_acc": 0.7826609593057348, "train_speed(iter/s)": 0.121275 }, { "epoch": 0.036591418533813, "grad_norm": 1.2129628658294678, "learning_rate": 3.6590112884390814e-05, "loss": 0.859401798248291, "memory(GiB)": 88.99, "step": 2820, "token_acc": 0.7380561419970287, "train_speed(iter/s)": 0.12128 }, { "epoch": 0.0366562969354687, "grad_norm": 1.2223378419876099, "learning_rate": 3.665498897106527e-05, "loss": 0.8964414596557617, "memory(GiB)": 88.99, "step": 2825, "token_acc": 0.7274368823664599, "train_speed(iter/s)": 0.121282 }, { "epoch": 0.03672117533712439, "grad_norm": 1.1566162109375, "learning_rate": 3.671986505773971e-05, "loss": 0.8305459022521973, "memory(GiB)": 88.99, "step": 2830, "token_acc": 0.7552763206050994, "train_speed(iter/s)": 0.121286 }, { "epoch": 0.03678605373878009, "grad_norm": 1.2256956100463867, "learning_rate": 3.678474114441417e-05, "loss": 0.8022218704223633, "memory(GiB)": 88.99, "step": 2835, "token_acc": 0.7688263170638902, "train_speed(iter/s)": 0.121291 }, { "epoch": 0.03685093214043579, "grad_norm": 1.1754697561264038, "learning_rate": 3.684961723108863e-05, "loss": 0.7779945373535156, "memory(GiB)": 88.99, "step": 2840, "token_acc": 0.7755200607168731, "train_speed(iter/s)": 0.121291 }, { "epoch": 0.03691581054209148, "grad_norm": 1.089287519454956, "learning_rate": 3.6914493317763076e-05, "loss": 0.8406197547912597, "memory(GiB)": 88.99, "step": 2845, "token_acc": 0.7636124476444321, "train_speed(iter/s)": 0.121294 }, { "epoch": 0.03698068894374718, "grad_norm": 1.1180239915847778, "learning_rate": 3.6979369404437526e-05, "loss": 0.8001043319702148, "memory(GiB)": 88.99, "step": 2850, "token_acc": 0.7436017421979051, "train_speed(iter/s)": 0.121292 }, { "epoch": 0.03704556734540288, "grad_norm": 1.234092116355896, "learning_rate": 3.7044245491111976e-05, "loss": 0.8561000823974609, "memory(GiB)": 88.99, "step": 2855, "token_acc": 0.7524110966300502, "train_speed(iter/s)": 0.121298 }, { "epoch": 0.037110445747058574, "grad_norm": 1.175598382949829, "learning_rate": 3.710912157778643e-05, "loss": 0.830511474609375, "memory(GiB)": 88.99, "step": 2860, "token_acc": 0.7532607561476122, "train_speed(iter/s)": 0.121284 }, { "epoch": 0.03717532414871427, "grad_norm": 1.1137361526489258, "learning_rate": 3.717399766446088e-05, "loss": 0.8253331184387207, "memory(GiB)": 88.99, "step": 2865, "token_acc": 0.7704718663856007, "train_speed(iter/s)": 0.121291 }, { "epoch": 0.03724020255036997, "grad_norm": 1.1156286001205444, "learning_rate": 3.723887375113533e-05, "loss": 0.808352279663086, "memory(GiB)": 88.99, "step": 2870, "token_acc": 0.7679326773231221, "train_speed(iter/s)": 0.121286 }, { "epoch": 0.037305080952025664, "grad_norm": 1.087266445159912, "learning_rate": 3.730374983780979e-05, "loss": 0.8368012428283691, "memory(GiB)": 88.99, "step": 2875, "token_acc": 0.7549758199822351, "train_speed(iter/s)": 0.121283 }, { "epoch": 0.03736995935368136, "grad_norm": 1.033640742301941, "learning_rate": 3.736862592448423e-05, "loss": 0.8096643447875976, "memory(GiB)": 88.99, "step": 2880, "token_acc": 0.7663004220050945, "train_speed(iter/s)": 0.121282 }, { "epoch": 0.03743483775533706, "grad_norm": 1.1905407905578613, "learning_rate": 3.743350201115869e-05, "loss": 0.8569784164428711, "memory(GiB)": 88.99, "step": 2885, "token_acc": 0.7444143534190928, "train_speed(iter/s)": 0.121291 }, { "epoch": 0.037499716156992755, "grad_norm": 1.2121214866638184, "learning_rate": 3.7498378097833145e-05, "loss": 0.8326489448547363, "memory(GiB)": 88.99, "step": 2890, "token_acc": 0.77374588451871, "train_speed(iter/s)": 0.12129 }, { "epoch": 0.037564594558648454, "grad_norm": 1.1586523056030273, "learning_rate": 3.756325418450759e-05, "loss": 0.8623594284057617, "memory(GiB)": 88.99, "step": 2895, "token_acc": 0.7514584891548243, "train_speed(iter/s)": 0.121293 }, { "epoch": 0.03762947296030415, "grad_norm": 1.2099865674972534, "learning_rate": 3.7628130271182045e-05, "loss": 0.8281295776367188, "memory(GiB)": 88.99, "step": 2900, "token_acc": 0.7709083134376224, "train_speed(iter/s)": 0.121294 }, { "epoch": 0.037694351361959845, "grad_norm": 1.219686508178711, "learning_rate": 3.7693006357856495e-05, "loss": 0.85509033203125, "memory(GiB)": 88.99, "step": 2905, "token_acc": 0.7591529944233412, "train_speed(iter/s)": 0.121301 }, { "epoch": 0.037759229763615544, "grad_norm": 1.1790496110916138, "learning_rate": 3.7757882444530945e-05, "loss": 0.8355900764465332, "memory(GiB)": 88.99, "step": 2910, "token_acc": 0.7602063082721682, "train_speed(iter/s)": 0.121308 }, { "epoch": 0.03782410816527124, "grad_norm": 1.2304004430770874, "learning_rate": 3.78227585312054e-05, "loss": 0.8554562568664551, "memory(GiB)": 88.99, "step": 2915, "token_acc": 0.7651752021563343, "train_speed(iter/s)": 0.121308 }, { "epoch": 0.037888986566926935, "grad_norm": 1.2874937057495117, "learning_rate": 3.788763461787985e-05, "loss": 0.8527189254760742, "memory(GiB)": 88.99, "step": 2920, "token_acc": 0.7490464699929945, "train_speed(iter/s)": 0.121307 }, { "epoch": 0.037953864968582635, "grad_norm": 1.3061619997024536, "learning_rate": 3.79525107045543e-05, "loss": 0.8514667510986328, "memory(GiB)": 88.99, "step": 2925, "token_acc": 0.7545355020331561, "train_speed(iter/s)": 0.121307 }, { "epoch": 0.038018743370238334, "grad_norm": 1.1892809867858887, "learning_rate": 3.801738679122875e-05, "loss": 0.8363459587097168, "memory(GiB)": 88.99, "step": 2930, "token_acc": 0.7484247623794098, "train_speed(iter/s)": 0.121311 }, { "epoch": 0.038083621771894026, "grad_norm": 1.0784138441085815, "learning_rate": 3.808226287790321e-05, "loss": 0.7883004188537598, "memory(GiB)": 88.99, "step": 2935, "token_acc": 0.7776698243057242, "train_speed(iter/s)": 0.121313 }, { "epoch": 0.038148500173549725, "grad_norm": 1.157482624053955, "learning_rate": 3.8147138964577664e-05, "loss": 0.8506643295288085, "memory(GiB)": 88.99, "step": 2940, "token_acc": 0.7513268275980141, "train_speed(iter/s)": 0.121307 }, { "epoch": 0.038213378575205424, "grad_norm": 1.2483108043670654, "learning_rate": 3.821201505125211e-05, "loss": 0.874788475036621, "memory(GiB)": 88.99, "step": 2945, "token_acc": 0.7431882022471911, "train_speed(iter/s)": 0.121311 }, { "epoch": 0.038278256976861116, "grad_norm": 1.182938814163208, "learning_rate": 3.8276891137926564e-05, "loss": 0.8336369514465332, "memory(GiB)": 88.99, "step": 2950, "token_acc": 0.7753634230335538, "train_speed(iter/s)": 0.121305 }, { "epoch": 0.038343135378516816, "grad_norm": 1.211816430091858, "learning_rate": 3.8341767224601014e-05, "loss": 0.858939266204834, "memory(GiB)": 88.99, "step": 2955, "token_acc": 0.7545263007534907, "train_speed(iter/s)": 0.121306 }, { "epoch": 0.038408013780172515, "grad_norm": 1.072881817817688, "learning_rate": 3.8406643311275464e-05, "loss": 0.8548770904541015, "memory(GiB)": 88.99, "step": 2960, "token_acc": 0.7553804823568475, "train_speed(iter/s)": 0.121302 }, { "epoch": 0.03847289218182821, "grad_norm": 1.1583964824676514, "learning_rate": 3.847151939794992e-05, "loss": 0.8806078910827637, "memory(GiB)": 88.99, "step": 2965, "token_acc": 0.7367584363032011, "train_speed(iter/s)": 0.121297 }, { "epoch": 0.038537770583483906, "grad_norm": 1.2315986156463623, "learning_rate": 3.853639548462437e-05, "loss": 0.8538886070251465, "memory(GiB)": 88.99, "step": 2970, "token_acc": 0.769458429718329, "train_speed(iter/s)": 0.121302 }, { "epoch": 0.038602648985139605, "grad_norm": 1.2765642404556274, "learning_rate": 3.860127157129882e-05, "loss": 0.8270244598388672, "memory(GiB)": 88.99, "step": 2975, "token_acc": 0.7662148760330578, "train_speed(iter/s)": 0.121296 }, { "epoch": 0.0386675273867953, "grad_norm": 1.057145357131958, "learning_rate": 3.866614765797327e-05, "loss": 0.8652256011962891, "memory(GiB)": 88.99, "step": 2980, "token_acc": 0.7587116154873165, "train_speed(iter/s)": 0.121289 }, { "epoch": 0.038732405788450996, "grad_norm": 1.169481635093689, "learning_rate": 3.8731023744647727e-05, "loss": 0.8518313407897949, "memory(GiB)": 88.99, "step": 2985, "token_acc": 0.7719848566792861, "train_speed(iter/s)": 0.121271 }, { "epoch": 0.038797284190106696, "grad_norm": 1.174521565437317, "learning_rate": 3.8795899831322176e-05, "loss": 0.8546106338500976, "memory(GiB)": 88.99, "step": 2990, "token_acc": 0.7577260981912145, "train_speed(iter/s)": 0.12127 }, { "epoch": 0.03886216259176239, "grad_norm": 6.352973937988281, "learning_rate": 3.8860775917996626e-05, "loss": 0.8150294303894043, "memory(GiB)": 88.99, "step": 2995, "token_acc": 0.7435839542409874, "train_speed(iter/s)": 0.121276 }, { "epoch": 0.03892704099341809, "grad_norm": 1.1586354970932007, "learning_rate": 3.892565200467108e-05, "loss": 0.7898270130157471, "memory(GiB)": 88.99, "step": 3000, "token_acc": 0.7647664003873155, "train_speed(iter/s)": 0.121274 }, { "epoch": 0.038991919395073786, "grad_norm": 0.9927857518196106, "learning_rate": 3.899052809134553e-05, "loss": 0.8018699645996094, "memory(GiB)": 88.99, "step": 3005, "token_acc": 0.766985779347523, "train_speed(iter/s)": 0.12127 }, { "epoch": 0.03905679779672948, "grad_norm": 1.1698987483978271, "learning_rate": 3.905540417801998e-05, "loss": 0.8530485153198242, "memory(GiB)": 88.99, "step": 3010, "token_acc": 0.745018856589794, "train_speed(iter/s)": 0.121263 }, { "epoch": 0.03912167619838518, "grad_norm": 1.0038048028945923, "learning_rate": 3.912028026469444e-05, "loss": 0.8282352447509765, "memory(GiB)": 88.99, "step": 3015, "token_acc": 0.7489062976905076, "train_speed(iter/s)": 0.12126 }, { "epoch": 0.03918655460004088, "grad_norm": 1.134984016418457, "learning_rate": 3.918515635136888e-05, "loss": 0.8253629684448243, "memory(GiB)": 88.99, "step": 3020, "token_acc": 0.7548076923076923, "train_speed(iter/s)": 0.121263 }, { "epoch": 0.03925143300169657, "grad_norm": 1.117533564567566, "learning_rate": 3.925003243804334e-05, "loss": 0.8590721130371094, "memory(GiB)": 88.99, "step": 3025, "token_acc": 0.7526097880321314, "train_speed(iter/s)": 0.121266 }, { "epoch": 0.03931631140335227, "grad_norm": 1.1216456890106201, "learning_rate": 3.931490852471779e-05, "loss": 0.8630590438842773, "memory(GiB)": 88.99, "step": 3030, "token_acc": 0.7432870203729033, "train_speed(iter/s)": 0.121261 }, { "epoch": 0.03938118980500797, "grad_norm": 1.1699209213256836, "learning_rate": 3.9379784611392245e-05, "loss": 0.9092922210693359, "memory(GiB)": 88.99, "step": 3035, "token_acc": 0.7444713989382273, "train_speed(iter/s)": 0.121261 }, { "epoch": 0.03944606820666366, "grad_norm": 1.1854370832443237, "learning_rate": 3.9444660698066695e-05, "loss": 0.8271703720092773, "memory(GiB)": 88.99, "step": 3040, "token_acc": 0.7695036706593137, "train_speed(iter/s)": 0.121263 }, { "epoch": 0.03951094660831936, "grad_norm": 1.2008676528930664, "learning_rate": 3.9509536784741145e-05, "loss": 0.8556760787963867, "memory(GiB)": 88.99, "step": 3045, "token_acc": 0.7817903596021423, "train_speed(iter/s)": 0.121256 }, { "epoch": 0.03957582500997506, "grad_norm": 1.0257322788238525, "learning_rate": 3.95744128714156e-05, "loss": 0.8433998107910157, "memory(GiB)": 88.99, "step": 3050, "token_acc": 0.7537981412256937, "train_speed(iter/s)": 0.121252 }, { "epoch": 0.03964070341163075, "grad_norm": 1.07415771484375, "learning_rate": 3.963928895809005e-05, "loss": 0.8413286209106445, "memory(GiB)": 88.99, "step": 3055, "token_acc": 0.7644835868694956, "train_speed(iter/s)": 0.121251 }, { "epoch": 0.03970558181328645, "grad_norm": 1.191710352897644, "learning_rate": 3.97041650447645e-05, "loss": 0.8481435775756836, "memory(GiB)": 88.99, "step": 3060, "token_acc": 0.7597464342313788, "train_speed(iter/s)": 0.121255 }, { "epoch": 0.03977046021494215, "grad_norm": 1.0487600564956665, "learning_rate": 3.976904113143896e-05, "loss": 0.8537849426269531, "memory(GiB)": 88.99, "step": 3065, "token_acc": 0.7474309893209752, "train_speed(iter/s)": 0.121252 }, { "epoch": 0.03983533861659784, "grad_norm": 1.146193504333496, "learning_rate": 3.98339172181134e-05, "loss": 0.8376871109008789, "memory(GiB)": 88.99, "step": 3070, "token_acc": 0.787616741611899, "train_speed(iter/s)": 0.121245 }, { "epoch": 0.03990021701825354, "grad_norm": 1.0848790407180786, "learning_rate": 3.989879330478786e-05, "loss": 0.809360122680664, "memory(GiB)": 88.99, "step": 3075, "token_acc": 0.7747020666767235, "train_speed(iter/s)": 0.121248 }, { "epoch": 0.03996509541990924, "grad_norm": 1.146222710609436, "learning_rate": 3.996366939146231e-05, "loss": 0.8615243911743165, "memory(GiB)": 88.99, "step": 3080, "token_acc": 0.7515927306890579, "train_speed(iter/s)": 0.121243 }, { "epoch": 0.04002997382156493, "grad_norm": 0.9529266357421875, "learning_rate": 4.002854547813676e-05, "loss": 0.8331600189208984, "memory(GiB)": 88.99, "step": 3085, "token_acc": 0.7669724770642202, "train_speed(iter/s)": 0.121237 }, { "epoch": 0.04009485222322063, "grad_norm": 1.0441770553588867, "learning_rate": 4.0093421564811214e-05, "loss": 0.8425878524780274, "memory(GiB)": 88.99, "step": 3090, "token_acc": 0.7500747980452778, "train_speed(iter/s)": 0.121242 }, { "epoch": 0.04015973062487633, "grad_norm": 1.2737964391708374, "learning_rate": 4.0158297651485664e-05, "loss": 0.8445523262023926, "memory(GiB)": 88.99, "step": 3095, "token_acc": 0.7406303076052085, "train_speed(iter/s)": 0.121248 }, { "epoch": 0.04022460902653202, "grad_norm": 1.1594935655593872, "learning_rate": 4.0223173738160114e-05, "loss": 0.834408187866211, "memory(GiB)": 88.99, "step": 3100, "token_acc": 0.7617975390079919, "train_speed(iter/s)": 0.121252 }, { "epoch": 0.04028948742818772, "grad_norm": 1.0628690719604492, "learning_rate": 4.028804982483457e-05, "loss": 0.8219572067260742, "memory(GiB)": 88.99, "step": 3105, "token_acc": 0.7630621066201562, "train_speed(iter/s)": 0.121248 }, { "epoch": 0.04035436582984342, "grad_norm": 1.0705697536468506, "learning_rate": 4.035292591150902e-05, "loss": 0.8234344482421875, "memory(GiB)": 88.99, "step": 3110, "token_acc": 0.7424928013163308, "train_speed(iter/s)": 0.121255 }, { "epoch": 0.04041924423149911, "grad_norm": 1.2490147352218628, "learning_rate": 4.041780199818347e-05, "loss": 0.8920690536499023, "memory(GiB)": 88.99, "step": 3115, "token_acc": 0.7513265550648728, "train_speed(iter/s)": 0.121252 }, { "epoch": 0.04048412263315481, "grad_norm": 1.1630606651306152, "learning_rate": 4.048267808485792e-05, "loss": 0.8452714920043946, "memory(GiB)": 88.99, "step": 3120, "token_acc": 0.7643952408220398, "train_speed(iter/s)": 0.121259 }, { "epoch": 0.04054900103481051, "grad_norm": 1.03049898147583, "learning_rate": 4.0547554171532377e-05, "loss": 0.7904005527496338, "memory(GiB)": 88.99, "step": 3125, "token_acc": 0.7936911630731855, "train_speed(iter/s)": 0.121261 }, { "epoch": 0.0406138794364662, "grad_norm": 1.1235450506210327, "learning_rate": 4.0612430258206826e-05, "loss": 0.8480341911315918, "memory(GiB)": 88.99, "step": 3130, "token_acc": 0.7578286440947385, "train_speed(iter/s)": 0.12126 }, { "epoch": 0.0406787578381219, "grad_norm": 1.138994574546814, "learning_rate": 4.0677306344881276e-05, "loss": 0.8701135635375976, "memory(GiB)": 88.99, "step": 3135, "token_acc": 0.7527829476419566, "train_speed(iter/s)": 0.12126 }, { "epoch": 0.0407436362397776, "grad_norm": 1.1431965827941895, "learning_rate": 4.074218243155573e-05, "loss": 0.8389938354492188, "memory(GiB)": 88.99, "step": 3140, "token_acc": 0.7720844349322254, "train_speed(iter/s)": 0.121256 }, { "epoch": 0.04080851464143329, "grad_norm": 1.1036192178726196, "learning_rate": 4.080705851823018e-05, "loss": 0.8522863388061523, "memory(GiB)": 88.99, "step": 3145, "token_acc": 0.7579439252336448, "train_speed(iter/s)": 0.121253 }, { "epoch": 0.04087339304308899, "grad_norm": 1.032967448234558, "learning_rate": 4.087193460490463e-05, "loss": 0.8310722351074219, "memory(GiB)": 88.99, "step": 3150, "token_acc": 0.7658233890214797, "train_speed(iter/s)": 0.12125 }, { "epoch": 0.040938271444744684, "grad_norm": 1.1436101198196411, "learning_rate": 4.093681069157909e-05, "loss": 0.8484845161437988, "memory(GiB)": 88.99, "step": 3155, "token_acc": 0.770194398468179, "train_speed(iter/s)": 0.121248 }, { "epoch": 0.04100314984640038, "grad_norm": 1.1990364789962769, "learning_rate": 4.100168677825354e-05, "loss": 0.805626106262207, "memory(GiB)": 88.99, "step": 3160, "token_acc": 0.7802538322070216, "train_speed(iter/s)": 0.121253 }, { "epoch": 0.04106802824805608, "grad_norm": 1.1197059154510498, "learning_rate": 4.106656286492799e-05, "loss": 0.8707280158996582, "memory(GiB)": 88.99, "step": 3165, "token_acc": 0.7547415396058014, "train_speed(iter/s)": 0.121253 }, { "epoch": 0.041132906649711774, "grad_norm": 1.2361149787902832, "learning_rate": 4.113143895160244e-05, "loss": 0.8549275398254395, "memory(GiB)": 88.99, "step": 3170, "token_acc": 0.7643865363735071, "train_speed(iter/s)": 0.12126 }, { "epoch": 0.04119778505136747, "grad_norm": 1.046288013458252, "learning_rate": 4.1196315038276895e-05, "loss": 0.8527419090270996, "memory(GiB)": 88.99, "step": 3175, "token_acc": 0.7593485382285678, "train_speed(iter/s)": 0.121263 }, { "epoch": 0.04126266345302317, "grad_norm": 1.053492784500122, "learning_rate": 4.1261191124951345e-05, "loss": 0.8062458038330078, "memory(GiB)": 88.99, "step": 3180, "token_acc": 0.7699510070767556, "train_speed(iter/s)": 0.121262 }, { "epoch": 0.041327541854678865, "grad_norm": 1.0556546449661255, "learning_rate": 4.1326067211625795e-05, "loss": 0.8197366714477539, "memory(GiB)": 88.99, "step": 3185, "token_acc": 0.7661096234025965, "train_speed(iter/s)": 0.121265 }, { "epoch": 0.041392420256334564, "grad_norm": 1.1224976778030396, "learning_rate": 4.139094329830025e-05, "loss": 0.8464120864868164, "memory(GiB)": 88.99, "step": 3190, "token_acc": 0.7454175879038585, "train_speed(iter/s)": 0.121264 }, { "epoch": 0.04145729865799026, "grad_norm": 1.182681918144226, "learning_rate": 4.1455819384974695e-05, "loss": 0.8569733619689941, "memory(GiB)": 88.99, "step": 3195, "token_acc": 0.7688247602985727, "train_speed(iter/s)": 0.121264 }, { "epoch": 0.041522177059645955, "grad_norm": 1.1363822221755981, "learning_rate": 4.152069547164915e-05, "loss": 0.8646528244018554, "memory(GiB)": 88.99, "step": 3200, "token_acc": 0.7527123082678638, "train_speed(iter/s)": 0.121283 }, { "epoch": 0.041587055461301654, "grad_norm": 1.1206389665603638, "learning_rate": 4.158557155832361e-05, "loss": 0.8505621910095215, "memory(GiB)": 88.99, "step": 3205, "token_acc": 0.7309682449673944, "train_speed(iter/s)": 0.121281 }, { "epoch": 0.041651933862957354, "grad_norm": 1.129743218421936, "learning_rate": 4.165044764499805e-05, "loss": 0.8756380081176758, "memory(GiB)": 88.99, "step": 3210, "token_acc": 0.7515689093012656, "train_speed(iter/s)": 0.121294 }, { "epoch": 0.041716812264613046, "grad_norm": 1.0379151105880737, "learning_rate": 4.171532373167251e-05, "loss": 0.8737659454345703, "memory(GiB)": 88.99, "step": 3215, "token_acc": 0.7646551724137931, "train_speed(iter/s)": 0.1213 }, { "epoch": 0.041781690666268745, "grad_norm": 1.0077993869781494, "learning_rate": 4.178019981834696e-05, "loss": 0.8381379127502442, "memory(GiB)": 88.99, "step": 3220, "token_acc": 0.7720695759254983, "train_speed(iter/s)": 0.121298 }, { "epoch": 0.041846569067924444, "grad_norm": 1.1338307857513428, "learning_rate": 4.1845075905021414e-05, "loss": 0.8690978050231933, "memory(GiB)": 88.99, "step": 3225, "token_acc": 0.7539891722467588, "train_speed(iter/s)": 0.121304 }, { "epoch": 0.041911447469580136, "grad_norm": 0.9270058274269104, "learning_rate": 4.1909951991695864e-05, "loss": 0.8650070190429687, "memory(GiB)": 88.99, "step": 3230, "token_acc": 0.7489368165249088, "train_speed(iter/s)": 0.121305 }, { "epoch": 0.041976325871235835, "grad_norm": 1.1266932487487793, "learning_rate": 4.1974828078370314e-05, "loss": 0.8374746322631836, "memory(GiB)": 88.99, "step": 3235, "token_acc": 0.7699224900084777, "train_speed(iter/s)": 0.121302 }, { "epoch": 0.042041204272891534, "grad_norm": 0.95892733335495, "learning_rate": 4.203970416504477e-05, "loss": 0.8653931617736816, "memory(GiB)": 88.99, "step": 3240, "token_acc": 0.7532654083848338, "train_speed(iter/s)": 0.121305 }, { "epoch": 0.04210608267454723, "grad_norm": 1.0223349332809448, "learning_rate": 4.2104580251719214e-05, "loss": 0.8618358612060547, "memory(GiB)": 88.99, "step": 3245, "token_acc": 0.7669387895697702, "train_speed(iter/s)": 0.121311 }, { "epoch": 0.042170961076202926, "grad_norm": 1.1576615571975708, "learning_rate": 4.216945633839367e-05, "loss": 0.8477663040161133, "memory(GiB)": 88.99, "step": 3250, "token_acc": 0.7527266095203355, "train_speed(iter/s)": 0.121324 }, { "epoch": 0.042235839477858625, "grad_norm": 1.0596920251846313, "learning_rate": 4.223433242506813e-05, "loss": 0.8949674606323242, "memory(GiB)": 88.99, "step": 3255, "token_acc": 0.7523923444976076, "train_speed(iter/s)": 0.121324 }, { "epoch": 0.04230071787951432, "grad_norm": 1.1215587854385376, "learning_rate": 4.229920851174257e-05, "loss": 0.8278717994689941, "memory(GiB)": 88.99, "step": 3260, "token_acc": 0.7644878131924322, "train_speed(iter/s)": 0.121326 }, { "epoch": 0.042365596281170016, "grad_norm": 1.261680006980896, "learning_rate": 4.2364084598417027e-05, "loss": 0.8518632888793946, "memory(GiB)": 88.99, "step": 3265, "token_acc": 0.7742174531657576, "train_speed(iter/s)": 0.121317 }, { "epoch": 0.042430474682825715, "grad_norm": 1.0815942287445068, "learning_rate": 4.2428960685091476e-05, "loss": 0.8104162216186523, "memory(GiB)": 88.99, "step": 3270, "token_acc": 0.7689623632236864, "train_speed(iter/s)": 0.121314 }, { "epoch": 0.04249535308448141, "grad_norm": 1.2137978076934814, "learning_rate": 4.2493836771765926e-05, "loss": 0.8341232299804687, "memory(GiB)": 88.99, "step": 3275, "token_acc": 0.7709236412137379, "train_speed(iter/s)": 0.121312 }, { "epoch": 0.04256023148613711, "grad_norm": 1.2221285104751587, "learning_rate": 4.255871285844038e-05, "loss": 0.8837153434753418, "memory(GiB)": 88.99, "step": 3280, "token_acc": 0.7598291274424491, "train_speed(iter/s)": 0.121316 }, { "epoch": 0.042625109887792806, "grad_norm": 1.0256322622299194, "learning_rate": 4.262358894511483e-05, "loss": 0.8416971206665039, "memory(GiB)": 88.99, "step": 3285, "token_acc": 0.7520476545048399, "train_speed(iter/s)": 0.121306 }, { "epoch": 0.0426899882894485, "grad_norm": 1.101143479347229, "learning_rate": 4.268846503178928e-05, "loss": 0.8676305770874023, "memory(GiB)": 88.99, "step": 3290, "token_acc": 0.7594872196957817, "train_speed(iter/s)": 0.121306 }, { "epoch": 0.0427548666911042, "grad_norm": 1.006192684173584, "learning_rate": 4.275334111846373e-05, "loss": 0.8330129623413086, "memory(GiB)": 88.99, "step": 3295, "token_acc": 0.7475691974116611, "train_speed(iter/s)": 0.121295 }, { "epoch": 0.042819745092759896, "grad_norm": 1.1435184478759766, "learning_rate": 4.281821720513819e-05, "loss": 0.891502571105957, "memory(GiB)": 88.99, "step": 3300, "token_acc": 0.7500825718374986, "train_speed(iter/s)": 0.121297 }, { "epoch": 0.04288462349441559, "grad_norm": 1.0521929264068604, "learning_rate": 4.2883093291812646e-05, "loss": 0.8777050018310547, "memory(GiB)": 88.99, "step": 3305, "token_acc": 0.7428345590762252, "train_speed(iter/s)": 0.121296 }, { "epoch": 0.04294950189607129, "grad_norm": 1.0814698934555054, "learning_rate": 4.294796937848709e-05, "loss": 0.8757349967956543, "memory(GiB)": 88.99, "step": 3310, "token_acc": 0.7525545244396797, "train_speed(iter/s)": 0.121291 }, { "epoch": 0.04301438029772699, "grad_norm": 1.1236903667449951, "learning_rate": 4.3012845465161545e-05, "loss": 0.8804971694946289, "memory(GiB)": 88.99, "step": 3315, "token_acc": 0.7574172971369233, "train_speed(iter/s)": 0.121301 }, { "epoch": 0.04307925869938268, "grad_norm": 1.1917444467544556, "learning_rate": 4.3077721551835995e-05, "loss": 0.891969108581543, "memory(GiB)": 88.99, "step": 3320, "token_acc": 0.7748456790123457, "train_speed(iter/s)": 0.121304 }, { "epoch": 0.04314413710103838, "grad_norm": 1.0644373893737793, "learning_rate": 4.3142597638510445e-05, "loss": 0.850184440612793, "memory(GiB)": 88.99, "step": 3325, "token_acc": 0.7591630488933172, "train_speed(iter/s)": 0.121308 }, { "epoch": 0.04320901550269408, "grad_norm": 1.0309118032455444, "learning_rate": 4.32074737251849e-05, "loss": 0.8787730216979981, "memory(GiB)": 88.99, "step": 3330, "token_acc": 0.7644543461856337, "train_speed(iter/s)": 0.121307 }, { "epoch": 0.04327389390434977, "grad_norm": 1.1485127210617065, "learning_rate": 4.327234981185935e-05, "loss": 0.87722806930542, "memory(GiB)": 88.99, "step": 3335, "token_acc": 0.7399044876746963, "train_speed(iter/s)": 0.121317 }, { "epoch": 0.04333877230600547, "grad_norm": 1.2192211151123047, "learning_rate": 4.33372258985338e-05, "loss": 0.8777950286865235, "memory(GiB)": 88.99, "step": 3340, "token_acc": 0.7476887209582764, "train_speed(iter/s)": 0.12132 }, { "epoch": 0.04340365070766117, "grad_norm": 1.0507454872131348, "learning_rate": 4.340210198520825e-05, "loss": 0.8541275024414062, "memory(GiB)": 88.99, "step": 3345, "token_acc": 0.7617974058060532, "train_speed(iter/s)": 0.121321 }, { "epoch": 0.04346852910931686, "grad_norm": 1.0216301679611206, "learning_rate": 4.346697807188271e-05, "loss": 0.8388168334960937, "memory(GiB)": 88.99, "step": 3350, "token_acc": 0.7708515017267241, "train_speed(iter/s)": 0.12132 }, { "epoch": 0.04353340751097256, "grad_norm": 1.1821393966674805, "learning_rate": 4.353185415855716e-05, "loss": 0.8431890487670899, "memory(GiB)": 88.99, "step": 3355, "token_acc": 0.7427928230629662, "train_speed(iter/s)": 0.121312 }, { "epoch": 0.04359828591262826, "grad_norm": 1.100927710533142, "learning_rate": 4.359673024523161e-05, "loss": 0.8169103622436523, "memory(GiB)": 88.99, "step": 3360, "token_acc": 0.7571234249713631, "train_speed(iter/s)": 0.121309 }, { "epoch": 0.04366316431428395, "grad_norm": 1.0612592697143555, "learning_rate": 4.3661606331906064e-05, "loss": 0.8336977005004883, "memory(GiB)": 88.99, "step": 3365, "token_acc": 0.7762553722954855, "train_speed(iter/s)": 0.121316 }, { "epoch": 0.04372804271593965, "grad_norm": 1.1616038084030151, "learning_rate": 4.3726482418580514e-05, "loss": 0.8757875442504883, "memory(GiB)": 88.99, "step": 3370, "token_acc": 0.7400971363693705, "train_speed(iter/s)": 0.121314 }, { "epoch": 0.04379292111759535, "grad_norm": 1.1404575109481812, "learning_rate": 4.3791358505254964e-05, "loss": 0.8655940055847168, "memory(GiB)": 88.99, "step": 3375, "token_acc": 0.7468683066138768, "train_speed(iter/s)": 0.121319 }, { "epoch": 0.04385779951925104, "grad_norm": 1.1910552978515625, "learning_rate": 4.385623459192942e-05, "loss": 0.8781116485595704, "memory(GiB)": 88.99, "step": 3380, "token_acc": 0.7480470748763431, "train_speed(iter/s)": 0.121321 }, { "epoch": 0.04392267792090674, "grad_norm": 1.04073965549469, "learning_rate": 4.3921110678603864e-05, "loss": 0.9007955551147461, "memory(GiB)": 88.99, "step": 3385, "token_acc": 0.746033105022831, "train_speed(iter/s)": 0.121319 }, { "epoch": 0.04398755632256244, "grad_norm": 1.0246447324752808, "learning_rate": 4.398598676527832e-05, "loss": 0.8556685447692871, "memory(GiB)": 88.99, "step": 3390, "token_acc": 0.7564540281521414, "train_speed(iter/s)": 0.12132 }, { "epoch": 0.04405243472421813, "grad_norm": 1.0074397325515747, "learning_rate": 4.405086285195277e-05, "loss": 0.8589007377624511, "memory(GiB)": 88.99, "step": 3395, "token_acc": 0.7706857632132015, "train_speed(iter/s)": 0.121321 }, { "epoch": 0.04411731312587383, "grad_norm": 1.0864336490631104, "learning_rate": 4.411573893862723e-05, "loss": 0.8469520568847656, "memory(GiB)": 88.99, "step": 3400, "token_acc": 0.7711579013627923, "train_speed(iter/s)": 0.121315 }, { "epoch": 0.04418219152752953, "grad_norm": 1.210617184638977, "learning_rate": 4.418061502530168e-05, "loss": 0.8621655464172363, "memory(GiB)": 88.99, "step": 3405, "token_acc": 0.7579047551905249, "train_speed(iter/s)": 0.121311 }, { "epoch": 0.04424706992918522, "grad_norm": 1.1667252779006958, "learning_rate": 4.4245491111976126e-05, "loss": 0.8716836929321289, "memory(GiB)": 88.99, "step": 3410, "token_acc": 0.769670900458548, "train_speed(iter/s)": 0.121317 }, { "epoch": 0.04431194833084092, "grad_norm": 0.9579612016677856, "learning_rate": 4.431036719865058e-05, "loss": 0.8888240814208984, "memory(GiB)": 88.99, "step": 3415, "token_acc": 0.7496455537604273, "train_speed(iter/s)": 0.121316 }, { "epoch": 0.04437682673249662, "grad_norm": 1.119112491607666, "learning_rate": 4.437524328532503e-05, "loss": 0.8679189682006836, "memory(GiB)": 88.99, "step": 3420, "token_acc": 0.7401753935346568, "train_speed(iter/s)": 0.121311 }, { "epoch": 0.04444170513415231, "grad_norm": 1.1413586139678955, "learning_rate": 4.444011937199948e-05, "loss": 0.8500272750854492, "memory(GiB)": 88.99, "step": 3425, "token_acc": 0.7672377448162202, "train_speed(iter/s)": 0.12131 }, { "epoch": 0.04450658353580801, "grad_norm": 1.1780349016189575, "learning_rate": 4.450499545867394e-05, "loss": 0.8651642799377441, "memory(GiB)": 88.99, "step": 3430, "token_acc": 0.7679922195807218, "train_speed(iter/s)": 0.121317 }, { "epoch": 0.04457146193746371, "grad_norm": 1.245763897895813, "learning_rate": 4.456987154534838e-05, "loss": 0.8754039764404297, "memory(GiB)": 88.99, "step": 3435, "token_acc": 0.7441719107551488, "train_speed(iter/s)": 0.121324 }, { "epoch": 0.0446363403391194, "grad_norm": 1.0753480195999146, "learning_rate": 4.463474763202284e-05, "loss": 0.8270726203918457, "memory(GiB)": 88.99, "step": 3440, "token_acc": 0.7437134454614546, "train_speed(iter/s)": 0.121332 }, { "epoch": 0.0447012187407751, "grad_norm": 1.0528489351272583, "learning_rate": 4.469962371869729e-05, "loss": 0.8740915298461914, "memory(GiB)": 88.99, "step": 3445, "token_acc": 0.7620376597175521, "train_speed(iter/s)": 0.121328 }, { "epoch": 0.0447660971424308, "grad_norm": 1.1458173990249634, "learning_rate": 4.476449980537174e-05, "loss": 0.874017333984375, "memory(GiB)": 88.99, "step": 3450, "token_acc": 0.7398006134969325, "train_speed(iter/s)": 0.121329 }, { "epoch": 0.04483097554408649, "grad_norm": 1.1899895668029785, "learning_rate": 4.4829375892046195e-05, "loss": 0.9255914688110352, "memory(GiB)": 88.99, "step": 3455, "token_acc": 0.7403846153846154, "train_speed(iter/s)": 0.121327 }, { "epoch": 0.04489585394574219, "grad_norm": 1.1967103481292725, "learning_rate": 4.4894251978720645e-05, "loss": 0.8856023788452149, "memory(GiB)": 88.99, "step": 3460, "token_acc": 0.7405168045938186, "train_speed(iter/s)": 0.121324 }, { "epoch": 0.04496073234739789, "grad_norm": 1.1723886728286743, "learning_rate": 4.4959128065395095e-05, "loss": 0.8698581695556641, "memory(GiB)": 88.99, "step": 3465, "token_acc": 0.7623078791243104, "train_speed(iter/s)": 0.121327 }, { "epoch": 0.045025610749053584, "grad_norm": 1.0166722536087036, "learning_rate": 4.502400415206955e-05, "loss": 0.8867829322814942, "memory(GiB)": 88.99, "step": 3470, "token_acc": 0.7580508318270684, "train_speed(iter/s)": 0.121333 }, { "epoch": 0.04509048915070928, "grad_norm": 1.1294459104537964, "learning_rate": 4.5088880238744e-05, "loss": 0.8724224090576171, "memory(GiB)": 88.99, "step": 3475, "token_acc": 0.7637746498262037, "train_speed(iter/s)": 0.121329 }, { "epoch": 0.04515536755236498, "grad_norm": 1.0993449687957764, "learning_rate": 4.515375632541845e-05, "loss": 0.8473785400390625, "memory(GiB)": 88.99, "step": 3480, "token_acc": 0.7740132486889318, "train_speed(iter/s)": 0.121327 }, { "epoch": 0.045220245954020674, "grad_norm": 1.1031107902526855, "learning_rate": 4.52186324120929e-05, "loss": 0.8503690719604492, "memory(GiB)": 88.99, "step": 3485, "token_acc": 0.7598464434683105, "train_speed(iter/s)": 0.121332 }, { "epoch": 0.04528512435567637, "grad_norm": 1.0757265090942383, "learning_rate": 4.528350849876736e-05, "loss": 0.8659072875976562, "memory(GiB)": 88.99, "step": 3490, "token_acc": 0.7781980135025273, "train_speed(iter/s)": 0.121337 }, { "epoch": 0.04535000275733207, "grad_norm": 1.027161955833435, "learning_rate": 4.534838458544181e-05, "loss": 0.8387907981872559, "memory(GiB)": 88.99, "step": 3495, "token_acc": 0.7570308739599807, "train_speed(iter/s)": 0.12134 }, { "epoch": 0.045414881158987765, "grad_norm": 1.016628384590149, "learning_rate": 4.541326067211626e-05, "loss": 0.8568641662597656, "memory(GiB)": 88.99, "step": 3500, "token_acc": 0.7697803369588399, "train_speed(iter/s)": 0.121342 }, { "epoch": 0.045479759560643464, "grad_norm": 1.1339365243911743, "learning_rate": 4.5478136758790714e-05, "loss": 0.8852500915527344, "memory(GiB)": 88.99, "step": 3505, "token_acc": 0.7574092760772881, "train_speed(iter/s)": 0.121339 }, { "epoch": 0.04554463796229916, "grad_norm": 1.2198054790496826, "learning_rate": 4.5543012845465164e-05, "loss": 0.9101188659667969, "memory(GiB)": 88.99, "step": 3510, "token_acc": 0.7296141964221567, "train_speed(iter/s)": 0.12134 }, { "epoch": 0.045609516363954855, "grad_norm": 1.0774222612380981, "learning_rate": 4.5607888932139614e-05, "loss": 0.8237579345703125, "memory(GiB)": 88.99, "step": 3515, "token_acc": 0.7796362360371951, "train_speed(iter/s)": 0.121335 }, { "epoch": 0.045674394765610554, "grad_norm": 1.1622086763381958, "learning_rate": 4.567276501881407e-05, "loss": 0.8989505767822266, "memory(GiB)": 88.99, "step": 3520, "token_acc": 0.759676137868412, "train_speed(iter/s)": 0.121336 }, { "epoch": 0.04573927316726625, "grad_norm": 1.1747782230377197, "learning_rate": 4.573764110548852e-05, "loss": 0.8903154373168946, "memory(GiB)": 88.99, "step": 3525, "token_acc": 0.7480882989467609, "train_speed(iter/s)": 0.121333 }, { "epoch": 0.045804151568921946, "grad_norm": 1.0217602252960205, "learning_rate": 4.580251719216297e-05, "loss": 0.8747045516967773, "memory(GiB)": 88.99, "step": 3530, "token_acc": 0.762840680123016, "train_speed(iter/s)": 0.121337 }, { "epoch": 0.045869029970577645, "grad_norm": 1.0288293361663818, "learning_rate": 4.586739327883742e-05, "loss": 0.8367735862731933, "memory(GiB)": 88.99, "step": 3535, "token_acc": 0.7661023947151114, "train_speed(iter/s)": 0.121342 }, { "epoch": 0.045933908372233344, "grad_norm": 1.0489609241485596, "learning_rate": 4.593226936551188e-05, "loss": 0.8608413696289062, "memory(GiB)": 88.99, "step": 3540, "token_acc": 0.7399213033273505, "train_speed(iter/s)": 0.121346 }, { "epoch": 0.045998786773889036, "grad_norm": 1.2054945230484009, "learning_rate": 4.599714545218633e-05, "loss": 0.8558273315429688, "memory(GiB)": 88.99, "step": 3545, "token_acc": 0.7684419957147229, "train_speed(iter/s)": 0.12134 }, { "epoch": 0.046063665175544735, "grad_norm": 1.2024065256118774, "learning_rate": 4.6062021538860777e-05, "loss": 0.8649160385131835, "memory(GiB)": 88.99, "step": 3550, "token_acc": 0.7526264076789358, "train_speed(iter/s)": 0.121343 }, { "epoch": 0.046128543577200434, "grad_norm": 1.3526197671890259, "learning_rate": 4.612689762553523e-05, "loss": 0.939394187927246, "memory(GiB)": 88.99, "step": 3555, "token_acc": 0.7398065617295657, "train_speed(iter/s)": 0.121351 }, { "epoch": 0.046193421978856126, "grad_norm": 1.2311183214187622, "learning_rate": 4.6191773712209676e-05, "loss": 0.8170141220092774, "memory(GiB)": 88.99, "step": 3560, "token_acc": 0.7699140401146132, "train_speed(iter/s)": 0.121348 }, { "epoch": 0.046258300380511826, "grad_norm": 0.9421373009681702, "learning_rate": 4.625664979888413e-05, "loss": 0.847776222229004, "memory(GiB)": 88.99, "step": 3565, "token_acc": 0.7640997830802603, "train_speed(iter/s)": 0.121346 }, { "epoch": 0.046323178782167525, "grad_norm": 1.1235793828964233, "learning_rate": 4.632152588555859e-05, "loss": 0.8274619102478027, "memory(GiB)": 88.99, "step": 3570, "token_acc": 0.765597091094667, "train_speed(iter/s)": 0.121339 }, { "epoch": 0.04638805718382322, "grad_norm": 1.104063630104065, "learning_rate": 4.638640197223303e-05, "loss": 0.8951224327087403, "memory(GiB)": 88.99, "step": 3575, "token_acc": 0.7444700685354207, "train_speed(iter/s)": 0.121334 }, { "epoch": 0.046452935585478916, "grad_norm": 1.0184437036514282, "learning_rate": 4.645127805890749e-05, "loss": 0.8309938430786132, "memory(GiB)": 88.99, "step": 3580, "token_acc": 0.7531822912496998, "train_speed(iter/s)": 0.12133 }, { "epoch": 0.046517813987134615, "grad_norm": 1.0627853870391846, "learning_rate": 4.651615414558194e-05, "loss": 0.8833137512207031, "memory(GiB)": 88.99, "step": 3585, "token_acc": 0.7704729707552433, "train_speed(iter/s)": 0.121326 }, { "epoch": 0.04658269238879031, "grad_norm": 1.0655032396316528, "learning_rate": 4.6581030232256396e-05, "loss": 0.8881101608276367, "memory(GiB)": 88.99, "step": 3590, "token_acc": 0.7185494847365351, "train_speed(iter/s)": 0.121329 }, { "epoch": 0.04664757079044601, "grad_norm": 1.132448673248291, "learning_rate": 4.6645906318930845e-05, "loss": 0.8907249450683594, "memory(GiB)": 88.99, "step": 3595, "token_acc": 0.7332987910189983, "train_speed(iter/s)": 0.121334 }, { "epoch": 0.046712449192101706, "grad_norm": 1.0505914688110352, "learning_rate": 4.6710782405605295e-05, "loss": 0.8614205360412598, "memory(GiB)": 88.99, "step": 3600, "token_acc": 0.7496156474205672, "train_speed(iter/s)": 0.121341 }, { "epoch": 0.0467773275937574, "grad_norm": 1.1436893939971924, "learning_rate": 4.677565849227975e-05, "loss": 0.9053877830505371, "memory(GiB)": 88.99, "step": 3605, "token_acc": 0.7720145597088058, "train_speed(iter/s)": 0.121346 }, { "epoch": 0.0468422059954131, "grad_norm": 1.2241144180297852, "learning_rate": 4.6840534578954195e-05, "loss": 0.9039193153381347, "memory(GiB)": 88.99, "step": 3610, "token_acc": 0.732324020602466, "train_speed(iter/s)": 0.121346 }, { "epoch": 0.046907084397068796, "grad_norm": 1.0847550630569458, "learning_rate": 4.690541066562865e-05, "loss": 0.8391927719116211, "memory(GiB)": 88.99, "step": 3615, "token_acc": 0.7705502868864643, "train_speed(iter/s)": 0.121343 }, { "epoch": 0.04697196279872449, "grad_norm": 1.1680978536605835, "learning_rate": 4.697028675230311e-05, "loss": 0.8639381408691407, "memory(GiB)": 88.99, "step": 3620, "token_acc": 0.738932439905019, "train_speed(iter/s)": 0.12135 }, { "epoch": 0.04703684120038019, "grad_norm": 0.9997826218605042, "learning_rate": 4.703516283897755e-05, "loss": 0.864140796661377, "memory(GiB)": 88.99, "step": 3625, "token_acc": 0.754680767724246, "train_speed(iter/s)": 0.121353 }, { "epoch": 0.04710171960203589, "grad_norm": 1.062212586402893, "learning_rate": 4.710003892565201e-05, "loss": 0.9337656021118164, "memory(GiB)": 88.99, "step": 3630, "token_acc": 0.7086104366796862, "train_speed(iter/s)": 0.121356 }, { "epoch": 0.04716659800369158, "grad_norm": 1.0767498016357422, "learning_rate": 4.716491501232646e-05, "loss": 0.8464045524597168, "memory(GiB)": 88.99, "step": 3635, "token_acc": 0.7709308497804372, "train_speed(iter/s)": 0.121366 }, { "epoch": 0.04723147640534728, "grad_norm": 1.0985846519470215, "learning_rate": 4.722979109900091e-05, "loss": 0.8506279945373535, "memory(GiB)": 88.99, "step": 3640, "token_acc": 0.7474911258162058, "train_speed(iter/s)": 0.121368 }, { "epoch": 0.04729635480700298, "grad_norm": 1.0767234563827515, "learning_rate": 4.7294667185675364e-05, "loss": 0.8373927116394043, "memory(GiB)": 88.99, "step": 3645, "token_acc": 0.7636749204765172, "train_speed(iter/s)": 0.121366 }, { "epoch": 0.04736123320865867, "grad_norm": 1.0255409479141235, "learning_rate": 4.7359543272349814e-05, "loss": 0.8315879821777343, "memory(GiB)": 88.99, "step": 3650, "token_acc": 0.7713569664390527, "train_speed(iter/s)": 0.121368 }, { "epoch": 0.04742611161031437, "grad_norm": 1.0171962976455688, "learning_rate": 4.7424419359024264e-05, "loss": 0.844172477722168, "memory(GiB)": 88.99, "step": 3655, "token_acc": 0.7481618287625884, "train_speed(iter/s)": 0.121369 }, { "epoch": 0.04749099001197007, "grad_norm": 1.0760407447814941, "learning_rate": 4.7489295445698714e-05, "loss": 0.8510231018066406, "memory(GiB)": 88.99, "step": 3660, "token_acc": 0.7666232073011734, "train_speed(iter/s)": 0.121372 }, { "epoch": 0.04755586841362576, "grad_norm": 1.0636693239212036, "learning_rate": 4.755417153237317e-05, "loss": 0.8770817756652832, "memory(GiB)": 88.99, "step": 3665, "token_acc": 0.7395053246307111, "train_speed(iter/s)": 0.121373 }, { "epoch": 0.04762074681528146, "grad_norm": 1.175316572189331, "learning_rate": 4.761904761904762e-05, "loss": 0.8184633255004883, "memory(GiB)": 88.99, "step": 3670, "token_acc": 0.7767731329262565, "train_speed(iter/s)": 0.121376 }, { "epoch": 0.04768562521693716, "grad_norm": 0.924289345741272, "learning_rate": 4.768392370572207e-05, "loss": 0.8506599426269531, "memory(GiB)": 88.99, "step": 3675, "token_acc": 0.7433769869039288, "train_speed(iter/s)": 0.121373 }, { "epoch": 0.04775050361859285, "grad_norm": 1.1640737056732178, "learning_rate": 4.774879979239653e-05, "loss": 0.8751785278320312, "memory(GiB)": 88.99, "step": 3680, "token_acc": 0.7429979347077793, "train_speed(iter/s)": 0.121375 }, { "epoch": 0.04781538202024855, "grad_norm": 1.1235886812210083, "learning_rate": 4.781367587907098e-05, "loss": 0.8581063270568847, "memory(GiB)": 88.99, "step": 3685, "token_acc": 0.7452390205985231, "train_speed(iter/s)": 0.121383 }, { "epoch": 0.04788026042190425, "grad_norm": 1.175262212753296, "learning_rate": 4.7878551965745427e-05, "loss": 0.8703173637390137, "memory(GiB)": 88.99, "step": 3690, "token_acc": 0.7409352254976902, "train_speed(iter/s)": 0.121386 }, { "epoch": 0.04794513882355994, "grad_norm": 1.028231143951416, "learning_rate": 4.794342805241988e-05, "loss": 0.8745731353759766, "memory(GiB)": 88.99, "step": 3695, "token_acc": 0.7686844661820375, "train_speed(iter/s)": 0.121385 }, { "epoch": 0.04801001722521564, "grad_norm": 1.1707334518432617, "learning_rate": 4.800830413909433e-05, "loss": 0.880648422241211, "memory(GiB)": 88.99, "step": 3700, "token_acc": 0.7572638824541664, "train_speed(iter/s)": 0.121384 }, { "epoch": 0.04807489562687134, "grad_norm": 1.1176173686981201, "learning_rate": 4.807318022576878e-05, "loss": 0.8829469680786133, "memory(GiB)": 88.99, "step": 3705, "token_acc": 0.7614208021753909, "train_speed(iter/s)": 0.121388 }, { "epoch": 0.04813977402852703, "grad_norm": 1.0960705280303955, "learning_rate": 4.813805631244323e-05, "loss": 0.9165363311767578, "memory(GiB)": 88.99, "step": 3710, "token_acc": 0.7323539949838768, "train_speed(iter/s)": 0.121391 }, { "epoch": 0.04820465243018273, "grad_norm": 1.0444626808166504, "learning_rate": 4.820293239911769e-05, "loss": 0.8999433517456055, "memory(GiB)": 88.99, "step": 3715, "token_acc": 0.7530645285637191, "train_speed(iter/s)": 0.121397 }, { "epoch": 0.04826953083183843, "grad_norm": 1.019433856010437, "learning_rate": 4.826780848579214e-05, "loss": 0.897585105895996, "memory(GiB)": 88.99, "step": 3720, "token_acc": 0.7586507300414678, "train_speed(iter/s)": 0.121402 }, { "epoch": 0.04833440923349412, "grad_norm": 1.1071964502334595, "learning_rate": 4.833268457246659e-05, "loss": 0.8428084373474121, "memory(GiB)": 88.99, "step": 3725, "token_acc": 0.776877957741785, "train_speed(iter/s)": 0.121396 }, { "epoch": 0.04839928763514982, "grad_norm": 1.2039052248001099, "learning_rate": 4.8397560659141046e-05, "loss": 0.84732666015625, "memory(GiB)": 88.99, "step": 3730, "token_acc": 0.751068201833067, "train_speed(iter/s)": 0.121398 }, { "epoch": 0.04846416603680552, "grad_norm": 1.177490472793579, "learning_rate": 4.8462436745815496e-05, "loss": 0.8703075408935547, "memory(GiB)": 88.99, "step": 3735, "token_acc": 0.7735743644997806, "train_speed(iter/s)": 0.121394 }, { "epoch": 0.04852904443846121, "grad_norm": 1.1622949838638306, "learning_rate": 4.8527312832489945e-05, "loss": 0.8538652420043945, "memory(GiB)": 88.99, "step": 3740, "token_acc": 0.7638916418896597, "train_speed(iter/s)": 0.121389 }, { "epoch": 0.04859392284011691, "grad_norm": 1.0450705289840698, "learning_rate": 4.85921889191644e-05, "loss": 0.8733566284179688, "memory(GiB)": 88.99, "step": 3745, "token_acc": 0.7512948336138806, "train_speed(iter/s)": 0.121389 }, { "epoch": 0.04865880124177261, "grad_norm": 0.9825257658958435, "learning_rate": 4.8657065005838845e-05, "loss": 0.8524513244628906, "memory(GiB)": 88.99, "step": 3750, "token_acc": 0.7651647879408945, "train_speed(iter/s)": 0.12139 }, { "epoch": 0.0487236796434283, "grad_norm": 1.0753564834594727, "learning_rate": 4.87219410925133e-05, "loss": 0.8951984405517578, "memory(GiB)": 88.99, "step": 3755, "token_acc": 0.7520483119084617, "train_speed(iter/s)": 0.121385 }, { "epoch": 0.048788558045084, "grad_norm": 1.0617663860321045, "learning_rate": 4.878681717918775e-05, "loss": 0.8754723548889161, "memory(GiB)": 88.99, "step": 3760, "token_acc": 0.7476888067131275, "train_speed(iter/s)": 0.121393 }, { "epoch": 0.0488534364467397, "grad_norm": 1.1245148181915283, "learning_rate": 4.88516932658622e-05, "loss": 0.8748747825622558, "memory(GiB)": 88.99, "step": 3765, "token_acc": 0.7647868811416411, "train_speed(iter/s)": 0.1214 }, { "epoch": 0.04891831484839539, "grad_norm": 1.144425392150879, "learning_rate": 4.891656935253666e-05, "loss": 0.8848743438720703, "memory(GiB)": 88.99, "step": 3770, "token_acc": 0.745864526830143, "train_speed(iter/s)": 0.121411 }, { "epoch": 0.04898319325005109, "grad_norm": 1.0350043773651123, "learning_rate": 4.898144543921111e-05, "loss": 0.9022315979003906, "memory(GiB)": 88.99, "step": 3775, "token_acc": 0.7288776402949632, "train_speed(iter/s)": 0.121411 }, { "epoch": 0.04904807165170679, "grad_norm": 0.9642390012741089, "learning_rate": 4.9046321525885565e-05, "loss": 0.8431899070739746, "memory(GiB)": 88.99, "step": 3780, "token_acc": 0.769391974963675, "train_speed(iter/s)": 0.121407 }, { "epoch": 0.049112950053362484, "grad_norm": 1.0599443912506104, "learning_rate": 4.9111197612560014e-05, "loss": 0.8750453948974609, "memory(GiB)": 88.99, "step": 3785, "token_acc": 0.7674368309123022, "train_speed(iter/s)": 0.121406 }, { "epoch": 0.04917782845501818, "grad_norm": 1.0368101596832275, "learning_rate": 4.9176073699234464e-05, "loss": 0.8713481903076172, "memory(GiB)": 88.99, "step": 3790, "token_acc": 0.7542702737478689, "train_speed(iter/s)": 0.121403 }, { "epoch": 0.04924270685667388, "grad_norm": 0.9810357689857483, "learning_rate": 4.924094978590892e-05, "loss": 0.817537498474121, "memory(GiB)": 88.99, "step": 3795, "token_acc": 0.7795567704773992, "train_speed(iter/s)": 0.121401 }, { "epoch": 0.049307585258329574, "grad_norm": 1.0592726469039917, "learning_rate": 4.9305825872583364e-05, "loss": 0.9318370819091797, "memory(GiB)": 88.99, "step": 3800, "token_acc": 0.7365982565721738, "train_speed(iter/s)": 0.121406 }, { "epoch": 0.04937246365998527, "grad_norm": 1.068558692932129, "learning_rate": 4.937070195925782e-05, "loss": 0.8644952774047852, "memory(GiB)": 88.99, "step": 3805, "token_acc": 0.7726726536807488, "train_speed(iter/s)": 0.121407 }, { "epoch": 0.04943734206164097, "grad_norm": 1.1001850366592407, "learning_rate": 4.943557804593227e-05, "loss": 0.8673966407775879, "memory(GiB)": 88.99, "step": 3810, "token_acc": 0.7265009096422074, "train_speed(iter/s)": 0.121408 }, { "epoch": 0.049502220463296664, "grad_norm": 1.0550938844680786, "learning_rate": 4.950045413260672e-05, "loss": 0.9031328201293946, "memory(GiB)": 88.99, "step": 3815, "token_acc": 0.7356916007838368, "train_speed(iter/s)": 0.121399 }, { "epoch": 0.049567098864952364, "grad_norm": 1.1148065328598022, "learning_rate": 4.956533021928118e-05, "loss": 0.8418621063232422, "memory(GiB)": 88.99, "step": 3820, "token_acc": 0.7710260115606936, "train_speed(iter/s)": 0.121393 }, { "epoch": 0.04963197726660806, "grad_norm": 0.9976794123649597, "learning_rate": 4.963020630595563e-05, "loss": 0.8361377716064453, "memory(GiB)": 88.99, "step": 3825, "token_acc": 0.7719479886720059, "train_speed(iter/s)": 0.121397 }, { "epoch": 0.049696855668263755, "grad_norm": 1.150429129600525, "learning_rate": 4.9695082392630077e-05, "loss": 0.8864994049072266, "memory(GiB)": 88.99, "step": 3830, "token_acc": 0.7630472675074336, "train_speed(iter/s)": 0.121406 }, { "epoch": 0.049761734069919454, "grad_norm": 1.16764235496521, "learning_rate": 4.975995847930453e-05, "loss": 0.9083925247192383, "memory(GiB)": 88.99, "step": 3835, "token_acc": 0.7623151319008016, "train_speed(iter/s)": 0.121407 }, { "epoch": 0.04982661247157515, "grad_norm": 1.0741658210754395, "learning_rate": 4.982483456597898e-05, "loss": 0.8752026557922363, "memory(GiB)": 88.99, "step": 3840, "token_acc": 0.7571272959471134, "train_speed(iter/s)": 0.121407 }, { "epoch": 0.049891490873230845, "grad_norm": 1.0398939847946167, "learning_rate": 4.988971065265343e-05, "loss": 0.8465102195739747, "memory(GiB)": 88.99, "step": 3845, "token_acc": 0.7649058936667125, "train_speed(iter/s)": 0.121405 }, { "epoch": 0.049956369274886545, "grad_norm": 1.197089672088623, "learning_rate": 4.995458673932788e-05, "loss": 0.8601234436035157, "memory(GiB)": 88.99, "step": 3850, "token_acc": 0.7378988884904983, "train_speed(iter/s)": 0.12141 }, { "epoch": 0.050021247676542244, "grad_norm": 0.9632752537727356, "learning_rate": 5.001946282600234e-05, "loss": 0.844362449645996, "memory(GiB)": 88.99, "step": 3855, "token_acc": 0.7621395119546462, "train_speed(iter/s)": 0.121408 }, { "epoch": 0.050086126078197936, "grad_norm": 1.0680267810821533, "learning_rate": 5.008433891267679e-05, "loss": 0.8989805221557617, "memory(GiB)": 88.99, "step": 3860, "token_acc": 0.7505948364643631, "train_speed(iter/s)": 0.12141 }, { "epoch": 0.050151004479853635, "grad_norm": 1.0509519577026367, "learning_rate": 5.0149214999351246e-05, "loss": 0.8868295669555664, "memory(GiB)": 88.99, "step": 3865, "token_acc": 0.7345928611266035, "train_speed(iter/s)": 0.121413 }, { "epoch": 0.050215882881509334, "grad_norm": 1.1259859800338745, "learning_rate": 5.0214091086025696e-05, "loss": 0.9018037796020508, "memory(GiB)": 88.99, "step": 3870, "token_acc": 0.751245510369598, "train_speed(iter/s)": 0.121412 }, { "epoch": 0.050280761283165026, "grad_norm": 1.0288639068603516, "learning_rate": 5.027896717270014e-05, "loss": 0.8837043762207031, "memory(GiB)": 88.99, "step": 3875, "token_acc": 0.7271153217902646, "train_speed(iter/s)": 0.121408 }, { "epoch": 0.050345639684820725, "grad_norm": 1.0680232048034668, "learning_rate": 5.03438432593746e-05, "loss": 0.8940746307373046, "memory(GiB)": 88.99, "step": 3880, "token_acc": 0.7347705442902882, "train_speed(iter/s)": 0.121409 }, { "epoch": 0.050410518086476425, "grad_norm": 1.0828708410263062, "learning_rate": 5.040871934604905e-05, "loss": 0.8441499710083008, "memory(GiB)": 88.99, "step": 3885, "token_acc": 0.7614989265499967, "train_speed(iter/s)": 0.121408 }, { "epoch": 0.05047539648813212, "grad_norm": 1.0244914293289185, "learning_rate": 5.0473595432723495e-05, "loss": 0.879810905456543, "memory(GiB)": 88.99, "step": 3890, "token_acc": 0.7455499344200862, "train_speed(iter/s)": 0.121412 }, { "epoch": 0.050540274889787816, "grad_norm": 1.035622477531433, "learning_rate": 5.053847151939796e-05, "loss": 0.8518998146057128, "memory(GiB)": 88.99, "step": 3895, "token_acc": 0.7643968270320387, "train_speed(iter/s)": 0.121411 }, { "epoch": 0.050605153291443515, "grad_norm": 1.1037485599517822, "learning_rate": 5.06033476060724e-05, "loss": 0.8963409423828125, "memory(GiB)": 88.99, "step": 3900, "token_acc": 0.7438992682836236, "train_speed(iter/s)": 0.121412 }, { "epoch": 0.05067003169309921, "grad_norm": 1.0701954364776611, "learning_rate": 5.066822369274685e-05, "loss": 0.8813566207885742, "memory(GiB)": 88.99, "step": 3905, "token_acc": 0.7404129793510325, "train_speed(iter/s)": 0.12142 }, { "epoch": 0.050734910094754906, "grad_norm": 1.057366967201233, "learning_rate": 5.073309977942131e-05, "loss": 0.9251584053039551, "memory(GiB)": 88.99, "step": 3910, "token_acc": 0.7342693174830908, "train_speed(iter/s)": 0.121423 }, { "epoch": 0.050799788496410606, "grad_norm": 1.0500699281692505, "learning_rate": 5.079797586609576e-05, "loss": 0.9046125411987305, "memory(GiB)": 88.99, "step": 3915, "token_acc": 0.7478512859353904, "train_speed(iter/s)": 0.121429 }, { "epoch": 0.0508646668980663, "grad_norm": 1.0727722644805908, "learning_rate": 5.086285195277021e-05, "loss": 0.8805021286010742, "memory(GiB)": 88.99, "step": 3920, "token_acc": 0.7404954697330002, "train_speed(iter/s)": 0.121438 }, { "epoch": 0.050929545299722, "grad_norm": 1.1149433851242065, "learning_rate": 5.0927728039444664e-05, "loss": 0.8699479103088379, "memory(GiB)": 88.99, "step": 3925, "token_acc": 0.7617500251247864, "train_speed(iter/s)": 0.121438 }, { "epoch": 0.050994423701377696, "grad_norm": 1.059652328491211, "learning_rate": 5.0992604126119114e-05, "loss": 0.8602596282958984, "memory(GiB)": 88.99, "step": 3930, "token_acc": 0.7550905824225184, "train_speed(iter/s)": 0.121434 }, { "epoch": 0.05105930210303339, "grad_norm": 1.093222737312317, "learning_rate": 5.1057480212793564e-05, "loss": 0.8896757125854492, "memory(GiB)": 88.99, "step": 3935, "token_acc": 0.7664036457234482, "train_speed(iter/s)": 0.121433 }, { "epoch": 0.05112418050468909, "grad_norm": 1.184313178062439, "learning_rate": 5.112235629946802e-05, "loss": 0.8442479133605957, "memory(GiB)": 88.99, "step": 3940, "token_acc": 0.7569512708889201, "train_speed(iter/s)": 0.121428 }, { "epoch": 0.051189058906344787, "grad_norm": 1.0233560800552368, "learning_rate": 5.118723238614247e-05, "loss": 0.8896115303039551, "memory(GiB)": 88.99, "step": 3945, "token_acc": 0.7426504162129887, "train_speed(iter/s)": 0.121429 }, { "epoch": 0.05125393730800048, "grad_norm": 1.1166985034942627, "learning_rate": 5.125210847281692e-05, "loss": 0.8880014419555664, "memory(GiB)": 88.99, "step": 3950, "token_acc": 0.7297997180974858, "train_speed(iter/s)": 0.121431 }, { "epoch": 0.05131881570965618, "grad_norm": 1.0226432085037231, "learning_rate": 5.131698455949138e-05, "loss": 0.8643945693969727, "memory(GiB)": 88.99, "step": 3955, "token_acc": 0.761184317553278, "train_speed(iter/s)": 0.121438 }, { "epoch": 0.05138369411131188, "grad_norm": 1.124886155128479, "learning_rate": 5.138186064616583e-05, "loss": 0.8992156982421875, "memory(GiB)": 88.99, "step": 3960, "token_acc": 0.7331901037295646, "train_speed(iter/s)": 0.121434 }, { "epoch": 0.05144857251296757, "grad_norm": 1.07474684715271, "learning_rate": 5.144673673284027e-05, "loss": 0.8683936119079589, "memory(GiB)": 88.99, "step": 3965, "token_acc": 0.7452458895795785, "train_speed(iter/s)": 0.121432 }, { "epoch": 0.05151345091462327, "grad_norm": 1.141882300376892, "learning_rate": 5.1511612819514733e-05, "loss": 0.9004226684570312, "memory(GiB)": 88.99, "step": 3970, "token_acc": 0.7360975402523472, "train_speed(iter/s)": 0.121433 }, { "epoch": 0.05157832931627897, "grad_norm": 1.0313211679458618, "learning_rate": 5.1576488906189176e-05, "loss": 0.8818660736083984, "memory(GiB)": 88.99, "step": 3975, "token_acc": 0.74634166392634, "train_speed(iter/s)": 0.121437 }, { "epoch": 0.05164320771793466, "grad_norm": 1.0166290998458862, "learning_rate": 5.1641364992863626e-05, "loss": 0.8715008735656739, "memory(GiB)": 88.99, "step": 3980, "token_acc": 0.7670442531339676, "train_speed(iter/s)": 0.121443 }, { "epoch": 0.05170808611959036, "grad_norm": 1.0230557918548584, "learning_rate": 5.170624107953809e-05, "loss": 0.858043098449707, "memory(GiB)": 88.99, "step": 3985, "token_acc": 0.7282801566841877, "train_speed(iter/s)": 0.121436 }, { "epoch": 0.05177296452124606, "grad_norm": 1.0540602207183838, "learning_rate": 5.177111716621253e-05, "loss": 0.9208920478820801, "memory(GiB)": 88.99, "step": 3990, "token_acc": 0.767650397275823, "train_speed(iter/s)": 0.121433 }, { "epoch": 0.05183784292290175, "grad_norm": 1.0167254209518433, "learning_rate": 5.183599325288698e-05, "loss": 0.8983295440673829, "memory(GiB)": 88.99, "step": 3995, "token_acc": 0.7290924047162051, "train_speed(iter/s)": 0.121437 }, { "epoch": 0.05190272132455745, "grad_norm": 1.0528998374938965, "learning_rate": 5.190086933956144e-05, "loss": 0.8901880264282227, "memory(GiB)": 88.99, "step": 4000, "token_acc": 0.7567799814388341, "train_speed(iter/s)": 0.121432 }, { "epoch": 0.05196759972621315, "grad_norm": 1.1286598443984985, "learning_rate": 5.196574542623589e-05, "loss": 0.867437744140625, "memory(GiB)": 88.99, "step": 4005, "token_acc": 0.7641256495978286, "train_speed(iter/s)": 0.121427 }, { "epoch": 0.05203247812786884, "grad_norm": 1.0650904178619385, "learning_rate": 5.203062151291035e-05, "loss": 0.8698780059814453, "memory(GiB)": 88.99, "step": 4010, "token_acc": 0.7369755459229574, "train_speed(iter/s)": 0.121431 }, { "epoch": 0.05209735652952454, "grad_norm": 0.9969974756240845, "learning_rate": 5.2095497599584796e-05, "loss": 0.883572769165039, "memory(GiB)": 88.99, "step": 4015, "token_acc": 0.7636473714304934, "train_speed(iter/s)": 0.121432 }, { "epoch": 0.05216223493118024, "grad_norm": 1.0714102983474731, "learning_rate": 5.2160373686259245e-05, "loss": 0.8910327911376953, "memory(GiB)": 88.99, "step": 4020, "token_acc": 0.733737493956187, "train_speed(iter/s)": 0.121434 }, { "epoch": 0.05222711333283593, "grad_norm": 1.2967652082443237, "learning_rate": 5.22252497729337e-05, "loss": 0.8974685668945312, "memory(GiB)": 88.99, "step": 4025, "token_acc": 0.7330981561624904, "train_speed(iter/s)": 0.121438 }, { "epoch": 0.05229199173449163, "grad_norm": 1.1144036054611206, "learning_rate": 5.229012585960815e-05, "loss": 0.8504452705383301, "memory(GiB)": 88.99, "step": 4030, "token_acc": 0.7544984488107549, "train_speed(iter/s)": 0.12143 }, { "epoch": 0.05235687013614733, "grad_norm": 1.2086724042892456, "learning_rate": 5.23550019462826e-05, "loss": 0.8805629730224609, "memory(GiB)": 88.99, "step": 4035, "token_acc": 0.7682729778538162, "train_speed(iter/s)": 0.12143 }, { "epoch": 0.05242174853780302, "grad_norm": 1.0491836071014404, "learning_rate": 5.241987803295706e-05, "loss": 0.9121086120605468, "memory(GiB)": 88.99, "step": 4040, "token_acc": 0.7538430076008044, "train_speed(iter/s)": 0.121427 }, { "epoch": 0.05248662693945872, "grad_norm": 1.112954020500183, "learning_rate": 5.248475411963151e-05, "loss": 0.9060359954833984, "memory(GiB)": 88.99, "step": 4045, "token_acc": 0.7432237369687249, "train_speed(iter/s)": 0.121431 }, { "epoch": 0.05255150534111441, "grad_norm": 1.0776828527450562, "learning_rate": 5.254963020630596e-05, "loss": 0.898502254486084, "memory(GiB)": 88.99, "step": 4050, "token_acc": 0.7404032125881003, "train_speed(iter/s)": 0.121437 }, { "epoch": 0.05261638374277011, "grad_norm": 1.1156024932861328, "learning_rate": 5.2614506292980415e-05, "loss": 0.8796446800231934, "memory(GiB)": 88.99, "step": 4055, "token_acc": 0.7545278022947925, "train_speed(iter/s)": 0.12144 }, { "epoch": 0.05268126214442581, "grad_norm": 1.0400569438934326, "learning_rate": 5.2679382379654865e-05, "loss": 0.8880088806152344, "memory(GiB)": 88.99, "step": 4060, "token_acc": 0.7619269340974212, "train_speed(iter/s)": 0.121441 }, { "epoch": 0.0527461405460815, "grad_norm": 1.0179418325424194, "learning_rate": 5.274425846632931e-05, "loss": 0.8895782470703125, "memory(GiB)": 88.99, "step": 4065, "token_acc": 0.7477979922304704, "train_speed(iter/s)": 0.121447 }, { "epoch": 0.0528110189477372, "grad_norm": 1.2834666967391968, "learning_rate": 5.280913455300377e-05, "loss": 0.8833581924438476, "memory(GiB)": 88.99, "step": 4070, "token_acc": 0.7424761398024675, "train_speed(iter/s)": 0.121445 }, { "epoch": 0.0528758973493929, "grad_norm": 1.04867422580719, "learning_rate": 5.287401063967822e-05, "loss": 0.9162228584289551, "memory(GiB)": 88.99, "step": 4075, "token_acc": 0.7173186670214968, "train_speed(iter/s)": 0.121445 }, { "epoch": 0.052940775751048594, "grad_norm": 1.347717046737671, "learning_rate": 5.2938886726352664e-05, "loss": 0.9118390083312988, "memory(GiB)": 88.99, "step": 4080, "token_acc": 0.7390441839495041, "train_speed(iter/s)": 0.121451 }, { "epoch": 0.05300565415270429, "grad_norm": 1.1476253271102905, "learning_rate": 5.300376281302713e-05, "loss": 0.914058780670166, "memory(GiB)": 88.99, "step": 4085, "token_acc": 0.7594186046511628, "train_speed(iter/s)": 0.121453 }, { "epoch": 0.05307053255435999, "grad_norm": 1.0740365982055664, "learning_rate": 5.306863889970157e-05, "loss": 0.9172357559204102, "memory(GiB)": 88.99, "step": 4090, "token_acc": 0.7302684810592129, "train_speed(iter/s)": 0.121455 }, { "epoch": 0.053135410956015684, "grad_norm": 1.0357213020324707, "learning_rate": 5.313351498637602e-05, "loss": 0.9067340850830078, "memory(GiB)": 88.99, "step": 4095, "token_acc": 0.7263087398147114, "train_speed(iter/s)": 0.12146 }, { "epoch": 0.05320028935767138, "grad_norm": 1.0972449779510498, "learning_rate": 5.319839107305048e-05, "loss": 0.8516384124755859, "memory(GiB)": 88.99, "step": 4100, "token_acc": 0.7408965426580781, "train_speed(iter/s)": 0.121459 }, { "epoch": 0.05326516775932708, "grad_norm": 1.116982340812683, "learning_rate": 5.326326715972493e-05, "loss": 0.8856666564941407, "memory(GiB)": 88.99, "step": 4105, "token_acc": 0.7567417395169654, "train_speed(iter/s)": 0.121465 }, { "epoch": 0.053330046160982775, "grad_norm": 1.1578959226608276, "learning_rate": 5.332814324639938e-05, "loss": 0.8666315078735352, "memory(GiB)": 88.99, "step": 4110, "token_acc": 0.7592288311972833, "train_speed(iter/s)": 0.121465 }, { "epoch": 0.053394924562638474, "grad_norm": 1.1033297777175903, "learning_rate": 5.339301933307383e-05, "loss": 0.9175241470336915, "memory(GiB)": 88.99, "step": 4115, "token_acc": 0.7262906088432749, "train_speed(iter/s)": 0.121458 }, { "epoch": 0.05345980296429417, "grad_norm": 1.103288173675537, "learning_rate": 5.345789541974828e-05, "loss": 0.8495519638061524, "memory(GiB)": 88.99, "step": 4120, "token_acc": 0.7447579171235835, "train_speed(iter/s)": 0.121456 }, { "epoch": 0.053524681365949865, "grad_norm": 0.9664837718009949, "learning_rate": 5.352277150642273e-05, "loss": 0.9202404022216797, "memory(GiB)": 88.99, "step": 4125, "token_acc": 0.7474724696508438, "train_speed(iter/s)": 0.121461 }, { "epoch": 0.053589559767605564, "grad_norm": 1.1790894269943237, "learning_rate": 5.358764759309719e-05, "loss": 0.8929853439331055, "memory(GiB)": 88.99, "step": 4130, "token_acc": 0.7540475637152374, "train_speed(iter/s)": 0.121463 }, { "epoch": 0.05365443816926126, "grad_norm": 1.0680872201919556, "learning_rate": 5.365252367977164e-05, "loss": 0.8584230422973633, "memory(GiB)": 88.99, "step": 4135, "token_acc": 0.7617901339210036, "train_speed(iter/s)": 0.121465 }, { "epoch": 0.053719316570916956, "grad_norm": 1.2932499647140503, "learning_rate": 5.371739976644609e-05, "loss": 0.8964194297790528, "memory(GiB)": 88.99, "step": 4140, "token_acc": 0.760864146240133, "train_speed(iter/s)": 0.12147 }, { "epoch": 0.053784194972572655, "grad_norm": 1.0825086832046509, "learning_rate": 5.3782275853120546e-05, "loss": 0.8849531173706054, "memory(GiB)": 88.99, "step": 4145, "token_acc": 0.7478336221837089, "train_speed(iter/s)": 0.121473 }, { "epoch": 0.053849073374228354, "grad_norm": 1.1885833740234375, "learning_rate": 5.3847151939794996e-05, "loss": 0.9499628067016601, "memory(GiB)": 88.99, "step": 4150, "token_acc": 0.7326282130119296, "train_speed(iter/s)": 0.121473 }, { "epoch": 0.053913951775884046, "grad_norm": 1.0974698066711426, "learning_rate": 5.391202802646944e-05, "loss": 0.8818546295166015, "memory(GiB)": 88.99, "step": 4155, "token_acc": 0.7352752988718639, "train_speed(iter/s)": 0.121477 }, { "epoch": 0.053978830177539745, "grad_norm": 1.051100730895996, "learning_rate": 5.39769041131439e-05, "loss": 0.8876958847045898, "memory(GiB)": 88.99, "step": 4160, "token_acc": 0.7604800656500034, "train_speed(iter/s)": 0.121478 }, { "epoch": 0.054043708579195444, "grad_norm": 1.2161130905151367, "learning_rate": 5.4041780199818345e-05, "loss": 0.9444015502929688, "memory(GiB)": 88.99, "step": 4165, "token_acc": 0.7480610535459453, "train_speed(iter/s)": 0.121478 }, { "epoch": 0.05410858698085114, "grad_norm": 1.0324853658676147, "learning_rate": 5.4106656286492795e-05, "loss": 0.9083621978759766, "memory(GiB)": 88.99, "step": 4170, "token_acc": 0.7476742352570167, "train_speed(iter/s)": 0.121477 }, { "epoch": 0.054173465382506836, "grad_norm": 1.0662552118301392, "learning_rate": 5.417153237316726e-05, "loss": 0.8606383323669433, "memory(GiB)": 88.99, "step": 4175, "token_acc": 0.7577828397873956, "train_speed(iter/s)": 0.121482 }, { "epoch": 0.054238343784162535, "grad_norm": 1.0186442136764526, "learning_rate": 5.42364084598417e-05, "loss": 0.9038043022155762, "memory(GiB)": 88.99, "step": 4180, "token_acc": 0.7616404762673034, "train_speed(iter/s)": 0.121484 }, { "epoch": 0.05430322218581823, "grad_norm": 1.0589656829833984, "learning_rate": 5.430128454651615e-05, "loss": 0.8478694915771484, "memory(GiB)": 88.99, "step": 4185, "token_acc": 0.7667424440055156, "train_speed(iter/s)": 0.121478 }, { "epoch": 0.054368100587473926, "grad_norm": 1.067389726638794, "learning_rate": 5.436616063319061e-05, "loss": 0.9184314727783203, "memory(GiB)": 88.99, "step": 4190, "token_acc": 0.7492127601314348, "train_speed(iter/s)": 0.121474 }, { "epoch": 0.054432978989129625, "grad_norm": 1.11065673828125, "learning_rate": 5.443103671986506e-05, "loss": 0.8864557266235351, "memory(GiB)": 88.99, "step": 4195, "token_acc": 0.7585933186085035, "train_speed(iter/s)": 0.121475 }, { "epoch": 0.05449785739078532, "grad_norm": 1.0558812618255615, "learning_rate": 5.4495912806539515e-05, "loss": 0.8505370140075683, "memory(GiB)": 88.99, "step": 4200, "token_acc": 0.7865480638867762, "train_speed(iter/s)": 0.121473 }, { "epoch": 0.05456273579244102, "grad_norm": 1.0188313722610474, "learning_rate": 5.4560788893213964e-05, "loss": 0.9007729530334473, "memory(GiB)": 88.99, "step": 4205, "token_acc": 0.753865297869031, "train_speed(iter/s)": 0.121473 }, { "epoch": 0.054627614194096716, "grad_norm": 1.0630801916122437, "learning_rate": 5.4625664979888414e-05, "loss": 0.8863635063171387, "memory(GiB)": 88.99, "step": 4210, "token_acc": 0.7563646574316736, "train_speed(iter/s)": 0.121473 }, { "epoch": 0.05469249259575241, "grad_norm": 1.1210182905197144, "learning_rate": 5.469054106656287e-05, "loss": 0.8679141998291016, "memory(GiB)": 88.99, "step": 4215, "token_acc": 0.7695900751093009, "train_speed(iter/s)": 0.121478 }, { "epoch": 0.05475737099740811, "grad_norm": 1.0544261932373047, "learning_rate": 5.475541715323732e-05, "loss": 0.9021738052368165, "memory(GiB)": 88.99, "step": 4220, "token_acc": 0.7568568755846585, "train_speed(iter/s)": 0.121475 }, { "epoch": 0.054822249399063806, "grad_norm": 1.1828657388687134, "learning_rate": 5.482029323991177e-05, "loss": 0.9155467033386231, "memory(GiB)": 88.99, "step": 4225, "token_acc": 0.7513628724843219, "train_speed(iter/s)": 0.121481 }, { "epoch": 0.0548871278007195, "grad_norm": 1.047580361366272, "learning_rate": 5.488516932658623e-05, "loss": 0.8993667602539063, "memory(GiB)": 88.99, "step": 4230, "token_acc": 0.7673181083906117, "train_speed(iter/s)": 0.121483 }, { "epoch": 0.0549520062023752, "grad_norm": 1.151404619216919, "learning_rate": 5.495004541326068e-05, "loss": 0.8843463897705078, "memory(GiB)": 88.99, "step": 4235, "token_acc": 0.7537902559867877, "train_speed(iter/s)": 0.121487 }, { "epoch": 0.0550168846040309, "grad_norm": 1.0769221782684326, "learning_rate": 5.501492149993513e-05, "loss": 0.8630358695983886, "memory(GiB)": 88.99, "step": 4240, "token_acc": 0.7497527200791295, "train_speed(iter/s)": 0.121479 }, { "epoch": 0.05508176300568659, "grad_norm": 1.019917607307434, "learning_rate": 5.5079797586609584e-05, "loss": 0.8777495384216308, "memory(GiB)": 88.99, "step": 4245, "token_acc": 0.7551638126734116, "train_speed(iter/s)": 0.121473 }, { "epoch": 0.05514664140734229, "grad_norm": 1.057929515838623, "learning_rate": 5.5144673673284033e-05, "loss": 0.9109317779541015, "memory(GiB)": 88.99, "step": 4250, "token_acc": 0.7662372517444981, "train_speed(iter/s)": 0.121473 }, { "epoch": 0.05521151980899799, "grad_norm": 1.1338787078857422, "learning_rate": 5.5209549759958477e-05, "loss": 0.8893907546997071, "memory(GiB)": 88.99, "step": 4255, "token_acc": 0.762673781772223, "train_speed(iter/s)": 0.121479 }, { "epoch": 0.05527639821065368, "grad_norm": 0.9968667030334473, "learning_rate": 5.527442584663294e-05, "loss": 0.9074512481689453, "memory(GiB)": 88.99, "step": 4260, "token_acc": 0.7537524780515434, "train_speed(iter/s)": 0.121473 }, { "epoch": 0.05534127661230938, "grad_norm": 0.9641156196594238, "learning_rate": 5.533930193330738e-05, "loss": 0.8482259750366211, "memory(GiB)": 88.99, "step": 4265, "token_acc": 0.7597545719902306, "train_speed(iter/s)": 0.121478 }, { "epoch": 0.05540615501396508, "grad_norm": 1.1984184980392456, "learning_rate": 5.540417801998183e-05, "loss": 0.8737648010253907, "memory(GiB)": 88.99, "step": 4270, "token_acc": 0.7731146064041443, "train_speed(iter/s)": 0.121476 }, { "epoch": 0.05547103341562077, "grad_norm": 1.1064234972000122, "learning_rate": 5.5469054106656296e-05, "loss": 0.8814313888549805, "memory(GiB)": 88.99, "step": 4275, "token_acc": 0.7672964298712901, "train_speed(iter/s)": 0.121473 }, { "epoch": 0.05553591181727647, "grad_norm": 0.9865818619728088, "learning_rate": 5.553393019333074e-05, "loss": 0.8967674255371094, "memory(GiB)": 88.99, "step": 4280, "token_acc": 0.7524312124531183, "train_speed(iter/s)": 0.121472 }, { "epoch": 0.05560079021893217, "grad_norm": 1.0544030666351318, "learning_rate": 5.559880628000519e-05, "loss": 0.9057550430297852, "memory(GiB)": 88.99, "step": 4285, "token_acc": 0.73840618336887, "train_speed(iter/s)": 0.121469 }, { "epoch": 0.05566566862058786, "grad_norm": 1.176206350326538, "learning_rate": 5.5663682366679646e-05, "loss": 0.9163036346435547, "memory(GiB)": 88.99, "step": 4290, "token_acc": 0.7394605823990831, "train_speed(iter/s)": 0.121475 }, { "epoch": 0.05573054702224356, "grad_norm": 0.9805709719657898, "learning_rate": 5.5728558453354096e-05, "loss": 0.8435017585754394, "memory(GiB)": 88.99, "step": 4295, "token_acc": 0.7531468295481389, "train_speed(iter/s)": 0.121468 }, { "epoch": 0.05579542542389926, "grad_norm": 1.0014779567718506, "learning_rate": 5.5793434540028546e-05, "loss": 0.8825624465942383, "memory(GiB)": 88.99, "step": 4300, "token_acc": 0.7516439563419429, "train_speed(iter/s)": 0.121461 }, { "epoch": 0.05586030382555495, "grad_norm": 1.1688103675842285, "learning_rate": 5.5858310626703e-05, "loss": 0.8818703651428222, "memory(GiB)": 88.99, "step": 4305, "token_acc": 0.7353321186019546, "train_speed(iter/s)": 0.121462 }, { "epoch": 0.05592518222721065, "grad_norm": 1.0537948608398438, "learning_rate": 5.592318671337745e-05, "loss": 0.8907961845397949, "memory(GiB)": 88.99, "step": 4310, "token_acc": 0.7626819126819127, "train_speed(iter/s)": 0.121469 }, { "epoch": 0.05599006062886635, "grad_norm": 1.2044677734375, "learning_rate": 5.59880628000519e-05, "loss": 0.8991384506225586, "memory(GiB)": 88.99, "step": 4315, "token_acc": 0.7635117321381493, "train_speed(iter/s)": 0.121471 }, { "epoch": 0.05605493903052204, "grad_norm": 1.139000654220581, "learning_rate": 5.605293888672636e-05, "loss": 0.8613277435302734, "memory(GiB)": 88.99, "step": 4320, "token_acc": 0.769931353225039, "train_speed(iter/s)": 0.121459 }, { "epoch": 0.05611981743217774, "grad_norm": 1.1026594638824463, "learning_rate": 5.611781497340081e-05, "loss": 0.9300539016723632, "memory(GiB)": 88.99, "step": 4325, "token_acc": 0.7488808227465215, "train_speed(iter/s)": 0.121462 }, { "epoch": 0.05618469583383344, "grad_norm": 1.1470017433166504, "learning_rate": 5.618269106007525e-05, "loss": 0.9315913200378418, "memory(GiB)": 88.99, "step": 4330, "token_acc": 0.7474648126165847, "train_speed(iter/s)": 0.121469 }, { "epoch": 0.05624957423548913, "grad_norm": 0.9917123913764954, "learning_rate": 5.6247567146749715e-05, "loss": 0.8596018791198731, "memory(GiB)": 88.99, "step": 4335, "token_acc": 0.7724581492501428, "train_speed(iter/s)": 0.121463 }, { "epoch": 0.05631445263714483, "grad_norm": 0.9854516983032227, "learning_rate": 5.6312443233424165e-05, "loss": 0.8890085220336914, "memory(GiB)": 88.99, "step": 4340, "token_acc": 0.7526937556796054, "train_speed(iter/s)": 0.121462 }, { "epoch": 0.05637933103880053, "grad_norm": 1.0776125192642212, "learning_rate": 5.637731932009861e-05, "loss": 0.9234217643737793, "memory(GiB)": 88.99, "step": 4345, "token_acc": 0.7348282016956715, "train_speed(iter/s)": 0.121459 }, { "epoch": 0.05644420944045622, "grad_norm": 1.0243138074874878, "learning_rate": 5.644219540677307e-05, "loss": 0.9174539566040039, "memory(GiB)": 88.99, "step": 4350, "token_acc": 0.7427920859103357, "train_speed(iter/s)": 0.121458 }, { "epoch": 0.05650908784211192, "grad_norm": 1.0992423295974731, "learning_rate": 5.6507071493447514e-05, "loss": 0.8572198867797851, "memory(GiB)": 88.99, "step": 4355, "token_acc": 0.732686057171431, "train_speed(iter/s)": 0.121459 }, { "epoch": 0.05657396624376762, "grad_norm": 1.0100955963134766, "learning_rate": 5.6571947580121964e-05, "loss": 0.8336917877197265, "memory(GiB)": 88.99, "step": 4360, "token_acc": 0.7600213382106387, "train_speed(iter/s)": 0.121453 }, { "epoch": 0.05663884464542331, "grad_norm": 1.0055806636810303, "learning_rate": 5.663682366679642e-05, "loss": 0.8630617141723633, "memory(GiB)": 88.99, "step": 4365, "token_acc": 0.757163701840986, "train_speed(iter/s)": 0.121458 }, { "epoch": 0.05670372304707901, "grad_norm": 1.0462963581085205, "learning_rate": 5.670169975347087e-05, "loss": 0.8657988548278809, "memory(GiB)": 89.09, "step": 4370, "token_acc": 0.7826188881446752, "train_speed(iter/s)": 0.121453 }, { "epoch": 0.05676860144873471, "grad_norm": 1.1618350744247437, "learning_rate": 5.6766575840145334e-05, "loss": 0.9206674575805665, "memory(GiB)": 89.1, "step": 4375, "token_acc": 0.7477813718172016, "train_speed(iter/s)": 0.121458 }, { "epoch": 0.0568334798503904, "grad_norm": 1.0614885091781616, "learning_rate": 5.683145192681978e-05, "loss": 0.875944709777832, "memory(GiB)": 89.1, "step": 4380, "token_acc": 0.7683762389409965, "train_speed(iter/s)": 0.121452 }, { "epoch": 0.0568983582520461, "grad_norm": 1.0742466449737549, "learning_rate": 5.689632801349423e-05, "loss": 0.9227449417114257, "memory(GiB)": 89.1, "step": 4385, "token_acc": 0.7493448333056891, "train_speed(iter/s)": 0.121451 }, { "epoch": 0.0569632366537018, "grad_norm": 0.9941864013671875, "learning_rate": 5.6961204100168683e-05, "loss": 0.9380968093872071, "memory(GiB)": 89.1, "step": 4390, "token_acc": 0.7505417118093174, "train_speed(iter/s)": 0.12145 }, { "epoch": 0.057028115055357494, "grad_norm": 1.0907520055770874, "learning_rate": 5.702608018684313e-05, "loss": 0.8509528160095214, "memory(GiB)": 89.1, "step": 4395, "token_acc": 0.7476197505350949, "train_speed(iter/s)": 0.12145 }, { "epoch": 0.05709299345701319, "grad_norm": 1.113349199295044, "learning_rate": 5.709095627351758e-05, "loss": 0.8688566207885742, "memory(GiB)": 89.1, "step": 4400, "token_acc": 0.7578082747431044, "train_speed(iter/s)": 0.121447 }, { "epoch": 0.05715787185866889, "grad_norm": 1.1880605220794678, "learning_rate": 5.715583236019204e-05, "loss": 0.8599849700927734, "memory(GiB)": 89.13, "step": 4405, "token_acc": 0.7786878821131838, "train_speed(iter/s)": 0.121446 }, { "epoch": 0.057222750260324584, "grad_norm": 1.0345345735549927, "learning_rate": 5.722070844686649e-05, "loss": 0.9139204978942871, "memory(GiB)": 89.13, "step": 4410, "token_acc": 0.7361179060665362, "train_speed(iter/s)": 0.121451 }, { "epoch": 0.05728762866198028, "grad_norm": 1.0746026039123535, "learning_rate": 5.728558453354094e-05, "loss": 0.9123399734497071, "memory(GiB)": 89.13, "step": 4415, "token_acc": 0.7307855027499648, "train_speed(iter/s)": 0.121445 }, { "epoch": 0.05735250706363598, "grad_norm": 1.0018784999847412, "learning_rate": 5.7350460620215396e-05, "loss": 0.9309816360473633, "memory(GiB)": 89.13, "step": 4420, "token_acc": 0.7504973003694231, "train_speed(iter/s)": 0.121446 }, { "epoch": 0.057417385465291675, "grad_norm": 1.0937045812606812, "learning_rate": 5.7415336706889846e-05, "loss": 0.9212053298950196, "memory(GiB)": 89.13, "step": 4425, "token_acc": 0.7433582219135266, "train_speed(iter/s)": 0.121444 }, { "epoch": 0.057482263866947374, "grad_norm": 1.1581405401229858, "learning_rate": 5.748021279356429e-05, "loss": 0.9179599761962891, "memory(GiB)": 89.13, "step": 4430, "token_acc": 0.7270467989779131, "train_speed(iter/s)": 0.121444 }, { "epoch": 0.05754714226860307, "grad_norm": 1.0523457527160645, "learning_rate": 5.754508888023875e-05, "loss": 0.8847829818725585, "memory(GiB)": 89.13, "step": 4435, "token_acc": 0.7710839424432134, "train_speed(iter/s)": 0.121448 }, { "epoch": 0.057612020670258765, "grad_norm": 1.1496217250823975, "learning_rate": 5.76099649669132e-05, "loss": 0.9231597900390625, "memory(GiB)": 89.13, "step": 4440, "token_acc": 0.7543756865616991, "train_speed(iter/s)": 0.121455 }, { "epoch": 0.057676899071914464, "grad_norm": 1.0330754518508911, "learning_rate": 5.7674841053587645e-05, "loss": 0.8776414871215821, "memory(GiB)": 89.13, "step": 4445, "token_acc": 0.7813040293040293, "train_speed(iter/s)": 0.121453 }, { "epoch": 0.05774177747357016, "grad_norm": 0.9874274730682373, "learning_rate": 5.773971714026211e-05, "loss": 0.9057252883911133, "memory(GiB)": 89.13, "step": 4450, "token_acc": 0.7460068454078722, "train_speed(iter/s)": 0.121449 }, { "epoch": 0.057806655875225856, "grad_norm": 0.9229827523231506, "learning_rate": 5.780459322693655e-05, "loss": 0.8649269104003906, "memory(GiB)": 89.13, "step": 4455, "token_acc": 0.7539014791694938, "train_speed(iter/s)": 0.12145 }, { "epoch": 0.057871534276881555, "grad_norm": 0.9828157424926758, "learning_rate": 5.7869469313611e-05, "loss": 0.913149356842041, "memory(GiB)": 89.13, "step": 4460, "token_acc": 0.7518056749785039, "train_speed(iter/s)": 0.121448 }, { "epoch": 0.057936412678537254, "grad_norm": 1.15060293674469, "learning_rate": 5.793434540028546e-05, "loss": 0.9028247833251953, "memory(GiB)": 89.13, "step": 4465, "token_acc": 0.7586184339634984, "train_speed(iter/s)": 0.121452 }, { "epoch": 0.058001291080192946, "grad_norm": 1.1915390491485596, "learning_rate": 5.799922148695991e-05, "loss": 0.9651427268981934, "memory(GiB)": 89.13, "step": 4470, "token_acc": 0.7201194227898491, "train_speed(iter/s)": 0.121457 }, { "epoch": 0.058066169481848645, "grad_norm": 1.189487099647522, "learning_rate": 5.806409757363436e-05, "loss": 0.9213666915893555, "memory(GiB)": 89.13, "step": 4475, "token_acc": 0.7477761604399159, "train_speed(iter/s)": 0.12146 }, { "epoch": 0.058131047883504344, "grad_norm": 1.0276052951812744, "learning_rate": 5.8128973660308815e-05, "loss": 0.8937381744384766, "memory(GiB)": 89.13, "step": 4480, "token_acc": 0.7435064935064936, "train_speed(iter/s)": 0.121462 }, { "epoch": 0.058195926285160036, "grad_norm": 1.0996359586715698, "learning_rate": 5.8193849746983265e-05, "loss": 0.9017370223999024, "memory(GiB)": 89.13, "step": 4485, "token_acc": 0.760153937240971, "train_speed(iter/s)": 0.121456 }, { "epoch": 0.058260804686815736, "grad_norm": 1.0230764150619507, "learning_rate": 5.8258725833657714e-05, "loss": 0.9267766952514649, "memory(GiB)": 89.13, "step": 4490, "token_acc": 0.7534444793960365, "train_speed(iter/s)": 0.121453 }, { "epoch": 0.058325683088471435, "grad_norm": 1.0448706150054932, "learning_rate": 5.832360192033217e-05, "loss": 0.8840984344482422, "memory(GiB)": 89.13, "step": 4495, "token_acc": 0.7459784892289361, "train_speed(iter/s)": 0.121451 }, { "epoch": 0.05839056149012713, "grad_norm": 1.2826629877090454, "learning_rate": 5.838847800700662e-05, "loss": 0.9351221084594726, "memory(GiB)": 89.13, "step": 4500, "token_acc": 0.7387058356490671, "train_speed(iter/s)": 0.121458 }, { "epoch": 0.058455439891782826, "grad_norm": 1.103111743927002, "learning_rate": 5.845335409368107e-05, "loss": 0.9440142631530761, "memory(GiB)": 89.13, "step": 4505, "token_acc": 0.7360748550700071, "train_speed(iter/s)": 0.121459 }, { "epoch": 0.058520318293438525, "grad_norm": 1.0346348285675049, "learning_rate": 5.851823018035553e-05, "loss": 0.9098918914794922, "memory(GiB)": 89.13, "step": 4510, "token_acc": 0.7658306775668561, "train_speed(iter/s)": 0.121468 }, { "epoch": 0.05858519669509422, "grad_norm": 1.0158817768096924, "learning_rate": 5.858310626702998e-05, "loss": 0.9106743812561036, "memory(GiB)": 89.13, "step": 4515, "token_acc": 0.7504795588058986, "train_speed(iter/s)": 0.121467 }, { "epoch": 0.05865007509674992, "grad_norm": 1.1577554941177368, "learning_rate": 5.864798235370442e-05, "loss": 0.8922378540039062, "memory(GiB)": 89.13, "step": 4520, "token_acc": 0.753594829375315, "train_speed(iter/s)": 0.121466 }, { "epoch": 0.058714953498405616, "grad_norm": 0.9719184041023254, "learning_rate": 5.8712858440378884e-05, "loss": 0.8968635559082031, "memory(GiB)": 89.13, "step": 4525, "token_acc": 0.7593553020027128, "train_speed(iter/s)": 0.121468 }, { "epoch": 0.05877983190006131, "grad_norm": 1.1683846712112427, "learning_rate": 5.877773452705333e-05, "loss": 0.896724796295166, "memory(GiB)": 89.13, "step": 4530, "token_acc": 0.740396198634721, "train_speed(iter/s)": 0.121464 }, { "epoch": 0.05884471030171701, "grad_norm": 1.0050039291381836, "learning_rate": 5.8842610613727777e-05, "loss": 0.9325660705566406, "memory(GiB)": 89.13, "step": 4535, "token_acc": 0.7388325169115932, "train_speed(iter/s)": 0.121469 }, { "epoch": 0.058909588703372706, "grad_norm": 1.037240982055664, "learning_rate": 5.890748670040224e-05, "loss": 0.9214866638183594, "memory(GiB)": 89.13, "step": 4540, "token_acc": 0.7425923667845384, "train_speed(iter/s)": 0.121473 }, { "epoch": 0.0589744671050284, "grad_norm": 1.2823247909545898, "learning_rate": 5.897236278707668e-05, "loss": 0.9208980560302734, "memory(GiB)": 89.13, "step": 4545, "token_acc": 0.7387646668142572, "train_speed(iter/s)": 0.121476 }, { "epoch": 0.0590393455066841, "grad_norm": 0.9976404309272766, "learning_rate": 5.903723887375113e-05, "loss": 0.8732906341552734, "memory(GiB)": 89.13, "step": 4550, "token_acc": 0.7608672547560846, "train_speed(iter/s)": 0.121476 }, { "epoch": 0.0591042239083398, "grad_norm": 1.0664249658584595, "learning_rate": 5.910211496042559e-05, "loss": 0.9228727340698242, "memory(GiB)": 89.13, "step": 4555, "token_acc": 0.7403882494416767, "train_speed(iter/s)": 0.12147 }, { "epoch": 0.05916910230999549, "grad_norm": 1.0844244956970215, "learning_rate": 5.916699104710004e-05, "loss": 0.9232674598693847, "memory(GiB)": 89.13, "step": 4560, "token_acc": 0.7585147962032384, "train_speed(iter/s)": 0.12147 }, { "epoch": 0.05923398071165119, "grad_norm": 1.0143802165985107, "learning_rate": 5.9231867133774496e-05, "loss": 0.9023469924926758, "memory(GiB)": 89.13, "step": 4565, "token_acc": 0.7535586907805448, "train_speed(iter/s)": 0.121467 }, { "epoch": 0.05929885911330689, "grad_norm": 1.0022164583206177, "learning_rate": 5.9296743220448946e-05, "loss": 0.9238370895385742, "memory(GiB)": 89.13, "step": 4570, "token_acc": 0.7455936373133826, "train_speed(iter/s)": 0.121469 }, { "epoch": 0.05936373751496258, "grad_norm": 0.9201042652130127, "learning_rate": 5.9361619307123396e-05, "loss": 0.8912809371948243, "memory(GiB)": 89.13, "step": 4575, "token_acc": 0.7622042319585838, "train_speed(iter/s)": 0.121468 }, { "epoch": 0.05942861591661828, "grad_norm": 1.0226789712905884, "learning_rate": 5.942649539379785e-05, "loss": 0.9037838935852051, "memory(GiB)": 89.13, "step": 4580, "token_acc": 0.7548161120840631, "train_speed(iter/s)": 0.121464 }, { "epoch": 0.05949349431827398, "grad_norm": 1.0728262662887573, "learning_rate": 5.94913714804723e-05, "loss": 0.8979576110839844, "memory(GiB)": 89.13, "step": 4585, "token_acc": 0.7581999936124685, "train_speed(iter/s)": 0.121463 }, { "epoch": 0.05955837271992967, "grad_norm": 1.1242141723632812, "learning_rate": 5.955624756714675e-05, "loss": 0.9055076599121094, "memory(GiB)": 89.13, "step": 4590, "token_acc": 0.7217949648117246, "train_speed(iter/s)": 0.121469 }, { "epoch": 0.05962325112158537, "grad_norm": 0.9721839427947998, "learning_rate": 5.962112365382121e-05, "loss": 0.8950437545776367, "memory(GiB)": 89.13, "step": 4595, "token_acc": 0.752228394264307, "train_speed(iter/s)": 0.121467 }, { "epoch": 0.05968812952324107, "grad_norm": 1.1027333736419678, "learning_rate": 5.968599974049566e-05, "loss": 0.889886474609375, "memory(GiB)": 89.13, "step": 4600, "token_acc": 0.7373969140728918, "train_speed(iter/s)": 0.121471 }, { "epoch": 0.05975300792489676, "grad_norm": 1.0022823810577393, "learning_rate": 5.975087582717011e-05, "loss": 0.8883626937866211, "memory(GiB)": 89.13, "step": 4605, "token_acc": 0.7659222179437812, "train_speed(iter/s)": 0.121474 }, { "epoch": 0.05981788632655246, "grad_norm": 0.9880639910697937, "learning_rate": 5.9815751913844565e-05, "loss": 0.9030046463012695, "memory(GiB)": 89.13, "step": 4610, "token_acc": 0.7562457798784605, "train_speed(iter/s)": 0.121474 }, { "epoch": 0.05988276472820816, "grad_norm": 1.1354228258132935, "learning_rate": 5.9880628000519015e-05, "loss": 0.9107509613037109, "memory(GiB)": 89.13, "step": 4615, "token_acc": 0.7394750423947504, "train_speed(iter/s)": 0.121475 }, { "epoch": 0.05994764312986385, "grad_norm": 1.2331513166427612, "learning_rate": 5.994550408719346e-05, "loss": 0.9283709526062012, "memory(GiB)": 89.13, "step": 4620, "token_acc": 0.7426122826357526, "train_speed(iter/s)": 0.121478 }, { "epoch": 0.06001252153151955, "grad_norm": 0.9484540224075317, "learning_rate": 6.001038017386792e-05, "loss": 0.9336590766906738, "memory(GiB)": 89.13, "step": 4625, "token_acc": 0.7322760313414519, "train_speed(iter/s)": 0.121476 }, { "epoch": 0.06007739993317525, "grad_norm": 1.1691261529922485, "learning_rate": 6.0075256260542364e-05, "loss": 0.9302196502685547, "memory(GiB)": 89.13, "step": 4630, "token_acc": 0.7326797838437024, "train_speed(iter/s)": 0.121477 }, { "epoch": 0.06014227833483094, "grad_norm": 1.2012214660644531, "learning_rate": 6.0140132347216814e-05, "loss": 0.9597198486328125, "memory(GiB)": 89.13, "step": 4635, "token_acc": 0.7476657565645329, "train_speed(iter/s)": 0.121478 }, { "epoch": 0.06020715673648664, "grad_norm": 1.0380396842956543, "learning_rate": 6.020500843389128e-05, "loss": 0.9150201797485351, "memory(GiB)": 89.13, "step": 4640, "token_acc": 0.7526403209125951, "train_speed(iter/s)": 0.121478 }, { "epoch": 0.06027203513814234, "grad_norm": 0.990198016166687, "learning_rate": 6.026988452056572e-05, "loss": 0.9020153045654297, "memory(GiB)": 89.13, "step": 4645, "token_acc": 0.7567031942177663, "train_speed(iter/s)": 0.121478 }, { "epoch": 0.06033691353979803, "grad_norm": 1.213624358177185, "learning_rate": 6.033476060724017e-05, "loss": 0.9351103782653809, "memory(GiB)": 89.13, "step": 4650, "token_acc": 0.7395473989236926, "train_speed(iter/s)": 0.12148 }, { "epoch": 0.06040179194145373, "grad_norm": 1.0035096406936646, "learning_rate": 6.039963669391463e-05, "loss": 0.8694374084472656, "memory(GiB)": 89.13, "step": 4655, "token_acc": 0.7759528631927822, "train_speed(iter/s)": 0.121474 }, { "epoch": 0.06046667034310943, "grad_norm": 1.1903101205825806, "learning_rate": 6.046451278058908e-05, "loss": 0.9010452270507813, "memory(GiB)": 89.13, "step": 4660, "token_acc": 0.7404572147651006, "train_speed(iter/s)": 0.121474 }, { "epoch": 0.06053154874476512, "grad_norm": 1.0073224306106567, "learning_rate": 6.052938886726353e-05, "loss": 0.909182357788086, "memory(GiB)": 89.13, "step": 4665, "token_acc": 0.7538924877827026, "train_speed(iter/s)": 0.121468 }, { "epoch": 0.06059642714642082, "grad_norm": 0.9586397409439087, "learning_rate": 6.0594264953937984e-05, "loss": 0.916801643371582, "memory(GiB)": 89.13, "step": 4670, "token_acc": 0.7619975486344637, "train_speed(iter/s)": 0.121465 }, { "epoch": 0.06066130554807652, "grad_norm": 1.0364741086959839, "learning_rate": 6.0659141040612433e-05, "loss": 0.8804557800292969, "memory(GiB)": 89.13, "step": 4675, "token_acc": 0.7422467722614136, "train_speed(iter/s)": 0.121464 }, { "epoch": 0.06072618394973221, "grad_norm": 1.0517722368240356, "learning_rate": 6.072401712728688e-05, "loss": 0.8844018936157226, "memory(GiB)": 89.13, "step": 4680, "token_acc": 0.7535693194844896, "train_speed(iter/s)": 0.121463 }, { "epoch": 0.06079106235138791, "grad_norm": 1.0502971410751343, "learning_rate": 6.078889321396134e-05, "loss": 0.9221364974975585, "memory(GiB)": 89.13, "step": 4685, "token_acc": 0.7545282843607766, "train_speed(iter/s)": 0.121463 }, { "epoch": 0.06085594075304361, "grad_norm": 0.9734050035476685, "learning_rate": 6.085376930063579e-05, "loss": 0.9101134300231933, "memory(GiB)": 89.13, "step": 4690, "token_acc": 0.7480583799197088, "train_speed(iter/s)": 0.121465 }, { "epoch": 0.0609208191546993, "grad_norm": 1.091198205947876, "learning_rate": 6.091864538731023e-05, "loss": 0.8798023223876953, "memory(GiB)": 89.13, "step": 4695, "token_acc": 0.7531660774959673, "train_speed(iter/s)": 0.121465 }, { "epoch": 0.060985697556355, "grad_norm": 0.915611743927002, "learning_rate": 6.0983521473984696e-05, "loss": 0.9315989494323731, "memory(GiB)": 89.13, "step": 4700, "token_acc": 0.7548223180556949, "train_speed(iter/s)": 0.121463 }, { "epoch": 0.0610505759580107, "grad_norm": 1.0391405820846558, "learning_rate": 6.104839756065915e-05, "loss": 0.9190689086914062, "memory(GiB)": 89.13, "step": 4705, "token_acc": 0.7535209097352769, "train_speed(iter/s)": 0.12146 }, { "epoch": 0.061115454359666394, "grad_norm": 1.1865092515945435, "learning_rate": 6.111327364733359e-05, "loss": 0.9417662620544434, "memory(GiB)": 89.13, "step": 4710, "token_acc": 0.7574881500655528, "train_speed(iter/s)": 0.121463 }, { "epoch": 0.06118033276132209, "grad_norm": 1.1332039833068848, "learning_rate": 6.117814973400805e-05, "loss": 0.9135602951049805, "memory(GiB)": 89.13, "step": 4715, "token_acc": 0.7458685273595299, "train_speed(iter/s)": 0.121468 }, { "epoch": 0.06124521116297779, "grad_norm": 1.0099492073059082, "learning_rate": 6.12430258206825e-05, "loss": 0.8639090538024903, "memory(GiB)": 89.13, "step": 4720, "token_acc": 0.7473911747167561, "train_speed(iter/s)": 0.121463 }, { "epoch": 0.061310089564633484, "grad_norm": 1.0087194442749023, "learning_rate": 6.130790190735695e-05, "loss": 0.8761437416076661, "memory(GiB)": 89.13, "step": 4725, "token_acc": 0.7628216410292513, "train_speed(iter/s)": 0.121466 }, { "epoch": 0.06137496796628918, "grad_norm": 1.2465740442276, "learning_rate": 6.13727779940314e-05, "loss": 0.923908805847168, "memory(GiB)": 89.13, "step": 4730, "token_acc": 0.7590918281868339, "train_speed(iter/s)": 0.121469 }, { "epoch": 0.06143984636794488, "grad_norm": 1.1543749570846558, "learning_rate": 6.143765408070586e-05, "loss": 0.8899727821350097, "memory(GiB)": 89.13, "step": 4735, "token_acc": 0.7419749532172161, "train_speed(iter/s)": 0.121473 }, { "epoch": 0.061504724769600574, "grad_norm": 2.170117139816284, "learning_rate": 6.15025301673803e-05, "loss": 0.8736867904663086, "memory(GiB)": 89.13, "step": 4740, "token_acc": 0.7456489308801592, "train_speed(iter/s)": 0.121472 }, { "epoch": 0.061569603171256274, "grad_norm": 1.0569711923599243, "learning_rate": 6.156740625405476e-05, "loss": 0.902427864074707, "memory(GiB)": 89.13, "step": 4745, "token_acc": 0.7340725875029763, "train_speed(iter/s)": 0.121472 }, { "epoch": 0.06163448157291197, "grad_norm": 1.0130361318588257, "learning_rate": 6.16322823407292e-05, "loss": 0.8965718269348144, "memory(GiB)": 89.13, "step": 4750, "token_acc": 0.7673485116084904, "train_speed(iter/s)": 0.121467 }, { "epoch": 0.061699359974567665, "grad_norm": 0.9773524403572083, "learning_rate": 6.169715842740367e-05, "loss": 0.9018101692199707, "memory(GiB)": 89.13, "step": 4755, "token_acc": 0.7442748091603053, "train_speed(iter/s)": 0.12146 }, { "epoch": 0.061764238376223364, "grad_norm": 1.089308738708496, "learning_rate": 6.176203451407811e-05, "loss": 0.9104293823242188, "memory(GiB)": 89.13, "step": 4760, "token_acc": 0.7474352229383138, "train_speed(iter/s)": 0.121462 }, { "epoch": 0.06182911677787906, "grad_norm": 1.0047681331634521, "learning_rate": 6.182691060075256e-05, "loss": 0.9021093368530273, "memory(GiB)": 89.13, "step": 4765, "token_acc": 0.7608958837772397, "train_speed(iter/s)": 0.121463 }, { "epoch": 0.061893995179534755, "grad_norm": 1.0581772327423096, "learning_rate": 6.189178668742703e-05, "loss": 0.8536214828491211, "memory(GiB)": 89.13, "step": 4770, "token_acc": 0.7734760164723624, "train_speed(iter/s)": 0.121458 }, { "epoch": 0.061958873581190455, "grad_norm": 1.0880358219146729, "learning_rate": 6.195666277410147e-05, "loss": 0.914603328704834, "memory(GiB)": 89.13, "step": 4775, "token_acc": 0.7315964476619387, "train_speed(iter/s)": 0.121464 }, { "epoch": 0.062023751982846154, "grad_norm": 1.2182490825653076, "learning_rate": 6.202153886077591e-05, "loss": 0.9033641815185547, "memory(GiB)": 89.13, "step": 4780, "token_acc": 0.7450039306549713, "train_speed(iter/s)": 0.121469 }, { "epoch": 0.062088630384501846, "grad_norm": 1.1287198066711426, "learning_rate": 6.208641494745037e-05, "loss": 0.9341679573059082, "memory(GiB)": 89.13, "step": 4785, "token_acc": 0.7449211418787659, "train_speed(iter/s)": 0.121475 }, { "epoch": 0.062153508786157545, "grad_norm": 0.9711542129516602, "learning_rate": 6.215129103412483e-05, "loss": 0.8813105583190918, "memory(GiB)": 89.13, "step": 4790, "token_acc": 0.7458216489063376, "train_speed(iter/s)": 0.121475 }, { "epoch": 0.062218387187813244, "grad_norm": 1.0678625106811523, "learning_rate": 6.221616712079927e-05, "loss": 0.8815884590148926, "memory(GiB)": 89.13, "step": 4795, "token_acc": 0.7526869682042095, "train_speed(iter/s)": 0.12148 }, { "epoch": 0.062283265589468936, "grad_norm": 0.9936701655387878, "learning_rate": 6.228104320747373e-05, "loss": 0.9298961639404297, "memory(GiB)": 89.13, "step": 4800, "token_acc": 0.7557451443093121, "train_speed(iter/s)": 0.12148 }, { "epoch": 0.062348143991124635, "grad_norm": 1.0189580917358398, "learning_rate": 6.234591929414818e-05, "loss": 0.9238823890686035, "memory(GiB)": 89.13, "step": 4805, "token_acc": 0.7385456223160733, "train_speed(iter/s)": 0.12148 }, { "epoch": 0.062413022392780335, "grad_norm": 1.1402579545974731, "learning_rate": 6.241079538082263e-05, "loss": 0.9314916610717774, "memory(GiB)": 89.13, "step": 4810, "token_acc": 0.7472272635612018, "train_speed(iter/s)": 0.121485 }, { "epoch": 0.06247790079443603, "grad_norm": 1.1405739784240723, "learning_rate": 6.247567146749708e-05, "loss": 0.9202508926391602, "memory(GiB)": 89.13, "step": 4815, "token_acc": 0.7560222974318137, "train_speed(iter/s)": 0.12149 }, { "epoch": 0.06254277919609172, "grad_norm": 0.9781814813613892, "learning_rate": 6.254054755417154e-05, "loss": 0.8993735313415527, "memory(GiB)": 89.13, "step": 4820, "token_acc": 0.7657095297544736, "train_speed(iter/s)": 0.121487 }, { "epoch": 0.06260765759774742, "grad_norm": 1.0642772912979126, "learning_rate": 6.260542364084598e-05, "loss": 0.9382001876831054, "memory(GiB)": 89.13, "step": 4825, "token_acc": 0.761605521249546, "train_speed(iter/s)": 0.121487 }, { "epoch": 0.06267253599940312, "grad_norm": 0.9625418186187744, "learning_rate": 6.267029972752044e-05, "loss": 0.9082439422607422, "memory(GiB)": 89.13, "step": 4830, "token_acc": 0.7511016517281693, "train_speed(iter/s)": 0.121483 }, { "epoch": 0.06273741440105882, "grad_norm": 1.008692741394043, "learning_rate": 6.27351758141949e-05, "loss": 0.9233821868896485, "memory(GiB)": 89.13, "step": 4835, "token_acc": 0.7400064881231302, "train_speed(iter/s)": 0.121479 }, { "epoch": 0.06280229280271452, "grad_norm": 1.1114509105682373, "learning_rate": 6.280005190086934e-05, "loss": 0.9248848915100097, "memory(GiB)": 89.13, "step": 4840, "token_acc": 0.7393499410377359, "train_speed(iter/s)": 0.121482 }, { "epoch": 0.06286717120437021, "grad_norm": 1.0264514684677124, "learning_rate": 6.28649279875438e-05, "loss": 0.9083274841308594, "memory(GiB)": 89.13, "step": 4845, "token_acc": 0.7401300862199365, "train_speed(iter/s)": 0.121483 }, { "epoch": 0.0629320496060259, "grad_norm": 1.0873587131500244, "learning_rate": 6.292980407421825e-05, "loss": 0.9194437980651855, "memory(GiB)": 89.13, "step": 4850, "token_acc": 0.7328738366804283, "train_speed(iter/s)": 0.12148 }, { "epoch": 0.0629969280076816, "grad_norm": 0.9940730333328247, "learning_rate": 6.29946801608927e-05, "loss": 0.9204539299011231, "memory(GiB)": 89.13, "step": 4855, "token_acc": 0.7573661598464304, "train_speed(iter/s)": 0.121477 }, { "epoch": 0.0630618064093373, "grad_norm": 1.0581135749816895, "learning_rate": 6.305955624756715e-05, "loss": 0.9221965789794921, "memory(GiB)": 89.13, "step": 4860, "token_acc": 0.7570750164086328, "train_speed(iter/s)": 0.121482 }, { "epoch": 0.063126684810993, "grad_norm": 1.0865108966827393, "learning_rate": 6.31244323342416e-05, "loss": 0.8740836143493652, "memory(GiB)": 89.13, "step": 4865, "token_acc": 0.7441937077911382, "train_speed(iter/s)": 0.121482 }, { "epoch": 0.0631915632126487, "grad_norm": 1.0807243585586548, "learning_rate": 6.318930842091605e-05, "loss": 0.8995683670043946, "memory(GiB)": 89.13, "step": 4870, "token_acc": 0.765606230867594, "train_speed(iter/s)": 0.121484 }, { "epoch": 0.0632564416143044, "grad_norm": 0.9238796234130859, "learning_rate": 6.325418450759051e-05, "loss": 0.9189419746398926, "memory(GiB)": 89.13, "step": 4875, "token_acc": 0.7409023040665164, "train_speed(iter/s)": 0.121474 }, { "epoch": 0.06332132001596008, "grad_norm": 1.0592297315597534, "learning_rate": 6.331906059426495e-05, "loss": 0.8953393936157227, "memory(GiB)": 89.13, "step": 4880, "token_acc": 0.7685466377440348, "train_speed(iter/s)": 0.121476 }, { "epoch": 0.06338619841761578, "grad_norm": 1.2207541465759277, "learning_rate": 6.338393668093941e-05, "loss": 0.9054292678833008, "memory(GiB)": 89.13, "step": 4885, "token_acc": 0.7434224598930481, "train_speed(iter/s)": 0.121473 }, { "epoch": 0.06345107681927148, "grad_norm": 1.0733221769332886, "learning_rate": 6.344881276761387e-05, "loss": 0.9064373016357422, "memory(GiB)": 89.13, "step": 4890, "token_acc": 0.7423575370496682, "train_speed(iter/s)": 0.121474 }, { "epoch": 0.06351595522092718, "grad_norm": 0.9153972268104553, "learning_rate": 6.351368885428831e-05, "loss": 0.8933368682861328, "memory(GiB)": 89.13, "step": 4895, "token_acc": 0.7652824517267292, "train_speed(iter/s)": 0.121473 }, { "epoch": 0.06358083362258288, "grad_norm": 1.016236424446106, "learning_rate": 6.357856494096276e-05, "loss": 0.9363498687744141, "memory(GiB)": 89.13, "step": 4900, "token_acc": 0.7576484905851729, "train_speed(iter/s)": 0.121473 }, { "epoch": 0.06364571202423858, "grad_norm": 0.92414391040802, "learning_rate": 6.364344102763722e-05, "loss": 0.8670162200927735, "memory(GiB)": 89.13, "step": 4905, "token_acc": 0.7662131909764078, "train_speed(iter/s)": 0.121473 }, { "epoch": 0.06371059042589426, "grad_norm": 0.9933594465255737, "learning_rate": 6.370831711431166e-05, "loss": 0.8822515487670899, "memory(GiB)": 89.13, "step": 4910, "token_acc": 0.7547053616104076, "train_speed(iter/s)": 0.121472 }, { "epoch": 0.06377546882754996, "grad_norm": 0.9672484993934631, "learning_rate": 6.377319320098612e-05, "loss": 0.9255411148071289, "memory(GiB)": 89.13, "step": 4915, "token_acc": 0.7363398882519411, "train_speed(iter/s)": 0.121473 }, { "epoch": 0.06384034722920566, "grad_norm": 1.1095560789108276, "learning_rate": 6.383806928766058e-05, "loss": 0.9307450294494629, "memory(GiB)": 89.13, "step": 4920, "token_acc": 0.7439748649373539, "train_speed(iter/s)": 0.121476 }, { "epoch": 0.06390522563086136, "grad_norm": 1.167967677116394, "learning_rate": 6.390294537433502e-05, "loss": 0.9258286476135253, "memory(GiB)": 89.13, "step": 4925, "token_acc": 0.7728003765739615, "train_speed(iter/s)": 0.121482 }, { "epoch": 0.06397010403251706, "grad_norm": 0.9936275482177734, "learning_rate": 6.396782146100946e-05, "loss": 0.9391070365905761, "memory(GiB)": 89.13, "step": 4930, "token_acc": 0.730125160350865, "train_speed(iter/s)": 0.121487 }, { "epoch": 0.06403498243417276, "grad_norm": 1.2510733604431152, "learning_rate": 6.403269754768393e-05, "loss": 0.9098219871520996, "memory(GiB)": 89.13, "step": 4935, "token_acc": 0.7326812227074235, "train_speed(iter/s)": 0.121488 }, { "epoch": 0.06409986083582844, "grad_norm": 0.9704006910324097, "learning_rate": 6.409757363435838e-05, "loss": 0.9061588287353516, "memory(GiB)": 89.13, "step": 4940, "token_acc": 0.7535617148037819, "train_speed(iter/s)": 0.121489 }, { "epoch": 0.06416473923748414, "grad_norm": 1.1966700553894043, "learning_rate": 6.416244972103283e-05, "loss": 0.8993711471557617, "memory(GiB)": 89.13, "step": 4945, "token_acc": 0.740018703690382, "train_speed(iter/s)": 0.12149 }, { "epoch": 0.06422961763913984, "grad_norm": 1.1017775535583496, "learning_rate": 6.422732580770729e-05, "loss": 0.9450142860412598, "memory(GiB)": 89.13, "step": 4950, "token_acc": 0.7361790958825223, "train_speed(iter/s)": 0.121494 }, { "epoch": 0.06429449604079554, "grad_norm": 0.9858067631721497, "learning_rate": 6.429220189438173e-05, "loss": 0.9639873504638672, "memory(GiB)": 89.13, "step": 4955, "token_acc": 0.7515348358220779, "train_speed(iter/s)": 0.12149 }, { "epoch": 0.06435937444245124, "grad_norm": 1.0174654722213745, "learning_rate": 6.435707798105619e-05, "loss": 0.9351696014404297, "memory(GiB)": 89.13, "step": 4960, "token_acc": 0.7315719947159841, "train_speed(iter/s)": 0.12149 }, { "epoch": 0.06442425284410694, "grad_norm": 1.1244608163833618, "learning_rate": 6.442195406773063e-05, "loss": 0.9293685913085937, "memory(GiB)": 89.13, "step": 4965, "token_acc": 0.7373775591709757, "train_speed(iter/s)": 0.121496 }, { "epoch": 0.06448913124576262, "grad_norm": 0.981587290763855, "learning_rate": 6.448683015440509e-05, "loss": 0.8731257438659668, "memory(GiB)": 89.13, "step": 4970, "token_acc": 0.7581288614298323, "train_speed(iter/s)": 0.1215 }, { "epoch": 0.06455400964741832, "grad_norm": 0.9737399816513062, "learning_rate": 6.455170624107955e-05, "loss": 0.9067255020141601, "memory(GiB)": 89.13, "step": 4975, "token_acc": 0.7603876135307158, "train_speed(iter/s)": 0.121497 }, { "epoch": 0.06461888804907402, "grad_norm": 0.9746440649032593, "learning_rate": 6.461658232775399e-05, "loss": 0.9216492652893067, "memory(GiB)": 89.13, "step": 4980, "token_acc": 0.7592160673793327, "train_speed(iter/s)": 0.121503 }, { "epoch": 0.06468376645072972, "grad_norm": 1.050279140472412, "learning_rate": 6.468145841442845e-05, "loss": 0.9299222946166992, "memory(GiB)": 89.13, "step": 4985, "token_acc": 0.7463749110120275, "train_speed(iter/s)": 0.121508 }, { "epoch": 0.06474864485238542, "grad_norm": 0.9887217879295349, "learning_rate": 6.47463345011029e-05, "loss": 0.8897027015686035, "memory(GiB)": 89.13, "step": 4990, "token_acc": 0.7460763090244893, "train_speed(iter/s)": 0.12151 }, { "epoch": 0.06481352325404112, "grad_norm": 1.0642507076263428, "learning_rate": 6.481121058777735e-05, "loss": 0.8854730606079102, "memory(GiB)": 89.13, "step": 4995, "token_acc": 0.7569210547933952, "train_speed(iter/s)": 0.12151 }, { "epoch": 0.0648784016556968, "grad_norm": 1.058918833732605, "learning_rate": 6.48760866744518e-05, "loss": 0.9478899955749511, "memory(GiB)": 89.13, "step": 5000, "token_acc": 0.748680618744313, "train_speed(iter/s)": 0.121508 }, { "epoch": 0.0649432800573525, "grad_norm": 1.103624939918518, "learning_rate": 6.494096276112626e-05, "loss": 0.9307183265686035, "memory(GiB)": 89.13, "step": 5005, "token_acc": 0.7367057938928109, "train_speed(iter/s)": 0.12151 }, { "epoch": 0.0650081584590082, "grad_norm": 0.9880870580673218, "learning_rate": 6.50058388478007e-05, "loss": 0.9222283363342285, "memory(GiB)": 89.13, "step": 5010, "token_acc": 0.7502235674485794, "train_speed(iter/s)": 0.121515 }, { "epoch": 0.0650730368606639, "grad_norm": 1.0368711948394775, "learning_rate": 6.507071493447516e-05, "loss": 0.8947784423828125, "memory(GiB)": 89.13, "step": 5015, "token_acc": 0.7440548166062072, "train_speed(iter/s)": 0.121514 }, { "epoch": 0.0651379152623196, "grad_norm": 1.0202633142471313, "learning_rate": 6.513559102114962e-05, "loss": 0.9232219696044922, "memory(GiB)": 89.13, "step": 5020, "token_acc": 0.7366140255843994, "train_speed(iter/s)": 0.121512 }, { "epoch": 0.0652027936639753, "grad_norm": 1.294553518295288, "learning_rate": 6.520046710782406e-05, "loss": 0.930865478515625, "memory(GiB)": 89.13, "step": 5025, "token_acc": 0.7271671826625387, "train_speed(iter/s)": 0.121513 }, { "epoch": 0.06526767206563099, "grad_norm": 0.9746602177619934, "learning_rate": 6.52653431944985e-05, "loss": 0.9028520584106445, "memory(GiB)": 89.13, "step": 5030, "token_acc": 0.727463638484872, "train_speed(iter/s)": 0.121518 }, { "epoch": 0.06533255046728668, "grad_norm": 0.9946491122245789, "learning_rate": 6.533021928117297e-05, "loss": 0.9096492767333985, "memory(GiB)": 89.13, "step": 5035, "token_acc": 0.7486835139728623, "train_speed(iter/s)": 0.121508 }, { "epoch": 0.06539742886894238, "grad_norm": 1.1069202423095703, "learning_rate": 6.539509536784741e-05, "loss": 0.9516725540161133, "memory(GiB)": 89.13, "step": 5040, "token_acc": 0.7247577710552852, "train_speed(iter/s)": 0.121505 }, { "epoch": 0.06546230727059808, "grad_norm": 0.9777161478996277, "learning_rate": 6.545997145452186e-05, "loss": 0.919002628326416, "memory(GiB)": 89.13, "step": 5045, "token_acc": 0.7414027074438155, "train_speed(iter/s)": 0.121506 }, { "epoch": 0.06552718567225378, "grad_norm": 1.1019026041030884, "learning_rate": 6.552484754119633e-05, "loss": 0.9332873344421386, "memory(GiB)": 89.13, "step": 5050, "token_acc": 0.7417789523850786, "train_speed(iter/s)": 0.121508 }, { "epoch": 0.06559206407390948, "grad_norm": 1.0337270498275757, "learning_rate": 6.558972362787077e-05, "loss": 0.9482204437255859, "memory(GiB)": 89.13, "step": 5055, "token_acc": 0.7204039246580248, "train_speed(iter/s)": 0.12151 }, { "epoch": 0.06565694247556517, "grad_norm": 0.9489880204200745, "learning_rate": 6.565459971454521e-05, "loss": 0.9713048934936523, "memory(GiB)": 89.13, "step": 5060, "token_acc": 0.71129656014655, "train_speed(iter/s)": 0.121508 }, { "epoch": 0.06572182087722087, "grad_norm": 1.070723533630371, "learning_rate": 6.571947580121967e-05, "loss": 0.9092426300048828, "memory(GiB)": 89.13, "step": 5065, "token_acc": 0.7555900830912653, "train_speed(iter/s)": 0.121513 }, { "epoch": 0.06578669927887656, "grad_norm": 1.1718484163284302, "learning_rate": 6.578435188789413e-05, "loss": 0.9172409057617188, "memory(GiB)": 89.13, "step": 5070, "token_acc": 0.7305451058493767, "train_speed(iter/s)": 0.121514 }, { "epoch": 0.06585157768053226, "grad_norm": 1.05293607711792, "learning_rate": 6.584922797456857e-05, "loss": 0.8985922813415528, "memory(GiB)": 89.13, "step": 5075, "token_acc": 0.7663328378061185, "train_speed(iter/s)": 0.121515 }, { "epoch": 0.06591645608218796, "grad_norm": 1.1562997102737427, "learning_rate": 6.591410406124303e-05, "loss": 0.9686059951782227, "memory(GiB)": 89.13, "step": 5080, "token_acc": 0.740755082284608, "train_speed(iter/s)": 0.121513 }, { "epoch": 0.06598133448384366, "grad_norm": 1.0622502565383911, "learning_rate": 6.597898014791748e-05, "loss": 0.9421125411987304, "memory(GiB)": 89.13, "step": 5085, "token_acc": 0.745593237810535, "train_speed(iter/s)": 0.121512 }, { "epoch": 0.06604621288549935, "grad_norm": 1.0485626459121704, "learning_rate": 6.604385623459193e-05, "loss": 0.935035514831543, "memory(GiB)": 89.13, "step": 5090, "token_acc": 0.732289336316182, "train_speed(iter/s)": 0.121513 }, { "epoch": 0.06611109128715505, "grad_norm": 1.054581642150879, "learning_rate": 6.610873232126638e-05, "loss": 0.8976299285888671, "memory(GiB)": 89.13, "step": 5095, "token_acc": 0.7545189504373178, "train_speed(iter/s)": 0.121516 }, { "epoch": 0.06617596968881075, "grad_norm": 0.9608541131019592, "learning_rate": 6.617360840794084e-05, "loss": 0.8979265213012695, "memory(GiB)": 89.13, "step": 5100, "token_acc": 0.7583422359077484, "train_speed(iter/s)": 0.121512 }, { "epoch": 0.06624084809046644, "grad_norm": 1.1504567861557007, "learning_rate": 6.623848449461528e-05, "loss": 0.9229921340942383, "memory(GiB)": 89.13, "step": 5105, "token_acc": 0.7427220438265288, "train_speed(iter/s)": 0.121508 }, { "epoch": 0.06630572649212214, "grad_norm": 1.071410059928894, "learning_rate": 6.630336058128974e-05, "loss": 0.9423105239868164, "memory(GiB)": 89.13, "step": 5110, "token_acc": 0.7517243655324907, "train_speed(iter/s)": 0.12151 }, { "epoch": 0.06637060489377784, "grad_norm": 1.0804998874664307, "learning_rate": 6.63682366679642e-05, "loss": 0.9342281341552734, "memory(GiB)": 89.13, "step": 5115, "token_acc": 0.7536287242169595, "train_speed(iter/s)": 0.121507 }, { "epoch": 0.06643548329543353, "grad_norm": 1.009093999862671, "learning_rate": 6.643311275463865e-05, "loss": 0.9181487083435058, "memory(GiB)": 89.13, "step": 5120, "token_acc": 0.7281560088202866, "train_speed(iter/s)": 0.121513 }, { "epoch": 0.06650036169708923, "grad_norm": 1.132123589515686, "learning_rate": 6.64979888413131e-05, "loss": 0.9303798675537109, "memory(GiB)": 89.13, "step": 5125, "token_acc": 0.7392009046362609, "train_speed(iter/s)": 0.121509 }, { "epoch": 0.06656524009874493, "grad_norm": 0.935897171497345, "learning_rate": 6.656286492798754e-05, "loss": 0.8837145805358887, "memory(GiB)": 89.13, "step": 5130, "token_acc": 0.7618521078092605, "train_speed(iter/s)": 0.121507 }, { "epoch": 0.06663011850040063, "grad_norm": 1.0280932188034058, "learning_rate": 6.662774101466201e-05, "loss": 0.9458621978759766, "memory(GiB)": 89.13, "step": 5135, "token_acc": 0.7336519882029899, "train_speed(iter/s)": 0.121507 }, { "epoch": 0.06669499690205632, "grad_norm": 1.0440022945404053, "learning_rate": 6.669261710133645e-05, "loss": 0.9194658279418946, "memory(GiB)": 89.13, "step": 5140, "token_acc": 0.7514073336858147, "train_speed(iter/s)": 0.121507 }, { "epoch": 0.06675987530371202, "grad_norm": 1.0891461372375488, "learning_rate": 6.67574931880109e-05, "loss": 0.9390763282775879, "memory(GiB)": 89.13, "step": 5145, "token_acc": 0.7242153228823069, "train_speed(iter/s)": 0.121505 }, { "epoch": 0.06682475370536771, "grad_norm": 1.0804049968719482, "learning_rate": 6.682236927468537e-05, "loss": 0.9693042755126953, "memory(GiB)": 89.13, "step": 5150, "token_acc": 0.7512222891969728, "train_speed(iter/s)": 0.121507 }, { "epoch": 0.06688963210702341, "grad_norm": 1.0359652042388916, "learning_rate": 6.688724536135981e-05, "loss": 0.8851705551147461, "memory(GiB)": 89.13, "step": 5155, "token_acc": 0.7527245518722144, "train_speed(iter/s)": 0.121512 }, { "epoch": 0.06695451050867911, "grad_norm": 1.070901870727539, "learning_rate": 6.695212144803425e-05, "loss": 0.9418327331542968, "memory(GiB)": 89.13, "step": 5160, "token_acc": 0.7402883609741521, "train_speed(iter/s)": 0.121511 }, { "epoch": 0.0670193889103348, "grad_norm": 1.0741757154464722, "learning_rate": 6.701699753470871e-05, "loss": 0.9076757431030273, "memory(GiB)": 89.13, "step": 5165, "token_acc": 0.753298969072165, "train_speed(iter/s)": 0.121517 }, { "epoch": 0.0670842673119905, "grad_norm": 1.0970572233200073, "learning_rate": 6.708187362138317e-05, "loss": 0.9582727432250977, "memory(GiB)": 89.13, "step": 5170, "token_acc": 0.7256037049288786, "train_speed(iter/s)": 0.12152 }, { "epoch": 0.0671491457136462, "grad_norm": 1.2013864517211914, "learning_rate": 6.714674970805761e-05, "loss": 0.8988889694213867, "memory(GiB)": 89.13, "step": 5175, "token_acc": 0.7603768685782114, "train_speed(iter/s)": 0.121519 }, { "epoch": 0.06721402411530189, "grad_norm": 0.993699312210083, "learning_rate": 6.721162579473206e-05, "loss": 0.916262435913086, "memory(GiB)": 89.13, "step": 5180, "token_acc": 0.7479571710340941, "train_speed(iter/s)": 0.121518 }, { "epoch": 0.06727890251695759, "grad_norm": 1.016563057899475, "learning_rate": 6.727650188140652e-05, "loss": 0.951176643371582, "memory(GiB)": 89.13, "step": 5185, "token_acc": 0.7481863230921705, "train_speed(iter/s)": 0.12152 }, { "epoch": 0.06734378091861329, "grad_norm": 1.009920358657837, "learning_rate": 6.734137796808096e-05, "loss": 0.930506420135498, "memory(GiB)": 89.13, "step": 5190, "token_acc": 0.7547582436357424, "train_speed(iter/s)": 0.121521 }, { "epoch": 0.06740865932026899, "grad_norm": 1.0946837663650513, "learning_rate": 6.740625405475542e-05, "loss": 0.8896492004394532, "memory(GiB)": 89.13, "step": 5195, "token_acc": 0.7652166234989404, "train_speed(iter/s)": 0.121521 }, { "epoch": 0.06747353772192469, "grad_norm": 1.0577083826065063, "learning_rate": 6.747113014142988e-05, "loss": 0.9249481201171875, "memory(GiB)": 89.13, "step": 5200, "token_acc": 0.7516017534985668, "train_speed(iter/s)": 0.121526 }, { "epoch": 0.06753841612358039, "grad_norm": 1.0535693168640137, "learning_rate": 6.753600622810432e-05, "loss": 0.9122654914855957, "memory(GiB)": 89.13, "step": 5205, "token_acc": 0.7290145683987975, "train_speed(iter/s)": 0.121527 }, { "epoch": 0.06760329452523607, "grad_norm": 1.039925456047058, "learning_rate": 6.760088231477878e-05, "loss": 0.9545976638793945, "memory(GiB)": 89.13, "step": 5210, "token_acc": 0.7506397410205026, "train_speed(iter/s)": 0.121524 }, { "epoch": 0.06766817292689177, "grad_norm": 1.0343570709228516, "learning_rate": 6.766575840145323e-05, "loss": 0.9407179832458497, "memory(GiB)": 89.13, "step": 5215, "token_acc": 0.7349133502294678, "train_speed(iter/s)": 0.121523 }, { "epoch": 0.06773305132854747, "grad_norm": 1.0509949922561646, "learning_rate": 6.773063448812768e-05, "loss": 0.9406131744384766, "memory(GiB)": 89.13, "step": 5220, "token_acc": 0.7273903952739703, "train_speed(iter/s)": 0.121524 }, { "epoch": 0.06779792973020317, "grad_norm": 0.943897008895874, "learning_rate": 6.779551057480213e-05, "loss": 0.9385345458984375, "memory(GiB)": 89.13, "step": 5225, "token_acc": 0.7431025596976465, "train_speed(iter/s)": 0.121523 }, { "epoch": 0.06786280813185887, "grad_norm": 1.121387004852295, "learning_rate": 6.786038666147658e-05, "loss": 0.9124290466308593, "memory(GiB)": 89.13, "step": 5230, "token_acc": 0.7449918986595964, "train_speed(iter/s)": 0.121525 }, { "epoch": 0.06792768653351457, "grad_norm": 1.0212600231170654, "learning_rate": 6.792526274815103e-05, "loss": 0.9511608123779297, "memory(GiB)": 89.13, "step": 5235, "token_acc": 0.7390080379301404, "train_speed(iter/s)": 0.121528 }, { "epoch": 0.06799256493517025, "grad_norm": 1.2054508924484253, "learning_rate": 6.799013883482549e-05, "loss": 0.9155834197998047, "memory(GiB)": 89.13, "step": 5240, "token_acc": 0.7507225222595242, "train_speed(iter/s)": 0.121525 }, { "epoch": 0.06805744333682595, "grad_norm": 0.9439337849617004, "learning_rate": 6.805501492149993e-05, "loss": 0.9487483978271485, "memory(GiB)": 89.13, "step": 5245, "token_acc": 0.7410755750952042, "train_speed(iter/s)": 0.121529 }, { "epoch": 0.06812232173848165, "grad_norm": 1.0265179872512817, "learning_rate": 6.811989100817439e-05, "loss": 0.9032848358154297, "memory(GiB)": 89.13, "step": 5250, "token_acc": 0.7636225115108152, "train_speed(iter/s)": 0.12153 }, { "epoch": 0.06818720014013735, "grad_norm": 1.1093782186508179, "learning_rate": 6.818476709484885e-05, "loss": 0.9253065109252929, "memory(GiB)": 89.13, "step": 5255, "token_acc": 0.7546918817861509, "train_speed(iter/s)": 0.121536 }, { "epoch": 0.06825207854179305, "grad_norm": 1.0422639846801758, "learning_rate": 6.824964318152329e-05, "loss": 0.9542154312133789, "memory(GiB)": 89.13, "step": 5260, "token_acc": 0.73834628190899, "train_speed(iter/s)": 0.121533 }, { "epoch": 0.06831695694344875, "grad_norm": 0.9767162203788757, "learning_rate": 6.831451926819775e-05, "loss": 0.8862668037414551, "memory(GiB)": 89.13, "step": 5265, "token_acc": 0.7540023053278688, "train_speed(iter/s)": 0.121534 }, { "epoch": 0.06838183534510443, "grad_norm": 1.17781662940979, "learning_rate": 6.83793953548722e-05, "loss": 1.0046197891235351, "memory(GiB)": 89.13, "step": 5270, "token_acc": 0.7235613941355794, "train_speed(iter/s)": 0.121533 }, { "epoch": 0.06844671374676013, "grad_norm": 1.1401870250701904, "learning_rate": 6.844427144154665e-05, "loss": 0.9573972702026368, "memory(GiB)": 89.13, "step": 5275, "token_acc": 0.7304093389009113, "train_speed(iter/s)": 0.121533 }, { "epoch": 0.06851159214841583, "grad_norm": 0.907983124256134, "learning_rate": 6.85091475282211e-05, "loss": 0.9024090766906738, "memory(GiB)": 89.13, "step": 5280, "token_acc": 0.763508897008585, "train_speed(iter/s)": 0.121533 }, { "epoch": 0.06857647055007153, "grad_norm": 1.061275839805603, "learning_rate": 6.857402361489556e-05, "loss": 0.9061775207519531, "memory(GiB)": 89.13, "step": 5285, "token_acc": 0.7372872396659267, "train_speed(iter/s)": 0.12153 }, { "epoch": 0.06864134895172723, "grad_norm": 1.0319205522537231, "learning_rate": 6.863889970157e-05, "loss": 0.9033396720886231, "memory(GiB)": 89.13, "step": 5290, "token_acc": 0.7567644545713472, "train_speed(iter/s)": 0.12153 }, { "epoch": 0.06870622735338293, "grad_norm": 0.9804749488830566, "learning_rate": 6.870377578824445e-05, "loss": 0.9476816177368164, "memory(GiB)": 89.13, "step": 5295, "token_acc": 0.728016102654422, "train_speed(iter/s)": 0.121525 }, { "epoch": 0.06877110575503861, "grad_norm": 1.034327745437622, "learning_rate": 6.876865187491892e-05, "loss": 0.925117301940918, "memory(GiB)": 89.13, "step": 5300, "token_acc": 0.7243687454437007, "train_speed(iter/s)": 0.121525 }, { "epoch": 0.06883598415669431, "grad_norm": 0.9254536628723145, "learning_rate": 6.883352796159336e-05, "loss": 0.8768438339233399, "memory(GiB)": 89.13, "step": 5305, "token_acc": 0.7577259095058025, "train_speed(iter/s)": 0.121531 }, { "epoch": 0.06890086255835001, "grad_norm": 0.9365211725234985, "learning_rate": 6.889840404826782e-05, "loss": 0.8785717010498046, "memory(GiB)": 89.13, "step": 5310, "token_acc": 0.7562413184667943, "train_speed(iter/s)": 0.121527 }, { "epoch": 0.06896574096000571, "grad_norm": 1.0425803661346436, "learning_rate": 6.896328013494227e-05, "loss": 0.8829265594482422, "memory(GiB)": 89.13, "step": 5315, "token_acc": 0.7440580903544559, "train_speed(iter/s)": 0.12153 }, { "epoch": 0.06903061936166141, "grad_norm": 0.9716393351554871, "learning_rate": 6.902815622161671e-05, "loss": 0.9086591720581054, "memory(GiB)": 89.13, "step": 5320, "token_acc": 0.7516525694789964, "train_speed(iter/s)": 0.12153 }, { "epoch": 0.06909549776331711, "grad_norm": 1.0456448793411255, "learning_rate": 6.909303230829117e-05, "loss": 0.9499615669250489, "memory(GiB)": 89.13, "step": 5325, "token_acc": 0.7623956968484932, "train_speed(iter/s)": 0.121529 }, { "epoch": 0.0691603761649728, "grad_norm": 1.0474071502685547, "learning_rate": 6.915790839496561e-05, "loss": 0.9380897521972656, "memory(GiB)": 89.13, "step": 5330, "token_acc": 0.7430475410446372, "train_speed(iter/s)": 0.121526 }, { "epoch": 0.0692252545666285, "grad_norm": 1.060974359512329, "learning_rate": 6.922278448164007e-05, "loss": 0.9023077011108398, "memory(GiB)": 89.13, "step": 5335, "token_acc": 0.7545652824701706, "train_speed(iter/s)": 0.121525 }, { "epoch": 0.0692901329682842, "grad_norm": 1.1755248308181763, "learning_rate": 6.928766056831453e-05, "loss": 0.9593148231506348, "memory(GiB)": 89.13, "step": 5340, "token_acc": 0.715796845053811, "train_speed(iter/s)": 0.121526 }, { "epoch": 0.06935501136993989, "grad_norm": 1.03705632686615, "learning_rate": 6.935253665498897e-05, "loss": 0.9404752731323243, "memory(GiB)": 89.13, "step": 5345, "token_acc": 0.744951534733441, "train_speed(iter/s)": 0.121528 }, { "epoch": 0.06941988977159559, "grad_norm": 0.9529651999473572, "learning_rate": 6.941741274166343e-05, "loss": 0.952173900604248, "memory(GiB)": 89.13, "step": 5350, "token_acc": 0.7168871116579514, "train_speed(iter/s)": 0.121528 }, { "epoch": 0.06948476817325129, "grad_norm": 1.0280667543411255, "learning_rate": 6.948228882833788e-05, "loss": 0.9310410499572754, "memory(GiB)": 89.13, "step": 5355, "token_acc": 0.7533134248230043, "train_speed(iter/s)": 0.121528 }, { "epoch": 0.06954964657490698, "grad_norm": 1.0115230083465576, "learning_rate": 6.954716491501233e-05, "loss": 0.9522171020507812, "memory(GiB)": 89.13, "step": 5360, "token_acc": 0.7523430731668899, "train_speed(iter/s)": 0.121533 }, { "epoch": 0.06961452497656268, "grad_norm": 0.9407137632369995, "learning_rate": 6.961204100168678e-05, "loss": 0.9279073715209961, "memory(GiB)": 89.13, "step": 5365, "token_acc": 0.7394784995425434, "train_speed(iter/s)": 0.121532 }, { "epoch": 0.06967940337821837, "grad_norm": 1.1572344303131104, "learning_rate": 6.967691708836124e-05, "loss": 0.9374675750732422, "memory(GiB)": 89.13, "step": 5370, "token_acc": 0.7500084365403435, "train_speed(iter/s)": 0.121532 }, { "epoch": 0.06974428177987407, "grad_norm": 1.1176848411560059, "learning_rate": 6.974179317503568e-05, "loss": 0.9485801696777344, "memory(GiB)": 89.13, "step": 5375, "token_acc": 0.7366196695188277, "train_speed(iter/s)": 0.121537 }, { "epoch": 0.06980916018152977, "grad_norm": 1.0309666395187378, "learning_rate": 6.980666926171014e-05, "loss": 0.9375093460083008, "memory(GiB)": 89.13, "step": 5380, "token_acc": 0.7361746361746362, "train_speed(iter/s)": 0.121537 }, { "epoch": 0.06987403858318547, "grad_norm": 1.0901654958724976, "learning_rate": 6.98715453483846e-05, "loss": 0.9057474136352539, "memory(GiB)": 89.13, "step": 5385, "token_acc": 0.7511641830761043, "train_speed(iter/s)": 0.121536 }, { "epoch": 0.06993891698484116, "grad_norm": 1.1034226417541504, "learning_rate": 6.993642143505904e-05, "loss": 0.9256601333618164, "memory(GiB)": 89.13, "step": 5390, "token_acc": 0.7354785078452426, "train_speed(iter/s)": 0.121537 }, { "epoch": 0.07000379538649686, "grad_norm": 0.9560639262199402, "learning_rate": 7.000129752173348e-05, "loss": 0.9481281280517578, "memory(GiB)": 89.13, "step": 5395, "token_acc": 0.7240198785201546, "train_speed(iter/s)": 0.121542 }, { "epoch": 0.07006867378815256, "grad_norm": 1.1256650686264038, "learning_rate": 7.006617360840795e-05, "loss": 0.9110755920410156, "memory(GiB)": 89.13, "step": 5400, "token_acc": 0.74941625588377, "train_speed(iter/s)": 0.121544 }, { "epoch": 0.07013355218980825, "grad_norm": 1.0619361400604248, "learning_rate": 7.01310496950824e-05, "loss": 0.9047711372375489, "memory(GiB)": 89.13, "step": 5405, "token_acc": 0.7442294067196181, "train_speed(iter/s)": 0.121536 }, { "epoch": 0.07019843059146395, "grad_norm": 1.0509403944015503, "learning_rate": 7.019592578175684e-05, "loss": 0.9520758628845215, "memory(GiB)": 89.13, "step": 5410, "token_acc": 0.7380303151816505, "train_speed(iter/s)": 0.121545 }, { "epoch": 0.07026330899311964, "grad_norm": 0.9449858069419861, "learning_rate": 7.026080186843131e-05, "loss": 0.9227798461914063, "memory(GiB)": 89.13, "step": 5415, "token_acc": 0.7368435951875443, "train_speed(iter/s)": 0.121539 }, { "epoch": 0.07032818739477534, "grad_norm": 1.0253961086273193, "learning_rate": 7.032567795510575e-05, "loss": 0.9371507644653321, "memory(GiB)": 89.13, "step": 5420, "token_acc": 0.7299290099111767, "train_speed(iter/s)": 0.121539 }, { "epoch": 0.07039306579643104, "grad_norm": 0.9937489032745361, "learning_rate": 7.03905540417802e-05, "loss": 0.9021520614624023, "memory(GiB)": 89.13, "step": 5425, "token_acc": 0.7488312216274815, "train_speed(iter/s)": 0.121544 }, { "epoch": 0.07045794419808674, "grad_norm": 1.1577519178390503, "learning_rate": 7.045543012845465e-05, "loss": 0.9579016685485839, "memory(GiB)": 89.13, "step": 5430, "token_acc": 0.7204434516422603, "train_speed(iter/s)": 0.121548 }, { "epoch": 0.07052282259974244, "grad_norm": 0.9286018013954163, "learning_rate": 7.052030621512911e-05, "loss": 0.8851480484008789, "memory(GiB)": 89.13, "step": 5435, "token_acc": 0.7595369943085679, "train_speed(iter/s)": 0.121545 }, { "epoch": 0.07058770100139813, "grad_norm": 1.0856138467788696, "learning_rate": 7.058518230180355e-05, "loss": 0.9110952377319336, "memory(GiB)": 89.13, "step": 5440, "token_acc": 0.7601781909742398, "train_speed(iter/s)": 0.121551 }, { "epoch": 0.07065257940305382, "grad_norm": 0.9981244802474976, "learning_rate": 7.065005838847801e-05, "loss": 0.9173169136047363, "memory(GiB)": 89.13, "step": 5445, "token_acc": 0.7507821698906644, "train_speed(iter/s)": 0.121553 }, { "epoch": 0.07071745780470952, "grad_norm": 1.1900315284729004, "learning_rate": 7.071493447515247e-05, "loss": 0.944366455078125, "memory(GiB)": 89.13, "step": 5450, "token_acc": 0.7366414907884521, "train_speed(iter/s)": 0.121554 }, { "epoch": 0.07078233620636522, "grad_norm": 0.944852352142334, "learning_rate": 7.077981056182691e-05, "loss": 0.9317392349243164, "memory(GiB)": 89.13, "step": 5455, "token_acc": 0.7608688180912122, "train_speed(iter/s)": 0.121559 }, { "epoch": 0.07084721460802092, "grad_norm": 0.9159605503082275, "learning_rate": 7.084468664850136e-05, "loss": 0.9082883834838867, "memory(GiB)": 89.13, "step": 5460, "token_acc": 0.7343308974210262, "train_speed(iter/s)": 0.121562 }, { "epoch": 0.07091209300967662, "grad_norm": 0.954649806022644, "learning_rate": 7.090956273517582e-05, "loss": 0.9008567810058594, "memory(GiB)": 89.13, "step": 5465, "token_acc": 0.7386726893676164, "train_speed(iter/s)": 0.121563 }, { "epoch": 0.07097697141133232, "grad_norm": 0.9949111342430115, "learning_rate": 7.097443882185026e-05, "loss": 0.9537612915039062, "memory(GiB)": 89.13, "step": 5470, "token_acc": 0.7295258620689655, "train_speed(iter/s)": 0.121561 }, { "epoch": 0.071041849812988, "grad_norm": 1.0883857011795044, "learning_rate": 7.103931490852472e-05, "loss": 0.9504097938537598, "memory(GiB)": 89.13, "step": 5475, "token_acc": 0.7432103839172575, "train_speed(iter/s)": 0.121566 }, { "epoch": 0.0711067282146437, "grad_norm": 1.0543768405914307, "learning_rate": 7.110419099519918e-05, "loss": 0.9704625129699707, "memory(GiB)": 89.13, "step": 5480, "token_acc": 0.7294284400858106, "train_speed(iter/s)": 0.121565 }, { "epoch": 0.0711716066162994, "grad_norm": 1.118267297744751, "learning_rate": 7.116906708187362e-05, "loss": 0.9375407218933105, "memory(GiB)": 89.13, "step": 5485, "token_acc": 0.747293177756754, "train_speed(iter/s)": 0.121567 }, { "epoch": 0.0712364850179551, "grad_norm": 0.9811697602272034, "learning_rate": 7.123394316854808e-05, "loss": 0.9322267532348633, "memory(GiB)": 89.13, "step": 5490, "token_acc": 0.7448952964968361, "train_speed(iter/s)": 0.121565 }, { "epoch": 0.0713013634196108, "grad_norm": 0.9966520667076111, "learning_rate": 7.129881925522252e-05, "loss": 0.9329862594604492, "memory(GiB)": 89.13, "step": 5495, "token_acc": 0.7274319000639752, "train_speed(iter/s)": 0.121563 }, { "epoch": 0.0713662418212665, "grad_norm": 1.0096662044525146, "learning_rate": 7.136369534189699e-05, "loss": 0.9653573036193848, "memory(GiB)": 89.13, "step": 5500, "token_acc": 0.7418715305313244, "train_speed(iter/s)": 0.121561 }, { "epoch": 0.07143112022292218, "grad_norm": 0.9711261987686157, "learning_rate": 7.142857142857143e-05, "loss": 0.8987648010253906, "memory(GiB)": 89.13, "step": 5505, "token_acc": 0.7362672548740572, "train_speed(iter/s)": 0.121563 }, { "epoch": 0.07149599862457788, "grad_norm": 0.9653379917144775, "learning_rate": 7.149344751524588e-05, "loss": 0.953924560546875, "memory(GiB)": 89.13, "step": 5510, "token_acc": 0.7441070647970678, "train_speed(iter/s)": 0.121561 }, { "epoch": 0.07156087702623358, "grad_norm": 0.9519029855728149, "learning_rate": 7.155832360192035e-05, "loss": 0.9282400131225585, "memory(GiB)": 89.13, "step": 5515, "token_acc": 0.7445028527469709, "train_speed(iter/s)": 0.121563 }, { "epoch": 0.07162575542788928, "grad_norm": 0.9919806122779846, "learning_rate": 7.162319968859479e-05, "loss": 0.9613370895385742, "memory(GiB)": 89.13, "step": 5520, "token_acc": 0.7308128187370144, "train_speed(iter/s)": 0.121567 }, { "epoch": 0.07169063382954498, "grad_norm": 1.2243350744247437, "learning_rate": 7.168807577526923e-05, "loss": 0.962161922454834, "memory(GiB)": 89.13, "step": 5525, "token_acc": 0.7429936668057189, "train_speed(iter/s)": 0.121564 }, { "epoch": 0.07175551223120068, "grad_norm": 1.0102182626724243, "learning_rate": 7.175295186194369e-05, "loss": 0.898384952545166, "memory(GiB)": 89.13, "step": 5530, "token_acc": 0.7539375424304141, "train_speed(iter/s)": 0.121557 }, { "epoch": 0.07182039063285636, "grad_norm": 1.0184355974197388, "learning_rate": 7.181782794861815e-05, "loss": 0.8835586547851563, "memory(GiB)": 89.13, "step": 5535, "token_acc": 0.7616230838593328, "train_speed(iter/s)": 0.121557 }, { "epoch": 0.07188526903451206, "grad_norm": 1.0367306470870972, "learning_rate": 7.188270403529259e-05, "loss": 0.898548698425293, "memory(GiB)": 89.13, "step": 5540, "token_acc": 0.7420064874884152, "train_speed(iter/s)": 0.121556 }, { "epoch": 0.07195014743616776, "grad_norm": 0.9889965057373047, "learning_rate": 7.194758012196705e-05, "loss": 0.9005525588989258, "memory(GiB)": 89.13, "step": 5545, "token_acc": 0.7609325376051373, "train_speed(iter/s)": 0.121555 }, { "epoch": 0.07201502583782346, "grad_norm": 0.9444377422332764, "learning_rate": 7.20124562086415e-05, "loss": 0.9236432075500488, "memory(GiB)": 89.13, "step": 5550, "token_acc": 0.7279902442329189, "train_speed(iter/s)": 0.121557 }, { "epoch": 0.07207990423947916, "grad_norm": 1.0055557489395142, "learning_rate": 7.207733229531595e-05, "loss": 0.9065372467041015, "memory(GiB)": 89.13, "step": 5555, "token_acc": 0.7430921155104202, "train_speed(iter/s)": 0.121553 }, { "epoch": 0.07214478264113486, "grad_norm": 0.9579237103462219, "learning_rate": 7.21422083819904e-05, "loss": 0.9123955726623535, "memory(GiB)": 89.13, "step": 5560, "token_acc": 0.7629860031104199, "train_speed(iter/s)": 0.121555 }, { "epoch": 0.07220966104279054, "grad_norm": 1.0087251663208008, "learning_rate": 7.220708446866486e-05, "loss": 0.9607258796691894, "memory(GiB)": 89.13, "step": 5565, "token_acc": 0.73079862437906, "train_speed(iter/s)": 0.121558 }, { "epoch": 0.07227453944444624, "grad_norm": 1.0255919694900513, "learning_rate": 7.22719605553393e-05, "loss": 0.9509201049804688, "memory(GiB)": 89.13, "step": 5570, "token_acc": 0.7347877605074231, "train_speed(iter/s)": 0.121556 }, { "epoch": 0.07233941784610194, "grad_norm": 1.152217149734497, "learning_rate": 7.233683664201376e-05, "loss": 0.9591031074523926, "memory(GiB)": 89.13, "step": 5575, "token_acc": 0.7370941368839057, "train_speed(iter/s)": 0.121558 }, { "epoch": 0.07240429624775764, "grad_norm": 1.0949246883392334, "learning_rate": 7.240171272868822e-05, "loss": 0.9465948104858398, "memory(GiB)": 89.13, "step": 5580, "token_acc": 0.7474730169607675, "train_speed(iter/s)": 0.121558 }, { "epoch": 0.07246917464941334, "grad_norm": 1.0300935506820679, "learning_rate": 7.246658881536266e-05, "loss": 0.9470788955688476, "memory(GiB)": 89.13, "step": 5585, "token_acc": 0.7875421472937001, "train_speed(iter/s)": 0.121554 }, { "epoch": 0.07253405305106904, "grad_norm": 1.0373594760894775, "learning_rate": 7.253146490203712e-05, "loss": 0.9124479293823242, "memory(GiB)": 89.13, "step": 5590, "token_acc": 0.7480358121688288, "train_speed(iter/s)": 0.121556 }, { "epoch": 0.07259893145272472, "grad_norm": 1.024011492729187, "learning_rate": 7.259634098871156e-05, "loss": 0.9049945831298828, "memory(GiB)": 89.13, "step": 5595, "token_acc": 0.7581507215392838, "train_speed(iter/s)": 0.121558 }, { "epoch": 0.07266380985438042, "grad_norm": 1.0277132987976074, "learning_rate": 7.266121707538601e-05, "loss": 0.9625099182128907, "memory(GiB)": 89.13, "step": 5600, "token_acc": 0.7338788233002143, "train_speed(iter/s)": 0.121559 }, { "epoch": 0.07272868825603612, "grad_norm": 1.0420759916305542, "learning_rate": 7.272609316206047e-05, "loss": 0.9841618537902832, "memory(GiB)": 89.13, "step": 5605, "token_acc": 0.715102746558489, "train_speed(iter/s)": 0.121561 }, { "epoch": 0.07279356665769182, "grad_norm": 1.0079379081726074, "learning_rate": 7.279096924873491e-05, "loss": 0.93572998046875, "memory(GiB)": 89.13, "step": 5610, "token_acc": 0.738634576057889, "train_speed(iter/s)": 0.12156 }, { "epoch": 0.07285844505934752, "grad_norm": 0.9667061567306519, "learning_rate": 7.285584533540937e-05, "loss": 0.9490789413452149, "memory(GiB)": 89.13, "step": 5615, "token_acc": 0.7112997444322745, "train_speed(iter/s)": 0.121564 }, { "epoch": 0.07292332346100322, "grad_norm": 1.4934027194976807, "learning_rate": 7.292072142208383e-05, "loss": 0.9771795272827148, "memory(GiB)": 89.13, "step": 5620, "token_acc": 0.7482444826597879, "train_speed(iter/s)": 0.121566 }, { "epoch": 0.0729882018626589, "grad_norm": 1.21857750415802, "learning_rate": 7.298559750875827e-05, "loss": 0.945685863494873, "memory(GiB)": 89.13, "step": 5625, "token_acc": 0.7366639245712839, "train_speed(iter/s)": 0.121569 }, { "epoch": 0.0730530802643146, "grad_norm": 1.0701770782470703, "learning_rate": 7.305047359543273e-05, "loss": 0.8770992279052734, "memory(GiB)": 89.13, "step": 5630, "token_acc": 0.763573252996267, "train_speed(iter/s)": 0.121571 }, { "epoch": 0.0731179586659703, "grad_norm": 1.0040078163146973, "learning_rate": 7.311534968210718e-05, "loss": 0.945649242401123, "memory(GiB)": 89.13, "step": 5635, "token_acc": 0.7322380385025397, "train_speed(iter/s)": 0.121569 }, { "epoch": 0.073182837067626, "grad_norm": 1.125838279724121, "learning_rate": 7.318022576878163e-05, "loss": 0.9958003044128418, "memory(GiB)": 89.13, "step": 5640, "token_acc": 0.7281880203167733, "train_speed(iter/s)": 0.121571 }, { "epoch": 0.0732477154692817, "grad_norm": 1.0028164386749268, "learning_rate": 7.324510185545608e-05, "loss": 0.9038333892822266, "memory(GiB)": 89.13, "step": 5645, "token_acc": 0.7500576612593219, "train_speed(iter/s)": 0.121575 }, { "epoch": 0.0733125938709374, "grad_norm": 0.9561190605163574, "learning_rate": 7.330997794213054e-05, "loss": 0.9454196929931641, "memory(GiB)": 89.13, "step": 5650, "token_acc": 0.7196209404099898, "train_speed(iter/s)": 0.121574 }, { "epoch": 0.07337747227259309, "grad_norm": 0.9614904522895813, "learning_rate": 7.337485402880498e-05, "loss": 0.9274625778198242, "memory(GiB)": 89.13, "step": 5655, "token_acc": 0.7558301111648139, "train_speed(iter/s)": 0.121577 }, { "epoch": 0.07344235067424879, "grad_norm": 0.9959623217582703, "learning_rate": 7.343973011547943e-05, "loss": 0.9340389251708985, "memory(GiB)": 89.13, "step": 5660, "token_acc": 0.7391033439159719, "train_speed(iter/s)": 0.121579 }, { "epoch": 0.07350722907590448, "grad_norm": 1.0586875677108765, "learning_rate": 7.35046062021539e-05, "loss": 0.932529067993164, "memory(GiB)": 89.13, "step": 5665, "token_acc": 0.7544563279857398, "train_speed(iter/s)": 0.121578 }, { "epoch": 0.07357210747756018, "grad_norm": 1.035825252532959, "learning_rate": 7.356948228882834e-05, "loss": 0.9458538055419922, "memory(GiB)": 89.13, "step": 5670, "token_acc": 0.7373985206566841, "train_speed(iter/s)": 0.121583 }, { "epoch": 0.07363698587921588, "grad_norm": 1.0063728094100952, "learning_rate": 7.363435837550278e-05, "loss": 0.8929413795471192, "memory(GiB)": 89.13, "step": 5675, "token_acc": 0.7456891046110614, "train_speed(iter/s)": 0.121579 }, { "epoch": 0.07370186428087158, "grad_norm": 1.0781432390213013, "learning_rate": 7.369923446217725e-05, "loss": 0.9427495956420898, "memory(GiB)": 89.13, "step": 5680, "token_acc": 0.752767110910323, "train_speed(iter/s)": 0.121584 }, { "epoch": 0.07376674268252727, "grad_norm": 0.9567283391952515, "learning_rate": 7.37641105488517e-05, "loss": 0.9433568954467774, "memory(GiB)": 89.13, "step": 5685, "token_acc": 0.7254473858629966, "train_speed(iter/s)": 0.121591 }, { "epoch": 0.07383162108418297, "grad_norm": 1.1392273902893066, "learning_rate": 7.382898663552615e-05, "loss": 0.9701902389526367, "memory(GiB)": 89.13, "step": 5690, "token_acc": 0.7369560937192361, "train_speed(iter/s)": 0.121593 }, { "epoch": 0.07389649948583867, "grad_norm": 2.3463242053985596, "learning_rate": 7.38938627222006e-05, "loss": 0.9768113136291504, "memory(GiB)": 89.13, "step": 5695, "token_acc": 0.7515962791300684, "train_speed(iter/s)": 0.121586 }, { "epoch": 0.07396137788749436, "grad_norm": 1.1007773876190186, "learning_rate": 7.395873880887505e-05, "loss": 0.9371135711669922, "memory(GiB)": 89.13, "step": 5700, "token_acc": 0.72474453055346, "train_speed(iter/s)": 0.121589 }, { "epoch": 0.07402625628915006, "grad_norm": 1.0371673107147217, "learning_rate": 7.402361489554951e-05, "loss": 0.9305502891540527, "memory(GiB)": 89.13, "step": 5705, "token_acc": 0.7761596958174904, "train_speed(iter/s)": 0.121586 }, { "epoch": 0.07409113469080576, "grad_norm": 1.054200291633606, "learning_rate": 7.408849098222395e-05, "loss": 0.9109867095947266, "memory(GiB)": 89.13, "step": 5710, "token_acc": 0.7507303804694584, "train_speed(iter/s)": 0.12158 }, { "epoch": 0.07415601309246145, "grad_norm": 1.0015331506729126, "learning_rate": 7.415336706889841e-05, "loss": 0.9104767799377441, "memory(GiB)": 89.13, "step": 5715, "token_acc": 0.7286867081991889, "train_speed(iter/s)": 0.121582 }, { "epoch": 0.07422089149411715, "grad_norm": 1.1480000019073486, "learning_rate": 7.421824315557287e-05, "loss": 0.9309260368347168, "memory(GiB)": 89.13, "step": 5720, "token_acc": 0.7479987684729064, "train_speed(iter/s)": 0.121575 }, { "epoch": 0.07428576989577285, "grad_norm": 1.0220534801483154, "learning_rate": 7.428311924224731e-05, "loss": 0.9682892799377442, "memory(GiB)": 89.13, "step": 5725, "token_acc": 0.7506982517844212, "train_speed(iter/s)": 0.121575 }, { "epoch": 0.07435064829742855, "grad_norm": 0.8819116950035095, "learning_rate": 7.434799532892177e-05, "loss": 0.9246708869934082, "memory(GiB)": 89.13, "step": 5730, "token_acc": 0.7385950360492567, "train_speed(iter/s)": 0.12157 }, { "epoch": 0.07441552669908424, "grad_norm": 1.0338391065597534, "learning_rate": 7.441287141559622e-05, "loss": 0.9194965362548828, "memory(GiB)": 89.13, "step": 5735, "token_acc": 0.7583997950097691, "train_speed(iter/s)": 0.121563 }, { "epoch": 0.07448040510073994, "grad_norm": 0.9785370826721191, "learning_rate": 7.447774750227067e-05, "loss": 0.9735174179077148, "memory(GiB)": 89.13, "step": 5740, "token_acc": 0.738755763852143, "train_speed(iter/s)": 0.121561 }, { "epoch": 0.07454528350239563, "grad_norm": 1.0573437213897705, "learning_rate": 7.454262358894512e-05, "loss": 0.9708963394165039, "memory(GiB)": 89.13, "step": 5745, "token_acc": 0.7506709218829332, "train_speed(iter/s)": 0.121561 }, { "epoch": 0.07461016190405133, "grad_norm": 0.9859198927879333, "learning_rate": 7.460749967561958e-05, "loss": 0.9227770805358887, "memory(GiB)": 89.13, "step": 5750, "token_acc": 0.7520060180541625, "train_speed(iter/s)": 0.121563 }, { "epoch": 0.07467504030570703, "grad_norm": 0.970070481300354, "learning_rate": 7.467237576229402e-05, "loss": 1.0027661323547363, "memory(GiB)": 89.13, "step": 5755, "token_acc": 0.7021432765707575, "train_speed(iter/s)": 0.121563 }, { "epoch": 0.07473991870736273, "grad_norm": 0.977689802646637, "learning_rate": 7.473725184896846e-05, "loss": 0.9080915451049805, "memory(GiB)": 89.13, "step": 5760, "token_acc": 0.7493719299561289, "train_speed(iter/s)": 0.121562 }, { "epoch": 0.07480479710901843, "grad_norm": 1.1302911043167114, "learning_rate": 7.480212793564293e-05, "loss": 0.9234691619873047, "memory(GiB)": 89.13, "step": 5765, "token_acc": 0.7298045198140987, "train_speed(iter/s)": 0.121559 }, { "epoch": 0.07486967551067412, "grad_norm": 0.9048647880554199, "learning_rate": 7.486700402231738e-05, "loss": 0.983701229095459, "memory(GiB)": 89.13, "step": 5770, "token_acc": 0.7298670147095135, "train_speed(iter/s)": 0.121556 }, { "epoch": 0.07493455391232981, "grad_norm": 0.9256327748298645, "learning_rate": 7.493188010899182e-05, "loss": 0.9442910194396973, "memory(GiB)": 89.13, "step": 5775, "token_acc": 0.7455632841448352, "train_speed(iter/s)": 0.121556 }, { "epoch": 0.07499943231398551, "grad_norm": 0.9095121622085571, "learning_rate": 7.499675619566629e-05, "loss": 0.9125276565551758, "memory(GiB)": 89.13, "step": 5780, "token_acc": 0.747158706534975, "train_speed(iter/s)": 0.121553 }, { "epoch": 0.07506431071564121, "grad_norm": 1.0431199073791504, "learning_rate": 7.506163228234073e-05, "loss": 0.9762365341186523, "memory(GiB)": 89.13, "step": 5785, "token_acc": 0.7392523037034442, "train_speed(iter/s)": 0.121557 }, { "epoch": 0.07512918911729691, "grad_norm": 1.1709686517715454, "learning_rate": 7.512650836901518e-05, "loss": 0.9089483261108399, "memory(GiB)": 89.13, "step": 5790, "token_acc": 0.7545650447998186, "train_speed(iter/s)": 0.121556 }, { "epoch": 0.0751940675189526, "grad_norm": 1.0727897882461548, "learning_rate": 7.519138445568963e-05, "loss": 0.9896665573120117, "memory(GiB)": 89.13, "step": 5795, "token_acc": 0.7210789883432227, "train_speed(iter/s)": 0.121558 }, { "epoch": 0.0752589459206083, "grad_norm": 1.0758150815963745, "learning_rate": 7.525626054236409e-05, "loss": 0.9691350936889649, "memory(GiB)": 89.13, "step": 5800, "token_acc": 0.7330914260062846, "train_speed(iter/s)": 0.121559 }, { "epoch": 0.07532382432226399, "grad_norm": 1.0273023843765259, "learning_rate": 7.532113662903853e-05, "loss": 0.9706596374511719, "memory(GiB)": 89.13, "step": 5805, "token_acc": 0.7436088095743928, "train_speed(iter/s)": 0.121559 }, { "epoch": 0.07538870272391969, "grad_norm": 1.0614521503448486, "learning_rate": 7.538601271571299e-05, "loss": 0.9860066413879395, "memory(GiB)": 89.13, "step": 5810, "token_acc": 0.7305627923035877, "train_speed(iter/s)": 0.121562 }, { "epoch": 0.07545358112557539, "grad_norm": 1.143231987953186, "learning_rate": 7.545088880238745e-05, "loss": 0.9399290084838867, "memory(GiB)": 89.13, "step": 5815, "token_acc": 0.7405783039006404, "train_speed(iter/s)": 0.121565 }, { "epoch": 0.07551845952723109, "grad_norm": 0.936716616153717, "learning_rate": 7.551576488906189e-05, "loss": 0.9445330619812011, "memory(GiB)": 89.13, "step": 5820, "token_acc": 0.7432132863320856, "train_speed(iter/s)": 0.121564 }, { "epoch": 0.07558333792888679, "grad_norm": 0.9457675814628601, "learning_rate": 7.558064097573635e-05, "loss": 0.931805419921875, "memory(GiB)": 89.13, "step": 5825, "token_acc": 0.7407075682937752, "train_speed(iter/s)": 0.121559 }, { "epoch": 0.07564821633054249, "grad_norm": 1.2149876356124878, "learning_rate": 7.56455170624108e-05, "loss": 0.9702733039855957, "memory(GiB)": 89.13, "step": 5830, "token_acc": 0.7311813731975129, "train_speed(iter/s)": 0.12156 }, { "epoch": 0.07571309473219817, "grad_norm": 0.9128324389457703, "learning_rate": 7.571039314908525e-05, "loss": 1.0036810874938964, "memory(GiB)": 89.13, "step": 5835, "token_acc": 0.7153070637849236, "train_speed(iter/s)": 0.121561 }, { "epoch": 0.07577797313385387, "grad_norm": 1.0445126295089722, "learning_rate": 7.57752692357597e-05, "loss": 0.9741602897644043, "memory(GiB)": 89.13, "step": 5840, "token_acc": 0.7218666758916525, "train_speed(iter/s)": 0.121558 }, { "epoch": 0.07584285153550957, "grad_norm": 0.9190884828567505, "learning_rate": 7.584014532243416e-05, "loss": 0.9467218399047852, "memory(GiB)": 89.13, "step": 5845, "token_acc": 0.7699186738541115, "train_speed(iter/s)": 0.121558 }, { "epoch": 0.07590772993716527, "grad_norm": 0.974137008190155, "learning_rate": 7.59050214091086e-05, "loss": 0.9687656402587891, "memory(GiB)": 89.13, "step": 5850, "token_acc": 0.7463781809019904, "train_speed(iter/s)": 0.121559 }, { "epoch": 0.07597260833882097, "grad_norm": 0.9547000527381897, "learning_rate": 7.596989749578306e-05, "loss": 0.9663289070129395, "memory(GiB)": 89.13, "step": 5855, "token_acc": 0.7335878434014522, "train_speed(iter/s)": 0.121557 }, { "epoch": 0.07603748674047667, "grad_norm": 1.0290685892105103, "learning_rate": 7.60347735824575e-05, "loss": 0.9502239227294922, "memory(GiB)": 89.13, "step": 5860, "token_acc": 0.742147834307924, "train_speed(iter/s)": 0.121559 }, { "epoch": 0.07610236514213235, "grad_norm": 1.091395378112793, "learning_rate": 7.609964966913197e-05, "loss": 0.9187831878662109, "memory(GiB)": 89.13, "step": 5865, "token_acc": 0.7588401987761304, "train_speed(iter/s)": 0.121564 }, { "epoch": 0.07616724354378805, "grad_norm": 1.0367704629898071, "learning_rate": 7.616452575580642e-05, "loss": 0.9679054260253906, "memory(GiB)": 89.13, "step": 5870, "token_acc": 0.7522859793267956, "train_speed(iter/s)": 0.121565 }, { "epoch": 0.07623212194544375, "grad_norm": 0.9321190118789673, "learning_rate": 7.622940184248086e-05, "loss": 0.9211990356445312, "memory(GiB)": 89.13, "step": 5875, "token_acc": 0.7500221965728492, "train_speed(iter/s)": 0.121564 }, { "epoch": 0.07629700034709945, "grad_norm": 1.1238912343978882, "learning_rate": 7.629427792915533e-05, "loss": 0.9343162536621094, "memory(GiB)": 89.13, "step": 5880, "token_acc": 0.7434345406619155, "train_speed(iter/s)": 0.121565 }, { "epoch": 0.07636187874875515, "grad_norm": 1.027833104133606, "learning_rate": 7.635915401582977e-05, "loss": 0.9401045799255371, "memory(GiB)": 89.13, "step": 5885, "token_acc": 0.764795029173297, "train_speed(iter/s)": 0.121562 }, { "epoch": 0.07642675715041085, "grad_norm": 1.0605329275131226, "learning_rate": 7.642403010250421e-05, "loss": 0.9580572128295899, "memory(GiB)": 89.13, "step": 5890, "token_acc": 0.7539221784504804, "train_speed(iter/s)": 0.121566 }, { "epoch": 0.07649163555206653, "grad_norm": 1.0380980968475342, "learning_rate": 7.648890618917867e-05, "loss": 0.8996797561645508, "memory(GiB)": 89.13, "step": 5895, "token_acc": 0.7484293492746449, "train_speed(iter/s)": 0.121568 }, { "epoch": 0.07655651395372223, "grad_norm": 0.9259305596351624, "learning_rate": 7.655378227585313e-05, "loss": 0.9650774002075195, "memory(GiB)": 89.13, "step": 5900, "token_acc": 0.7536607580746865, "train_speed(iter/s)": 0.12157 }, { "epoch": 0.07662139235537793, "grad_norm": 0.8207322955131531, "learning_rate": 7.661865836252757e-05, "loss": 0.9122623443603516, "memory(GiB)": 89.13, "step": 5905, "token_acc": 0.7413372031288713, "train_speed(iter/s)": 0.121566 }, { "epoch": 0.07668627075703363, "grad_norm": 1.1244863271713257, "learning_rate": 7.668353444920203e-05, "loss": 0.9546525001525878, "memory(GiB)": 89.13, "step": 5910, "token_acc": 0.729607250755287, "train_speed(iter/s)": 0.121564 }, { "epoch": 0.07675114915868933, "grad_norm": 1.0960036516189575, "learning_rate": 7.674841053587648e-05, "loss": 0.9715406417846679, "memory(GiB)": 89.13, "step": 5915, "token_acc": 0.7301815280271294, "train_speed(iter/s)": 0.121568 }, { "epoch": 0.07681602756034503, "grad_norm": 1.048600196838379, "learning_rate": 7.681328662255093e-05, "loss": 0.9197031021118164, "memory(GiB)": 89.13, "step": 5920, "token_acc": 0.7731097598427615, "train_speed(iter/s)": 0.121563 }, { "epoch": 0.07688090596200071, "grad_norm": 1.0709279775619507, "learning_rate": 7.687816270922538e-05, "loss": 0.9398316383361817, "memory(GiB)": 89.13, "step": 5925, "token_acc": 0.7410369135147427, "train_speed(iter/s)": 0.121564 }, { "epoch": 0.07694578436365641, "grad_norm": 0.900253415107727, "learning_rate": 7.694303879589984e-05, "loss": 0.932871150970459, "memory(GiB)": 89.13, "step": 5930, "token_acc": 0.7418470180075996, "train_speed(iter/s)": 0.121567 }, { "epoch": 0.07701066276531211, "grad_norm": 0.8792507648468018, "learning_rate": 7.700791488257428e-05, "loss": 0.9342117309570312, "memory(GiB)": 89.13, "step": 5935, "token_acc": 0.7263064658990257, "train_speed(iter/s)": 0.121569 }, { "epoch": 0.07707554116696781, "grad_norm": 0.9575066566467285, "learning_rate": 7.707279096924874e-05, "loss": 0.9338895797729492, "memory(GiB)": 89.13, "step": 5940, "token_acc": 0.7487706750111757, "train_speed(iter/s)": 0.121567 }, { "epoch": 0.07714041956862351, "grad_norm": 1.0584169626235962, "learning_rate": 7.71376670559232e-05, "loss": 0.9557902336120605, "memory(GiB)": 89.13, "step": 5945, "token_acc": 0.7326423230604898, "train_speed(iter/s)": 0.121562 }, { "epoch": 0.07720529797027921, "grad_norm": 0.9790886640548706, "learning_rate": 7.720254314259764e-05, "loss": 0.9137590408325196, "memory(GiB)": 89.13, "step": 5950, "token_acc": 0.7488370536461296, "train_speed(iter/s)": 0.121564 }, { "epoch": 0.0772701763719349, "grad_norm": 1.0274072885513306, "learning_rate": 7.72674192292721e-05, "loss": 0.9284963607788086, "memory(GiB)": 89.13, "step": 5955, "token_acc": 0.7682999257609503, "train_speed(iter/s)": 0.121565 }, { "epoch": 0.0773350547735906, "grad_norm": 0.9772322773933411, "learning_rate": 7.733229531594654e-05, "loss": 0.9779716491699219, "memory(GiB)": 89.13, "step": 5960, "token_acc": 0.7539669882784467, "train_speed(iter/s)": 0.121563 }, { "epoch": 0.0773999331752463, "grad_norm": 1.0858007669448853, "learning_rate": 7.7397171402621e-05, "loss": 0.9520936965942383, "memory(GiB)": 89.13, "step": 5965, "token_acc": 0.7419966229022364, "train_speed(iter/s)": 0.121563 }, { "epoch": 0.07746481157690199, "grad_norm": 1.0344605445861816, "learning_rate": 7.746204748929545e-05, "loss": 0.9558925628662109, "memory(GiB)": 89.13, "step": 5970, "token_acc": 0.7308768283910159, "train_speed(iter/s)": 0.121567 }, { "epoch": 0.07752968997855769, "grad_norm": 1.0386704206466675, "learning_rate": 7.75269235759699e-05, "loss": 0.9594759941101074, "memory(GiB)": 89.13, "step": 5975, "token_acc": 0.7269017165937395, "train_speed(iter/s)": 0.121562 }, { "epoch": 0.07759456838021339, "grad_norm": 1.0218209028244019, "learning_rate": 7.759179966264435e-05, "loss": 0.990130615234375, "memory(GiB)": 89.13, "step": 5980, "token_acc": 0.7490815131817998, "train_speed(iter/s)": 0.121564 }, { "epoch": 0.07765944678186908, "grad_norm": 0.8960354328155518, "learning_rate": 7.765667574931881e-05, "loss": 0.9253078460693359, "memory(GiB)": 89.13, "step": 5985, "token_acc": 0.7408980134406616, "train_speed(iter/s)": 0.121568 }, { "epoch": 0.07772432518352478, "grad_norm": 1.1006501913070679, "learning_rate": 7.772155183599325e-05, "loss": 0.9831693649291993, "memory(GiB)": 89.13, "step": 5990, "token_acc": 0.7344913151364765, "train_speed(iter/s)": 0.12157 }, { "epoch": 0.07778920358518047, "grad_norm": 1.1371577978134155, "learning_rate": 7.778642792266771e-05, "loss": 0.9938056945800782, "memory(GiB)": 89.13, "step": 5995, "token_acc": 0.7260703337581373, "train_speed(iter/s)": 0.121571 }, { "epoch": 0.07785408198683617, "grad_norm": 0.9492915272712708, "learning_rate": 7.785130400934217e-05, "loss": 0.9498298645019532, "memory(GiB)": 89.13, "step": 6000, "token_acc": 0.7474368183896036, "train_speed(iter/s)": 0.121568 }, { "epoch": 0.07791896038849187, "grad_norm": 0.8697419166564941, "learning_rate": 7.791618009601661e-05, "loss": 0.9031967163085938, "memory(GiB)": 89.13, "step": 6005, "token_acc": 0.7515897512178796, "train_speed(iter/s)": 0.121567 }, { "epoch": 0.07798383879014757, "grad_norm": 0.9243260622024536, "learning_rate": 7.798105618269107e-05, "loss": 0.9760931015014649, "memory(GiB)": 89.13, "step": 6010, "token_acc": 0.7512185146051331, "train_speed(iter/s)": 0.121569 }, { "epoch": 0.07804871719180326, "grad_norm": 1.1652101278305054, "learning_rate": 7.804593226936552e-05, "loss": 0.941656494140625, "memory(GiB)": 89.13, "step": 6015, "token_acc": 0.7496987433293166, "train_speed(iter/s)": 0.121568 }, { "epoch": 0.07811359559345896, "grad_norm": 1.0463218688964844, "learning_rate": 7.811080835603997e-05, "loss": 1.0070379257202149, "memory(GiB)": 89.13, "step": 6020, "token_acc": 0.7125928772738919, "train_speed(iter/s)": 0.121571 }, { "epoch": 0.07817847399511466, "grad_norm": 1.0358612537384033, "learning_rate": 7.817568444271441e-05, "loss": 0.966463279724121, "memory(GiB)": 89.13, "step": 6025, "token_acc": 0.7545677313958987, "train_speed(iter/s)": 0.121575 }, { "epoch": 0.07824335239677035, "grad_norm": 0.9789474606513977, "learning_rate": 7.824056052938888e-05, "loss": 0.9774547576904297, "memory(GiB)": 89.13, "step": 6030, "token_acc": 0.7321275815715508, "train_speed(iter/s)": 0.121576 }, { "epoch": 0.07830823079842605, "grad_norm": 0.9736539125442505, "learning_rate": 7.830543661606332e-05, "loss": 0.9642826080322265, "memory(GiB)": 89.13, "step": 6035, "token_acc": 0.7356372218476062, "train_speed(iter/s)": 0.121579 }, { "epoch": 0.07837310920008175, "grad_norm": 1.0185221433639526, "learning_rate": 7.837031270273776e-05, "loss": 0.9335518836975097, "memory(GiB)": 89.13, "step": 6040, "token_acc": 0.7422811794010311, "train_speed(iter/s)": 0.121577 }, { "epoch": 0.07843798760173744, "grad_norm": 1.0026545524597168, "learning_rate": 7.843518878941223e-05, "loss": 0.9873912811279297, "memory(GiB)": 89.13, "step": 6045, "token_acc": 0.7345309381237525, "train_speed(iter/s)": 0.121579 }, { "epoch": 0.07850286600339314, "grad_norm": 1.0252217054367065, "learning_rate": 7.850006487608668e-05, "loss": 0.9756961822509765, "memory(GiB)": 89.13, "step": 6050, "token_acc": 0.7235842916033847, "train_speed(iter/s)": 0.121583 }, { "epoch": 0.07856774440504884, "grad_norm": 0.8640637397766113, "learning_rate": 7.856494096276113e-05, "loss": 0.940216064453125, "memory(GiB)": 89.13, "step": 6055, "token_acc": 0.7471037388099, "train_speed(iter/s)": 0.121585 }, { "epoch": 0.07863262280670454, "grad_norm": 1.006751537322998, "learning_rate": 7.862981704943558e-05, "loss": 0.9550480842590332, "memory(GiB)": 89.13, "step": 6060, "token_acc": 0.7479275181571418, "train_speed(iter/s)": 0.121585 }, { "epoch": 0.07869750120836023, "grad_norm": 1.072014570236206, "learning_rate": 7.869469313611003e-05, "loss": 0.9298563003540039, "memory(GiB)": 89.13, "step": 6065, "token_acc": 0.7329622649778674, "train_speed(iter/s)": 0.121586 }, { "epoch": 0.07876237961001593, "grad_norm": 0.9046154022216797, "learning_rate": 7.875956922278449e-05, "loss": 0.9796724319458008, "memory(GiB)": 89.13, "step": 6070, "token_acc": 0.7643121564521557, "train_speed(iter/s)": 0.121583 }, { "epoch": 0.07882725801167162, "grad_norm": 0.9753292798995972, "learning_rate": 7.882444530945893e-05, "loss": 0.974726390838623, "memory(GiB)": 89.13, "step": 6075, "token_acc": 0.7584175084175084, "train_speed(iter/s)": 0.12158 }, { "epoch": 0.07889213641332732, "grad_norm": 0.9848453998565674, "learning_rate": 7.888932139613339e-05, "loss": 0.9588285446166992, "memory(GiB)": 89.13, "step": 6080, "token_acc": 0.7325244174805827, "train_speed(iter/s)": 0.121581 }, { "epoch": 0.07895701481498302, "grad_norm": 1.0225125551223755, "learning_rate": 7.895419748280785e-05, "loss": 0.9219192504882813, "memory(GiB)": 89.13, "step": 6085, "token_acc": 0.7372896061079299, "train_speed(iter/s)": 0.121584 }, { "epoch": 0.07902189321663872, "grad_norm": 1.1192944049835205, "learning_rate": 7.901907356948229e-05, "loss": 0.9565130233764648, "memory(GiB)": 89.13, "step": 6090, "token_acc": 0.7474214535068232, "train_speed(iter/s)": 0.121578 }, { "epoch": 0.07908677161829442, "grad_norm": 1.1156508922576904, "learning_rate": 7.908394965615675e-05, "loss": 1.0051286697387696, "memory(GiB)": 89.13, "step": 6095, "token_acc": 0.7247770805812418, "train_speed(iter/s)": 0.121582 }, { "epoch": 0.07915165001995011, "grad_norm": 1.0703126192092896, "learning_rate": 7.91488257428312e-05, "loss": 0.9345063209533692, "memory(GiB)": 89.13, "step": 6100, "token_acc": 0.7333539915559515, "train_speed(iter/s)": 0.121582 }, { "epoch": 0.0792165284216058, "grad_norm": 0.9907615184783936, "learning_rate": 7.921370182950565e-05, "loss": 0.9874360084533691, "memory(GiB)": 89.13, "step": 6105, "token_acc": 0.7488353313870427, "train_speed(iter/s)": 0.121578 }, { "epoch": 0.0792814068232615, "grad_norm": 1.091516375541687, "learning_rate": 7.92785779161801e-05, "loss": 1.0086865425109863, "memory(GiB)": 89.13, "step": 6110, "token_acc": 0.7327047826262137, "train_speed(iter/s)": 0.12158 }, { "epoch": 0.0793462852249172, "grad_norm": 1.0643192529678345, "learning_rate": 7.934345400285456e-05, "loss": 0.9355940818786621, "memory(GiB)": 89.13, "step": 6115, "token_acc": 0.75072681540522, "train_speed(iter/s)": 0.121585 }, { "epoch": 0.0794111636265729, "grad_norm": 0.9789677262306213, "learning_rate": 7.9408330089529e-05, "loss": 0.9414629936218262, "memory(GiB)": 89.13, "step": 6120, "token_acc": 0.7584102644343608, "train_speed(iter/s)": 0.121589 }, { "epoch": 0.0794760420282286, "grad_norm": 1.015950322151184, "learning_rate": 7.947320617620345e-05, "loss": 0.9326620101928711, "memory(GiB)": 89.13, "step": 6125, "token_acc": 0.7343139344844738, "train_speed(iter/s)": 0.121592 }, { "epoch": 0.0795409204298843, "grad_norm": 1.0139902830123901, "learning_rate": 7.953808226287792e-05, "loss": 0.9507707595825196, "memory(GiB)": 89.13, "step": 6130, "token_acc": 0.7331410750813185, "train_speed(iter/s)": 0.121588 }, { "epoch": 0.07960579883153998, "grad_norm": 1.0465147495269775, "learning_rate": 7.960295834955236e-05, "loss": 0.9881669044494629, "memory(GiB)": 89.13, "step": 6135, "token_acc": 0.7344698518541818, "train_speed(iter/s)": 0.121586 }, { "epoch": 0.07967067723319568, "grad_norm": 1.0247001647949219, "learning_rate": 7.96678344362268e-05, "loss": 0.9730347633361817, "memory(GiB)": 89.13, "step": 6140, "token_acc": 0.7559289876677057, "train_speed(iter/s)": 0.121581 }, { "epoch": 0.07973555563485138, "grad_norm": 1.0339200496673584, "learning_rate": 7.973271052290127e-05, "loss": 0.9416119575500488, "memory(GiB)": 89.13, "step": 6145, "token_acc": 0.7365494796080957, "train_speed(iter/s)": 0.121583 }, { "epoch": 0.07980043403650708, "grad_norm": 1.0311707258224487, "learning_rate": 7.979758660957572e-05, "loss": 0.9510904312133789, "memory(GiB)": 89.13, "step": 6150, "token_acc": 0.7554159592529711, "train_speed(iter/s)": 0.121585 }, { "epoch": 0.07986531243816278, "grad_norm": 0.9189770221710205, "learning_rate": 7.986246269625016e-05, "loss": 0.9476083755493164, "memory(GiB)": 89.13, "step": 6155, "token_acc": 0.7636729080715117, "train_speed(iter/s)": 0.121586 }, { "epoch": 0.07993019083981848, "grad_norm": 0.9635843634605408, "learning_rate": 7.992733878292462e-05, "loss": 0.9647759437561035, "memory(GiB)": 89.13, "step": 6160, "token_acc": 0.740818046802874, "train_speed(iter/s)": 0.121581 }, { "epoch": 0.07999506924147416, "grad_norm": 0.8908465504646301, "learning_rate": 7.999221486959907e-05, "loss": 0.9472531318664551, "memory(GiB)": 89.13, "step": 6165, "token_acc": 0.7208974675073087, "train_speed(iter/s)": 0.121584 }, { "epoch": 0.08005994764312986, "grad_norm": 1.021242380142212, "learning_rate": 8.005709095627351e-05, "loss": 0.9616836547851563, "memory(GiB)": 89.13, "step": 6170, "token_acc": 0.7446252578501031, "train_speed(iter/s)": 0.121587 }, { "epoch": 0.08012482604478556, "grad_norm": 1.0226662158966064, "learning_rate": 8.012196704294797e-05, "loss": 0.9807598114013671, "memory(GiB)": 89.13, "step": 6175, "token_acc": 0.747141679509695, "train_speed(iter/s)": 0.121587 }, { "epoch": 0.08018970444644126, "grad_norm": 0.9385281801223755, "learning_rate": 8.018684312962243e-05, "loss": 0.9275224685668946, "memory(GiB)": 89.13, "step": 6180, "token_acc": 0.7532926557035494, "train_speed(iter/s)": 0.121584 }, { "epoch": 0.08025458284809696, "grad_norm": 0.9605461359024048, "learning_rate": 8.025171921629687e-05, "loss": 0.9458171844482421, "memory(GiB)": 89.13, "step": 6185, "token_acc": 0.7630911269293115, "train_speed(iter/s)": 0.121582 }, { "epoch": 0.08031946124975266, "grad_norm": 0.9994077086448669, "learning_rate": 8.031659530297133e-05, "loss": 0.9417960166931152, "memory(GiB)": 89.13, "step": 6190, "token_acc": 0.7435593515774795, "train_speed(iter/s)": 0.121584 }, { "epoch": 0.08038433965140834, "grad_norm": 1.0108873844146729, "learning_rate": 8.038147138964578e-05, "loss": 0.9473783493041992, "memory(GiB)": 89.13, "step": 6195, "token_acc": 0.734229540086147, "train_speed(iter/s)": 0.12158 }, { "epoch": 0.08044921805306404, "grad_norm": 1.0233559608459473, "learning_rate": 8.044634747632023e-05, "loss": 0.9720149040222168, "memory(GiB)": 89.13, "step": 6200, "token_acc": 0.7536159422877754, "train_speed(iter/s)": 0.12158 }, { "epoch": 0.08051409645471974, "grad_norm": 0.9415002465248108, "learning_rate": 8.051122356299468e-05, "loss": 0.8824022293090821, "memory(GiB)": 89.13, "step": 6205, "token_acc": 0.7570479501566211, "train_speed(iter/s)": 0.121577 }, { "epoch": 0.08057897485637544, "grad_norm": 0.9703708291053772, "learning_rate": 8.057609964966914e-05, "loss": 0.9864619255065918, "memory(GiB)": 89.13, "step": 6210, "token_acc": 0.7476389359220379, "train_speed(iter/s)": 0.121574 }, { "epoch": 0.08064385325803114, "grad_norm": 0.9973936676979065, "learning_rate": 8.064097573634358e-05, "loss": 0.8989732742309571, "memory(GiB)": 89.13, "step": 6215, "token_acc": 0.782572927021561, "train_speed(iter/s)": 0.121579 }, { "epoch": 0.08070873165968684, "grad_norm": 1.0386316776275635, "learning_rate": 8.070585182301804e-05, "loss": 0.9351300239562989, "memory(GiB)": 89.13, "step": 6220, "token_acc": 0.725812364605899, "train_speed(iter/s)": 0.121581 }, { "epoch": 0.08077361006134252, "grad_norm": 0.9859884977340698, "learning_rate": 8.077072790969248e-05, "loss": 0.9366657257080078, "memory(GiB)": 89.13, "step": 6225, "token_acc": 0.7534162610420047, "train_speed(iter/s)": 0.12158 }, { "epoch": 0.08083848846299822, "grad_norm": 0.9851006865501404, "learning_rate": 8.083560399636694e-05, "loss": 0.9614997863769531, "memory(GiB)": 89.13, "step": 6230, "token_acc": 0.7588696433551265, "train_speed(iter/s)": 0.121574 }, { "epoch": 0.08090336686465392, "grad_norm": 1.1318483352661133, "learning_rate": 8.09004800830414e-05, "loss": 0.9878745079040527, "memory(GiB)": 89.13, "step": 6235, "token_acc": 0.7380979915963147, "train_speed(iter/s)": 0.121569 }, { "epoch": 0.08096824526630962, "grad_norm": 1.0377979278564453, "learning_rate": 8.096535616971584e-05, "loss": 0.9807586669921875, "memory(GiB)": 89.13, "step": 6240, "token_acc": 0.7535409790979097, "train_speed(iter/s)": 0.121572 }, { "epoch": 0.08103312366796532, "grad_norm": 0.9765267968177795, "learning_rate": 8.103023225639031e-05, "loss": 0.9858023643493652, "memory(GiB)": 89.13, "step": 6245, "token_acc": 0.7409758568293094, "train_speed(iter/s)": 0.12157 }, { "epoch": 0.08109800206962102, "grad_norm": 1.2238582372665405, "learning_rate": 8.109510834306475e-05, "loss": 0.9913558959960938, "memory(GiB)": 89.13, "step": 6250, "token_acc": 0.7319618114227779, "train_speed(iter/s)": 0.121571 }, { "epoch": 0.0811628804712767, "grad_norm": 1.0276061296463013, "learning_rate": 8.11599844297392e-05, "loss": 0.9624543190002441, "memory(GiB)": 89.13, "step": 6255, "token_acc": 0.7588776088187879, "train_speed(iter/s)": 0.121569 }, { "epoch": 0.0812277588729324, "grad_norm": 1.0576353073120117, "learning_rate": 8.122486051641365e-05, "loss": 0.9730071067810059, "memory(GiB)": 89.13, "step": 6260, "token_acc": 0.7293099780157436, "train_speed(iter/s)": 0.121569 }, { "epoch": 0.0812926372745881, "grad_norm": 1.0314512252807617, "learning_rate": 8.128973660308811e-05, "loss": 0.9476509094238281, "memory(GiB)": 89.13, "step": 6265, "token_acc": 0.7537331051171575, "train_speed(iter/s)": 0.121566 }, { "epoch": 0.0813575156762438, "grad_norm": 0.9028652310371399, "learning_rate": 8.135461268976255e-05, "loss": 0.9647420883178711, "memory(GiB)": 89.13, "step": 6270, "token_acc": 0.7542103169340936, "train_speed(iter/s)": 0.121567 }, { "epoch": 0.0814223940778995, "grad_norm": 1.166609525680542, "learning_rate": 8.141948877643701e-05, "loss": 0.9642057418823242, "memory(GiB)": 89.13, "step": 6275, "token_acc": 0.7464871922407361, "train_speed(iter/s)": 0.121567 }, { "epoch": 0.0814872724795552, "grad_norm": 1.0399869680404663, "learning_rate": 8.148436486311147e-05, "loss": 1.0141104698181151, "memory(GiB)": 89.13, "step": 6280, "token_acc": 0.7335037258360632, "train_speed(iter/s)": 0.12157 }, { "epoch": 0.08155215088121089, "grad_norm": 0.9318342804908752, "learning_rate": 8.154924094978591e-05, "loss": 0.956761646270752, "memory(GiB)": 89.13, "step": 6285, "token_acc": 0.7364263462394304, "train_speed(iter/s)": 0.121571 }, { "epoch": 0.08161702928286658, "grad_norm": 0.9912930727005005, "learning_rate": 8.161411703646037e-05, "loss": 1.0047760009765625, "memory(GiB)": 89.13, "step": 6290, "token_acc": 0.7261857785494437, "train_speed(iter/s)": 0.121573 }, { "epoch": 0.08168190768452228, "grad_norm": 0.9680468440055847, "learning_rate": 8.167899312313482e-05, "loss": 0.9764308929443359, "memory(GiB)": 89.13, "step": 6295, "token_acc": 0.7430655321029948, "train_speed(iter/s)": 0.121578 }, { "epoch": 0.08174678608617798, "grad_norm": 0.9455914497375488, "learning_rate": 8.174386920980927e-05, "loss": 0.9483418464660645, "memory(GiB)": 89.13, "step": 6300, "token_acc": 0.7592772619087527, "train_speed(iter/s)": 0.121579 }, { "epoch": 0.08181166448783368, "grad_norm": 0.9480286240577698, "learning_rate": 8.180874529648372e-05, "loss": 0.9735340118408203, "memory(GiB)": 89.13, "step": 6305, "token_acc": 0.7291516304704416, "train_speed(iter/s)": 0.121581 }, { "epoch": 0.08187654288948937, "grad_norm": 1.064112901687622, "learning_rate": 8.187362138315818e-05, "loss": 0.9920537948608399, "memory(GiB)": 89.13, "step": 6310, "token_acc": 0.7338179174995457, "train_speed(iter/s)": 0.121582 }, { "epoch": 0.08194142129114507, "grad_norm": 0.9716015458106995, "learning_rate": 8.193849746983262e-05, "loss": 0.982727336883545, "memory(GiB)": 89.13, "step": 6315, "token_acc": 0.7334819553487771, "train_speed(iter/s)": 0.121582 }, { "epoch": 0.08200629969280077, "grad_norm": 0.941214382648468, "learning_rate": 8.200337355650708e-05, "loss": 0.9804800987243653, "memory(GiB)": 89.13, "step": 6320, "token_acc": 0.7233273716100302, "train_speed(iter/s)": 0.121582 }, { "epoch": 0.08207117809445647, "grad_norm": 1.183273434638977, "learning_rate": 8.206824964318152e-05, "loss": 0.9401464462280273, "memory(GiB)": 89.13, "step": 6325, "token_acc": 0.732926560317577, "train_speed(iter/s)": 0.121582 }, { "epoch": 0.08213605649611216, "grad_norm": 1.0206174850463867, "learning_rate": 8.213312572985598e-05, "loss": 0.9517885208129883, "memory(GiB)": 89.13, "step": 6330, "token_acc": 0.7523789004991203, "train_speed(iter/s)": 0.121582 }, { "epoch": 0.08220093489776786, "grad_norm": 0.8885799646377563, "learning_rate": 8.219800181653043e-05, "loss": 0.9350030899047852, "memory(GiB)": 89.13, "step": 6335, "token_acc": 0.7577093918767775, "train_speed(iter/s)": 0.121577 }, { "epoch": 0.08226581329942355, "grad_norm": 1.0875576734542847, "learning_rate": 8.226287790320488e-05, "loss": 0.9279278755187989, "memory(GiB)": 89.13, "step": 6340, "token_acc": 0.7523573542570032, "train_speed(iter/s)": 0.121578 }, { "epoch": 0.08233069170107925, "grad_norm": 1.0299335718154907, "learning_rate": 8.232775398987933e-05, "loss": 0.9759042739868165, "memory(GiB)": 89.13, "step": 6345, "token_acc": 0.7389975994762493, "train_speed(iter/s)": 0.121579 }, { "epoch": 0.08239557010273495, "grad_norm": 1.1546519994735718, "learning_rate": 8.239263007655379e-05, "loss": 0.9679855346679688, "memory(GiB)": 89.13, "step": 6350, "token_acc": 0.7486555148635718, "train_speed(iter/s)": 0.121581 }, { "epoch": 0.08246044850439065, "grad_norm": 1.0223453044891357, "learning_rate": 8.245750616322823e-05, "loss": 0.9700302124023438, "memory(GiB)": 89.13, "step": 6355, "token_acc": 0.7412484556098135, "train_speed(iter/s)": 0.121573 }, { "epoch": 0.08252532690604635, "grad_norm": 1.1631182432174683, "learning_rate": 8.252238224990269e-05, "loss": 0.9426576614379882, "memory(GiB)": 89.13, "step": 6360, "token_acc": 0.726788899611077, "train_speed(iter/s)": 0.121569 }, { "epoch": 0.08259020530770204, "grad_norm": 0.9540301561355591, "learning_rate": 8.258725833657715e-05, "loss": 0.9283029556274414, "memory(GiB)": 89.13, "step": 6365, "token_acc": 0.7470652845472644, "train_speed(iter/s)": 0.121566 }, { "epoch": 0.08265508370935773, "grad_norm": 0.9679407477378845, "learning_rate": 8.265213442325159e-05, "loss": 0.9894884109497071, "memory(GiB)": 89.13, "step": 6370, "token_acc": 0.7279144229369815, "train_speed(iter/s)": 0.121569 }, { "epoch": 0.08271996211101343, "grad_norm": 1.0347061157226562, "learning_rate": 8.271701050992605e-05, "loss": 1.040610408782959, "memory(GiB)": 89.13, "step": 6375, "token_acc": 0.7062641199424933, "train_speed(iter/s)": 0.121572 }, { "epoch": 0.08278484051266913, "grad_norm": 0.8917498588562012, "learning_rate": 8.27818865966005e-05, "loss": 0.9618372917175293, "memory(GiB)": 89.13, "step": 6380, "token_acc": 0.7251886881087345, "train_speed(iter/s)": 0.121568 }, { "epoch": 0.08284971891432483, "grad_norm": 0.943406343460083, "learning_rate": 8.284676268327495e-05, "loss": 0.9411436080932617, "memory(GiB)": 89.13, "step": 6385, "token_acc": 0.7484064899319136, "train_speed(iter/s)": 0.121568 }, { "epoch": 0.08291459731598053, "grad_norm": 0.9842959642410278, "learning_rate": 8.291163876994939e-05, "loss": 0.9801813125610351, "memory(GiB)": 89.13, "step": 6390, "token_acc": 0.7219314139990605, "train_speed(iter/s)": 0.121568 }, { "epoch": 0.08297947571763623, "grad_norm": 1.0973758697509766, "learning_rate": 8.297651485662386e-05, "loss": 0.9754556655883789, "memory(GiB)": 89.13, "step": 6395, "token_acc": 0.751749428071592, "train_speed(iter/s)": 0.121564 }, { "epoch": 0.08304435411929191, "grad_norm": 1.0328112840652466, "learning_rate": 8.30413909432983e-05, "loss": 0.9880111694335938, "memory(GiB)": 89.13, "step": 6400, "token_acc": 0.734560657180578, "train_speed(iter/s)": 0.121562 }, { "epoch": 0.08310923252094761, "grad_norm": 0.9404907822608948, "learning_rate": 8.310626702997275e-05, "loss": 0.992991828918457, "memory(GiB)": 89.13, "step": 6405, "token_acc": 0.7308359222599523, "train_speed(iter/s)": 0.121562 }, { "epoch": 0.08317411092260331, "grad_norm": 0.8863458633422852, "learning_rate": 8.317114311664722e-05, "loss": 0.9628168106079101, "memory(GiB)": 89.13, "step": 6410, "token_acc": 0.7459700056139226, "train_speed(iter/s)": 0.12156 }, { "epoch": 0.08323898932425901, "grad_norm": 0.9527114629745483, "learning_rate": 8.323601920332166e-05, "loss": 0.9699237823486329, "memory(GiB)": 89.13, "step": 6415, "token_acc": 0.7219752454664663, "train_speed(iter/s)": 0.121558 }, { "epoch": 0.08330386772591471, "grad_norm": 0.9273279309272766, "learning_rate": 8.33008952899961e-05, "loss": 0.949838924407959, "memory(GiB)": 89.13, "step": 6420, "token_acc": 0.7425467336113238, "train_speed(iter/s)": 0.121556 }, { "epoch": 0.0833687461275704, "grad_norm": 0.9681837558746338, "learning_rate": 8.336577137667056e-05, "loss": 0.974232292175293, "memory(GiB)": 89.13, "step": 6425, "token_acc": 0.7299700085689803, "train_speed(iter/s)": 0.121562 }, { "epoch": 0.08343362452922609, "grad_norm": 0.898265540599823, "learning_rate": 8.343064746334502e-05, "loss": 0.9537406921386719, "memory(GiB)": 89.13, "step": 6430, "token_acc": 0.7483560005978179, "train_speed(iter/s)": 0.121564 }, { "epoch": 0.08349850293088179, "grad_norm": 0.9164485335350037, "learning_rate": 8.349552355001947e-05, "loss": 1.0105014801025392, "memory(GiB)": 89.13, "step": 6435, "token_acc": 0.7360374414976599, "train_speed(iter/s)": 0.121567 }, { "epoch": 0.08356338133253749, "grad_norm": 1.0666943788528442, "learning_rate": 8.356039963669392e-05, "loss": 0.9693307876586914, "memory(GiB)": 89.13, "step": 6440, "token_acc": 0.745813797722706, "train_speed(iter/s)": 0.121568 }, { "epoch": 0.08362825973419319, "grad_norm": 1.0284602642059326, "learning_rate": 8.362527572336837e-05, "loss": 0.9517772674560547, "memory(GiB)": 89.13, "step": 6445, "token_acc": 0.7521740841301696, "train_speed(iter/s)": 0.121572 }, { "epoch": 0.08369313813584889, "grad_norm": 1.00684654712677, "learning_rate": 8.369015181004283e-05, "loss": 0.9603755950927735, "memory(GiB)": 89.13, "step": 6450, "token_acc": 0.7642066884586372, "train_speed(iter/s)": 0.121576 }, { "epoch": 0.08375801653750459, "grad_norm": 0.9107084274291992, "learning_rate": 8.375502789671727e-05, "loss": 0.9523365020751953, "memory(GiB)": 89.13, "step": 6455, "token_acc": 0.734540797658251, "train_speed(iter/s)": 0.121575 }, { "epoch": 0.08382289493916027, "grad_norm": 0.9536482095718384, "learning_rate": 8.381990398339173e-05, "loss": 0.9841621398925782, "memory(GiB)": 89.13, "step": 6460, "token_acc": 0.7305844816100417, "train_speed(iter/s)": 0.12158 }, { "epoch": 0.08388777334081597, "grad_norm": 0.9750139117240906, "learning_rate": 8.388478007006618e-05, "loss": 0.9966953277587891, "memory(GiB)": 89.13, "step": 6465, "token_acc": 0.7266114366538695, "train_speed(iter/s)": 0.121583 }, { "epoch": 0.08395265174247167, "grad_norm": 0.9503470063209534, "learning_rate": 8.394965615674063e-05, "loss": 0.9849124908447265, "memory(GiB)": 89.13, "step": 6470, "token_acc": 0.7279622509612024, "train_speed(iter/s)": 0.121581 }, { "epoch": 0.08401753014412737, "grad_norm": 0.9035587310791016, "learning_rate": 8.401453224341508e-05, "loss": 0.9397673606872559, "memory(GiB)": 89.13, "step": 6475, "token_acc": 0.76228393862886, "train_speed(iter/s)": 0.121581 }, { "epoch": 0.08408240854578307, "grad_norm": 0.9678300619125366, "learning_rate": 8.407940833008954e-05, "loss": 1.000755500793457, "memory(GiB)": 89.13, "step": 6480, "token_acc": 0.7098235678501416, "train_speed(iter/s)": 0.121577 }, { "epoch": 0.08414728694743877, "grad_norm": 1.0036051273345947, "learning_rate": 8.414428441676398e-05, "loss": 0.9721772193908691, "memory(GiB)": 89.13, "step": 6485, "token_acc": 0.7350177337379794, "train_speed(iter/s)": 0.121577 }, { "epoch": 0.08421216534909445, "grad_norm": 0.9041601419448853, "learning_rate": 8.420916050343843e-05, "loss": 0.9995384216308594, "memory(GiB)": 89.13, "step": 6490, "token_acc": 0.7308960434445306, "train_speed(iter/s)": 0.12158 }, { "epoch": 0.08427704375075015, "grad_norm": 0.9866836667060852, "learning_rate": 8.42740365901129e-05, "loss": 0.9699831008911133, "memory(GiB)": 89.13, "step": 6495, "token_acc": 0.7597631830550857, "train_speed(iter/s)": 0.121575 }, { "epoch": 0.08434192215240585, "grad_norm": 0.9310398101806641, "learning_rate": 8.433891267678734e-05, "loss": 0.926972770690918, "memory(GiB)": 89.13, "step": 6500, "token_acc": 0.7574623090154812, "train_speed(iter/s)": 0.121568 }, { "epoch": 0.08440680055406155, "grad_norm": 0.9865860342979431, "learning_rate": 8.440378876346178e-05, "loss": 0.9722267150878906, "memory(GiB)": 89.13, "step": 6505, "token_acc": 0.730780426275106, "train_speed(iter/s)": 0.121561 }, { "epoch": 0.08447167895571725, "grad_norm": 1.0489047765731812, "learning_rate": 8.446866485013625e-05, "loss": 0.9770754814147949, "memory(GiB)": 89.13, "step": 6510, "token_acc": 0.7381653833277536, "train_speed(iter/s)": 0.121563 }, { "epoch": 0.08453655735737295, "grad_norm": 1.0053397417068481, "learning_rate": 8.45335409368107e-05, "loss": 0.9845830917358398, "memory(GiB)": 89.13, "step": 6515, "token_acc": 0.7488865795724465, "train_speed(iter/s)": 0.121561 }, { "epoch": 0.08460143575902863, "grad_norm": 0.9646563529968262, "learning_rate": 8.459841702348514e-05, "loss": 0.96866455078125, "memory(GiB)": 89.13, "step": 6520, "token_acc": 0.7304810248972685, "train_speed(iter/s)": 0.121564 }, { "epoch": 0.08466631416068433, "grad_norm": 0.8124751448631287, "learning_rate": 8.46632931101596e-05, "loss": 0.9530097007751465, "memory(GiB)": 89.13, "step": 6525, "token_acc": 0.7281263879195482, "train_speed(iter/s)": 0.121561 }, { "epoch": 0.08473119256234003, "grad_norm": 1.1132867336273193, "learning_rate": 8.472816919683405e-05, "loss": 0.9752317428588867, "memory(GiB)": 89.13, "step": 6530, "token_acc": 0.735629798713426, "train_speed(iter/s)": 0.121562 }, { "epoch": 0.08479607096399573, "grad_norm": 1.0654405355453491, "learning_rate": 8.47930452835085e-05, "loss": 0.9897659301757813, "memory(GiB)": 89.13, "step": 6535, "token_acc": 0.74447646493756, "train_speed(iter/s)": 0.121564 }, { "epoch": 0.08486094936565143, "grad_norm": 0.9437134265899658, "learning_rate": 8.485792137018295e-05, "loss": 0.9297284126281739, "memory(GiB)": 89.13, "step": 6540, "token_acc": 0.7315142089636646, "train_speed(iter/s)": 0.121565 }, { "epoch": 0.08492582776730713, "grad_norm": 0.987308144569397, "learning_rate": 8.492279745685741e-05, "loss": 0.9421513557434082, "memory(GiB)": 89.13, "step": 6545, "token_acc": 0.7334779519018588, "train_speed(iter/s)": 0.121567 }, { "epoch": 0.08499070616896282, "grad_norm": 1.0710052251815796, "learning_rate": 8.498767354353185e-05, "loss": 0.9904962539672851, "memory(GiB)": 89.13, "step": 6550, "token_acc": 0.7493926836238926, "train_speed(iter/s)": 0.121568 }, { "epoch": 0.08505558457061851, "grad_norm": 0.8637051582336426, "learning_rate": 8.505254963020631e-05, "loss": 0.9640966415405273, "memory(GiB)": 89.13, "step": 6555, "token_acc": 0.7462541620421753, "train_speed(iter/s)": 0.121564 }, { "epoch": 0.08512046297227421, "grad_norm": 0.9634467363357544, "learning_rate": 8.511742571688077e-05, "loss": 0.9659879684448243, "memory(GiB)": 89.13, "step": 6560, "token_acc": 0.7333460803059273, "train_speed(iter/s)": 0.121565 }, { "epoch": 0.08518534137392991, "grad_norm": 0.9956804513931274, "learning_rate": 8.518230180355521e-05, "loss": 0.9874416351318359, "memory(GiB)": 89.13, "step": 6565, "token_acc": 0.7234506384303955, "train_speed(iter/s)": 0.121564 }, { "epoch": 0.08525021977558561, "grad_norm": 1.0266369581222534, "learning_rate": 8.524717789022967e-05, "loss": 0.9187610626220704, "memory(GiB)": 89.13, "step": 6570, "token_acc": 0.7363379863379863, "train_speed(iter/s)": 0.121565 }, { "epoch": 0.08531509817724131, "grad_norm": 0.9750714898109436, "learning_rate": 8.531205397690412e-05, "loss": 0.9758009910583496, "memory(GiB)": 89.13, "step": 6575, "token_acc": 0.7266347332070848, "train_speed(iter/s)": 0.121567 }, { "epoch": 0.085379976578897, "grad_norm": 1.0951954126358032, "learning_rate": 8.537693006357857e-05, "loss": 0.9696211814880371, "memory(GiB)": 89.13, "step": 6580, "token_acc": 0.742472266244057, "train_speed(iter/s)": 0.121569 }, { "epoch": 0.0854448549805527, "grad_norm": 1.0017085075378418, "learning_rate": 8.544180615025302e-05, "loss": 0.9862018585205078, "memory(GiB)": 89.13, "step": 6585, "token_acc": 0.7245682112150955, "train_speed(iter/s)": 0.12157 }, { "epoch": 0.0855097333822084, "grad_norm": 0.948285698890686, "learning_rate": 8.550668223692746e-05, "loss": 1.0013303756713867, "memory(GiB)": 89.13, "step": 6590, "token_acc": 0.7261644375918821, "train_speed(iter/s)": 0.12157 }, { "epoch": 0.0855746117838641, "grad_norm": 1.130082130432129, "learning_rate": 8.557155832360192e-05, "loss": 0.9992529869079589, "memory(GiB)": 89.13, "step": 6595, "token_acc": 0.7308776234209741, "train_speed(iter/s)": 0.121574 }, { "epoch": 0.08563949018551979, "grad_norm": 1.041623592376709, "learning_rate": 8.563643441027638e-05, "loss": 0.9438609123229981, "memory(GiB)": 89.13, "step": 6600, "token_acc": 0.7354505936547071, "train_speed(iter/s)": 0.121574 }, { "epoch": 0.08570436858717549, "grad_norm": 0.984747588634491, "learning_rate": 8.570131049695082e-05, "loss": 0.9645793914794922, "memory(GiB)": 89.13, "step": 6605, "token_acc": 0.74014170716326, "train_speed(iter/s)": 0.121573 }, { "epoch": 0.08576924698883118, "grad_norm": 0.9684034585952759, "learning_rate": 8.576618658362529e-05, "loss": 0.9385208129882813, "memory(GiB)": 89.13, "step": 6610, "token_acc": 0.7522548272357723, "train_speed(iter/s)": 0.121569 }, { "epoch": 0.08583412539048688, "grad_norm": 0.997377872467041, "learning_rate": 8.583106267029973e-05, "loss": 1.0169931411743165, "memory(GiB)": 89.13, "step": 6615, "token_acc": 0.7193305754042874, "train_speed(iter/s)": 0.121568 }, { "epoch": 0.08589900379214258, "grad_norm": 0.9528045058250427, "learning_rate": 8.589593875697418e-05, "loss": 0.9980155944824218, "memory(GiB)": 89.13, "step": 6620, "token_acc": 0.7349527967461472, "train_speed(iter/s)": 0.121567 }, { "epoch": 0.08596388219379827, "grad_norm": 0.9407920241355896, "learning_rate": 8.596081484364863e-05, "loss": 0.923896312713623, "memory(GiB)": 89.13, "step": 6625, "token_acc": 0.7371521885306266, "train_speed(iter/s)": 0.121567 }, { "epoch": 0.08602876059545397, "grad_norm": 0.9549934267997742, "learning_rate": 8.602569093032309e-05, "loss": 0.9593400001525879, "memory(GiB)": 89.13, "step": 6630, "token_acc": 0.7419936185298984, "train_speed(iter/s)": 0.121563 }, { "epoch": 0.08609363899710967, "grad_norm": 0.994210958480835, "learning_rate": 8.609056701699753e-05, "loss": 0.9737372398376465, "memory(GiB)": 89.13, "step": 6635, "token_acc": 0.7444867897058194, "train_speed(iter/s)": 0.121565 }, { "epoch": 0.08615851739876536, "grad_norm": 0.8967296481132507, "learning_rate": 8.615544310367199e-05, "loss": 0.9348080635070801, "memory(GiB)": 89.13, "step": 6640, "token_acc": 0.7395581696600497, "train_speed(iter/s)": 0.121566 }, { "epoch": 0.08622339580042106, "grad_norm": 0.9695926904678345, "learning_rate": 8.622031919034645e-05, "loss": 1.0105104446411133, "memory(GiB)": 89.13, "step": 6645, "token_acc": 0.7338415118310306, "train_speed(iter/s)": 0.121569 }, { "epoch": 0.08628827420207676, "grad_norm": 1.028836727142334, "learning_rate": 8.628519527702089e-05, "loss": 0.9670860290527343, "memory(GiB)": 89.13, "step": 6650, "token_acc": 0.7438513461090853, "train_speed(iter/s)": 0.121568 }, { "epoch": 0.08635315260373246, "grad_norm": 1.0416395664215088, "learning_rate": 8.635007136369535e-05, "loss": 0.9232273101806641, "memory(GiB)": 89.13, "step": 6655, "token_acc": 0.7468111713213839, "train_speed(iter/s)": 0.12157 }, { "epoch": 0.08641803100538815, "grad_norm": 0.8915035128593445, "learning_rate": 8.64149474503698e-05, "loss": 0.9688618659973145, "memory(GiB)": 89.13, "step": 6660, "token_acc": 0.7425860023724793, "train_speed(iter/s)": 0.12157 }, { "epoch": 0.08648290940704385, "grad_norm": 1.021373987197876, "learning_rate": 8.647982353704425e-05, "loss": 0.9675500869750977, "memory(GiB)": 89.13, "step": 6665, "token_acc": 0.7425373134328358, "train_speed(iter/s)": 0.121568 }, { "epoch": 0.08654778780869954, "grad_norm": 1.1772584915161133, "learning_rate": 8.65446996237187e-05, "loss": 0.9207422256469726, "memory(GiB)": 89.13, "step": 6670, "token_acc": 0.763388409879427, "train_speed(iter/s)": 0.121568 }, { "epoch": 0.08661266621035524, "grad_norm": 0.9628501534461975, "learning_rate": 8.660957571039316e-05, "loss": 0.9745618820190429, "memory(GiB)": 89.13, "step": 6675, "token_acc": 0.711226411976602, "train_speed(iter/s)": 0.121567 }, { "epoch": 0.08667754461201094, "grad_norm": 0.9598839282989502, "learning_rate": 8.66744517970676e-05, "loss": 0.9606029510498046, "memory(GiB)": 89.13, "step": 6680, "token_acc": 0.740521910388971, "train_speed(iter/s)": 0.12157 }, { "epoch": 0.08674242301366664, "grad_norm": 0.975412905216217, "learning_rate": 8.673932788374206e-05, "loss": 0.9974966049194336, "memory(GiB)": 89.13, "step": 6685, "token_acc": 0.750982768303136, "train_speed(iter/s)": 0.12157 }, { "epoch": 0.08680730141532234, "grad_norm": 0.975484311580658, "learning_rate": 8.68042039704165e-05, "loss": 0.9870624542236328, "memory(GiB)": 89.13, "step": 6690, "token_acc": 0.729459503327382, "train_speed(iter/s)": 0.121567 }, { "epoch": 0.08687217981697803, "grad_norm": 0.9353969097137451, "learning_rate": 8.686908005709096e-05, "loss": 1.0003520965576171, "memory(GiB)": 89.13, "step": 6695, "token_acc": 0.7554888152444076, "train_speed(iter/s)": 0.121568 }, { "epoch": 0.08693705821863372, "grad_norm": 0.9879118204116821, "learning_rate": 8.693395614376542e-05, "loss": 0.9745491027832032, "memory(GiB)": 89.13, "step": 6700, "token_acc": 0.7325820645120446, "train_speed(iter/s)": 0.12157 }, { "epoch": 0.08700193662028942, "grad_norm": 1.0103782415390015, "learning_rate": 8.699883223043986e-05, "loss": 1.0683706283569336, "memory(GiB)": 89.13, "step": 6705, "token_acc": 0.7207994300498707, "train_speed(iter/s)": 0.121572 }, { "epoch": 0.08706681502194512, "grad_norm": 0.9375943541526794, "learning_rate": 8.706370831711432e-05, "loss": 0.9781109809875488, "memory(GiB)": 89.13, "step": 6710, "token_acc": 0.7262909807232306, "train_speed(iter/s)": 0.121574 }, { "epoch": 0.08713169342360082, "grad_norm": 0.9760564565658569, "learning_rate": 8.712858440378877e-05, "loss": 1.010506820678711, "memory(GiB)": 89.13, "step": 6715, "token_acc": 0.724614372395643, "train_speed(iter/s)": 0.121572 }, { "epoch": 0.08719657182525652, "grad_norm": 0.9507816433906555, "learning_rate": 8.719346049046322e-05, "loss": 1.0136941909790038, "memory(GiB)": 89.13, "step": 6720, "token_acc": 0.7325358195653664, "train_speed(iter/s)": 0.121575 }, { "epoch": 0.08726145022691222, "grad_norm": 0.9922086596488953, "learning_rate": 8.725833657713767e-05, "loss": 0.980384635925293, "memory(GiB)": 89.13, "step": 6725, "token_acc": 0.742098997734342, "train_speed(iter/s)": 0.121579 }, { "epoch": 0.0873263286285679, "grad_norm": 1.1822853088378906, "learning_rate": 8.732321266381213e-05, "loss": 1.0041351318359375, "memory(GiB)": 89.13, "step": 6730, "token_acc": 0.7289523619294916, "train_speed(iter/s)": 0.12158 }, { "epoch": 0.0873912070302236, "grad_norm": 0.9092561602592468, "learning_rate": 8.738808875048657e-05, "loss": 0.9830171585083007, "memory(GiB)": 89.13, "step": 6735, "token_acc": 0.7530991110842353, "train_speed(iter/s)": 0.121579 }, { "epoch": 0.0874560854318793, "grad_norm": 1.031224250793457, "learning_rate": 8.745296483716103e-05, "loss": 0.9516434669494629, "memory(GiB)": 89.13, "step": 6740, "token_acc": 0.7444460630781558, "train_speed(iter/s)": 0.12158 }, { "epoch": 0.087520963833535, "grad_norm": 0.9629105925559998, "learning_rate": 8.751784092383548e-05, "loss": 1.0394021987915039, "memory(GiB)": 89.13, "step": 6745, "token_acc": 0.7316975382888301, "train_speed(iter/s)": 0.121581 }, { "epoch": 0.0875858422351907, "grad_norm": 0.9646713733673096, "learning_rate": 8.758271701050993e-05, "loss": 0.9690254211425782, "memory(GiB)": 89.13, "step": 6750, "token_acc": 0.7205964131794801, "train_speed(iter/s)": 0.121582 }, { "epoch": 0.0876507206368464, "grad_norm": 1.0610014200210571, "learning_rate": 8.764759309718437e-05, "loss": 0.9482048034667969, "memory(GiB)": 89.13, "step": 6755, "token_acc": 0.7455119608945797, "train_speed(iter/s)": 0.121583 }, { "epoch": 0.08771559903850208, "grad_norm": 1.0004435777664185, "learning_rate": 8.771246918385884e-05, "loss": 0.9616815567016601, "memory(GiB)": 89.13, "step": 6760, "token_acc": 0.7235371135058918, "train_speed(iter/s)": 0.121584 }, { "epoch": 0.08778047744015778, "grad_norm": 0.9778384566307068, "learning_rate": 8.777734527053328e-05, "loss": 0.9616521835327149, "memory(GiB)": 89.13, "step": 6765, "token_acc": 0.7415126297960085, "train_speed(iter/s)": 0.121581 }, { "epoch": 0.08784535584181348, "grad_norm": 0.878879964351654, "learning_rate": 8.784222135720773e-05, "loss": 0.9524013519287109, "memory(GiB)": 89.13, "step": 6770, "token_acc": 0.7351186346607465, "train_speed(iter/s)": 0.121575 }, { "epoch": 0.08791023424346918, "grad_norm": 1.009443759918213, "learning_rate": 8.79070974438822e-05, "loss": 0.9804861068725585, "memory(GiB)": 89.13, "step": 6775, "token_acc": 0.7356168939142002, "train_speed(iter/s)": 0.121574 }, { "epoch": 0.08797511264512488, "grad_norm": 0.9919083118438721, "learning_rate": 8.797197353055664e-05, "loss": 0.9835152626037598, "memory(GiB)": 89.13, "step": 6780, "token_acc": 0.7464926480507217, "train_speed(iter/s)": 0.121578 }, { "epoch": 0.08803999104678058, "grad_norm": 0.8935465812683105, "learning_rate": 8.803684961723108e-05, "loss": 0.9906543731689453, "memory(GiB)": 89.13, "step": 6785, "token_acc": 0.7373356535189481, "train_speed(iter/s)": 0.121578 }, { "epoch": 0.08810486944843626, "grad_norm": 1.1023648977279663, "learning_rate": 8.810172570390554e-05, "loss": 0.9826009750366211, "memory(GiB)": 89.13, "step": 6790, "token_acc": 0.7369004344356769, "train_speed(iter/s)": 0.12158 }, { "epoch": 0.08816974785009196, "grad_norm": 0.9911883473396301, "learning_rate": 8.816660179058e-05, "loss": 0.9611760139465332, "memory(GiB)": 89.13, "step": 6795, "token_acc": 0.737184249628529, "train_speed(iter/s)": 0.12158 }, { "epoch": 0.08823462625174766, "grad_norm": 0.9631414413452148, "learning_rate": 8.823147787725445e-05, "loss": 1.016391658782959, "memory(GiB)": 89.13, "step": 6800, "token_acc": 0.730857269438086, "train_speed(iter/s)": 0.121583 }, { "epoch": 0.08829950465340336, "grad_norm": 1.1406364440917969, "learning_rate": 8.82963539639289e-05, "loss": 1.0132652282714845, "memory(GiB)": 89.13, "step": 6805, "token_acc": 0.7195130245528123, "train_speed(iter/s)": 0.121586 }, { "epoch": 0.08836438305505906, "grad_norm": 0.9884994029998779, "learning_rate": 8.836123005060335e-05, "loss": 0.9841888427734375, "memory(GiB)": 89.13, "step": 6810, "token_acc": 0.7474645759907933, "train_speed(iter/s)": 0.121586 }, { "epoch": 0.08842926145671476, "grad_norm": 1.0813473463058472, "learning_rate": 8.842610613727781e-05, "loss": 1.0057215690612793, "memory(GiB)": 89.13, "step": 6815, "token_acc": 0.7366318656377667, "train_speed(iter/s)": 0.121585 }, { "epoch": 0.08849413985837044, "grad_norm": 1.021356463432312, "learning_rate": 8.849098222395225e-05, "loss": 1.0061827659606934, "memory(GiB)": 89.13, "step": 6820, "token_acc": 0.7172053067304028, "train_speed(iter/s)": 0.121585 }, { "epoch": 0.08855901826002614, "grad_norm": 1.0130966901779175, "learning_rate": 8.855585831062671e-05, "loss": 0.937005615234375, "memory(GiB)": 89.13, "step": 6825, "token_acc": 0.7442116134190625, "train_speed(iter/s)": 0.121588 }, { "epoch": 0.08862389666168184, "grad_norm": 1.1920769214630127, "learning_rate": 8.862073439730117e-05, "loss": 0.9500310897827149, "memory(GiB)": 89.13, "step": 6830, "token_acc": 0.7438487526874381, "train_speed(iter/s)": 0.121589 }, { "epoch": 0.08868877506333754, "grad_norm": 5.023373603820801, "learning_rate": 8.868561048397561e-05, "loss": 0.9975303649902344, "memory(GiB)": 89.13, "step": 6835, "token_acc": 0.7336543435870906, "train_speed(iter/s)": 0.12159 }, { "epoch": 0.08875365346499324, "grad_norm": 1.0514001846313477, "learning_rate": 8.875048657065007e-05, "loss": 1.004549217224121, "memory(GiB)": 89.13, "step": 6840, "token_acc": 0.7272652659225214, "train_speed(iter/s)": 0.121592 }, { "epoch": 0.08881853186664894, "grad_norm": 0.9656239151954651, "learning_rate": 8.881536265732452e-05, "loss": 1.0209367752075196, "memory(GiB)": 89.13, "step": 6845, "token_acc": 0.739138705014076, "train_speed(iter/s)": 0.12159 }, { "epoch": 0.08888341026830462, "grad_norm": 0.9983412027359009, "learning_rate": 8.888023874399897e-05, "loss": 0.9831819534301758, "memory(GiB)": 89.13, "step": 6850, "token_acc": 0.7338115274914575, "train_speed(iter/s)": 0.121586 }, { "epoch": 0.08894828866996032, "grad_norm": 0.8916625380516052, "learning_rate": 8.894511483067341e-05, "loss": 0.9579207420349121, "memory(GiB)": 89.13, "step": 6855, "token_acc": 0.7380734089975024, "train_speed(iter/s)": 0.121587 }, { "epoch": 0.08901316707161602, "grad_norm": 1.0966088771820068, "learning_rate": 8.900999091734788e-05, "loss": 0.9529075622558594, "memory(GiB)": 89.13, "step": 6860, "token_acc": 0.752951066587271, "train_speed(iter/s)": 0.121586 }, { "epoch": 0.08907804547327172, "grad_norm": 1.0979697704315186, "learning_rate": 8.907486700402232e-05, "loss": 0.9757758140563965, "memory(GiB)": 89.13, "step": 6865, "token_acc": 0.735381772664958, "train_speed(iter/s)": 0.121579 }, { "epoch": 0.08914292387492742, "grad_norm": 0.9922294616699219, "learning_rate": 8.913974309069677e-05, "loss": 0.946745491027832, "memory(GiB)": 89.13, "step": 6870, "token_acc": 0.7513167134831461, "train_speed(iter/s)": 0.121578 }, { "epoch": 0.08920780227658312, "grad_norm": 0.9030492901802063, "learning_rate": 8.920461917737124e-05, "loss": 0.996987247467041, "memory(GiB)": 89.13, "step": 6875, "token_acc": 0.7528623104237879, "train_speed(iter/s)": 0.121577 }, { "epoch": 0.0892726806782388, "grad_norm": 1.0377088785171509, "learning_rate": 8.926949526404568e-05, "loss": 0.9776585578918457, "memory(GiB)": 89.13, "step": 6880, "token_acc": 0.7290310582108983, "train_speed(iter/s)": 0.121577 }, { "epoch": 0.0893375590798945, "grad_norm": 0.9750199317932129, "learning_rate": 8.933437135072012e-05, "loss": 1.0016138076782226, "memory(GiB)": 89.13, "step": 6885, "token_acc": 0.7236550792144685, "train_speed(iter/s)": 0.121576 }, { "epoch": 0.0894024374815502, "grad_norm": 1.0449302196502686, "learning_rate": 8.939924743739458e-05, "loss": 1.0042454719543457, "memory(GiB)": 89.13, "step": 6890, "token_acc": 0.7308446455505279, "train_speed(iter/s)": 0.121577 }, { "epoch": 0.0894673158832059, "grad_norm": 0.8864238262176514, "learning_rate": 8.946412352406903e-05, "loss": 0.9798280715942382, "memory(GiB)": 89.13, "step": 6895, "token_acc": 0.7373869313914378, "train_speed(iter/s)": 0.121574 }, { "epoch": 0.0895321942848616, "grad_norm": 0.9272863864898682, "learning_rate": 8.952899961074348e-05, "loss": 0.9963461875915527, "memory(GiB)": 89.13, "step": 6900, "token_acc": 0.7335084351954743, "train_speed(iter/s)": 0.121573 }, { "epoch": 0.0895970726865173, "grad_norm": 0.9388284683227539, "learning_rate": 8.959387569741793e-05, "loss": 0.96842041015625, "memory(GiB)": 89.13, "step": 6905, "token_acc": 0.751365275631118, "train_speed(iter/s)": 0.121571 }, { "epoch": 0.08966195108817299, "grad_norm": 0.9951668977737427, "learning_rate": 8.965875178409239e-05, "loss": 1.0061222076416017, "memory(GiB)": 89.13, "step": 6910, "token_acc": 0.7265850144092219, "train_speed(iter/s)": 0.12157 }, { "epoch": 0.08972682948982869, "grad_norm": 1.0209271907806396, "learning_rate": 8.972362787076683e-05, "loss": 1.0152436256408692, "memory(GiB)": 89.13, "step": 6915, "token_acc": 0.7510179759658357, "train_speed(iter/s)": 0.121574 }, { "epoch": 0.08979170789148438, "grad_norm": 1.075881004333496, "learning_rate": 8.978850395744129e-05, "loss": 1.0405126571655274, "memory(GiB)": 89.13, "step": 6920, "token_acc": 0.7341577929849327, "train_speed(iter/s)": 0.121573 }, { "epoch": 0.08985658629314008, "grad_norm": 0.942657470703125, "learning_rate": 8.985338004411575e-05, "loss": 1.021531105041504, "memory(GiB)": 89.13, "step": 6925, "token_acc": 0.7267491346535777, "train_speed(iter/s)": 0.121569 }, { "epoch": 0.08992146469479578, "grad_norm": 0.9818950891494751, "learning_rate": 8.991825613079019e-05, "loss": 0.9658201217651368, "memory(GiB)": 89.13, "step": 6930, "token_acc": 0.7637105568649524, "train_speed(iter/s)": 0.121572 }, { "epoch": 0.08998634309645148, "grad_norm": 0.9440091848373413, "learning_rate": 8.998313221746465e-05, "loss": 0.9403085708618164, "memory(GiB)": 89.13, "step": 6935, "token_acc": 0.7482080558198541, "train_speed(iter/s)": 0.121569 }, { "epoch": 0.09005122149810717, "grad_norm": 1.0977939367294312, "learning_rate": 9.00480083041391e-05, "loss": 0.9740276336669922, "memory(GiB)": 89.13, "step": 6940, "token_acc": 0.7372383330132514, "train_speed(iter/s)": 0.121568 }, { "epoch": 0.09011609989976287, "grad_norm": 0.9769631624221802, "learning_rate": 9.011288439081355e-05, "loss": 0.9862232208251953, "memory(GiB)": 89.13, "step": 6945, "token_acc": 0.7243984694208444, "train_speed(iter/s)": 0.121571 }, { "epoch": 0.09018097830141857, "grad_norm": 1.0680222511291504, "learning_rate": 9.0177760477488e-05, "loss": 0.9914335250854492, "memory(GiB)": 89.13, "step": 6950, "token_acc": 0.7375437803638007, "train_speed(iter/s)": 0.121573 }, { "epoch": 0.09024585670307426, "grad_norm": 1.142669916152954, "learning_rate": 9.024263656416245e-05, "loss": 1.060931396484375, "memory(GiB)": 89.13, "step": 6955, "token_acc": 0.7185543030953659, "train_speed(iter/s)": 0.121574 }, { "epoch": 0.09031073510472996, "grad_norm": 1.0133545398712158, "learning_rate": 9.03075126508369e-05, "loss": 1.0185447692871095, "memory(GiB)": 89.13, "step": 6960, "token_acc": 0.7343235558216102, "train_speed(iter/s)": 0.121579 }, { "epoch": 0.09037561350638566, "grad_norm": 0.9353241920471191, "learning_rate": 9.037238873751136e-05, "loss": 0.9949161529541015, "memory(GiB)": 89.13, "step": 6965, "token_acc": 0.7530329565355836, "train_speed(iter/s)": 0.121581 }, { "epoch": 0.09044049190804135, "grad_norm": 1.0700513124465942, "learning_rate": 9.04372648241858e-05, "loss": 0.9778785705566406, "memory(GiB)": 89.13, "step": 6970, "token_acc": 0.7304215915555274, "train_speed(iter/s)": 0.121583 }, { "epoch": 0.09050537030969705, "grad_norm": 0.8944912552833557, "learning_rate": 9.050214091086026e-05, "loss": 1.009715461730957, "memory(GiB)": 89.13, "step": 6975, "token_acc": 0.7247138755206859, "train_speed(iter/s)": 0.121583 }, { "epoch": 0.09057024871135275, "grad_norm": 1.0644683837890625, "learning_rate": 9.056701699753472e-05, "loss": 0.9738945960998535, "memory(GiB)": 89.13, "step": 6980, "token_acc": 0.7661444519829766, "train_speed(iter/s)": 0.121583 }, { "epoch": 0.09063512711300845, "grad_norm": 0.9411346316337585, "learning_rate": 9.063189308420916e-05, "loss": 0.9446498870849609, "memory(GiB)": 89.13, "step": 6985, "token_acc": 0.745972114639814, "train_speed(iter/s)": 0.121588 }, { "epoch": 0.09070000551466414, "grad_norm": 1.033280372619629, "learning_rate": 9.069676917088362e-05, "loss": 0.9753680229187012, "memory(GiB)": 89.13, "step": 6990, "token_acc": 0.7458823529411764, "train_speed(iter/s)": 0.12159 }, { "epoch": 0.09076488391631984, "grad_norm": 0.9107504487037659, "learning_rate": 9.076164525755807e-05, "loss": 0.9958051681518555, "memory(GiB)": 89.13, "step": 6995, "token_acc": 0.7462557048274089, "train_speed(iter/s)": 0.121589 }, { "epoch": 0.09082976231797553, "grad_norm": 0.9927100539207458, "learning_rate": 9.082652134423252e-05, "loss": 1.0440411567687988, "memory(GiB)": 89.13, "step": 7000, "token_acc": 0.7347494753893163, "train_speed(iter/s)": 0.121589 }, { "epoch": 0.09089464071963123, "grad_norm": 1.0393016338348389, "learning_rate": 9.089139743090697e-05, "loss": 0.9712390899658203, "memory(GiB)": 89.13, "step": 7005, "token_acc": 0.7800559179869525, "train_speed(iter/s)": 0.121591 }, { "epoch": 0.09095951912128693, "grad_norm": 0.8672062754631042, "learning_rate": 9.095627351758143e-05, "loss": 0.9706291198730469, "memory(GiB)": 89.13, "step": 7010, "token_acc": 0.7285074517289479, "train_speed(iter/s)": 0.121591 }, { "epoch": 0.09102439752294263, "grad_norm": 0.8600600361824036, "learning_rate": 9.102114960425587e-05, "loss": 0.9626895904541015, "memory(GiB)": 89.13, "step": 7015, "token_acc": 0.7452934662236987, "train_speed(iter/s)": 0.12159 }, { "epoch": 0.09108927592459833, "grad_norm": 0.992891252040863, "learning_rate": 9.108602569093033e-05, "loss": 0.971992015838623, "memory(GiB)": 89.13, "step": 7020, "token_acc": 0.7406811643517802, "train_speed(iter/s)": 0.121588 }, { "epoch": 0.09115415432625402, "grad_norm": 0.9217731356620789, "learning_rate": 9.115090177760478e-05, "loss": 0.9451288223266602, "memory(GiB)": 89.13, "step": 7025, "token_acc": 0.7543696607927941, "train_speed(iter/s)": 0.121587 }, { "epoch": 0.09121903272790971, "grad_norm": 0.9886902570724487, "learning_rate": 9.121577786427923e-05, "loss": 1.0071176528930663, "memory(GiB)": 89.13, "step": 7030, "token_acc": 0.7344002296458431, "train_speed(iter/s)": 0.121588 }, { "epoch": 0.09128391112956541, "grad_norm": 0.8762850761413574, "learning_rate": 9.128065395095368e-05, "loss": 0.9805797576904297, "memory(GiB)": 89.13, "step": 7035, "token_acc": 0.7579960373620153, "train_speed(iter/s)": 0.121587 }, { "epoch": 0.09134878953122111, "grad_norm": 0.9578309655189514, "learning_rate": 9.134553003762814e-05, "loss": 0.9472015380859375, "memory(GiB)": 89.13, "step": 7040, "token_acc": 0.7323540974232515, "train_speed(iter/s)": 0.121585 }, { "epoch": 0.09141366793287681, "grad_norm": 1.1019693613052368, "learning_rate": 9.141040612430258e-05, "loss": 0.9754462242126465, "memory(GiB)": 89.13, "step": 7045, "token_acc": 0.746946662291735, "train_speed(iter/s)": 0.121585 }, { "epoch": 0.0914785463345325, "grad_norm": 0.982083261013031, "learning_rate": 9.147528221097704e-05, "loss": 0.9997190475463867, "memory(GiB)": 89.13, "step": 7050, "token_acc": 0.7422042467138523, "train_speed(iter/s)": 0.121584 }, { "epoch": 0.0915434247361882, "grad_norm": 1.0695081949234009, "learning_rate": 9.154015829765148e-05, "loss": 1.024611473083496, "memory(GiB)": 89.13, "step": 7055, "token_acc": 0.7162189054726368, "train_speed(iter/s)": 0.121588 }, { "epoch": 0.09160830313784389, "grad_norm": 0.9625498056411743, "learning_rate": 9.160503438432594e-05, "loss": 0.9847209930419922, "memory(GiB)": 89.13, "step": 7060, "token_acc": 0.751204650729008, "train_speed(iter/s)": 0.121588 }, { "epoch": 0.09167318153949959, "grad_norm": 0.9901791214942932, "learning_rate": 9.16699104710004e-05, "loss": 0.9628775596618653, "memory(GiB)": 89.13, "step": 7065, "token_acc": 0.7438023279558914, "train_speed(iter/s)": 0.121589 }, { "epoch": 0.09173805994115529, "grad_norm": 0.8832181096076965, "learning_rate": 9.173478655767484e-05, "loss": 0.9970659255981446, "memory(GiB)": 89.13, "step": 7070, "token_acc": 0.7253133155062985, "train_speed(iter/s)": 0.121589 }, { "epoch": 0.09180293834281099, "grad_norm": 1.0435868501663208, "learning_rate": 9.17996626443493e-05, "loss": 0.9690035820007324, "memory(GiB)": 89.13, "step": 7075, "token_acc": 0.7467828418230563, "train_speed(iter/s)": 0.121588 }, { "epoch": 0.09186781674446669, "grad_norm": 0.9656345844268799, "learning_rate": 9.186453873102375e-05, "loss": 0.9597922325134277, "memory(GiB)": 89.13, "step": 7080, "token_acc": 0.7258383990164606, "train_speed(iter/s)": 0.121591 }, { "epoch": 0.09193269514612239, "grad_norm": 0.9357357025146484, "learning_rate": 9.19294148176982e-05, "loss": 0.9843554496765137, "memory(GiB)": 89.13, "step": 7085, "token_acc": 0.7340247452692867, "train_speed(iter/s)": 0.121589 }, { "epoch": 0.09199757354777807, "grad_norm": 0.9894979000091553, "learning_rate": 9.199429090437265e-05, "loss": 0.9481256484985352, "memory(GiB)": 89.13, "step": 7090, "token_acc": 0.7645032229384308, "train_speed(iter/s)": 0.121593 }, { "epoch": 0.09206245194943377, "grad_norm": 0.9075835943222046, "learning_rate": 9.205916699104711e-05, "loss": 0.9765890121459961, "memory(GiB)": 89.13, "step": 7095, "token_acc": 0.7161613299329366, "train_speed(iter/s)": 0.121591 }, { "epoch": 0.09212733035108947, "grad_norm": 0.9313375353813171, "learning_rate": 9.212404307772155e-05, "loss": 0.963507080078125, "memory(GiB)": 89.13, "step": 7100, "token_acc": 0.7483104092375615, "train_speed(iter/s)": 0.121595 }, { "epoch": 0.09219220875274517, "grad_norm": 0.9564356803894043, "learning_rate": 9.218891916439601e-05, "loss": 1.0014965057373046, "memory(GiB)": 89.13, "step": 7105, "token_acc": 0.7378130967089745, "train_speed(iter/s)": 0.121593 }, { "epoch": 0.09225708715440087, "grad_norm": 0.9445807933807373, "learning_rate": 9.225379525107047e-05, "loss": 1.0161091804504394, "memory(GiB)": 89.13, "step": 7110, "token_acc": 0.7233546084688766, "train_speed(iter/s)": 0.121594 }, { "epoch": 0.09232196555605657, "grad_norm": 0.9736127853393555, "learning_rate": 9.231867133774491e-05, "loss": 0.9449707984924316, "memory(GiB)": 89.13, "step": 7115, "token_acc": 0.7289807573741766, "train_speed(iter/s)": 0.121595 }, { "epoch": 0.09238684395771225, "grad_norm": 1.1882919073104858, "learning_rate": 9.238354742441935e-05, "loss": 0.9970000267028809, "memory(GiB)": 89.13, "step": 7120, "token_acc": 0.7233865755701228, "train_speed(iter/s)": 0.121595 }, { "epoch": 0.09245172235936795, "grad_norm": 0.9410256743431091, "learning_rate": 9.244842351109382e-05, "loss": 1.0127625465393066, "memory(GiB)": 89.13, "step": 7125, "token_acc": 0.7435857467967703, "train_speed(iter/s)": 0.121594 }, { "epoch": 0.09251660076102365, "grad_norm": 0.9633681774139404, "learning_rate": 9.251329959776827e-05, "loss": 0.9777100563049317, "memory(GiB)": 89.13, "step": 7130, "token_acc": 0.7009747232777135, "train_speed(iter/s)": 0.121597 }, { "epoch": 0.09258147916267935, "grad_norm": 0.9469655156135559, "learning_rate": 9.257817568444271e-05, "loss": 1.0109508514404297, "memory(GiB)": 89.13, "step": 7135, "token_acc": 0.7045116017344676, "train_speed(iter/s)": 0.121599 }, { "epoch": 0.09264635756433505, "grad_norm": 0.9134995937347412, "learning_rate": 9.264305177111718e-05, "loss": 0.9644187927246094, "memory(GiB)": 89.13, "step": 7140, "token_acc": 0.756535535282876, "train_speed(iter/s)": 0.121599 }, { "epoch": 0.09271123596599075, "grad_norm": 1.0389479398727417, "learning_rate": 9.270792785779162e-05, "loss": 0.9681709289550782, "memory(GiB)": 89.13, "step": 7145, "token_acc": 0.7376738305941846, "train_speed(iter/s)": 0.121598 }, { "epoch": 0.09277611436764643, "grad_norm": 1.0531755685806274, "learning_rate": 9.277280394446607e-05, "loss": 1.0073566436767578, "memory(GiB)": 89.13, "step": 7150, "token_acc": 0.721233599758709, "train_speed(iter/s)": 0.121601 }, { "epoch": 0.09284099276930213, "grad_norm": 0.9406887292861938, "learning_rate": 9.283768003114052e-05, "loss": 0.9685914993286133, "memory(GiB)": 89.13, "step": 7155, "token_acc": 0.7310223078157599, "train_speed(iter/s)": 0.121598 }, { "epoch": 0.09290587117095783, "grad_norm": 0.8524556756019592, "learning_rate": 9.290255611781498e-05, "loss": 0.9455994606018067, "memory(GiB)": 89.13, "step": 7160, "token_acc": 0.7583680266444629, "train_speed(iter/s)": 0.121595 }, { "epoch": 0.09297074957261353, "grad_norm": 0.990800142288208, "learning_rate": 9.296743220448942e-05, "loss": 0.9941011428833008, "memory(GiB)": 89.13, "step": 7165, "token_acc": 0.737875925051173, "train_speed(iter/s)": 0.121592 }, { "epoch": 0.09303562797426923, "grad_norm": 0.9600595831871033, "learning_rate": 9.303230829116388e-05, "loss": 1.0152209281921387, "memory(GiB)": 89.13, "step": 7170, "token_acc": 0.7540469749667225, "train_speed(iter/s)": 0.121592 }, { "epoch": 0.09310050637592493, "grad_norm": 1.0442744493484497, "learning_rate": 9.309718437783833e-05, "loss": 1.003508949279785, "memory(GiB)": 89.13, "step": 7175, "token_acc": 0.7374070174246798, "train_speed(iter/s)": 0.121594 }, { "epoch": 0.09316538477758061, "grad_norm": 0.9269804954528809, "learning_rate": 9.316206046451279e-05, "loss": 0.9630768775939942, "memory(GiB)": 89.13, "step": 7180, "token_acc": 0.7366223618186694, "train_speed(iter/s)": 0.121592 }, { "epoch": 0.09323026317923631, "grad_norm": 0.9328075051307678, "learning_rate": 9.322693655118723e-05, "loss": 1.0250890731811524, "memory(GiB)": 89.13, "step": 7185, "token_acc": 0.7298765917051294, "train_speed(iter/s)": 0.121592 }, { "epoch": 0.09329514158089201, "grad_norm": 1.0151560306549072, "learning_rate": 9.329181263786169e-05, "loss": 1.0124229431152343, "memory(GiB)": 89.13, "step": 7190, "token_acc": 0.7502630286876621, "train_speed(iter/s)": 0.121588 }, { "epoch": 0.09336001998254771, "grad_norm": 0.9904279112815857, "learning_rate": 9.335668872453615e-05, "loss": 0.9794008255004882, "memory(GiB)": 89.13, "step": 7195, "token_acc": 0.7425455949049503, "train_speed(iter/s)": 0.121584 }, { "epoch": 0.09342489838420341, "grad_norm": 0.9928146004676819, "learning_rate": 9.342156481121059e-05, "loss": 1.0031620025634767, "memory(GiB)": 89.13, "step": 7200, "token_acc": 0.7358013254471603, "train_speed(iter/s)": 0.121585 }, { "epoch": 0.0934897767858591, "grad_norm": 0.8808194994926453, "learning_rate": 9.348644089788505e-05, "loss": 0.956937599182129, "memory(GiB)": 89.13, "step": 7205, "token_acc": 0.74001623975788, "train_speed(iter/s)": 0.121585 }, { "epoch": 0.0935546551875148, "grad_norm": 1.1702252626419067, "learning_rate": 9.35513169845595e-05, "loss": 0.9942723274230957, "memory(GiB)": 89.13, "step": 7210, "token_acc": 0.7294073157315731, "train_speed(iter/s)": 0.121584 }, { "epoch": 0.0936195335891705, "grad_norm": 0.9686222076416016, "learning_rate": 9.361619307123395e-05, "loss": 1.0236702919006349, "memory(GiB)": 89.13, "step": 7215, "token_acc": 0.7378759760495816, "train_speed(iter/s)": 0.121582 }, { "epoch": 0.0936844119908262, "grad_norm": 0.858786404132843, "learning_rate": 9.368106915790839e-05, "loss": 0.9434444427490234, "memory(GiB)": 89.13, "step": 7220, "token_acc": 0.7504696305572949, "train_speed(iter/s)": 0.121578 }, { "epoch": 0.0937492903924819, "grad_norm": 1.0512419939041138, "learning_rate": 9.374594524458286e-05, "loss": 1.0038225173950195, "memory(GiB)": 89.13, "step": 7225, "token_acc": 0.7391320726286447, "train_speed(iter/s)": 0.121581 }, { "epoch": 0.09381416879413759, "grad_norm": 0.9181923270225525, "learning_rate": 9.38108213312573e-05, "loss": 1.0344453811645509, "memory(GiB)": 89.13, "step": 7230, "token_acc": 0.7374914816541731, "train_speed(iter/s)": 0.121579 }, { "epoch": 0.09387904719579328, "grad_norm": 0.8266717195510864, "learning_rate": 9.387569741793175e-05, "loss": 0.9591407775878906, "memory(GiB)": 89.13, "step": 7235, "token_acc": 0.7601836445430455, "train_speed(iter/s)": 0.121579 }, { "epoch": 0.09394392559744898, "grad_norm": 1.0397521257400513, "learning_rate": 9.394057350460622e-05, "loss": 0.972964096069336, "memory(GiB)": 89.13, "step": 7240, "token_acc": 0.7327017985492366, "train_speed(iter/s)": 0.12157 }, { "epoch": 0.09400880399910468, "grad_norm": 0.9261758923530579, "learning_rate": 9.400544959128066e-05, "loss": 0.9737716674804687, "memory(GiB)": 89.13, "step": 7245, "token_acc": 0.7126608176994833, "train_speed(iter/s)": 0.121572 }, { "epoch": 0.09407368240076038, "grad_norm": 1.0654032230377197, "learning_rate": 9.40703256779551e-05, "loss": 1.000113296508789, "memory(GiB)": 89.13, "step": 7250, "token_acc": 0.7232581298719137, "train_speed(iter/s)": 0.121571 }, { "epoch": 0.09413856080241607, "grad_norm": 0.8968384861946106, "learning_rate": 9.413520176462956e-05, "loss": 1.0047381401062012, "memory(GiB)": 89.13, "step": 7255, "token_acc": 0.7126296934671669, "train_speed(iter/s)": 0.121571 }, { "epoch": 0.09420343920407177, "grad_norm": 1.0113643407821655, "learning_rate": 9.420007785130402e-05, "loss": 1.0161388397216797, "memory(GiB)": 89.13, "step": 7260, "token_acc": 0.7351106069200227, "train_speed(iter/s)": 0.121573 }, { "epoch": 0.09426831760572746, "grad_norm": 1.0083034038543701, "learning_rate": 9.426495393797846e-05, "loss": 0.9568971633911133, "memory(GiB)": 89.13, "step": 7265, "token_acc": 0.7604857334425985, "train_speed(iter/s)": 0.121571 }, { "epoch": 0.09433319600738316, "grad_norm": 1.0705342292785645, "learning_rate": 9.432983002465292e-05, "loss": 0.9835772514343262, "memory(GiB)": 89.13, "step": 7270, "token_acc": 0.7267686424474188, "train_speed(iter/s)": 0.12157 }, { "epoch": 0.09439807440903886, "grad_norm": 1.0987374782562256, "learning_rate": 9.439470611132737e-05, "loss": 0.983738899230957, "memory(GiB)": 89.13, "step": 7275, "token_acc": 0.7321003416031193, "train_speed(iter/s)": 0.12157 }, { "epoch": 0.09446295281069456, "grad_norm": 0.9762403964996338, "learning_rate": 9.445958219800182e-05, "loss": 0.993045425415039, "memory(GiB)": 89.13, "step": 7280, "token_acc": 0.7232018220556903, "train_speed(iter/s)": 0.121573 }, { "epoch": 0.09452783121235026, "grad_norm": 1.0739717483520508, "learning_rate": 9.452445828467627e-05, "loss": 1.0095361709594726, "memory(GiB)": 89.13, "step": 7285, "token_acc": 0.7474819444894257, "train_speed(iter/s)": 0.121574 }, { "epoch": 0.09459270961400595, "grad_norm": 0.8529001474380493, "learning_rate": 9.458933437135073e-05, "loss": 0.9625384330749511, "memory(GiB)": 89.13, "step": 7290, "token_acc": 0.7257116740663359, "train_speed(iter/s)": 0.121576 }, { "epoch": 0.09465758801566164, "grad_norm": 0.8843984603881836, "learning_rate": 9.465421045802517e-05, "loss": 0.9747405052185059, "memory(GiB)": 89.13, "step": 7295, "token_acc": 0.7251707661225668, "train_speed(iter/s)": 0.12157 }, { "epoch": 0.09472246641731734, "grad_norm": 0.9005332589149475, "learning_rate": 9.471908654469963e-05, "loss": 0.9929369926452637, "memory(GiB)": 89.13, "step": 7300, "token_acc": 0.734008473700527, "train_speed(iter/s)": 0.121568 }, { "epoch": 0.09478734481897304, "grad_norm": 0.9272375702857971, "learning_rate": 9.478396263137408e-05, "loss": 1.0159122467041015, "memory(GiB)": 89.13, "step": 7305, "token_acc": 0.7431663071537183, "train_speed(iter/s)": 0.121568 }, { "epoch": 0.09485222322062874, "grad_norm": 0.9048323631286621, "learning_rate": 9.484883871804853e-05, "loss": 0.9805566787719726, "memory(GiB)": 89.13, "step": 7310, "token_acc": 0.7406883032019168, "train_speed(iter/s)": 0.12157 }, { "epoch": 0.09491710162228444, "grad_norm": 0.9806305766105652, "learning_rate": 9.491371480472298e-05, "loss": 0.9929019927978515, "memory(GiB)": 89.13, "step": 7315, "token_acc": 0.7277648632296277, "train_speed(iter/s)": 0.121575 }, { "epoch": 0.09498198002394014, "grad_norm": 0.9532061815261841, "learning_rate": 9.497859089139743e-05, "loss": 0.9914905548095703, "memory(GiB)": 89.13, "step": 7320, "token_acc": 0.7133596740338314, "train_speed(iter/s)": 0.121577 }, { "epoch": 0.09504685842559582, "grad_norm": 0.9747717976570129, "learning_rate": 9.504346697807188e-05, "loss": 1.0215042114257813, "memory(GiB)": 89.13, "step": 7325, "token_acc": 0.7328647925033467, "train_speed(iter/s)": 0.121571 }, { "epoch": 0.09511173682725152, "grad_norm": 1.0942456722259521, "learning_rate": 9.510834306474634e-05, "loss": 0.9797447204589844, "memory(GiB)": 89.13, "step": 7330, "token_acc": 0.7557770233271274, "train_speed(iter/s)": 0.121573 }, { "epoch": 0.09517661522890722, "grad_norm": 0.940068781375885, "learning_rate": 9.517321915142078e-05, "loss": 1.0370245933532716, "memory(GiB)": 89.13, "step": 7335, "token_acc": 0.7344024254029041, "train_speed(iter/s)": 0.121578 }, { "epoch": 0.09524149363056292, "grad_norm": 0.9782463908195496, "learning_rate": 9.523809523809524e-05, "loss": 0.9851066589355468, "memory(GiB)": 89.13, "step": 7340, "token_acc": 0.7401616890492416, "train_speed(iter/s)": 0.121576 }, { "epoch": 0.09530637203221862, "grad_norm": 0.9186102151870728, "learning_rate": 9.53029713247697e-05, "loss": 0.9444065093994141, "memory(GiB)": 89.13, "step": 7345, "token_acc": 0.7798502088434395, "train_speed(iter/s)": 0.121575 }, { "epoch": 0.09537125043387432, "grad_norm": 0.9800398349761963, "learning_rate": 9.536784741144414e-05, "loss": 0.9508193969726563, "memory(GiB)": 89.13, "step": 7350, "token_acc": 0.7211085801063022, "train_speed(iter/s)": 0.121579 }, { "epoch": 0.09543612883553, "grad_norm": 0.9362871646881104, "learning_rate": 9.54327234981186e-05, "loss": 1.0236140251159669, "memory(GiB)": 89.13, "step": 7355, "token_acc": 0.7392460760491233, "train_speed(iter/s)": 0.121581 }, { "epoch": 0.0955010072371857, "grad_norm": 0.9620161652565002, "learning_rate": 9.549759958479305e-05, "loss": 0.9993377685546875, "memory(GiB)": 89.13, "step": 7360, "token_acc": 0.7498532002348797, "train_speed(iter/s)": 0.121579 }, { "epoch": 0.0955658856388414, "grad_norm": 0.9739927649497986, "learning_rate": 9.55624756714675e-05, "loss": 1.0113235473632813, "memory(GiB)": 89.13, "step": 7365, "token_acc": 0.725653617374464, "train_speed(iter/s)": 0.121577 }, { "epoch": 0.0956307640404971, "grad_norm": 0.8824233412742615, "learning_rate": 9.562735175814195e-05, "loss": 0.984654426574707, "memory(GiB)": 89.13, "step": 7370, "token_acc": 0.7323038198578183, "train_speed(iter/s)": 0.121577 }, { "epoch": 0.0956956424421528, "grad_norm": 0.8987405300140381, "learning_rate": 9.569222784481641e-05, "loss": 1.0351774215698242, "memory(GiB)": 89.13, "step": 7375, "token_acc": 0.7243078755002812, "train_speed(iter/s)": 0.121573 }, { "epoch": 0.0957605208438085, "grad_norm": 0.958381712436676, "learning_rate": 9.575710393149085e-05, "loss": 0.9876920700073242, "memory(GiB)": 89.13, "step": 7380, "token_acc": 0.7050970419954378, "train_speed(iter/s)": 0.121574 }, { "epoch": 0.09582539924546418, "grad_norm": 1.0309057235717773, "learning_rate": 9.582198001816531e-05, "loss": 0.9907495498657226, "memory(GiB)": 89.13, "step": 7385, "token_acc": 0.7451616780863084, "train_speed(iter/s)": 0.121579 }, { "epoch": 0.09589027764711988, "grad_norm": 0.9588906764984131, "learning_rate": 9.588685610483977e-05, "loss": 0.983721923828125, "memory(GiB)": 89.13, "step": 7390, "token_acc": 0.7676064333017976, "train_speed(iter/s)": 0.121577 }, { "epoch": 0.09595515604877558, "grad_norm": 0.8502411246299744, "learning_rate": 9.595173219151421e-05, "loss": 0.9586681365966797, "memory(GiB)": 89.13, "step": 7395, "token_acc": 0.7592047342798006, "train_speed(iter/s)": 0.121574 }, { "epoch": 0.09602003445043128, "grad_norm": 0.9542590379714966, "learning_rate": 9.601660827818867e-05, "loss": 1.0653684616088868, "memory(GiB)": 89.13, "step": 7400, "token_acc": 0.7269949618072485, "train_speed(iter/s)": 0.121576 }, { "epoch": 0.09608491285208698, "grad_norm": 0.945134162902832, "learning_rate": 9.608148436486312e-05, "loss": 0.9625123977661133, "memory(GiB)": 89.13, "step": 7405, "token_acc": 0.7476331823152632, "train_speed(iter/s)": 0.121575 }, { "epoch": 0.09614979125374268, "grad_norm": 0.9356964230537415, "learning_rate": 9.614636045153757e-05, "loss": 0.9491386413574219, "memory(GiB)": 89.13, "step": 7410, "token_acc": 0.7538359788359789, "train_speed(iter/s)": 0.121574 }, { "epoch": 0.09621466965539836, "grad_norm": 0.9439274668693542, "learning_rate": 9.621123653821202e-05, "loss": 0.9704273223876954, "memory(GiB)": 89.13, "step": 7415, "token_acc": 0.7347417116422513, "train_speed(iter/s)": 0.121572 }, { "epoch": 0.09627954805705406, "grad_norm": 1.1318573951721191, "learning_rate": 9.627611262488647e-05, "loss": 1.0300151824951171, "memory(GiB)": 89.13, "step": 7420, "token_acc": 0.7333111185160502, "train_speed(iter/s)": 0.121569 }, { "epoch": 0.09634442645870976, "grad_norm": 1.0603137016296387, "learning_rate": 9.634098871156092e-05, "loss": 0.9886566162109375, "memory(GiB)": 89.13, "step": 7425, "token_acc": 0.735249340908247, "train_speed(iter/s)": 0.12157 }, { "epoch": 0.09640930486036546, "grad_norm": 0.8495898246765137, "learning_rate": 9.640586479823538e-05, "loss": 1.0158121109008789, "memory(GiB)": 89.13, "step": 7430, "token_acc": 0.7327121487986862, "train_speed(iter/s)": 0.121572 }, { "epoch": 0.09647418326202116, "grad_norm": 0.9217912554740906, "learning_rate": 9.647074088490982e-05, "loss": 1.0061368942260742, "memory(GiB)": 89.13, "step": 7435, "token_acc": 0.7401018967528084, "train_speed(iter/s)": 0.121573 }, { "epoch": 0.09653906166367686, "grad_norm": 1.007940649986267, "learning_rate": 9.653561697158428e-05, "loss": 1.00764741897583, "memory(GiB)": 89.13, "step": 7440, "token_acc": 0.7310403365686239, "train_speed(iter/s)": 0.121571 }, { "epoch": 0.09660394006533254, "grad_norm": 0.9936829805374146, "learning_rate": 9.660049305825874e-05, "loss": 0.9768404960632324, "memory(GiB)": 89.13, "step": 7445, "token_acc": 0.7412395755843546, "train_speed(iter/s)": 0.121575 }, { "epoch": 0.09666881846698824, "grad_norm": 1.0002847909927368, "learning_rate": 9.666536914493318e-05, "loss": 1.015312385559082, "memory(GiB)": 89.13, "step": 7450, "token_acc": 0.7266579406631762, "train_speed(iter/s)": 0.121578 }, { "epoch": 0.09673369686864394, "grad_norm": 1.0348551273345947, "learning_rate": 9.673024523160763e-05, "loss": 0.9904894828796387, "memory(GiB)": 89.13, "step": 7455, "token_acc": 0.7341584589906738, "train_speed(iter/s)": 0.121577 }, { "epoch": 0.09679857527029964, "grad_norm": 1.0642454624176025, "learning_rate": 9.679512131828209e-05, "loss": 0.9638532638549805, "memory(GiB)": 89.13, "step": 7460, "token_acc": 0.7322418714524754, "train_speed(iter/s)": 0.121576 }, { "epoch": 0.09686345367195534, "grad_norm": 0.9306467175483704, "learning_rate": 9.685999740495653e-05, "loss": 1.0074665069580078, "memory(GiB)": 89.13, "step": 7465, "token_acc": 0.7299948038451546, "train_speed(iter/s)": 0.121574 }, { "epoch": 0.09692833207361104, "grad_norm": 0.9369504451751709, "learning_rate": 9.692487349163099e-05, "loss": 1.007098388671875, "memory(GiB)": 89.13, "step": 7470, "token_acc": 0.7119689264405558, "train_speed(iter/s)": 0.121576 }, { "epoch": 0.09699321047526673, "grad_norm": 1.0550923347473145, "learning_rate": 9.698974957830545e-05, "loss": 1.054002571105957, "memory(GiB)": 89.13, "step": 7475, "token_acc": 0.7336612789880534, "train_speed(iter/s)": 0.12158 }, { "epoch": 0.09705808887692242, "grad_norm": 0.9430214166641235, "learning_rate": 9.705462566497989e-05, "loss": 0.9923044204711914, "memory(GiB)": 89.13, "step": 7480, "token_acc": 0.7450480676172656, "train_speed(iter/s)": 0.12158 }, { "epoch": 0.09712296727857812, "grad_norm": 0.9589431285858154, "learning_rate": 9.711950175165433e-05, "loss": 0.9402302742004395, "memory(GiB)": 89.13, "step": 7485, "token_acc": 0.7512137292536977, "train_speed(iter/s)": 0.121575 }, { "epoch": 0.09718784568023382, "grad_norm": 0.9287962317466736, "learning_rate": 9.71843778383288e-05, "loss": 0.9845195770263672, "memory(GiB)": 89.13, "step": 7490, "token_acc": 0.7405791603439554, "train_speed(iter/s)": 0.121571 }, { "epoch": 0.09725272408188952, "grad_norm": 1.0666640996932983, "learning_rate": 9.724925392500325e-05, "loss": 1.0136554718017579, "memory(GiB)": 89.13, "step": 7495, "token_acc": 0.7408704946262098, "train_speed(iter/s)": 0.121568 }, { "epoch": 0.09731760248354522, "grad_norm": 0.9335580468177795, "learning_rate": 9.731413001167769e-05, "loss": 0.9967855453491211, "memory(GiB)": 89.13, "step": 7500, "token_acc": 0.7285033605755578, "train_speed(iter/s)": 0.121568 }, { "epoch": 0.0973824808852009, "grad_norm": 0.8933122754096985, "learning_rate": 9.737900609835216e-05, "loss": 0.965910530090332, "memory(GiB)": 89.13, "step": 7505, "token_acc": 0.7482093455110471, "train_speed(iter/s)": 0.121571 }, { "epoch": 0.0974473592868566, "grad_norm": 0.8080366253852844, "learning_rate": 9.74438821850266e-05, "loss": 0.9678458213806153, "memory(GiB)": 89.13, "step": 7510, "token_acc": 0.73919723449203, "train_speed(iter/s)": 0.121571 }, { "epoch": 0.0975122376885123, "grad_norm": 0.9443601965904236, "learning_rate": 9.750875827170105e-05, "loss": 1.0141765594482421, "memory(GiB)": 89.13, "step": 7515, "token_acc": 0.7462774583517617, "train_speed(iter/s)": 0.121572 }, { "epoch": 0.097577116090168, "grad_norm": 1.0114281177520752, "learning_rate": 9.75736343583755e-05, "loss": 1.038348388671875, "memory(GiB)": 89.13, "step": 7520, "token_acc": 0.7460827777589766, "train_speed(iter/s)": 0.121575 }, { "epoch": 0.0976419944918237, "grad_norm": 1.147032618522644, "learning_rate": 9.763851044504996e-05, "loss": 1.0021699905395507, "memory(GiB)": 89.13, "step": 7525, "token_acc": 0.7453096213858705, "train_speed(iter/s)": 0.121576 }, { "epoch": 0.0977068728934794, "grad_norm": 0.9814774990081787, "learning_rate": 9.77033865317244e-05, "loss": 1.0183626174926759, "memory(GiB)": 89.13, "step": 7530, "token_acc": 0.7107794624124144, "train_speed(iter/s)": 0.121582 }, { "epoch": 0.09777175129513509, "grad_norm": 0.8764054179191589, "learning_rate": 9.776826261839886e-05, "loss": 1.004580020904541, "memory(GiB)": 89.13, "step": 7535, "token_acc": 0.7209826318260053, "train_speed(iter/s)": 0.121579 }, { "epoch": 0.09783662969679079, "grad_norm": 0.9199085235595703, "learning_rate": 9.783313870507332e-05, "loss": 1.0336655616760253, "memory(GiB)": 89.13, "step": 7540, "token_acc": 0.7104191302062803, "train_speed(iter/s)": 0.121578 }, { "epoch": 0.09790150809844649, "grad_norm": 0.8746131062507629, "learning_rate": 9.789801479174777e-05, "loss": 0.9456822395324707, "memory(GiB)": 89.13, "step": 7545, "token_acc": 0.7220859687621014, "train_speed(iter/s)": 0.121574 }, { "epoch": 0.09796638650010218, "grad_norm": 0.9375726580619812, "learning_rate": 9.796289087842222e-05, "loss": 0.9675101280212403, "memory(GiB)": 89.13, "step": 7550, "token_acc": 0.7452310446938916, "train_speed(iter/s)": 0.121574 }, { "epoch": 0.09803126490175788, "grad_norm": 0.9158331155776978, "learning_rate": 9.802776696509667e-05, "loss": 0.9388223648071289, "memory(GiB)": 89.13, "step": 7555, "token_acc": 0.749477220527236, "train_speed(iter/s)": 0.12157 }, { "epoch": 0.09809614330341358, "grad_norm": 0.9947110414505005, "learning_rate": 9.809264305177113e-05, "loss": 1.0415807723999024, "memory(GiB)": 89.13, "step": 7560, "token_acc": 0.7131160203923992, "train_speed(iter/s)": 0.121573 }, { "epoch": 0.09816102170506927, "grad_norm": 1.1180232763290405, "learning_rate": 9.815751913844557e-05, "loss": 1.0375352859497071, "memory(GiB)": 89.13, "step": 7565, "token_acc": 0.7318484799219639, "train_speed(iter/s)": 0.121573 }, { "epoch": 0.09822590010672497, "grad_norm": 0.8911582231521606, "learning_rate": 9.822239522512003e-05, "loss": 1.0309473037719727, "memory(GiB)": 89.13, "step": 7570, "token_acc": 0.7341103341103341, "train_speed(iter/s)": 0.121574 }, { "epoch": 0.09829077850838067, "grad_norm": 0.9335363507270813, "learning_rate": 9.828727131179449e-05, "loss": 1.0361138343811036, "memory(GiB)": 89.13, "step": 7575, "token_acc": 0.7427296090896794, "train_speed(iter/s)": 0.121574 }, { "epoch": 0.09835565691003637, "grad_norm": 0.9939594864845276, "learning_rate": 9.835214739846893e-05, "loss": 1.047262954711914, "memory(GiB)": 89.13, "step": 7580, "token_acc": 0.7195769055065537, "train_speed(iter/s)": 0.121576 }, { "epoch": 0.09842053531169206, "grad_norm": 0.9555116891860962, "learning_rate": 9.841702348514337e-05, "loss": 1.0137920379638672, "memory(GiB)": 89.13, "step": 7585, "token_acc": 0.7380171805618957, "train_speed(iter/s)": 0.121574 }, { "epoch": 0.09848541371334776, "grad_norm": 0.9924075603485107, "learning_rate": 9.848189957181784e-05, "loss": 1.0032430648803712, "memory(GiB)": 89.13, "step": 7590, "token_acc": 0.7361501711110359, "train_speed(iter/s)": 0.121572 }, { "epoch": 0.09855029211500345, "grad_norm": 0.9705674648284912, "learning_rate": 9.854677565849228e-05, "loss": 1.0086929321289062, "memory(GiB)": 89.13, "step": 7595, "token_acc": 0.7503222530369907, "train_speed(iter/s)": 0.121577 }, { "epoch": 0.09861517051665915, "grad_norm": 1.1820871829986572, "learning_rate": 9.861165174516673e-05, "loss": 0.9971777915954589, "memory(GiB)": 89.13, "step": 7600, "token_acc": 0.7408496527560533, "train_speed(iter/s)": 0.121578 }, { "epoch": 0.09868004891831485, "grad_norm": 1.2225207090377808, "learning_rate": 9.86765278318412e-05, "loss": 1.0032736778259277, "memory(GiB)": 89.13, "step": 7605, "token_acc": 0.7349795669941744, "train_speed(iter/s)": 0.121583 }, { "epoch": 0.09874492731997055, "grad_norm": 1.0200626850128174, "learning_rate": 9.874140391851564e-05, "loss": 0.921210765838623, "memory(GiB)": 89.13, "step": 7610, "token_acc": 0.7395150842124746, "train_speed(iter/s)": 0.121575 }, { "epoch": 0.09880980572162625, "grad_norm": 0.8698384165763855, "learning_rate": 9.880628000519008e-05, "loss": 0.9716634750366211, "memory(GiB)": 89.13, "step": 7615, "token_acc": 0.7588553418573833, "train_speed(iter/s)": 0.121573 }, { "epoch": 0.09887468412328194, "grad_norm": 1.0759915113449097, "learning_rate": 9.887115609186454e-05, "loss": 1.0229912757873536, "memory(GiB)": 89.13, "step": 7620, "token_acc": 0.7285200914733747, "train_speed(iter/s)": 0.121574 }, { "epoch": 0.09893956252493763, "grad_norm": 0.9760597348213196, "learning_rate": 9.8936032178539e-05, "loss": 1.0300661087036134, "memory(GiB)": 89.13, "step": 7625, "token_acc": 0.7218135777161462, "train_speed(iter/s)": 0.121573 }, { "epoch": 0.09900444092659333, "grad_norm": 0.8309049010276794, "learning_rate": 9.900090826521344e-05, "loss": 0.9586987495422363, "memory(GiB)": 89.13, "step": 7630, "token_acc": 0.7522553564716201, "train_speed(iter/s)": 0.121572 }, { "epoch": 0.09906931932824903, "grad_norm": 0.9314054250717163, "learning_rate": 9.90657843518879e-05, "loss": 0.9896963119506836, "memory(GiB)": 89.13, "step": 7635, "token_acc": 0.7419807186678352, "train_speed(iter/s)": 0.121571 }, { "epoch": 0.09913419772990473, "grad_norm": 1.023512840270996, "learning_rate": 9.913066043856235e-05, "loss": 0.9387906074523926, "memory(GiB)": 89.13, "step": 7640, "token_acc": 0.7402526084568918, "train_speed(iter/s)": 0.121567 }, { "epoch": 0.09919907613156043, "grad_norm": 1.0892293453216553, "learning_rate": 9.91955365252368e-05, "loss": 0.983650016784668, "memory(GiB)": 89.13, "step": 7645, "token_acc": 0.7477745834920855, "train_speed(iter/s)": 0.121562 }, { "epoch": 0.09926395453321613, "grad_norm": 1.0596755743026733, "learning_rate": 9.926041261191125e-05, "loss": 1.037545394897461, "memory(GiB)": 89.13, "step": 7650, "token_acc": 0.7226890756302521, "train_speed(iter/s)": 0.121561 }, { "epoch": 0.09932883293487181, "grad_norm": 0.8476898074150085, "learning_rate": 9.932528869858571e-05, "loss": 1.0176932334899902, "memory(GiB)": 89.13, "step": 7655, "token_acc": 0.7368256254501049, "train_speed(iter/s)": 0.121557 }, { "epoch": 0.09939371133652751, "grad_norm": 0.9427684545516968, "learning_rate": 9.939016478526015e-05, "loss": 1.0234504699707032, "memory(GiB)": 89.13, "step": 7660, "token_acc": 0.7217168435492022, "train_speed(iter/s)": 0.121557 }, { "epoch": 0.09945858973818321, "grad_norm": 0.9811166524887085, "learning_rate": 9.945504087193461e-05, "loss": 1.0088695526123046, "memory(GiB)": 89.13, "step": 7665, "token_acc": 0.7372159090909091, "train_speed(iter/s)": 0.121561 }, { "epoch": 0.09952346813983891, "grad_norm": 0.9621495008468628, "learning_rate": 9.951991695860907e-05, "loss": 1.0050294876098633, "memory(GiB)": 89.13, "step": 7670, "token_acc": 0.7228752098888929, "train_speed(iter/s)": 0.121558 }, { "epoch": 0.09958834654149461, "grad_norm": 0.8790849447250366, "learning_rate": 9.958479304528351e-05, "loss": 1.0271270751953125, "memory(GiB)": 89.13, "step": 7675, "token_acc": 0.7156336658623546, "train_speed(iter/s)": 0.121556 }, { "epoch": 0.0996532249431503, "grad_norm": 0.9330216646194458, "learning_rate": 9.964966913195797e-05, "loss": 1.0031145095825196, "memory(GiB)": 89.13, "step": 7680, "token_acc": 0.7169872839908705, "train_speed(iter/s)": 0.121556 }, { "epoch": 0.09971810334480599, "grad_norm": 0.9155007004737854, "learning_rate": 9.971454521863241e-05, "loss": 0.9914966583251953, "memory(GiB)": 89.13, "step": 7685, "token_acc": 0.7203708191953466, "train_speed(iter/s)": 0.121554 }, { "epoch": 0.09978298174646169, "grad_norm": 0.9496882557868958, "learning_rate": 9.977942130530687e-05, "loss": 1.0076119422912597, "memory(GiB)": 89.13, "step": 7690, "token_acc": 0.7209789512187011, "train_speed(iter/s)": 0.121552 }, { "epoch": 0.09984786014811739, "grad_norm": 0.886504590511322, "learning_rate": 9.984429739198132e-05, "loss": 0.9900485038757324, "memory(GiB)": 89.13, "step": 7695, "token_acc": 0.7558090247003969, "train_speed(iter/s)": 0.121552 }, { "epoch": 0.09991273854977309, "grad_norm": 0.9648659825325012, "learning_rate": 9.990917347865577e-05, "loss": 1.0117545127868652, "memory(GiB)": 89.13, "step": 7700, "token_acc": 0.7356004392240375, "train_speed(iter/s)": 0.121552 }, { "epoch": 0.09997761695142879, "grad_norm": 0.9933404326438904, "learning_rate": 9.997404956533022e-05, "loss": 0.9938316345214844, "memory(GiB)": 89.13, "step": 7705, "token_acc": 0.7581958031513704, "train_speed(iter/s)": 0.121553 }, { "epoch": 0.10004249535308449, "grad_norm": 1.0412521362304688, "learning_rate": 9.999999989642858e-05, "loss": 0.9881864547729492, "memory(GiB)": 89.13, "step": 7710, "token_acc": 0.7271122776847536, "train_speed(iter/s)": 0.121559 }, { "epoch": 0.10010737375474017, "grad_norm": 0.9106295108795166, "learning_rate": 9.999999926349213e-05, "loss": 0.9719056129455567, "memory(GiB)": 89.13, "step": 7715, "token_acc": 0.7656297600108317, "train_speed(iter/s)": 0.121558 }, { "epoch": 0.10017225215639587, "grad_norm": 1.062477707862854, "learning_rate": 9.999999805515889e-05, "loss": 1.0367202758789062, "memory(GiB)": 89.13, "step": 7720, "token_acc": 0.7186832468721391, "train_speed(iter/s)": 0.121557 }, { "epoch": 0.10023713055805157, "grad_norm": 1.043255090713501, "learning_rate": 9.99999962714289e-05, "loss": 0.9813391685485839, "memory(GiB)": 89.13, "step": 7725, "token_acc": 0.7362993619589584, "train_speed(iter/s)": 0.121559 }, { "epoch": 0.10030200895970727, "grad_norm": 1.04062819480896, "learning_rate": 9.999999391230217e-05, "loss": 1.0440729141235352, "memory(GiB)": 89.13, "step": 7730, "token_acc": 0.740715109573241, "train_speed(iter/s)": 0.121561 }, { "epoch": 0.10036688736136297, "grad_norm": 1.0358823537826538, "learning_rate": 9.999999097777872e-05, "loss": 1.0375226974487304, "memory(GiB)": 89.13, "step": 7735, "token_acc": 0.7095473043540894, "train_speed(iter/s)": 0.12156 }, { "epoch": 0.10043176576301867, "grad_norm": 0.9652231931686401, "learning_rate": 9.99999874678586e-05, "loss": 0.9967095375061035, "memory(GiB)": 89.13, "step": 7740, "token_acc": 0.7497210737021723, "train_speed(iter/s)": 0.12156 }, { "epoch": 0.10049664416467435, "grad_norm": 0.882781445980072, "learning_rate": 9.999998338254184e-05, "loss": 0.9776859283447266, "memory(GiB)": 89.13, "step": 7745, "token_acc": 0.7572432926637032, "train_speed(iter/s)": 0.121557 }, { "epoch": 0.10056152256633005, "grad_norm": 0.9058821797370911, "learning_rate": 9.999997872182849e-05, "loss": 0.9995176315307617, "memory(GiB)": 89.13, "step": 7750, "token_acc": 0.7258310643008886, "train_speed(iter/s)": 0.121556 }, { "epoch": 0.10062640096798575, "grad_norm": 0.910384476184845, "learning_rate": 9.99999734857186e-05, "loss": 0.9806253433227539, "memory(GiB)": 89.13, "step": 7755, "token_acc": 0.7323413245154772, "train_speed(iter/s)": 0.121551 }, { "epoch": 0.10069127936964145, "grad_norm": 0.9048728942871094, "learning_rate": 9.999996767421225e-05, "loss": 0.9797175407409668, "memory(GiB)": 89.13, "step": 7760, "token_acc": 0.7324020728056906, "train_speed(iter/s)": 0.12155 }, { "epoch": 0.10075615777129715, "grad_norm": 0.9220802187919617, "learning_rate": 9.999996128730947e-05, "loss": 1.007463550567627, "memory(GiB)": 89.13, "step": 7765, "token_acc": 0.7615643218860377, "train_speed(iter/s)": 0.121552 }, { "epoch": 0.10082103617295285, "grad_norm": 0.9969472289085388, "learning_rate": 9.999995432501036e-05, "loss": 1.0341331481933593, "memory(GiB)": 89.13, "step": 7770, "token_acc": 0.7422155469265148, "train_speed(iter/s)": 0.121554 }, { "epoch": 0.10088591457460853, "grad_norm": 1.0454678535461426, "learning_rate": 9.999994678731499e-05, "loss": 1.0099912643432618, "memory(GiB)": 89.13, "step": 7775, "token_acc": 0.7309866392600206, "train_speed(iter/s)": 0.121556 }, { "epoch": 0.10095079297626423, "grad_norm": 1.2984431982040405, "learning_rate": 9.999993867422346e-05, "loss": 1.051440715789795, "memory(GiB)": 89.13, "step": 7780, "token_acc": 0.7362893815635939, "train_speed(iter/s)": 0.121559 }, { "epoch": 0.10101567137791993, "grad_norm": 0.989984929561615, "learning_rate": 9.999992998573584e-05, "loss": 0.9816394805908203, "memory(GiB)": 89.13, "step": 7785, "token_acc": 0.7352767846619259, "train_speed(iter/s)": 0.121558 }, { "epoch": 0.10108054977957563, "grad_norm": 0.8945564031600952, "learning_rate": 9.999992072185225e-05, "loss": 1.0313477516174316, "memory(GiB)": 89.13, "step": 7790, "token_acc": 0.7322754551927576, "train_speed(iter/s)": 0.121557 }, { "epoch": 0.10114542818123133, "grad_norm": 0.8963878750801086, "learning_rate": 9.99999108825728e-05, "loss": 0.9771123886108398, "memory(GiB)": 89.13, "step": 7795, "token_acc": 0.7301064368981247, "train_speed(iter/s)": 0.121553 }, { "epoch": 0.10121030658288703, "grad_norm": 0.9366214275360107, "learning_rate": 9.999990046789758e-05, "loss": 1.0035662651062012, "memory(GiB)": 89.13, "step": 7800, "token_acc": 0.731529449270742, "train_speed(iter/s)": 0.121557 }, { "epoch": 0.10127518498454272, "grad_norm": 0.9348906874656677, "learning_rate": 9.999988947782672e-05, "loss": 1.029129409790039, "memory(GiB)": 89.13, "step": 7805, "token_acc": 0.7313422068318725, "train_speed(iter/s)": 0.121555 }, { "epoch": 0.10134006338619841, "grad_norm": 0.9444119334220886, "learning_rate": 9.999987791236036e-05, "loss": 1.0678500175476073, "memory(GiB)": 89.13, "step": 7810, "token_acc": 0.7402651058702014, "train_speed(iter/s)": 0.121553 }, { "epoch": 0.10140494178785411, "grad_norm": 0.8878713250160217, "learning_rate": 9.999986577149863e-05, "loss": 0.9700538635253906, "memory(GiB)": 89.13, "step": 7815, "token_acc": 0.7539421536385867, "train_speed(iter/s)": 0.121554 }, { "epoch": 0.10146982018950981, "grad_norm": 0.8799527287483215, "learning_rate": 9.999985305524164e-05, "loss": 0.9468729019165039, "memory(GiB)": 89.13, "step": 7820, "token_acc": 0.7581816947311244, "train_speed(iter/s)": 0.121552 }, { "epoch": 0.10153469859116551, "grad_norm": 0.9625751972198486, "learning_rate": 9.999983976358957e-05, "loss": 1.0187742233276367, "memory(GiB)": 89.13, "step": 7825, "token_acc": 0.7205272056563172, "train_speed(iter/s)": 0.12155 }, { "epoch": 0.10159957699282121, "grad_norm": 1.0513559579849243, "learning_rate": 9.999982589654258e-05, "loss": 1.031821060180664, "memory(GiB)": 89.13, "step": 7830, "token_acc": 0.713650819538449, "train_speed(iter/s)": 0.12155 }, { "epoch": 0.1016644553944769, "grad_norm": 0.9652976989746094, "learning_rate": 9.999981145410079e-05, "loss": 0.9548957824707032, "memory(GiB)": 89.13, "step": 7835, "token_acc": 0.7492030696576151, "train_speed(iter/s)": 0.121549 }, { "epoch": 0.1017293337961326, "grad_norm": 0.9549874663352966, "learning_rate": 9.99997964362644e-05, "loss": 1.0450479507446289, "memory(GiB)": 89.13, "step": 7840, "token_acc": 0.7113236158821526, "train_speed(iter/s)": 0.121552 }, { "epoch": 0.1017942121977883, "grad_norm": 0.8725703954696655, "learning_rate": 9.999978084303356e-05, "loss": 0.9985557556152344, "memory(GiB)": 89.13, "step": 7845, "token_acc": 0.7103947327556757, "train_speed(iter/s)": 0.12155 }, { "epoch": 0.101859090599444, "grad_norm": 0.9214089512825012, "learning_rate": 9.999976467440846e-05, "loss": 0.9811445236206054, "memory(GiB)": 89.13, "step": 7850, "token_acc": 0.737033932281226, "train_speed(iter/s)": 0.121549 }, { "epoch": 0.10192396900109969, "grad_norm": 0.8649849891662598, "learning_rate": 9.999974793038928e-05, "loss": 0.9881103515625, "memory(GiB)": 89.13, "step": 7855, "token_acc": 0.7604345509442899, "train_speed(iter/s)": 0.121546 }, { "epoch": 0.10198884740275539, "grad_norm": 0.9559375643730164, "learning_rate": 9.999973061097623e-05, "loss": 0.9613311767578125, "memory(GiB)": 89.13, "step": 7860, "token_acc": 0.7258979898209512, "train_speed(iter/s)": 0.121543 }, { "epoch": 0.10205372580441108, "grad_norm": 0.9712839722633362, "learning_rate": 9.99997127161695e-05, "loss": 1.0081775665283204, "memory(GiB)": 89.13, "step": 7865, "token_acc": 0.7315532816372212, "train_speed(iter/s)": 0.121541 }, { "epoch": 0.10211860420606678, "grad_norm": 1.0041518211364746, "learning_rate": 9.99996942459693e-05, "loss": 0.9874235153198242, "memory(GiB)": 89.13, "step": 7870, "token_acc": 0.7462932454695222, "train_speed(iter/s)": 0.121543 }, { "epoch": 0.10218348260772248, "grad_norm": 0.9601787328720093, "learning_rate": 9.999967520037584e-05, "loss": 1.014573860168457, "memory(GiB)": 89.13, "step": 7875, "token_acc": 0.7373586161011311, "train_speed(iter/s)": 0.121543 }, { "epoch": 0.10224836100937817, "grad_norm": 0.9036638736724854, "learning_rate": 9.999965557938931e-05, "loss": 0.9825645446777344, "memory(GiB)": 89.13, "step": 7880, "token_acc": 0.7483224035382035, "train_speed(iter/s)": 0.121547 }, { "epoch": 0.10231323941103387, "grad_norm": 0.994600772857666, "learning_rate": 9.999963538300998e-05, "loss": 0.9928817749023438, "memory(GiB)": 89.13, "step": 7885, "token_acc": 0.7605513307984791, "train_speed(iter/s)": 0.121545 }, { "epoch": 0.10237811781268957, "grad_norm": 1.3652909994125366, "learning_rate": 9.999961461123806e-05, "loss": 0.992668342590332, "memory(GiB)": 89.13, "step": 7890, "token_acc": 0.7514727409047092, "train_speed(iter/s)": 0.121542 }, { "epoch": 0.10244299621434526, "grad_norm": 0.9251009821891785, "learning_rate": 9.999959326407379e-05, "loss": 0.9921878814697266, "memory(GiB)": 89.13, "step": 7895, "token_acc": 0.7505418719211823, "train_speed(iter/s)": 0.121547 }, { "epoch": 0.10250787461600096, "grad_norm": 1.0007599592208862, "learning_rate": 9.999957134151742e-05, "loss": 1.0106704711914063, "memory(GiB)": 89.13, "step": 7900, "token_acc": 0.7401379310344828, "train_speed(iter/s)": 0.121548 }, { "epoch": 0.10257275301765666, "grad_norm": 0.9915769100189209, "learning_rate": 9.999954884356919e-05, "loss": 1.0014759063720704, "memory(GiB)": 89.13, "step": 7905, "token_acc": 0.7288508557457213, "train_speed(iter/s)": 0.121545 }, { "epoch": 0.10263763141931236, "grad_norm": 0.938555121421814, "learning_rate": 9.99995257702294e-05, "loss": 1.0016480445861817, "memory(GiB)": 89.13, "step": 7910, "token_acc": 0.7181428571428572, "train_speed(iter/s)": 0.121545 }, { "epoch": 0.10270250982096805, "grad_norm": 0.8984293341636658, "learning_rate": 9.999950212149826e-05, "loss": 0.9996466636657715, "memory(GiB)": 89.13, "step": 7915, "token_acc": 0.7452227024324035, "train_speed(iter/s)": 0.121543 }, { "epoch": 0.10276738822262375, "grad_norm": 0.9361212849617004, "learning_rate": 9.999947789737607e-05, "loss": 1.0131608963012695, "memory(GiB)": 89.13, "step": 7920, "token_acc": 0.7318340677708555, "train_speed(iter/s)": 0.121542 }, { "epoch": 0.10283226662427944, "grad_norm": 0.9013607501983643, "learning_rate": 9.999945309786312e-05, "loss": 1.0615806579589844, "memory(GiB)": 89.13, "step": 7925, "token_acc": 0.7398515188666395, "train_speed(iter/s)": 0.121542 }, { "epoch": 0.10289714502593514, "grad_norm": 0.9428006410598755, "learning_rate": 9.999942772295967e-05, "loss": 1.038295841217041, "memory(GiB)": 89.13, "step": 7930, "token_acc": 0.7265772985574821, "train_speed(iter/s)": 0.121545 }, { "epoch": 0.10296202342759084, "grad_norm": 0.9005557894706726, "learning_rate": 9.999940177266603e-05, "loss": 1.0393656730651855, "memory(GiB)": 89.13, "step": 7935, "token_acc": 0.7375964352221336, "train_speed(iter/s)": 0.121542 }, { "epoch": 0.10302690182924654, "grad_norm": 0.9798007607460022, "learning_rate": 9.999937524698248e-05, "loss": 1.0269756317138672, "memory(GiB)": 89.13, "step": 7940, "token_acc": 0.739978798118333, "train_speed(iter/s)": 0.12154 }, { "epoch": 0.10309178023090224, "grad_norm": 0.8545849919319153, "learning_rate": 9.999934814590935e-05, "loss": 0.9913545608520508, "memory(GiB)": 89.13, "step": 7945, "token_acc": 0.750947375358355, "train_speed(iter/s)": 0.121544 }, { "epoch": 0.10315665863255793, "grad_norm": 0.9509168267250061, "learning_rate": 9.999932046944693e-05, "loss": 0.9900833129882812, "memory(GiB)": 89.13, "step": 7950, "token_acc": 0.7487109177073709, "train_speed(iter/s)": 0.121545 }, { "epoch": 0.10322153703421362, "grad_norm": 0.9299849271774292, "learning_rate": 9.999929221759556e-05, "loss": 0.9632368087768555, "memory(GiB)": 89.13, "step": 7955, "token_acc": 0.7734572799758153, "train_speed(iter/s)": 0.121548 }, { "epoch": 0.10328641543586932, "grad_norm": 0.9001189470291138, "learning_rate": 9.999926339035555e-05, "loss": 1.0001896858215331, "memory(GiB)": 89.13, "step": 7960, "token_acc": 0.7430143765197731, "train_speed(iter/s)": 0.121551 }, { "epoch": 0.10335129383752502, "grad_norm": 1.0413341522216797, "learning_rate": 9.999923398772721e-05, "loss": 1.0634191513061524, "memory(GiB)": 89.13, "step": 7965, "token_acc": 0.7226731078904992, "train_speed(iter/s)": 0.121549 }, { "epoch": 0.10341617223918072, "grad_norm": 1.0876383781433105, "learning_rate": 9.999920400971093e-05, "loss": 1.0886558532714843, "memory(GiB)": 89.13, "step": 7970, "token_acc": 0.7278898519598067, "train_speed(iter/s)": 0.121552 }, { "epoch": 0.10348105064083642, "grad_norm": 0.8781169056892395, "learning_rate": 9.999917345630703e-05, "loss": 1.029487705230713, "memory(GiB)": 89.13, "step": 7975, "token_acc": 0.7302390287087391, "train_speed(iter/s)": 0.121548 }, { "epoch": 0.10354592904249212, "grad_norm": 0.9308024048805237, "learning_rate": 9.999914232751585e-05, "loss": 1.0104039192199707, "memory(GiB)": 89.13, "step": 7980, "token_acc": 0.7261678693505507, "train_speed(iter/s)": 0.121547 }, { "epoch": 0.1036108074441478, "grad_norm": 0.9967846274375916, "learning_rate": 9.999911062333775e-05, "loss": 0.9919163703918457, "memory(GiB)": 89.13, "step": 7985, "token_acc": 0.7378640776699029, "train_speed(iter/s)": 0.121549 }, { "epoch": 0.1036756858458035, "grad_norm": 0.9511969089508057, "learning_rate": 9.999907834377312e-05, "loss": 1.0487144470214844, "memory(GiB)": 89.13, "step": 7990, "token_acc": 0.7254758686200723, "train_speed(iter/s)": 0.121547 }, { "epoch": 0.1037405642474592, "grad_norm": 1.0331292152404785, "learning_rate": 9.99990454888223e-05, "loss": 1.0791275024414062, "memory(GiB)": 89.13, "step": 7995, "token_acc": 0.7194951914482858, "train_speed(iter/s)": 0.121547 }, { "epoch": 0.1038054426491149, "grad_norm": 1.0504544973373413, "learning_rate": 9.99990120584857e-05, "loss": 0.9790698051452636, "memory(GiB)": 89.13, "step": 8000, "token_acc": 0.7521504642271982, "train_speed(iter/s)": 0.121545 }, { "epoch": 0.1038703210507706, "grad_norm": 0.964063286781311, "learning_rate": 9.999897805276368e-05, "loss": 0.9461848258972168, "memory(GiB)": 89.13, "step": 8005, "token_acc": 0.7567177348199245, "train_speed(iter/s)": 0.121542 }, { "epoch": 0.1039351994524263, "grad_norm": 0.8946437239646912, "learning_rate": 9.999894347165664e-05, "loss": 0.9821965217590332, "memory(GiB)": 89.13, "step": 8010, "token_acc": 0.7479034393779921, "train_speed(iter/s)": 0.121543 }, { "epoch": 0.10400007785408198, "grad_norm": 1.004324197769165, "learning_rate": 9.999890831516496e-05, "loss": 1.0133811950683593, "memory(GiB)": 89.13, "step": 8015, "token_acc": 0.7288941736028538, "train_speed(iter/s)": 0.121543 }, { "epoch": 0.10406495625573768, "grad_norm": 1.0680936574935913, "learning_rate": 9.999887258328909e-05, "loss": 0.9971589088439942, "memory(GiB)": 89.13, "step": 8020, "token_acc": 0.7303886925795053, "train_speed(iter/s)": 0.121539 }, { "epoch": 0.10412983465739338, "grad_norm": 0.984992265701294, "learning_rate": 9.999883627602938e-05, "loss": 0.9948573112487793, "memory(GiB)": 89.13, "step": 8025, "token_acc": 0.718274782083695, "train_speed(iter/s)": 0.121535 }, { "epoch": 0.10419471305904908, "grad_norm": 0.9029020667076111, "learning_rate": 9.99987993933863e-05, "loss": 1.002594566345215, "memory(GiB)": 89.13, "step": 8030, "token_acc": 0.7471970511442175, "train_speed(iter/s)": 0.121534 }, { "epoch": 0.10425959146070478, "grad_norm": 0.9626374840736389, "learning_rate": 9.999876193536025e-05, "loss": 0.9801532745361328, "memory(GiB)": 89.13, "step": 8035, "token_acc": 0.7465102366391917, "train_speed(iter/s)": 0.121533 }, { "epoch": 0.10432446986236048, "grad_norm": 0.9746389985084534, "learning_rate": 9.999872390195166e-05, "loss": 1.0038041114807128, "memory(GiB)": 89.13, "step": 8040, "token_acc": 0.7316866020242055, "train_speed(iter/s)": 0.121537 }, { "epoch": 0.10438934826401616, "grad_norm": 0.935921311378479, "learning_rate": 9.999868529316099e-05, "loss": 0.9995872497558593, "memory(GiB)": 89.13, "step": 8045, "token_acc": 0.7329681274900398, "train_speed(iter/s)": 0.121535 }, { "epoch": 0.10445422666567186, "grad_norm": 0.9793906211853027, "learning_rate": 9.999864610898866e-05, "loss": 1.0160533905029296, "memory(GiB)": 89.13, "step": 8050, "token_acc": 0.744188645507376, "train_speed(iter/s)": 0.121536 }, { "epoch": 0.10451910506732756, "grad_norm": 0.9122878313064575, "learning_rate": 9.999860634943513e-05, "loss": 0.9889345169067383, "memory(GiB)": 89.13, "step": 8055, "token_acc": 0.7332984293193717, "train_speed(iter/s)": 0.121531 }, { "epoch": 0.10458398346898326, "grad_norm": 0.820298433303833, "learning_rate": 9.999856601450084e-05, "loss": 1.0504621505737304, "memory(GiB)": 89.13, "step": 8060, "token_acc": 0.7308327435715971, "train_speed(iter/s)": 0.121528 }, { "epoch": 0.10464886187063896, "grad_norm": 0.9948912858963013, "learning_rate": 9.999852510418628e-05, "loss": 1.048396110534668, "memory(GiB)": 89.13, "step": 8065, "token_acc": 0.7228876306620209, "train_speed(iter/s)": 0.121529 }, { "epoch": 0.10471374027229466, "grad_norm": 0.9305578470230103, "learning_rate": 9.99984836184919e-05, "loss": 1.0252123832702638, "memory(GiB)": 89.13, "step": 8070, "token_acc": 0.7256535572539115, "train_speed(iter/s)": 0.12153 }, { "epoch": 0.10477861867395034, "grad_norm": 0.9629974365234375, "learning_rate": 9.99984415574182e-05, "loss": 1.0409429550170899, "memory(GiB)": 89.13, "step": 8075, "token_acc": 0.7127045412759698, "train_speed(iter/s)": 0.121534 }, { "epoch": 0.10484349707560604, "grad_norm": 0.8562747240066528, "learning_rate": 9.999839892096566e-05, "loss": 0.9701717376708985, "memory(GiB)": 89.13, "step": 8080, "token_acc": 0.7368281236163995, "train_speed(iter/s)": 0.121533 }, { "epoch": 0.10490837547726174, "grad_norm": 0.8686419725418091, "learning_rate": 9.999835570913476e-05, "loss": 0.9898746490478516, "memory(GiB)": 89.13, "step": 8085, "token_acc": 0.7284684500017081, "train_speed(iter/s)": 0.121535 }, { "epoch": 0.10497325387891744, "grad_norm": 0.9396279454231262, "learning_rate": 9.999831192192599e-05, "loss": 1.001972770690918, "memory(GiB)": 89.13, "step": 8090, "token_acc": 0.7355385314549113, "train_speed(iter/s)": 0.121534 }, { "epoch": 0.10503813228057314, "grad_norm": 0.8539882898330688, "learning_rate": 9.999826755933988e-05, "loss": 1.0164117813110352, "memory(GiB)": 89.13, "step": 8095, "token_acc": 0.7301207363863564, "train_speed(iter/s)": 0.121533 }, { "epoch": 0.10510301068222883, "grad_norm": 0.9208024740219116, "learning_rate": 9.99982226213769e-05, "loss": 1.0271871566772461, "memory(GiB)": 89.13, "step": 8100, "token_acc": 0.7374296345768534, "train_speed(iter/s)": 0.12153 }, { "epoch": 0.10516788908388452, "grad_norm": 1.1056792736053467, "learning_rate": 9.999817710803761e-05, "loss": 1.0301962852478028, "memory(GiB)": 89.13, "step": 8105, "token_acc": 0.7203078606032193, "train_speed(iter/s)": 0.121532 }, { "epoch": 0.10523276748554022, "grad_norm": 0.8482903242111206, "learning_rate": 9.999813101932252e-05, "loss": 1.0259559631347657, "memory(GiB)": 89.13, "step": 8110, "token_acc": 0.7289798919652705, "train_speed(iter/s)": 0.121529 }, { "epoch": 0.10529764588719592, "grad_norm": 0.9709965586662292, "learning_rate": 9.999808435523215e-05, "loss": 1.0284538269042969, "memory(GiB)": 89.13, "step": 8115, "token_acc": 0.7382441220610305, "train_speed(iter/s)": 0.121529 }, { "epoch": 0.10536252428885162, "grad_norm": 1.0032857656478882, "learning_rate": 9.999803711576704e-05, "loss": 1.0210309028625488, "memory(GiB)": 89.13, "step": 8120, "token_acc": 0.7389295340777821, "train_speed(iter/s)": 0.12153 }, { "epoch": 0.10542740269050732, "grad_norm": 0.8944088816642761, "learning_rate": 9.999798930092774e-05, "loss": 0.9834169387817383, "memory(GiB)": 89.13, "step": 8125, "token_acc": 0.7345239289655864, "train_speed(iter/s)": 0.121528 }, { "epoch": 0.105492281092163, "grad_norm": 0.9863624572753906, "learning_rate": 9.999794091071479e-05, "loss": 0.9952836036682129, "memory(GiB)": 89.13, "step": 8130, "token_acc": 0.7366419126969752, "train_speed(iter/s)": 0.12153 }, { "epoch": 0.1055571594938187, "grad_norm": 1.007417917251587, "learning_rate": 9.999789194512876e-05, "loss": 0.9562044143676758, "memory(GiB)": 89.13, "step": 8135, "token_acc": 0.7478667571137848, "train_speed(iter/s)": 0.121531 }, { "epoch": 0.1056220378954744, "grad_norm": 0.8596062660217285, "learning_rate": 9.999784240417021e-05, "loss": 0.9879474639892578, "memory(GiB)": 89.13, "step": 8140, "token_acc": 0.7230707876370888, "train_speed(iter/s)": 0.121531 }, { "epoch": 0.1056869162971301, "grad_norm": 0.8522611856460571, "learning_rate": 9.99977922878397e-05, "loss": 0.9593499183654786, "memory(GiB)": 89.13, "step": 8145, "token_acc": 0.73337830025053, "train_speed(iter/s)": 0.121529 }, { "epoch": 0.1057517946987858, "grad_norm": 0.9656156897544861, "learning_rate": 9.999774159613782e-05, "loss": 1.0250813484191894, "memory(GiB)": 89.13, "step": 8150, "token_acc": 0.7398276508055451, "train_speed(iter/s)": 0.121532 }, { "epoch": 0.1058166731004415, "grad_norm": 0.8775205612182617, "learning_rate": 9.999769032906515e-05, "loss": 1.0340733528137207, "memory(GiB)": 89.13, "step": 8155, "token_acc": 0.7259272661480007, "train_speed(iter/s)": 0.121533 }, { "epoch": 0.10588155150209719, "grad_norm": 0.9489590525627136, "learning_rate": 9.999763848662227e-05, "loss": 1.014878273010254, "memory(GiB)": 89.13, "step": 8160, "token_acc": 0.7343558524726252, "train_speed(iter/s)": 0.121531 }, { "epoch": 0.10594642990375289, "grad_norm": 0.9493241906166077, "learning_rate": 9.99975860688098e-05, "loss": 0.9861207962036133, "memory(GiB)": 89.13, "step": 8165, "token_acc": 0.7560194850382742, "train_speed(iter/s)": 0.121531 }, { "epoch": 0.10601130830540859, "grad_norm": 0.9011742472648621, "learning_rate": 9.999753307562832e-05, "loss": 1.085366439819336, "memory(GiB)": 89.13, "step": 8170, "token_acc": 0.7237159064418369, "train_speed(iter/s)": 0.12153 }, { "epoch": 0.10607618670706428, "grad_norm": 0.9183640480041504, "learning_rate": 9.999747950707842e-05, "loss": 1.0376290321350097, "memory(GiB)": 89.13, "step": 8175, "token_acc": 0.7189170742805142, "train_speed(iter/s)": 0.12153 }, { "epoch": 0.10614106510871998, "grad_norm": 0.9487308263778687, "learning_rate": 9.999742536316076e-05, "loss": 0.9790715217590332, "memory(GiB)": 89.13, "step": 8180, "token_acc": 0.7384696200315182, "train_speed(iter/s)": 0.12153 }, { "epoch": 0.10620594351037568, "grad_norm": 0.9814698696136475, "learning_rate": 9.999737064387598e-05, "loss": 1.035869026184082, "memory(GiB)": 89.13, "step": 8185, "token_acc": 0.7089430276310289, "train_speed(iter/s)": 0.121531 }, { "epoch": 0.10627082191203137, "grad_norm": 1.0057905912399292, "learning_rate": 9.999731534922462e-05, "loss": 1.0183965682983398, "memory(GiB)": 89.13, "step": 8190, "token_acc": 0.7112447249505173, "train_speed(iter/s)": 0.121533 }, { "epoch": 0.10633570031368707, "grad_norm": 1.1228647232055664, "learning_rate": 9.999725947920742e-05, "loss": 1.046732521057129, "memory(GiB)": 89.13, "step": 8195, "token_acc": 0.7103533702645976, "train_speed(iter/s)": 0.121534 }, { "epoch": 0.10640057871534277, "grad_norm": 0.8877583742141724, "learning_rate": 9.999720303382495e-05, "loss": 0.9904163360595704, "memory(GiB)": 89.13, "step": 8200, "token_acc": 0.7421035374694889, "train_speed(iter/s)": 0.121535 }, { "epoch": 0.10646545711699847, "grad_norm": 0.9335121512413025, "learning_rate": 9.99971460130779e-05, "loss": 1.0258428573608398, "memory(GiB)": 89.13, "step": 8205, "token_acc": 0.7155757082509786, "train_speed(iter/s)": 0.121538 }, { "epoch": 0.10653033551865417, "grad_norm": 0.9186772704124451, "learning_rate": 9.999708841696692e-05, "loss": 1.0349155426025392, "memory(GiB)": 89.13, "step": 8210, "token_acc": 0.7103426644711048, "train_speed(iter/s)": 0.121538 }, { "epoch": 0.10659521392030986, "grad_norm": 1.0639586448669434, "learning_rate": 9.999703024549263e-05, "loss": 1.0022104263305665, "memory(GiB)": 89.13, "step": 8215, "token_acc": 0.7364477902243994, "train_speed(iter/s)": 0.121542 }, { "epoch": 0.10666009232196555, "grad_norm": 0.8285014033317566, "learning_rate": 9.999697149865576e-05, "loss": 1.0340110778808593, "memory(GiB)": 89.13, "step": 8220, "token_acc": 0.7258696548879418, "train_speed(iter/s)": 0.121541 }, { "epoch": 0.10672497072362125, "grad_norm": 0.9023734331130981, "learning_rate": 9.999691217645695e-05, "loss": 0.979707145690918, "memory(GiB)": 89.13, "step": 8225, "token_acc": 0.7485895879454403, "train_speed(iter/s)": 0.121541 }, { "epoch": 0.10678984912527695, "grad_norm": 0.8427553176879883, "learning_rate": 9.99968522788969e-05, "loss": 0.9876091003417968, "memory(GiB)": 89.13, "step": 8230, "token_acc": 0.7194132927833745, "train_speed(iter/s)": 0.121539 }, { "epoch": 0.10685472752693265, "grad_norm": 0.9931334853172302, "learning_rate": 9.999679180597628e-05, "loss": 1.051925277709961, "memory(GiB)": 89.13, "step": 8235, "token_acc": 0.7403138793526238, "train_speed(iter/s)": 0.121544 }, { "epoch": 0.10691960592858835, "grad_norm": 0.8899463415145874, "learning_rate": 9.99967307576958e-05, "loss": 0.9903312683105469, "memory(GiB)": 89.13, "step": 8240, "token_acc": 0.7400515206274751, "train_speed(iter/s)": 0.121543 }, { "epoch": 0.10698448433024405, "grad_norm": 0.9746626019477844, "learning_rate": 9.999666913405617e-05, "loss": 1.0270977020263672, "memory(GiB)": 89.13, "step": 8245, "token_acc": 0.7414333836682542, "train_speed(iter/s)": 0.121544 }, { "epoch": 0.10704936273189973, "grad_norm": 0.9315205216407776, "learning_rate": 9.999660693505808e-05, "loss": 0.9871282577514648, "memory(GiB)": 89.13, "step": 8250, "token_acc": 0.7527886426191511, "train_speed(iter/s)": 0.121547 }, { "epoch": 0.10711424113355543, "grad_norm": 0.9100365042686462, "learning_rate": 9.999654416070225e-05, "loss": 1.0055384635925293, "memory(GiB)": 89.13, "step": 8255, "token_acc": 0.7514126276629347, "train_speed(iter/s)": 0.121546 }, { "epoch": 0.10717911953521113, "grad_norm": 0.8854137659072876, "learning_rate": 9.999648081098942e-05, "loss": 0.9588442802429199, "memory(GiB)": 89.13, "step": 8260, "token_acc": 0.7598146877144818, "train_speed(iter/s)": 0.121548 }, { "epoch": 0.10724399793686683, "grad_norm": 1.148740530014038, "learning_rate": 9.999641688592031e-05, "loss": 1.0393712997436524, "memory(GiB)": 89.13, "step": 8265, "token_acc": 0.7409242741254635, "train_speed(iter/s)": 0.12155 }, { "epoch": 0.10730887633852253, "grad_norm": 0.8355949521064758, "learning_rate": 9.999635238549564e-05, "loss": 0.9661390304565429, "memory(GiB)": 89.13, "step": 8270, "token_acc": 0.7489786297925832, "train_speed(iter/s)": 0.121548 }, { "epoch": 0.10737375474017823, "grad_norm": 0.8560447096824646, "learning_rate": 9.999628730971618e-05, "loss": 0.9908267974853515, "memory(GiB)": 89.13, "step": 8275, "token_acc": 0.737123745819398, "train_speed(iter/s)": 0.121545 }, { "epoch": 0.10743863314183391, "grad_norm": 0.9865535497665405, "learning_rate": 9.999622165858264e-05, "loss": 1.0118728637695313, "memory(GiB)": 89.13, "step": 8280, "token_acc": 0.7377770022683651, "train_speed(iter/s)": 0.121546 }, { "epoch": 0.10750351154348961, "grad_norm": 0.8825473785400391, "learning_rate": 9.999615543209582e-05, "loss": 0.9843449592590332, "memory(GiB)": 89.13, "step": 8285, "token_acc": 0.7337365757313912, "train_speed(iter/s)": 0.121547 }, { "epoch": 0.10756838994514531, "grad_norm": 1.1099598407745361, "learning_rate": 9.999608863025645e-05, "loss": 1.0472526550292969, "memory(GiB)": 89.13, "step": 8290, "token_acc": 0.7332963681729969, "train_speed(iter/s)": 0.121549 }, { "epoch": 0.10763326834680101, "grad_norm": 0.8802152276039124, "learning_rate": 9.99960212530653e-05, "loss": 0.97281494140625, "memory(GiB)": 89.13, "step": 8295, "token_acc": 0.7353568657874321, "train_speed(iter/s)": 0.121548 }, { "epoch": 0.10769814674845671, "grad_norm": 1.081792950630188, "learning_rate": 9.999595330052318e-05, "loss": 1.033921241760254, "memory(GiB)": 89.13, "step": 8300, "token_acc": 0.7378157947085737, "train_speed(iter/s)": 0.12155 }, { "epoch": 0.10776302515011241, "grad_norm": 0.8055312037467957, "learning_rate": 9.999588477263083e-05, "loss": 1.03216552734375, "memory(GiB)": 89.13, "step": 8305, "token_acc": 0.7469413809000319, "train_speed(iter/s)": 0.121551 }, { "epoch": 0.10782790355176809, "grad_norm": 1.0869123935699463, "learning_rate": 9.999581566938907e-05, "loss": 1.0051465034484863, "memory(GiB)": 89.13, "step": 8310, "token_acc": 0.7224506924045321, "train_speed(iter/s)": 0.121554 }, { "epoch": 0.10789278195342379, "grad_norm": 0.9654251337051392, "learning_rate": 9.999574599079866e-05, "loss": 1.0583910942077637, "memory(GiB)": 89.13, "step": 8315, "token_acc": 0.7300556928410725, "train_speed(iter/s)": 0.121555 }, { "epoch": 0.10795766035507949, "grad_norm": 0.942957878112793, "learning_rate": 9.999567573686044e-05, "loss": 0.9799916267395019, "memory(GiB)": 89.13, "step": 8320, "token_acc": 0.7325390685440198, "train_speed(iter/s)": 0.121554 }, { "epoch": 0.10802253875673519, "grad_norm": 0.9555338621139526, "learning_rate": 9.99956049075752e-05, "loss": 1.031672477722168, "memory(GiB)": 89.13, "step": 8325, "token_acc": 0.7389725841476655, "train_speed(iter/s)": 0.121554 }, { "epoch": 0.10808741715839089, "grad_norm": 0.8324995636940002, "learning_rate": 9.999553350294375e-05, "loss": 1.0340376853942872, "memory(GiB)": 89.13, "step": 8330, "token_acc": 0.7386387805742645, "train_speed(iter/s)": 0.121555 }, { "epoch": 0.10815229556004659, "grad_norm": 0.8527424335479736, "learning_rate": 9.999546152296693e-05, "loss": 0.9936029434204101, "memory(GiB)": 89.13, "step": 8335, "token_acc": 0.730514859773964, "train_speed(iter/s)": 0.121556 }, { "epoch": 0.10821717396170227, "grad_norm": 0.8701366186141968, "learning_rate": 9.999538896764553e-05, "loss": 0.9957001686096192, "memory(GiB)": 89.13, "step": 8340, "token_acc": 0.7410177726633422, "train_speed(iter/s)": 0.121554 }, { "epoch": 0.10828205236335797, "grad_norm": 1.0778858661651611, "learning_rate": 9.999531583698042e-05, "loss": 0.991832160949707, "memory(GiB)": 89.13, "step": 8345, "token_acc": 0.7334346111577995, "train_speed(iter/s)": 0.121551 }, { "epoch": 0.10834693076501367, "grad_norm": 1.0457395315170288, "learning_rate": 9.999524213097245e-05, "loss": 0.9932050704956055, "memory(GiB)": 89.13, "step": 8350, "token_acc": 0.7331040470148257, "train_speed(iter/s)": 0.121548 }, { "epoch": 0.10841180916666937, "grad_norm": 1.0455800294876099, "learning_rate": 9.999516784962245e-05, "loss": 1.0233978271484374, "memory(GiB)": 89.13, "step": 8355, "token_acc": 0.7458721458116152, "train_speed(iter/s)": 0.121547 }, { "epoch": 0.10847668756832507, "grad_norm": 0.8959933519363403, "learning_rate": 9.999509299293126e-05, "loss": 1.0192441940307617, "memory(GiB)": 89.13, "step": 8360, "token_acc": 0.7389314529798631, "train_speed(iter/s)": 0.121542 }, { "epoch": 0.10854156596998077, "grad_norm": 0.9640218019485474, "learning_rate": 9.999501756089978e-05, "loss": 0.9514904022216797, "memory(GiB)": 89.13, "step": 8365, "token_acc": 0.7448216439772531, "train_speed(iter/s)": 0.121542 }, { "epoch": 0.10860644437163645, "grad_norm": 0.9586824178695679, "learning_rate": 9.999494155352883e-05, "loss": 1.0711588859558105, "memory(GiB)": 89.13, "step": 8370, "token_acc": 0.7192743462803601, "train_speed(iter/s)": 0.121543 }, { "epoch": 0.10867132277329215, "grad_norm": 0.833195686340332, "learning_rate": 9.999486497081933e-05, "loss": 0.9592987060546875, "memory(GiB)": 89.13, "step": 8375, "token_acc": 0.7365471632439958, "train_speed(iter/s)": 0.121543 }, { "epoch": 0.10873620117494785, "grad_norm": 0.9958425164222717, "learning_rate": 9.999478781277212e-05, "loss": 1.0522345542907714, "memory(GiB)": 89.13, "step": 8380, "token_acc": 0.7303661616161616, "train_speed(iter/s)": 0.121545 }, { "epoch": 0.10880107957660355, "grad_norm": 1.0254619121551514, "learning_rate": 9.999471007938813e-05, "loss": 0.994744873046875, "memory(GiB)": 89.13, "step": 8385, "token_acc": 0.7554912772304077, "train_speed(iter/s)": 0.121543 }, { "epoch": 0.10886595797825925, "grad_norm": 0.8792033791542053, "learning_rate": 9.999463177066823e-05, "loss": 1.002612018585205, "memory(GiB)": 89.13, "step": 8390, "token_acc": 0.7452166224580018, "train_speed(iter/s)": 0.121541 }, { "epoch": 0.10893083637991495, "grad_norm": 0.9980478882789612, "learning_rate": 9.999455288661333e-05, "loss": 0.987468147277832, "memory(GiB)": 89.13, "step": 8395, "token_acc": 0.7536464239627733, "train_speed(iter/s)": 0.121544 }, { "epoch": 0.10899571478157064, "grad_norm": 1.0059926509857178, "learning_rate": 9.999447342722432e-05, "loss": 1.0278911590576172, "memory(GiB)": 89.13, "step": 8400, "token_acc": 0.7152624014295148, "train_speed(iter/s)": 0.121548 }, { "epoch": 0.10906059318322633, "grad_norm": 1.0048463344573975, "learning_rate": 9.999439339250213e-05, "loss": 1.0172024726867677, "memory(GiB)": 89.13, "step": 8405, "token_acc": 0.732101469981268, "train_speed(iter/s)": 0.121546 }, { "epoch": 0.10912547158488203, "grad_norm": 0.8024690747261047, "learning_rate": 9.999431278244768e-05, "loss": 0.9700801849365235, "memory(GiB)": 89.13, "step": 8410, "token_acc": 0.7620237565467429, "train_speed(iter/s)": 0.121544 }, { "epoch": 0.10919034998653773, "grad_norm": 1.020496129989624, "learning_rate": 9.99942315970619e-05, "loss": 1.003179931640625, "memory(GiB)": 89.13, "step": 8415, "token_acc": 0.7373622324210244, "train_speed(iter/s)": 0.121546 }, { "epoch": 0.10925522838819343, "grad_norm": 1.0118383169174194, "learning_rate": 9.999414983634572e-05, "loss": 1.0039026260375976, "memory(GiB)": 89.13, "step": 8420, "token_acc": 0.7516527304011833, "train_speed(iter/s)": 0.121543 }, { "epoch": 0.10932010678984913, "grad_norm": 0.9971678256988525, "learning_rate": 9.999406750030007e-05, "loss": 1.0365229606628419, "memory(GiB)": 89.13, "step": 8425, "token_acc": 0.7298926628436995, "train_speed(iter/s)": 0.121546 }, { "epoch": 0.10938498519150482, "grad_norm": 0.9011861085891724, "learning_rate": 9.999398458892592e-05, "loss": 1.025630569458008, "memory(GiB)": 89.13, "step": 8430, "token_acc": 0.7201824619605091, "train_speed(iter/s)": 0.121543 }, { "epoch": 0.10944986359316052, "grad_norm": 0.8556617498397827, "learning_rate": 9.999390110222421e-05, "loss": 0.9561986923217773, "memory(GiB)": 89.13, "step": 8435, "token_acc": 0.750113310167699, "train_speed(iter/s)": 0.121539 }, { "epoch": 0.10951474199481621, "grad_norm": 0.8425412774085999, "learning_rate": 9.99938170401959e-05, "loss": 1.0020620346069335, "memory(GiB)": 89.13, "step": 8440, "token_acc": 0.7532301682692307, "train_speed(iter/s)": 0.121539 }, { "epoch": 0.10957962039647191, "grad_norm": 0.8608978390693665, "learning_rate": 9.999373240284198e-05, "loss": 1.047930908203125, "memory(GiB)": 89.13, "step": 8445, "token_acc": 0.7218083575790517, "train_speed(iter/s)": 0.121538 }, { "epoch": 0.10964449879812761, "grad_norm": 0.8682669401168823, "learning_rate": 9.999364719016338e-05, "loss": 0.9978510856628418, "memory(GiB)": 89.13, "step": 8450, "token_acc": 0.7257075004947556, "train_speed(iter/s)": 0.12154 }, { "epoch": 0.10970937719978331, "grad_norm": 1.053248405456543, "learning_rate": 9.999356140216112e-05, "loss": 0.9698232650756836, "memory(GiB)": 89.13, "step": 8455, "token_acc": 0.7246693845083026, "train_speed(iter/s)": 0.121541 }, { "epoch": 0.109774255601439, "grad_norm": 1.0030180215835571, "learning_rate": 9.999347503883616e-05, "loss": 1.069112777709961, "memory(GiB)": 89.13, "step": 8460, "token_acc": 0.7147182630529503, "train_speed(iter/s)": 0.121543 }, { "epoch": 0.1098391340030947, "grad_norm": 0.9984250068664551, "learning_rate": 9.999338810018952e-05, "loss": 1.027686882019043, "memory(GiB)": 89.13, "step": 8465, "token_acc": 0.7365278687261662, "train_speed(iter/s)": 0.121542 }, { "epoch": 0.1099040124047504, "grad_norm": 0.9643036127090454, "learning_rate": 9.999330058622219e-05, "loss": 1.0184854507446288, "memory(GiB)": 89.13, "step": 8470, "token_acc": 0.7457905941552158, "train_speed(iter/s)": 0.12154 }, { "epoch": 0.1099688908064061, "grad_norm": 0.8709520101547241, "learning_rate": 9.999321249693516e-05, "loss": 1.0439897537231446, "memory(GiB)": 89.13, "step": 8475, "token_acc": 0.7217730291654635, "train_speed(iter/s)": 0.121542 }, { "epoch": 0.1100337692080618, "grad_norm": 0.9757399559020996, "learning_rate": 9.999312383232946e-05, "loss": 0.9906270980834961, "memory(GiB)": 89.13, "step": 8480, "token_acc": 0.7579895303290991, "train_speed(iter/s)": 0.121543 }, { "epoch": 0.11009864760971749, "grad_norm": 0.9730418920516968, "learning_rate": 9.999303459240612e-05, "loss": 0.9678607940673828, "memory(GiB)": 89.13, "step": 8485, "token_acc": 0.7544372175063389, "train_speed(iter/s)": 0.121543 }, { "epoch": 0.11016352601137318, "grad_norm": 0.8886470794677734, "learning_rate": 9.999294477716614e-05, "loss": 0.9942654609680176, "memory(GiB)": 89.13, "step": 8490, "token_acc": 0.7348574290723051, "train_speed(iter/s)": 0.121542 }, { "epoch": 0.11022840441302888, "grad_norm": 0.9714106917381287, "learning_rate": 9.999285438661059e-05, "loss": 1.0217839241027833, "memory(GiB)": 89.13, "step": 8495, "token_acc": 0.7216478794295323, "train_speed(iter/s)": 0.121542 }, { "epoch": 0.11029328281468458, "grad_norm": 0.8818594813346863, "learning_rate": 9.999276342074047e-05, "loss": 1.0124670028686524, "memory(GiB)": 89.13, "step": 8500, "token_acc": 0.7248887947468756, "train_speed(iter/s)": 0.121545 }, { "epoch": 0.11035816121634028, "grad_norm": 0.9811005592346191, "learning_rate": 9.999267187955686e-05, "loss": 1.004260540008545, "memory(GiB)": 89.13, "step": 8505, "token_acc": 0.7693944045022484, "train_speed(iter/s)": 0.121542 }, { "epoch": 0.11042303961799597, "grad_norm": 0.749987006187439, "learning_rate": 9.99925797630608e-05, "loss": 0.9856292724609375, "memory(GiB)": 89.13, "step": 8510, "token_acc": 0.7370660953649286, "train_speed(iter/s)": 0.121539 }, { "epoch": 0.11048791801965167, "grad_norm": 0.9511091709136963, "learning_rate": 9.999248707125334e-05, "loss": 1.0481661796569823, "memory(GiB)": 89.13, "step": 8515, "token_acc": 0.7450096587250483, "train_speed(iter/s)": 0.121539 }, { "epoch": 0.11055279642130736, "grad_norm": 0.9163221120834351, "learning_rate": 9.999239380413556e-05, "loss": 1.0118417739868164, "memory(GiB)": 89.13, "step": 8520, "token_acc": 0.7230282914493441, "train_speed(iter/s)": 0.121539 }, { "epoch": 0.11061767482296306, "grad_norm": 0.9564377665519714, "learning_rate": 9.999229996170852e-05, "loss": 1.0499601364135742, "memory(GiB)": 89.13, "step": 8525, "token_acc": 0.7228228228228228, "train_speed(iter/s)": 0.121539 }, { "epoch": 0.11068255322461876, "grad_norm": 0.9877768754959106, "learning_rate": 9.999220554397332e-05, "loss": 1.005086898803711, "memory(GiB)": 89.13, "step": 8530, "token_acc": 0.7349465589615154, "train_speed(iter/s)": 0.121537 }, { "epoch": 0.11074743162627446, "grad_norm": 0.9086194038391113, "learning_rate": 9.999211055093104e-05, "loss": 1.0164568901062012, "memory(GiB)": 89.13, "step": 8535, "token_acc": 0.748356866568131, "train_speed(iter/s)": 0.121536 }, { "epoch": 0.11081231002793016, "grad_norm": 0.8935666680335999, "learning_rate": 9.999201498258276e-05, "loss": 1.0270002365112305, "memory(GiB)": 89.13, "step": 8540, "token_acc": 0.7388754045307443, "train_speed(iter/s)": 0.121539 }, { "epoch": 0.11087718842958585, "grad_norm": 0.8343422412872314, "learning_rate": 9.99919188389296e-05, "loss": 0.9608371734619141, "memory(GiB)": 89.13, "step": 8545, "token_acc": 0.7479820900977735, "train_speed(iter/s)": 0.121536 }, { "epoch": 0.11094206683124154, "grad_norm": 0.911495566368103, "learning_rate": 9.999182211997264e-05, "loss": 0.9865175247192383, "memory(GiB)": 89.13, "step": 8550, "token_acc": 0.7522514071294559, "train_speed(iter/s)": 0.121533 }, { "epoch": 0.11100694523289724, "grad_norm": 0.7971817255020142, "learning_rate": 9.999172482571301e-05, "loss": 1.0176482200622559, "memory(GiB)": 89.13, "step": 8555, "token_acc": 0.7392858189061722, "train_speed(iter/s)": 0.121533 }, { "epoch": 0.11107182363455294, "grad_norm": 1.0142285823822021, "learning_rate": 9.999162695615184e-05, "loss": 1.005585289001465, "memory(GiB)": 89.13, "step": 8560, "token_acc": 0.7309830255549338, "train_speed(iter/s)": 0.121535 }, { "epoch": 0.11113670203620864, "grad_norm": 0.9182499647140503, "learning_rate": 9.999152851129025e-05, "loss": 1.029698371887207, "memory(GiB)": 89.13, "step": 8565, "token_acc": 0.7446793289694889, "train_speed(iter/s)": 0.121535 }, { "epoch": 0.11120158043786434, "grad_norm": 0.940292477607727, "learning_rate": 9.999142949112935e-05, "loss": 1.0120935440063477, "memory(GiB)": 89.13, "step": 8570, "token_acc": 0.723702259764638, "train_speed(iter/s)": 0.121534 }, { "epoch": 0.11126645883952004, "grad_norm": 0.9596928358078003, "learning_rate": 9.999132989567031e-05, "loss": 0.9963912963867188, "memory(GiB)": 89.13, "step": 8575, "token_acc": 0.732488795051302, "train_speed(iter/s)": 0.121531 }, { "epoch": 0.11133133724117572, "grad_norm": 1.012502670288086, "learning_rate": 9.999122972491425e-05, "loss": 1.0346909523010255, "memory(GiB)": 89.13, "step": 8580, "token_acc": 0.7041678356557249, "train_speed(iter/s)": 0.121531 }, { "epoch": 0.11139621564283142, "grad_norm": 0.8617397546768188, "learning_rate": 9.999112897886234e-05, "loss": 0.9967539787292481, "memory(GiB)": 89.13, "step": 8585, "token_acc": 0.7427339084273391, "train_speed(iter/s)": 0.121532 }, { "epoch": 0.11146109404448712, "grad_norm": 0.8187331557273865, "learning_rate": 9.999102765751573e-05, "loss": 0.9721366882324218, "memory(GiB)": 89.13, "step": 8590, "token_acc": 0.7412053884951797, "train_speed(iter/s)": 0.12153 }, { "epoch": 0.11152597244614282, "grad_norm": 0.9838489890098572, "learning_rate": 9.999092576087562e-05, "loss": 1.0291923522949218, "memory(GiB)": 89.13, "step": 8595, "token_acc": 0.7440981093510475, "train_speed(iter/s)": 0.121533 }, { "epoch": 0.11159085084779852, "grad_norm": 1.0046796798706055, "learning_rate": 9.999082328894312e-05, "loss": 1.045160675048828, "memory(GiB)": 89.13, "step": 8600, "token_acc": 0.7045717246602097, "train_speed(iter/s)": 0.121538 }, { "epoch": 0.11165572924945422, "grad_norm": 1.0501105785369873, "learning_rate": 9.999072024171945e-05, "loss": 1.0210021018981934, "memory(GiB)": 89.13, "step": 8605, "token_acc": 0.7100470229816545, "train_speed(iter/s)": 0.121539 }, { "epoch": 0.1117206076511099, "grad_norm": 0.8761889338493347, "learning_rate": 9.999061661920579e-05, "loss": 1.005143165588379, "memory(GiB)": 89.13, "step": 8610, "token_acc": 0.7340741335044929, "train_speed(iter/s)": 0.121539 }, { "epoch": 0.1117854860527656, "grad_norm": 0.7634043097496033, "learning_rate": 9.999051242140333e-05, "loss": 1.0502471923828125, "memory(GiB)": 89.13, "step": 8615, "token_acc": 0.6980354267310789, "train_speed(iter/s)": 0.121539 }, { "epoch": 0.1118503644544213, "grad_norm": 1.0728827714920044, "learning_rate": 9.99904076483133e-05, "loss": 0.9949776649475097, "memory(GiB)": 89.13, "step": 8620, "token_acc": 0.7489038985661808, "train_speed(iter/s)": 0.121535 }, { "epoch": 0.111915242856077, "grad_norm": 0.9283134937286377, "learning_rate": 9.999030229993684e-05, "loss": 1.0215755462646485, "memory(GiB)": 89.13, "step": 8625, "token_acc": 0.7330695655514156, "train_speed(iter/s)": 0.121538 }, { "epoch": 0.1119801212577327, "grad_norm": 0.866020679473877, "learning_rate": 9.999019637627521e-05, "loss": 0.9908044815063477, "memory(GiB)": 89.13, "step": 8630, "token_acc": 0.7469773344143255, "train_speed(iter/s)": 0.121538 }, { "epoch": 0.1120449996593884, "grad_norm": 0.8552993535995483, "learning_rate": 9.999008987732961e-05, "loss": 0.9917032241821289, "memory(GiB)": 89.13, "step": 8635, "token_acc": 0.7338948396350209, "train_speed(iter/s)": 0.121539 }, { "epoch": 0.11210987806104408, "grad_norm": 0.7908280491828918, "learning_rate": 9.998998280310129e-05, "loss": 1.0070969581604003, "memory(GiB)": 89.13, "step": 8640, "token_acc": 0.7595502203312566, "train_speed(iter/s)": 0.121535 }, { "epoch": 0.11217475646269978, "grad_norm": 0.843694269657135, "learning_rate": 9.998987515359146e-05, "loss": 1.0543622970581055, "memory(GiB)": 89.13, "step": 8645, "token_acc": 0.7161426380368098, "train_speed(iter/s)": 0.121539 }, { "epoch": 0.11223963486435548, "grad_norm": 1.0133874416351318, "learning_rate": 9.998976692880136e-05, "loss": 0.9984733581542968, "memory(GiB)": 89.13, "step": 8650, "token_acc": 0.7444527711707015, "train_speed(iter/s)": 0.121538 }, { "epoch": 0.11230451326601118, "grad_norm": 1.008786678314209, "learning_rate": 9.998965812873224e-05, "loss": 0.9832120895385742, "memory(GiB)": 89.13, "step": 8655, "token_acc": 0.7570725707257072, "train_speed(iter/s)": 0.121539 }, { "epoch": 0.11236939166766688, "grad_norm": 1.0752453804016113, "learning_rate": 9.998954875338534e-05, "loss": 0.9811307907104492, "memory(GiB)": 89.13, "step": 8660, "token_acc": 0.7318801089918257, "train_speed(iter/s)": 0.121538 }, { "epoch": 0.11243427006932258, "grad_norm": 0.9377883672714233, "learning_rate": 9.998943880276194e-05, "loss": 1.0314659118652343, "memory(GiB)": 89.13, "step": 8665, "token_acc": 0.7330418449979986, "train_speed(iter/s)": 0.121535 }, { "epoch": 0.11249914847097826, "grad_norm": 0.9490870833396912, "learning_rate": 9.99893282768633e-05, "loss": 0.9629316329956055, "memory(GiB)": 89.13, "step": 8670, "token_acc": 0.7475433343174054, "train_speed(iter/s)": 0.121535 }, { "epoch": 0.11256402687263396, "grad_norm": 1.0096989870071411, "learning_rate": 9.998921717569069e-05, "loss": 1.044102382659912, "memory(GiB)": 89.13, "step": 8675, "token_acc": 0.730551281570158, "train_speed(iter/s)": 0.121538 }, { "epoch": 0.11262890527428966, "grad_norm": 0.9424781799316406, "learning_rate": 9.998910549924539e-05, "loss": 1.0086824417114257, "memory(GiB)": 89.13, "step": 8680, "token_acc": 0.7275578739217893, "train_speed(iter/s)": 0.12154 }, { "epoch": 0.11269378367594536, "grad_norm": 0.8996102809906006, "learning_rate": 9.998899324752866e-05, "loss": 0.9534784317016601, "memory(GiB)": 89.13, "step": 8685, "token_acc": 0.7309205100551925, "train_speed(iter/s)": 0.12154 }, { "epoch": 0.11275866207760106, "grad_norm": 1.1126258373260498, "learning_rate": 9.998888042054182e-05, "loss": 1.0077146530151366, "memory(GiB)": 89.13, "step": 8690, "token_acc": 0.7401133873604396, "train_speed(iter/s)": 0.121541 }, { "epoch": 0.11282354047925676, "grad_norm": 0.9441383481025696, "learning_rate": 9.998876701828617e-05, "loss": 1.0461727142333985, "memory(GiB)": 89.13, "step": 8695, "token_acc": 0.7221832514948058, "train_speed(iter/s)": 0.121544 }, { "epoch": 0.11288841888091244, "grad_norm": 0.9653375744819641, "learning_rate": 9.9988653040763e-05, "loss": 0.9785085678100586, "memory(GiB)": 89.13, "step": 8700, "token_acc": 0.7652907022985763, "train_speed(iter/s)": 0.121546 }, { "epoch": 0.11295329728256814, "grad_norm": 0.965718686580658, "learning_rate": 9.998853848797363e-05, "loss": 1.0476585388183595, "memory(GiB)": 89.13, "step": 8705, "token_acc": 0.7425278810408922, "train_speed(iter/s)": 0.121548 }, { "epoch": 0.11301817568422384, "grad_norm": 0.8917732238769531, "learning_rate": 9.998842335991938e-05, "loss": 0.9945413589477539, "memory(GiB)": 89.13, "step": 8710, "token_acc": 0.753538848694558, "train_speed(iter/s)": 0.121549 }, { "epoch": 0.11308305408587954, "grad_norm": 0.8513511419296265, "learning_rate": 9.998830765660156e-05, "loss": 1.0421257019042969, "memory(GiB)": 89.13, "step": 8715, "token_acc": 0.7206163028767741, "train_speed(iter/s)": 0.121551 }, { "epoch": 0.11314793248753524, "grad_norm": 0.8976889252662659, "learning_rate": 9.998819137802154e-05, "loss": 0.997071361541748, "memory(GiB)": 89.13, "step": 8720, "token_acc": 0.7231339928057554, "train_speed(iter/s)": 0.12155 }, { "epoch": 0.11321281088919094, "grad_norm": 1.02341628074646, "learning_rate": 9.99880745241806e-05, "loss": 1.0247861862182617, "memory(GiB)": 89.13, "step": 8725, "token_acc": 0.737332038206249, "train_speed(iter/s)": 0.121552 }, { "epoch": 0.11327768929084663, "grad_norm": 0.8845368027687073, "learning_rate": 9.998795709508013e-05, "loss": 1.0213876724243165, "memory(GiB)": 89.13, "step": 8730, "token_acc": 0.7246893089908109, "train_speed(iter/s)": 0.121556 }, { "epoch": 0.11334256769250232, "grad_norm": 0.8665318489074707, "learning_rate": 9.998783909072146e-05, "loss": 1.0154251098632812, "memory(GiB)": 89.13, "step": 8735, "token_acc": 0.7452168088762049, "train_speed(iter/s)": 0.121553 }, { "epoch": 0.11340744609415802, "grad_norm": 0.9278128147125244, "learning_rate": 9.998772051110597e-05, "loss": 1.0327749252319336, "memory(GiB)": 89.13, "step": 8740, "token_acc": 0.7315330712698684, "train_speed(iter/s)": 0.121553 }, { "epoch": 0.11347232449581372, "grad_norm": 0.8797795176506042, "learning_rate": 9.9987601356235e-05, "loss": 0.9957424163818359, "memory(GiB)": 89.13, "step": 8745, "token_acc": 0.7150278293135436, "train_speed(iter/s)": 0.121556 }, { "epoch": 0.11353720289746942, "grad_norm": 0.9033949971199036, "learning_rate": 9.998748162610991e-05, "loss": 1.0164868354797363, "memory(GiB)": 89.13, "step": 8750, "token_acc": 0.7424907725595011, "train_speed(iter/s)": 0.121557 }, { "epoch": 0.11360208129912512, "grad_norm": 0.8784922957420349, "learning_rate": 9.998736132073213e-05, "loss": 0.9866645812988282, "memory(GiB)": 89.13, "step": 8755, "token_acc": 0.7270588993426988, "train_speed(iter/s)": 0.121557 }, { "epoch": 0.1136669597007808, "grad_norm": 0.854708731174469, "learning_rate": 9.9987240440103e-05, "loss": 1.038928508758545, "memory(GiB)": 89.13, "step": 8760, "token_acc": 0.7342264477095938, "train_speed(iter/s)": 0.121555 }, { "epoch": 0.1137318381024365, "grad_norm": 0.903908908367157, "learning_rate": 9.998711898422393e-05, "loss": 1.0096566200256347, "memory(GiB)": 89.13, "step": 8765, "token_acc": 0.7262667235980643, "train_speed(iter/s)": 0.121552 }, { "epoch": 0.1137967165040922, "grad_norm": 0.9123968482017517, "learning_rate": 9.99869969530963e-05, "loss": 1.0491687774658203, "memory(GiB)": 89.13, "step": 8770, "token_acc": 0.7314487632508834, "train_speed(iter/s)": 0.121552 }, { "epoch": 0.1138615949057479, "grad_norm": 0.9818625450134277, "learning_rate": 9.998687434672153e-05, "loss": 0.9965860366821289, "memory(GiB)": 89.13, "step": 8775, "token_acc": 0.7228168797166179, "train_speed(iter/s)": 0.121554 }, { "epoch": 0.1139264733074036, "grad_norm": 1.0319520235061646, "learning_rate": 9.998675116510103e-05, "loss": 1.0786592483520507, "memory(GiB)": 89.13, "step": 8780, "token_acc": 0.7254560863553237, "train_speed(iter/s)": 0.121557 }, { "epoch": 0.1139913517090593, "grad_norm": 0.9751595854759216, "learning_rate": 9.99866274082362e-05, "loss": 0.9951699256896973, "memory(GiB)": 89.13, "step": 8785, "token_acc": 0.7115484941901826, "train_speed(iter/s)": 0.121558 }, { "epoch": 0.11405623011071499, "grad_norm": 0.9536406993865967, "learning_rate": 9.998650307612851e-05, "loss": 1.0160399436950684, "memory(GiB)": 89.13, "step": 8790, "token_acc": 0.7258582652784766, "train_speed(iter/s)": 0.12156 }, { "epoch": 0.11412110851237069, "grad_norm": 0.790550708770752, "learning_rate": 9.998637816877933e-05, "loss": 0.9988710403442382, "memory(GiB)": 89.13, "step": 8795, "token_acc": 0.7223026153248862, "train_speed(iter/s)": 0.121557 }, { "epoch": 0.11418598691402639, "grad_norm": 1.2218252420425415, "learning_rate": 9.998625268619016e-05, "loss": 1.0597263336181642, "memory(GiB)": 89.13, "step": 8800, "token_acc": 0.7446277032575965, "train_speed(iter/s)": 0.121557 }, { "epoch": 0.11425086531568208, "grad_norm": 0.8945978283882141, "learning_rate": 9.998612662836239e-05, "loss": 1.0663063049316406, "memory(GiB)": 89.13, "step": 8805, "token_acc": 0.712203763746365, "train_speed(iter/s)": 0.121558 }, { "epoch": 0.11431574371733778, "grad_norm": 0.9192918539047241, "learning_rate": 9.998599999529749e-05, "loss": 0.9679582595825196, "memory(GiB)": 89.13, "step": 8810, "token_acc": 0.7334573954120889, "train_speed(iter/s)": 0.121553 }, { "epoch": 0.11438062211899348, "grad_norm": 0.9083953499794006, "learning_rate": 9.998587278699692e-05, "loss": 0.9690715789794921, "memory(GiB)": 89.13, "step": 8815, "token_acc": 0.7615343766594821, "train_speed(iter/s)": 0.121554 }, { "epoch": 0.11444550052064917, "grad_norm": 0.9548837542533875, "learning_rate": 9.998574500346215e-05, "loss": 0.9704306602478028, "memory(GiB)": 89.13, "step": 8820, "token_acc": 0.7619124120425409, "train_speed(iter/s)": 0.121552 }, { "epoch": 0.11451037892230487, "grad_norm": 0.8287299871444702, "learning_rate": 9.998561664469463e-05, "loss": 1.0130434036254883, "memory(GiB)": 89.13, "step": 8825, "token_acc": 0.760428756741995, "train_speed(iter/s)": 0.12155 }, { "epoch": 0.11457525732396057, "grad_norm": 0.8810906410217285, "learning_rate": 9.998548771069587e-05, "loss": 1.0100058555603026, "memory(GiB)": 89.13, "step": 8830, "token_acc": 0.7253688261706221, "train_speed(iter/s)": 0.121551 }, { "epoch": 0.11464013572561627, "grad_norm": 0.9407358169555664, "learning_rate": 9.998535820146733e-05, "loss": 0.9038747787475586, "memory(GiB)": 89.13, "step": 8835, "token_acc": 0.761463507833397, "train_speed(iter/s)": 0.121549 }, { "epoch": 0.11470501412727196, "grad_norm": 0.936881422996521, "learning_rate": 9.99852281170105e-05, "loss": 1.025868034362793, "memory(GiB)": 89.13, "step": 8840, "token_acc": 0.7144507492100166, "train_speed(iter/s)": 0.121549 }, { "epoch": 0.11476989252892766, "grad_norm": 0.942198634147644, "learning_rate": 9.998509745732688e-05, "loss": 1.019575023651123, "memory(GiB)": 89.13, "step": 8845, "token_acc": 0.7464691264823378, "train_speed(iter/s)": 0.121547 }, { "epoch": 0.11483477093058335, "grad_norm": 0.7805351614952087, "learning_rate": 9.998496622241798e-05, "loss": 1.0180040359497071, "memory(GiB)": 89.13, "step": 8850, "token_acc": 0.7552899197145406, "train_speed(iter/s)": 0.121545 }, { "epoch": 0.11489964933223905, "grad_norm": 0.9711109399795532, "learning_rate": 9.99848344122853e-05, "loss": 0.9991961479187011, "memory(GiB)": 89.13, "step": 8855, "token_acc": 0.7477989301970165, "train_speed(iter/s)": 0.121543 }, { "epoch": 0.11496452773389475, "grad_norm": 1.0131515264511108, "learning_rate": 9.998470202693036e-05, "loss": 0.9896500587463379, "memory(GiB)": 89.13, "step": 8860, "token_acc": 0.734094318223393, "train_speed(iter/s)": 0.12154 }, { "epoch": 0.11502940613555045, "grad_norm": 0.9523619413375854, "learning_rate": 9.998456906635469e-05, "loss": 1.0286205291748047, "memory(GiB)": 89.13, "step": 8865, "token_acc": 0.7320025259612686, "train_speed(iter/s)": 0.121542 }, { "epoch": 0.11509428453720615, "grad_norm": 0.9084773659706116, "learning_rate": 9.998443553055982e-05, "loss": 1.0112890243530273, "memory(GiB)": 89.13, "step": 8870, "token_acc": 0.7437678962438943, "train_speed(iter/s)": 0.121543 }, { "epoch": 0.11515916293886184, "grad_norm": 0.8896310329437256, "learning_rate": 9.998430141954728e-05, "loss": 1.0187673568725586, "memory(GiB)": 89.13, "step": 8875, "token_acc": 0.7247394957983193, "train_speed(iter/s)": 0.121544 }, { "epoch": 0.11522404134051753, "grad_norm": 0.9324513673782349, "learning_rate": 9.998416673331861e-05, "loss": 1.0172123908996582, "memory(GiB)": 89.13, "step": 8880, "token_acc": 0.7202814959915841, "train_speed(iter/s)": 0.121539 }, { "epoch": 0.11528891974217323, "grad_norm": 0.9201903343200684, "learning_rate": 9.998403147187538e-05, "loss": 1.0380825042724608, "memory(GiB)": 89.13, "step": 8885, "token_acc": 0.7187188664624922, "train_speed(iter/s)": 0.121541 }, { "epoch": 0.11535379814382893, "grad_norm": 0.9047064185142517, "learning_rate": 9.998389563521913e-05, "loss": 0.9686148643493653, "memory(GiB)": 89.13, "step": 8890, "token_acc": 0.7568199812855395, "train_speed(iter/s)": 0.121541 }, { "epoch": 0.11541867654548463, "grad_norm": 0.8695628046989441, "learning_rate": 9.998375922335142e-05, "loss": 0.9882789611816406, "memory(GiB)": 89.13, "step": 8895, "token_acc": 0.753101777588634, "train_speed(iter/s)": 0.121544 }, { "epoch": 0.11548355494714033, "grad_norm": 0.8308716416358948, "learning_rate": 9.998362223627383e-05, "loss": 0.9908605575561523, "memory(GiB)": 89.13, "step": 8900, "token_acc": 0.7514128335360183, "train_speed(iter/s)": 0.121542 }, { "epoch": 0.11554843334879603, "grad_norm": 0.9056931734085083, "learning_rate": 9.998348467398792e-05, "loss": 1.0209820747375489, "memory(GiB)": 89.13, "step": 8905, "token_acc": 0.7219768314026169, "train_speed(iter/s)": 0.121543 }, { "epoch": 0.11561331175045171, "grad_norm": 0.875654935836792, "learning_rate": 9.99833465364953e-05, "loss": 0.981844425201416, "memory(GiB)": 89.13, "step": 8910, "token_acc": 0.7167137013666487, "train_speed(iter/s)": 0.121542 }, { "epoch": 0.11567819015210741, "grad_norm": 0.8623732924461365, "learning_rate": 9.998320782379753e-05, "loss": 0.972050666809082, "memory(GiB)": 89.13, "step": 8915, "token_acc": 0.7336055561372282, "train_speed(iter/s)": 0.121543 }, { "epoch": 0.11574306855376311, "grad_norm": 0.9552487134933472, "learning_rate": 9.998306853589623e-05, "loss": 1.034543800354004, "memory(GiB)": 89.13, "step": 8920, "token_acc": 0.7107607466063348, "train_speed(iter/s)": 0.121542 }, { "epoch": 0.11580794695541881, "grad_norm": 0.9924558997154236, "learning_rate": 9.9982928672793e-05, "loss": 1.0130714416503905, "memory(GiB)": 89.13, "step": 8925, "token_acc": 0.723698458975427, "train_speed(iter/s)": 0.121543 }, { "epoch": 0.11587282535707451, "grad_norm": 0.8554936647415161, "learning_rate": 9.998278823448943e-05, "loss": 0.9648589134216309, "memory(GiB)": 89.13, "step": 8930, "token_acc": 0.736319333358134, "train_speed(iter/s)": 0.121542 }, { "epoch": 0.1159377037587302, "grad_norm": 0.8841346502304077, "learning_rate": 9.998264722098716e-05, "loss": 0.9671295166015625, "memory(GiB)": 89.13, "step": 8935, "token_acc": 0.7462545870787463, "train_speed(iter/s)": 0.121541 }, { "epoch": 0.11600258216038589, "grad_norm": 0.9755989909172058, "learning_rate": 9.998250563228781e-05, "loss": 1.0331926345825195, "memory(GiB)": 89.13, "step": 8940, "token_acc": 0.753405765414242, "train_speed(iter/s)": 0.12154 }, { "epoch": 0.11606746056204159, "grad_norm": 0.9287661910057068, "learning_rate": 9.998236346839299e-05, "loss": 0.9686254501342774, "memory(GiB)": 89.13, "step": 8945, "token_acc": 0.7401644157369348, "train_speed(iter/s)": 0.121536 }, { "epoch": 0.11613233896369729, "grad_norm": 1.0085761547088623, "learning_rate": 9.998222072930436e-05, "loss": 0.9894817352294922, "memory(GiB)": 89.13, "step": 8950, "token_acc": 0.749810606060606, "train_speed(iter/s)": 0.121534 }, { "epoch": 0.11619721736535299, "grad_norm": 0.8934578895568848, "learning_rate": 9.998207741502353e-05, "loss": 0.9875911712646485, "memory(GiB)": 89.13, "step": 8955, "token_acc": 0.7309779557618175, "train_speed(iter/s)": 0.121534 }, { "epoch": 0.11626209576700869, "grad_norm": 0.8662419319152832, "learning_rate": 9.998193352555218e-05, "loss": 1.0751062393188477, "memory(GiB)": 89.13, "step": 8960, "token_acc": 0.7327450578014912, "train_speed(iter/s)": 0.121533 }, { "epoch": 0.11632697416866439, "grad_norm": 0.8651343584060669, "learning_rate": 9.998178906089197e-05, "loss": 1.0043697357177734, "memory(GiB)": 89.13, "step": 8965, "token_acc": 0.7283663432141433, "train_speed(iter/s)": 0.121532 }, { "epoch": 0.11639185257032007, "grad_norm": 0.8620626926422119, "learning_rate": 9.998164402104453e-05, "loss": 1.001226806640625, "memory(GiB)": 89.13, "step": 8970, "token_acc": 0.7235591801070376, "train_speed(iter/s)": 0.121533 }, { "epoch": 0.11645673097197577, "grad_norm": 0.8586058616638184, "learning_rate": 9.998149840601155e-05, "loss": 1.0086363792419433, "memory(GiB)": 89.13, "step": 8975, "token_acc": 0.7461703038083013, "train_speed(iter/s)": 0.121529 }, { "epoch": 0.11652160937363147, "grad_norm": 1.0210700035095215, "learning_rate": 9.998135221579471e-05, "loss": 1.0155012130737304, "memory(GiB)": 89.13, "step": 8980, "token_acc": 0.7294073026928248, "train_speed(iter/s)": 0.121529 }, { "epoch": 0.11658648777528717, "grad_norm": 0.8011159300804138, "learning_rate": 9.998120545039569e-05, "loss": 0.9731201171875, "memory(GiB)": 89.13, "step": 8985, "token_acc": 0.7301231802911534, "train_speed(iter/s)": 0.121531 }, { "epoch": 0.11665136617694287, "grad_norm": 0.9624505043029785, "learning_rate": 9.998105810981616e-05, "loss": 1.0470304489135742, "memory(GiB)": 89.13, "step": 8990, "token_acc": 0.736541598694943, "train_speed(iter/s)": 0.121531 }, { "epoch": 0.11671624457859855, "grad_norm": 0.8374166488647461, "learning_rate": 9.998091019405785e-05, "loss": 1.005757713317871, "memory(GiB)": 89.13, "step": 8995, "token_acc": 0.7641428236091912, "train_speed(iter/s)": 0.121526 }, { "epoch": 0.11678112298025425, "grad_norm": 0.8856792449951172, "learning_rate": 9.998076170312241e-05, "loss": 0.9358627319335937, "memory(GiB)": 89.13, "step": 9000, "token_acc": 0.7495001320405931, "train_speed(iter/s)": 0.121527 }, { "epoch": 0.11684600138190995, "grad_norm": 1.057767391204834, "learning_rate": 9.998061263701161e-05, "loss": 1.051310920715332, "memory(GiB)": 89.13, "step": 9005, "token_acc": 0.7016111069864133, "train_speed(iter/s)": 0.12153 }, { "epoch": 0.11691087978356565, "grad_norm": 1.0531736612319946, "learning_rate": 9.998046299572711e-05, "loss": 0.9938661575317382, "memory(GiB)": 89.13, "step": 9010, "token_acc": 0.7443273594338897, "train_speed(iter/s)": 0.121531 }, { "epoch": 0.11697575818522135, "grad_norm": 0.8380805253982544, "learning_rate": 9.998031277927069e-05, "loss": 1.043484592437744, "memory(GiB)": 89.13, "step": 9015, "token_acc": 0.7260700123068509, "train_speed(iter/s)": 0.121531 }, { "epoch": 0.11704063658687705, "grad_norm": 0.9105972647666931, "learning_rate": 9.998016198764403e-05, "loss": 1.042647933959961, "memory(GiB)": 89.13, "step": 9020, "token_acc": 0.7252447728006312, "train_speed(iter/s)": 0.121531 }, { "epoch": 0.11710551498853274, "grad_norm": 1.010205626487732, "learning_rate": 9.998001062084889e-05, "loss": 0.9926405906677246, "memory(GiB)": 89.13, "step": 9025, "token_acc": 0.726754575143262, "train_speed(iter/s)": 0.12153 }, { "epoch": 0.11717039339018843, "grad_norm": 0.9229644536972046, "learning_rate": 9.9979858678887e-05, "loss": 0.9757490158081055, "memory(GiB)": 89.13, "step": 9030, "token_acc": 0.7437032514119982, "train_speed(iter/s)": 0.12153 }, { "epoch": 0.11723527179184413, "grad_norm": 0.8790019154548645, "learning_rate": 9.997970616176012e-05, "loss": 1.0330771446228026, "memory(GiB)": 89.13, "step": 9035, "token_acc": 0.7466346503674598, "train_speed(iter/s)": 0.121528 }, { "epoch": 0.11730015019349983, "grad_norm": 0.9181070923805237, "learning_rate": 9.997955306946998e-05, "loss": 1.030296802520752, "memory(GiB)": 89.13, "step": 9040, "token_acc": 0.7407705830647524, "train_speed(iter/s)": 0.121526 }, { "epoch": 0.11736502859515553, "grad_norm": 0.8699054718017578, "learning_rate": 9.997939940201838e-05, "loss": 1.013477611541748, "memory(GiB)": 89.13, "step": 9045, "token_acc": 0.7409394986400059, "train_speed(iter/s)": 0.121528 }, { "epoch": 0.11742990699681123, "grad_norm": 0.9242104887962341, "learning_rate": 9.997924515940704e-05, "loss": 0.9869686126708984, "memory(GiB)": 89.13, "step": 9050, "token_acc": 0.7290110039396821, "train_speed(iter/s)": 0.121527 }, { "epoch": 0.11749478539846692, "grad_norm": 0.9024204611778259, "learning_rate": 9.997909034163779e-05, "loss": 0.9581650733947754, "memory(GiB)": 89.13, "step": 9055, "token_acc": 0.7477856177751089, "train_speed(iter/s)": 0.121524 }, { "epoch": 0.11755966380012262, "grad_norm": 0.8839772343635559, "learning_rate": 9.997893494871237e-05, "loss": 1.0218900680541991, "memory(GiB)": 89.13, "step": 9060, "token_acc": 0.7520205852274602, "train_speed(iter/s)": 0.121523 }, { "epoch": 0.11762454220177831, "grad_norm": 0.8775075078010559, "learning_rate": 9.997877898063259e-05, "loss": 0.9798784255981445, "memory(GiB)": 89.13, "step": 9065, "token_acc": 0.7242846094354215, "train_speed(iter/s)": 0.121524 }, { "epoch": 0.11768942060343401, "grad_norm": 0.8994531035423279, "learning_rate": 9.997862243740024e-05, "loss": 1.0358170509338378, "memory(GiB)": 89.13, "step": 9070, "token_acc": 0.7336985043295723, "train_speed(iter/s)": 0.121525 }, { "epoch": 0.11775429900508971, "grad_norm": 0.9029361605644226, "learning_rate": 9.99784653190171e-05, "loss": 1.030044174194336, "memory(GiB)": 89.13, "step": 9075, "token_acc": 0.7148473046263605, "train_speed(iter/s)": 0.121524 }, { "epoch": 0.11781917740674541, "grad_norm": 0.9086641669273376, "learning_rate": 9.997830762548503e-05, "loss": 0.9778148651123046, "memory(GiB)": 89.13, "step": 9080, "token_acc": 0.7386398061193578, "train_speed(iter/s)": 0.121524 }, { "epoch": 0.1178840558084011, "grad_norm": 0.9332831501960754, "learning_rate": 9.997814935680581e-05, "loss": 1.0124414443969727, "memory(GiB)": 89.13, "step": 9085, "token_acc": 0.7409245814194696, "train_speed(iter/s)": 0.121523 }, { "epoch": 0.1179489342100568, "grad_norm": 0.9779094457626343, "learning_rate": 9.997799051298124e-05, "loss": 1.0080316543579102, "memory(GiB)": 89.13, "step": 9090, "token_acc": 0.7397706305850006, "train_speed(iter/s)": 0.12152 }, { "epoch": 0.1180138126117125, "grad_norm": 0.8620627522468567, "learning_rate": 9.997783109401318e-05, "loss": 0.9713650703430176, "memory(GiB)": 89.13, "step": 9095, "token_acc": 0.7603230947551435, "train_speed(iter/s)": 0.12152 }, { "epoch": 0.1180786910133682, "grad_norm": 0.9308322668075562, "learning_rate": 9.997767109990347e-05, "loss": 1.063162612915039, "memory(GiB)": 89.13, "step": 9100, "token_acc": 0.7263313609467456, "train_speed(iter/s)": 0.121522 }, { "epoch": 0.1181435694150239, "grad_norm": 0.9039215445518494, "learning_rate": 9.997751053065392e-05, "loss": 1.020564079284668, "memory(GiB)": 89.13, "step": 9105, "token_acc": 0.7466875981161696, "train_speed(iter/s)": 0.121521 }, { "epoch": 0.1182084478166796, "grad_norm": 0.9147667288780212, "learning_rate": 9.99773493862664e-05, "loss": 1.0280185699462892, "memory(GiB)": 89.13, "step": 9110, "token_acc": 0.7410398964998777, "train_speed(iter/s)": 0.121522 }, { "epoch": 0.11827332621833528, "grad_norm": 0.7981624007225037, "learning_rate": 9.997718766674278e-05, "loss": 0.9824342727661133, "memory(GiB)": 89.13, "step": 9115, "token_acc": 0.7245110433553953, "train_speed(iter/s)": 0.121524 }, { "epoch": 0.11833820461999098, "grad_norm": 0.8496224284172058, "learning_rate": 9.997702537208487e-05, "loss": 0.9880992889404296, "memory(GiB)": 89.13, "step": 9120, "token_acc": 0.7641480457651847, "train_speed(iter/s)": 0.121525 }, { "epoch": 0.11840308302164668, "grad_norm": 0.897395670413971, "learning_rate": 9.99768625022946e-05, "loss": 0.9666858673095703, "memory(GiB)": 89.13, "step": 9125, "token_acc": 0.7549257676820459, "train_speed(iter/s)": 0.121526 }, { "epoch": 0.11846796142330238, "grad_norm": 0.9020940661430359, "learning_rate": 9.997669905737378e-05, "loss": 1.0714794158935548, "memory(GiB)": 89.13, "step": 9130, "token_acc": 0.7130871305202555, "train_speed(iter/s)": 0.121527 }, { "epoch": 0.11853283982495808, "grad_norm": 0.9041239619255066, "learning_rate": 9.997653503732433e-05, "loss": 0.9997821807861328, "memory(GiB)": 89.13, "step": 9135, "token_acc": 0.7287824182036605, "train_speed(iter/s)": 0.121528 }, { "epoch": 0.11859771822661377, "grad_norm": 0.8911957740783691, "learning_rate": 9.997637044214815e-05, "loss": 1.0328304290771484, "memory(GiB)": 89.13, "step": 9140, "token_acc": 0.7422987436016752, "train_speed(iter/s)": 0.121527 }, { "epoch": 0.11866259662826946, "grad_norm": 0.9523221850395203, "learning_rate": 9.997620527184711e-05, "loss": 0.9933004379272461, "memory(GiB)": 89.13, "step": 9145, "token_acc": 0.7579929260004161, "train_speed(iter/s)": 0.121528 }, { "epoch": 0.11872747502992516, "grad_norm": 0.8824868202209473, "learning_rate": 9.99760395264231e-05, "loss": 1.002810287475586, "memory(GiB)": 89.13, "step": 9150, "token_acc": 0.7377618286070913, "train_speed(iter/s)": 0.12153 }, { "epoch": 0.11879235343158086, "grad_norm": 1.0080586671829224, "learning_rate": 9.997587320587805e-05, "loss": 0.9764045715332031, "memory(GiB)": 89.13, "step": 9155, "token_acc": 0.7321044326495482, "train_speed(iter/s)": 0.121529 }, { "epoch": 0.11885723183323656, "grad_norm": 1.0032587051391602, "learning_rate": 9.997570631021388e-05, "loss": 1.0396472930908203, "memory(GiB)": 89.13, "step": 9160, "token_acc": 0.7299684305472038, "train_speed(iter/s)": 0.121531 }, { "epoch": 0.11892211023489226, "grad_norm": 0.9725304841995239, "learning_rate": 9.99755388394325e-05, "loss": 1.051912784576416, "memory(GiB)": 89.13, "step": 9165, "token_acc": 0.7275975715524718, "train_speed(iter/s)": 0.121531 }, { "epoch": 0.11898698863654796, "grad_norm": 1.0466797351837158, "learning_rate": 9.997537079353581e-05, "loss": 1.0375804901123047, "memory(GiB)": 89.13, "step": 9170, "token_acc": 0.7127559384701245, "train_speed(iter/s)": 0.121527 }, { "epoch": 0.11905186703820364, "grad_norm": 0.7884560823440552, "learning_rate": 9.997520217252578e-05, "loss": 1.009671974182129, "memory(GiB)": 89.13, "step": 9175, "token_acc": 0.7400544597197317, "train_speed(iter/s)": 0.121526 }, { "epoch": 0.11911674543985934, "grad_norm": 0.9321140646934509, "learning_rate": 9.997503297640435e-05, "loss": 1.0403911590576171, "memory(GiB)": 89.13, "step": 9180, "token_acc": 0.7146667615050857, "train_speed(iter/s)": 0.121527 }, { "epoch": 0.11918162384151504, "grad_norm": 0.7692103385925293, "learning_rate": 9.997486320517347e-05, "loss": 0.9331106185913086, "memory(GiB)": 89.13, "step": 9185, "token_acc": 0.7371360703624049, "train_speed(iter/s)": 0.121527 }, { "epoch": 0.11924650224317074, "grad_norm": 0.812863826751709, "learning_rate": 9.997469285883505e-05, "loss": 1.0085657119750977, "memory(GiB)": 89.13, "step": 9190, "token_acc": 0.7429228531809111, "train_speed(iter/s)": 0.121529 }, { "epoch": 0.11931138064482644, "grad_norm": 0.9005770683288574, "learning_rate": 9.99745219373911e-05, "loss": 0.989743995666504, "memory(GiB)": 89.13, "step": 9195, "token_acc": 0.7364174873714431, "train_speed(iter/s)": 0.121528 }, { "epoch": 0.11937625904648214, "grad_norm": 0.8497782945632935, "learning_rate": 9.997435044084356e-05, "loss": 0.9690094947814941, "memory(GiB)": 89.13, "step": 9200, "token_acc": 0.7564331445456784, "train_speed(iter/s)": 0.121524 }, { "epoch": 0.11944113744813782, "grad_norm": 0.9088004231452942, "learning_rate": 9.99741783691944e-05, "loss": 0.9988765716552734, "memory(GiB)": 89.13, "step": 9205, "token_acc": 0.7301273944291927, "train_speed(iter/s)": 0.121527 }, { "epoch": 0.11950601584979352, "grad_norm": 0.7919227480888367, "learning_rate": 9.997400572244564e-05, "loss": 0.9333955764770507, "memory(GiB)": 89.13, "step": 9210, "token_acc": 0.7502395894518812, "train_speed(iter/s)": 0.121528 }, { "epoch": 0.11957089425144922, "grad_norm": 0.8521004319190979, "learning_rate": 9.997383250059922e-05, "loss": 1.0225988388061524, "memory(GiB)": 89.13, "step": 9215, "token_acc": 0.7551752130970099, "train_speed(iter/s)": 0.121528 }, { "epoch": 0.11963577265310492, "grad_norm": 0.949195384979248, "learning_rate": 9.997365870365716e-05, "loss": 0.9972134590148926, "memory(GiB)": 89.13, "step": 9220, "token_acc": 0.731951393852752, "train_speed(iter/s)": 0.121528 }, { "epoch": 0.11970065105476062, "grad_norm": 0.9331569671630859, "learning_rate": 9.997348433162144e-05, "loss": 0.9744075775146485, "memory(GiB)": 89.13, "step": 9225, "token_acc": 0.7400913184095377, "train_speed(iter/s)": 0.121531 }, { "epoch": 0.11976552945641632, "grad_norm": 0.786604642868042, "learning_rate": 9.997330938449408e-05, "loss": 0.9776001930236816, "memory(GiB)": 89.13, "step": 9230, "token_acc": 0.7185925723434998, "train_speed(iter/s)": 0.121531 }, { "epoch": 0.119830407858072, "grad_norm": 0.9117927551269531, "learning_rate": 9.99731338622771e-05, "loss": 1.009998893737793, "memory(GiB)": 89.13, "step": 9235, "token_acc": 0.7455707926264528, "train_speed(iter/s)": 0.12153 }, { "epoch": 0.1198952862597277, "grad_norm": 0.8616802096366882, "learning_rate": 9.997295776497251e-05, "loss": 0.9862214088439941, "memory(GiB)": 89.13, "step": 9240, "token_acc": 0.7419785157621288, "train_speed(iter/s)": 0.121532 }, { "epoch": 0.1199601646613834, "grad_norm": 0.8401492238044739, "learning_rate": 9.997278109258233e-05, "loss": 0.9739301681518555, "memory(GiB)": 89.13, "step": 9245, "token_acc": 0.7576830249396621, "train_speed(iter/s)": 0.121529 }, { "epoch": 0.1200250430630391, "grad_norm": 0.8951522707939148, "learning_rate": 9.997260384510861e-05, "loss": 1.0066339492797851, "memory(GiB)": 89.13, "step": 9250, "token_acc": 0.7334919690660321, "train_speed(iter/s)": 0.121527 }, { "epoch": 0.1200899214646948, "grad_norm": 0.9256322979927063, "learning_rate": 9.997242602255339e-05, "loss": 1.017308807373047, "memory(GiB)": 89.13, "step": 9255, "token_acc": 0.7310685568508413, "train_speed(iter/s)": 0.121523 }, { "epoch": 0.1201547998663505, "grad_norm": 0.9762560725212097, "learning_rate": 9.99722476249187e-05, "loss": 0.9757088661193848, "memory(GiB)": 89.13, "step": 9260, "token_acc": 0.7328068574149768, "train_speed(iter/s)": 0.121523 }, { "epoch": 0.12021967826800618, "grad_norm": 0.9690214991569519, "learning_rate": 9.997206865220658e-05, "loss": 1.0025627136230468, "memory(GiB)": 89.13, "step": 9265, "token_acc": 0.7522827598718026, "train_speed(iter/s)": 0.121522 }, { "epoch": 0.12028455666966188, "grad_norm": 0.8235870599746704, "learning_rate": 9.997188910441914e-05, "loss": 0.961033821105957, "memory(GiB)": 89.13, "step": 9270, "token_acc": 0.7386654972338416, "train_speed(iter/s)": 0.121523 }, { "epoch": 0.12034943507131758, "grad_norm": 0.8272064328193665, "learning_rate": 9.99717089815584e-05, "loss": 0.9775489807128906, "memory(GiB)": 89.13, "step": 9275, "token_acc": 0.7402684115643987, "train_speed(iter/s)": 0.121523 }, { "epoch": 0.12041431347297328, "grad_norm": 0.812735915184021, "learning_rate": 9.997152828362646e-05, "loss": 0.9659663200378418, "memory(GiB)": 89.13, "step": 9280, "token_acc": 0.7515952747144867, "train_speed(iter/s)": 0.121519 }, { "epoch": 0.12047919187462898, "grad_norm": 1.0179599523544312, "learning_rate": 9.997134701062538e-05, "loss": 1.0306926727294923, "memory(GiB)": 89.13, "step": 9285, "token_acc": 0.7375056911708052, "train_speed(iter/s)": 0.121519 }, { "epoch": 0.12054407027628468, "grad_norm": 0.864913284778595, "learning_rate": 9.997116516255725e-05, "loss": 0.9805917739868164, "memory(GiB)": 89.13, "step": 9290, "token_acc": 0.7489053092501369, "train_speed(iter/s)": 0.121518 }, { "epoch": 0.12060894867794036, "grad_norm": 0.9041908383369446, "learning_rate": 9.997098273942417e-05, "loss": 1.039902114868164, "memory(GiB)": 89.13, "step": 9295, "token_acc": 0.744862891100451, "train_speed(iter/s)": 0.121518 }, { "epoch": 0.12067382707959606, "grad_norm": 0.9913057684898376, "learning_rate": 9.997079974122824e-05, "loss": 0.969507884979248, "memory(GiB)": 89.13, "step": 9300, "token_acc": 0.7313988858129122, "train_speed(iter/s)": 0.121515 }, { "epoch": 0.12073870548125176, "grad_norm": 0.8999226689338684, "learning_rate": 9.997061616797156e-05, "loss": 1.0164180755615235, "memory(GiB)": 89.13, "step": 9305, "token_acc": 0.7604413670998135, "train_speed(iter/s)": 0.121516 }, { "epoch": 0.12080358388290746, "grad_norm": 0.8274781107902527, "learning_rate": 9.997043201965626e-05, "loss": 0.9877872467041016, "memory(GiB)": 89.13, "step": 9310, "token_acc": 0.7350634746841062, "train_speed(iter/s)": 0.121516 }, { "epoch": 0.12086846228456316, "grad_norm": 0.9128315448760986, "learning_rate": 9.997024729628444e-05, "loss": 1.0361309051513672, "memory(GiB)": 89.13, "step": 9315, "token_acc": 0.7037141447600254, "train_speed(iter/s)": 0.121515 }, { "epoch": 0.12093334068621886, "grad_norm": 1.0025990009307861, "learning_rate": 9.997006199785822e-05, "loss": 1.023311233520508, "memory(GiB)": 89.13, "step": 9320, "token_acc": 0.7208231400201468, "train_speed(iter/s)": 0.121517 }, { "epoch": 0.12099821908787455, "grad_norm": 0.8905766010284424, "learning_rate": 9.996987612437974e-05, "loss": 0.9722475051879883, "memory(GiB)": 89.13, "step": 9325, "token_acc": 0.7511089714934149, "train_speed(iter/s)": 0.121516 }, { "epoch": 0.12106309748953024, "grad_norm": 0.921966016292572, "learning_rate": 9.996968967585116e-05, "loss": 1.0061055183410645, "memory(GiB)": 89.13, "step": 9330, "token_acc": 0.7474576893424869, "train_speed(iter/s)": 0.121517 }, { "epoch": 0.12112797589118594, "grad_norm": 0.896794855594635, "learning_rate": 9.996950265227458e-05, "loss": 1.0515162467956543, "memory(GiB)": 89.13, "step": 9335, "token_acc": 0.7366861136243097, "train_speed(iter/s)": 0.121517 }, { "epoch": 0.12119285429284164, "grad_norm": 0.827355682849884, "learning_rate": 9.99693150536522e-05, "loss": 1.0313545227050782, "memory(GiB)": 89.13, "step": 9340, "token_acc": 0.7281828956396981, "train_speed(iter/s)": 0.121513 }, { "epoch": 0.12125773269449734, "grad_norm": 0.8300125002861023, "learning_rate": 9.996912687998615e-05, "loss": 1.0013924598693849, "memory(GiB)": 89.13, "step": 9345, "token_acc": 0.7225506878551033, "train_speed(iter/s)": 0.12151 }, { "epoch": 0.12132261109615304, "grad_norm": 0.9452537298202515, "learning_rate": 9.996893813127861e-05, "loss": 0.9955541610717773, "memory(GiB)": 89.13, "step": 9350, "token_acc": 0.7280099602934249, "train_speed(iter/s)": 0.12151 }, { "epoch": 0.12138748949780873, "grad_norm": 1.0445111989974976, "learning_rate": 9.996874880753174e-05, "loss": 1.0117291450500487, "memory(GiB)": 89.13, "step": 9355, "token_acc": 0.7150041620421753, "train_speed(iter/s)": 0.12151 }, { "epoch": 0.12145236789946443, "grad_norm": 0.8964200615882874, "learning_rate": 9.996855890874772e-05, "loss": 1.0174034118652344, "memory(GiB)": 89.13, "step": 9360, "token_acc": 0.730718085106383, "train_speed(iter/s)": 0.121512 }, { "epoch": 0.12151724630112012, "grad_norm": 0.9630735516548157, "learning_rate": 9.996836843492875e-05, "loss": 1.0214977264404297, "memory(GiB)": 89.13, "step": 9365, "token_acc": 0.7491349480968859, "train_speed(iter/s)": 0.121511 }, { "epoch": 0.12158212470277582, "grad_norm": 0.816013753414154, "learning_rate": 9.996817738607702e-05, "loss": 0.9620486259460449, "memory(GiB)": 89.13, "step": 9370, "token_acc": 0.7672974249082919, "train_speed(iter/s)": 0.12151 }, { "epoch": 0.12164700310443152, "grad_norm": 0.803568959236145, "learning_rate": 9.99679857621947e-05, "loss": 1.0468764305114746, "memory(GiB)": 89.13, "step": 9375, "token_acc": 0.7250928856474473, "train_speed(iter/s)": 0.121509 }, { "epoch": 0.12171188150608722, "grad_norm": 0.9894695281982422, "learning_rate": 9.996779356328403e-05, "loss": 1.0200501441955567, "memory(GiB)": 89.13, "step": 9380, "token_acc": 0.7356735039120321, "train_speed(iter/s)": 0.121509 }, { "epoch": 0.1217767599077429, "grad_norm": 0.9668382406234741, "learning_rate": 9.99676007893472e-05, "loss": 1.0313633918762206, "memory(GiB)": 89.13, "step": 9385, "token_acc": 0.7277266015048168, "train_speed(iter/s)": 0.121512 }, { "epoch": 0.1218416383093986, "grad_norm": 1.1111856698989868, "learning_rate": 9.996740744038644e-05, "loss": 1.0402121543884277, "memory(GiB)": 89.13, "step": 9390, "token_acc": 0.7411108844924876, "train_speed(iter/s)": 0.121514 }, { "epoch": 0.1219065167110543, "grad_norm": 0.9642848968505859, "learning_rate": 9.996721351640397e-05, "loss": 0.9811281204223633, "memory(GiB)": 89.13, "step": 9395, "token_acc": 0.7321098963509868, "train_speed(iter/s)": 0.121516 }, { "epoch": 0.12197139511271, "grad_norm": 0.8641611933708191, "learning_rate": 9.996701901740202e-05, "loss": 0.9848054885864258, "memory(GiB)": 89.13, "step": 9400, "token_acc": 0.7199730549006399, "train_speed(iter/s)": 0.121515 }, { "epoch": 0.1220362735143657, "grad_norm": 0.9150604009628296, "learning_rate": 9.996682394338284e-05, "loss": 1.017814826965332, "memory(GiB)": 89.13, "step": 9405, "token_acc": 0.7297754426371096, "train_speed(iter/s)": 0.121513 }, { "epoch": 0.1221011519160214, "grad_norm": 0.8897842168807983, "learning_rate": 9.996662829434866e-05, "loss": 1.0022932052612306, "memory(GiB)": 89.13, "step": 9410, "token_acc": 0.7307705532441633, "train_speed(iter/s)": 0.121509 }, { "epoch": 0.12216603031767709, "grad_norm": 1.1306848526000977, "learning_rate": 9.996643207030174e-05, "loss": 1.0590080261230468, "memory(GiB)": 89.13, "step": 9415, "token_acc": 0.7132461128564044, "train_speed(iter/s)": 0.12151 }, { "epoch": 0.12223090871933279, "grad_norm": 0.9771133065223694, "learning_rate": 9.996623527124434e-05, "loss": 1.0392747879028321, "memory(GiB)": 89.13, "step": 9420, "token_acc": 0.7267521669215986, "train_speed(iter/s)": 0.121514 }, { "epoch": 0.12229578712098849, "grad_norm": 0.8758622407913208, "learning_rate": 9.996603789717869e-05, "loss": 0.9919536590576172, "memory(GiB)": 89.13, "step": 9425, "token_acc": 0.7528485254691689, "train_speed(iter/s)": 0.121515 }, { "epoch": 0.12236066552264419, "grad_norm": 0.9972097277641296, "learning_rate": 9.996583994810713e-05, "loss": 1.0420683860778808, "memory(GiB)": 89.13, "step": 9430, "token_acc": 0.7256177385043364, "train_speed(iter/s)": 0.121515 }, { "epoch": 0.12242554392429988, "grad_norm": 0.9903465509414673, "learning_rate": 9.996564142403189e-05, "loss": 1.0012709617614746, "memory(GiB)": 89.13, "step": 9435, "token_acc": 0.7252392568962281, "train_speed(iter/s)": 0.121512 }, { "epoch": 0.12249042232595558, "grad_norm": 0.9629032015800476, "learning_rate": 9.996544232495525e-05, "loss": 1.017998504638672, "memory(GiB)": 89.13, "step": 9440, "token_acc": 0.7443693299239903, "train_speed(iter/s)": 0.121514 }, { "epoch": 0.12255530072761127, "grad_norm": 0.9064805507659912, "learning_rate": 9.996524265087951e-05, "loss": 1.0082244873046875, "memory(GiB)": 89.13, "step": 9445, "token_acc": 0.7471093881678746, "train_speed(iter/s)": 0.121515 }, { "epoch": 0.12262017912926697, "grad_norm": 0.9639217257499695, "learning_rate": 9.9965042401807e-05, "loss": 0.9776721000671387, "memory(GiB)": 89.13, "step": 9450, "token_acc": 0.7512526597570184, "train_speed(iter/s)": 0.121516 }, { "epoch": 0.12268505753092267, "grad_norm": 0.8743109703063965, "learning_rate": 9.996484157773998e-05, "loss": 1.0411916732788087, "memory(GiB)": 89.13, "step": 9455, "token_acc": 0.7506176775324779, "train_speed(iter/s)": 0.121518 }, { "epoch": 0.12274993593257837, "grad_norm": 0.9327901005744934, "learning_rate": 9.996464017868078e-05, "loss": 0.9506240844726562, "memory(GiB)": 89.13, "step": 9460, "token_acc": 0.7530027202343587, "train_speed(iter/s)": 0.121518 }, { "epoch": 0.12281481433423407, "grad_norm": 1.0858312845230103, "learning_rate": 9.996443820463173e-05, "loss": 0.9478174209594726, "memory(GiB)": 89.13, "step": 9465, "token_acc": 0.7637512703252033, "train_speed(iter/s)": 0.121517 }, { "epoch": 0.12287969273588976, "grad_norm": 0.8592067360877991, "learning_rate": 9.996423565559514e-05, "loss": 1.013843822479248, "memory(GiB)": 89.13, "step": 9470, "token_acc": 0.7339360671617694, "train_speed(iter/s)": 0.121517 }, { "epoch": 0.12294457113754545, "grad_norm": 0.9703384041786194, "learning_rate": 9.996403253157332e-05, "loss": 0.9720752716064454, "memory(GiB)": 89.13, "step": 9475, "token_acc": 0.7331250406821584, "train_speed(iter/s)": 0.121513 }, { "epoch": 0.12300944953920115, "grad_norm": 0.8164506554603577, "learning_rate": 9.996382883256865e-05, "loss": 1.010166549682617, "memory(GiB)": 89.13, "step": 9480, "token_acc": 0.7596300050684237, "train_speed(iter/s)": 0.121514 }, { "epoch": 0.12307432794085685, "grad_norm": 0.840613603591919, "learning_rate": 9.996362455858343e-05, "loss": 1.0112676620483398, "memory(GiB)": 89.13, "step": 9485, "token_acc": 0.7283076820351885, "train_speed(iter/s)": 0.121512 }, { "epoch": 0.12313920634251255, "grad_norm": 0.8678216934204102, "learning_rate": 9.996341970962006e-05, "loss": 0.9453825950622559, "memory(GiB)": 89.13, "step": 9490, "token_acc": 0.7560765041838226, "train_speed(iter/s)": 0.12151 }, { "epoch": 0.12320408474416825, "grad_norm": 0.9438188672065735, "learning_rate": 9.996321428568088e-05, "loss": 1.0137704849243163, "memory(GiB)": 89.13, "step": 9495, "token_acc": 0.7371516933992298, "train_speed(iter/s)": 0.12151 }, { "epoch": 0.12326896314582395, "grad_norm": 0.8863827586174011, "learning_rate": 9.996300828676821e-05, "loss": 1.0432153701782227, "memory(GiB)": 89.13, "step": 9500, "token_acc": 0.7219878280999141, "train_speed(iter/s)": 0.121512 }, { "epoch": 0.12333384154747963, "grad_norm": 0.8872925639152527, "learning_rate": 9.996280171288448e-05, "loss": 0.9984566688537597, "memory(GiB)": 89.13, "step": 9505, "token_acc": 0.7187330720453748, "train_speed(iter/s)": 0.121512 }, { "epoch": 0.12339871994913533, "grad_norm": 0.9196987748146057, "learning_rate": 9.996259456403203e-05, "loss": 1.0199394226074219, "memory(GiB)": 89.13, "step": 9510, "token_acc": 0.7392013349008305, "train_speed(iter/s)": 0.121513 }, { "epoch": 0.12346359835079103, "grad_norm": 0.912968099117279, "learning_rate": 9.996238684021327e-05, "loss": 1.0223964691162108, "memory(GiB)": 89.13, "step": 9515, "token_acc": 0.7439185140802876, "train_speed(iter/s)": 0.12151 }, { "epoch": 0.12352847675244673, "grad_norm": 0.9238014221191406, "learning_rate": 9.996217854143057e-05, "loss": 1.0025007247924804, "memory(GiB)": 89.13, "step": 9520, "token_acc": 0.7340730136005726, "train_speed(iter/s)": 0.121509 }, { "epoch": 0.12359335515410243, "grad_norm": 0.8389246463775635, "learning_rate": 9.996196966768634e-05, "loss": 0.965212059020996, "memory(GiB)": 89.13, "step": 9525, "token_acc": 0.728934134674353, "train_speed(iter/s)": 0.12151 }, { "epoch": 0.12365823355575813, "grad_norm": 0.8513078093528748, "learning_rate": 9.996176021898297e-05, "loss": 0.997378158569336, "memory(GiB)": 89.13, "step": 9530, "token_acc": 0.7373715948001576, "train_speed(iter/s)": 0.12151 }, { "epoch": 0.12372311195741381, "grad_norm": 0.8970050811767578, "learning_rate": 9.996155019532288e-05, "loss": 1.0194887161254882, "memory(GiB)": 89.13, "step": 9535, "token_acc": 0.7370121972408521, "train_speed(iter/s)": 0.121513 }, { "epoch": 0.12378799035906951, "grad_norm": 0.9061025381088257, "learning_rate": 9.99613395967085e-05, "loss": 0.9712902069091797, "memory(GiB)": 89.13, "step": 9540, "token_acc": 0.7556104921822326, "train_speed(iter/s)": 0.121511 }, { "epoch": 0.12385286876072521, "grad_norm": 0.9176504015922546, "learning_rate": 9.996112842314222e-05, "loss": 1.0309198379516602, "memory(GiB)": 89.13, "step": 9545, "token_acc": 0.7185170719005306, "train_speed(iter/s)": 0.121514 }, { "epoch": 0.12391774716238091, "grad_norm": 0.9286296367645264, "learning_rate": 9.99609166746265e-05, "loss": 0.94794921875, "memory(GiB)": 89.13, "step": 9550, "token_acc": 0.755568189418768, "train_speed(iter/s)": 0.121514 }, { "epoch": 0.12398262556403661, "grad_norm": 0.8484618663787842, "learning_rate": 9.996070435116376e-05, "loss": 1.0168052673339845, "memory(GiB)": 89.13, "step": 9555, "token_acc": 0.7170081889882637, "train_speed(iter/s)": 0.121517 }, { "epoch": 0.12404750396569231, "grad_norm": 0.9403774738311768, "learning_rate": 9.996049145275647e-05, "loss": 0.9890077590942383, "memory(GiB)": 89.13, "step": 9560, "token_acc": 0.7553937061976089, "train_speed(iter/s)": 0.121515 }, { "epoch": 0.12411238236734799, "grad_norm": 1.035077452659607, "learning_rate": 9.996027797940703e-05, "loss": 1.0509371757507324, "memory(GiB)": 89.13, "step": 9565, "token_acc": 0.7485835963344614, "train_speed(iter/s)": 0.121518 }, { "epoch": 0.12417726076900369, "grad_norm": 0.9488494992256165, "learning_rate": 9.996006393111795e-05, "loss": 1.0345270156860351, "memory(GiB)": 89.13, "step": 9570, "token_acc": 0.7492643124665597, "train_speed(iter/s)": 0.121519 }, { "epoch": 0.12424213917065939, "grad_norm": 0.883781373500824, "learning_rate": 9.995984930789166e-05, "loss": 0.9695577621459961, "memory(GiB)": 89.13, "step": 9575, "token_acc": 0.7598082668505204, "train_speed(iter/s)": 0.121517 }, { "epoch": 0.12430701757231509, "grad_norm": 0.953224241733551, "learning_rate": 9.995963410973064e-05, "loss": 0.9915489196777344, "memory(GiB)": 89.13, "step": 9580, "token_acc": 0.7513433739498978, "train_speed(iter/s)": 0.12152 }, { "epoch": 0.12437189597397079, "grad_norm": 0.9398066401481628, "learning_rate": 9.995941833663738e-05, "loss": 1.0007981300354003, "memory(GiB)": 89.13, "step": 9585, "token_acc": 0.732831430107867, "train_speed(iter/s)": 0.121519 }, { "epoch": 0.12443677437562649, "grad_norm": 0.7504693269729614, "learning_rate": 9.995920198861432e-05, "loss": 0.9429265022277832, "memory(GiB)": 89.13, "step": 9590, "token_acc": 0.7510782501540357, "train_speed(iter/s)": 0.121517 }, { "epoch": 0.12450165277728217, "grad_norm": 0.889607846736908, "learning_rate": 9.9958985065664e-05, "loss": 1.0246879577636718, "memory(GiB)": 89.13, "step": 9595, "token_acc": 0.7238120356938766, "train_speed(iter/s)": 0.121516 }, { "epoch": 0.12456653117893787, "grad_norm": 0.7871553301811218, "learning_rate": 9.995876756778889e-05, "loss": 1.0123298645019532, "memory(GiB)": 89.13, "step": 9600, "token_acc": 0.7475387497870891, "train_speed(iter/s)": 0.121516 }, { "epoch": 0.12463140958059357, "grad_norm": 0.8055204153060913, "learning_rate": 9.995854949499151e-05, "loss": 0.9796612739562989, "memory(GiB)": 89.13, "step": 9605, "token_acc": 0.7338656021290751, "train_speed(iter/s)": 0.121514 }, { "epoch": 0.12469628798224927, "grad_norm": 0.9699760675430298, "learning_rate": 9.995833084727433e-05, "loss": 1.0356735229492187, "memory(GiB)": 89.13, "step": 9610, "token_acc": 0.7329093799682035, "train_speed(iter/s)": 0.121517 }, { "epoch": 0.12476116638390497, "grad_norm": 0.9610076546669006, "learning_rate": 9.995811162463992e-05, "loss": 1.0073433876037599, "memory(GiB)": 89.13, "step": 9615, "token_acc": 0.7267895878524946, "train_speed(iter/s)": 0.121516 }, { "epoch": 0.12482604478556067, "grad_norm": 0.8395575284957886, "learning_rate": 9.995789182709077e-05, "loss": 0.9906181335449219, "memory(GiB)": 89.13, "step": 9620, "token_acc": 0.7351640679543868, "train_speed(iter/s)": 0.121515 }, { "epoch": 0.12489092318721635, "grad_norm": 0.839756190776825, "learning_rate": 9.995767145462941e-05, "loss": 0.983957576751709, "memory(GiB)": 89.13, "step": 9625, "token_acc": 0.7669185788051841, "train_speed(iter/s)": 0.121516 }, { "epoch": 0.12495580158887205, "grad_norm": 0.8619343638420105, "learning_rate": 9.995745050725838e-05, "loss": 0.9942158699035645, "memory(GiB)": 89.13, "step": 9630, "token_acc": 0.7352123444776535, "train_speed(iter/s)": 0.121516 }, { "epoch": 0.12502067999052777, "grad_norm": 0.8223438262939453, "learning_rate": 9.995722898498023e-05, "loss": 0.9921298027038574, "memory(GiB)": 89.13, "step": 9635, "token_acc": 0.7404642924916187, "train_speed(iter/s)": 0.121516 }, { "epoch": 0.12508555839218344, "grad_norm": 0.8642477989196777, "learning_rate": 9.99570068877975e-05, "loss": 1.0102680206298829, "memory(GiB)": 89.13, "step": 9640, "token_acc": 0.7277864862635096, "train_speed(iter/s)": 0.121517 }, { "epoch": 0.12515043679383914, "grad_norm": 0.8415237069129944, "learning_rate": 9.995678421571276e-05, "loss": 0.9966818809509277, "memory(GiB)": 89.13, "step": 9645, "token_acc": 0.7286107454828288, "train_speed(iter/s)": 0.121518 }, { "epoch": 0.12521531519549484, "grad_norm": 0.8257467150688171, "learning_rate": 9.995656096872856e-05, "loss": 1.014536476135254, "memory(GiB)": 89.13, "step": 9650, "token_acc": 0.7398417697211768, "train_speed(iter/s)": 0.121517 }, { "epoch": 0.12528019359715054, "grad_norm": 0.8889100551605225, "learning_rate": 9.995633714684747e-05, "loss": 1.0182672500610352, "memory(GiB)": 89.13, "step": 9655, "token_acc": 0.7337511046838595, "train_speed(iter/s)": 0.121519 }, { "epoch": 0.12534507199880623, "grad_norm": 0.920447826385498, "learning_rate": 9.995611275007206e-05, "loss": 0.9870240211486816, "memory(GiB)": 89.13, "step": 9660, "token_acc": 0.7512820512820513, "train_speed(iter/s)": 0.121519 }, { "epoch": 0.12540995040046193, "grad_norm": 0.8839050531387329, "learning_rate": 9.995588777840492e-05, "loss": 0.9639423370361329, "memory(GiB)": 89.13, "step": 9665, "token_acc": 0.7279927109615923, "train_speed(iter/s)": 0.121517 }, { "epoch": 0.12547482880211763, "grad_norm": 0.9170861840248108, "learning_rate": 9.995566223184865e-05, "loss": 1.0279204368591308, "memory(GiB)": 89.13, "step": 9670, "token_acc": 0.7396800316018171, "train_speed(iter/s)": 0.121518 }, { "epoch": 0.12553970720377333, "grad_norm": 1.0507194995880127, "learning_rate": 9.995543611040583e-05, "loss": 1.0466115951538086, "memory(GiB)": 89.13, "step": 9675, "token_acc": 0.7522268937504454, "train_speed(iter/s)": 0.12152 }, { "epoch": 0.12560458560542903, "grad_norm": 0.9850727915763855, "learning_rate": 9.995520941407906e-05, "loss": 0.9999079704284668, "memory(GiB)": 89.13, "step": 9680, "token_acc": 0.7354996786632391, "train_speed(iter/s)": 0.12152 }, { "epoch": 0.12566946400708473, "grad_norm": 0.8940911293029785, "learning_rate": 9.995498214287096e-05, "loss": 1.0017257690429688, "memory(GiB)": 89.13, "step": 9685, "token_acc": 0.716677440206852, "train_speed(iter/s)": 0.121519 }, { "epoch": 0.12573434240874043, "grad_norm": 0.8823124170303345, "learning_rate": 9.995475429678413e-05, "loss": 0.971040916442871, "memory(GiB)": 89.13, "step": 9690, "token_acc": 0.7447428126375673, "train_speed(iter/s)": 0.121518 }, { "epoch": 0.12579922081039613, "grad_norm": 0.9586343765258789, "learning_rate": 9.995452587582121e-05, "loss": 0.9906285285949707, "memory(GiB)": 89.13, "step": 9695, "token_acc": 0.744627978365736, "train_speed(iter/s)": 0.121515 }, { "epoch": 0.1258640992120518, "grad_norm": 0.9519805312156677, "learning_rate": 9.995429687998483e-05, "loss": 0.9876699447631836, "memory(GiB)": 89.13, "step": 9700, "token_acc": 0.7344456030587417, "train_speed(iter/s)": 0.121515 }, { "epoch": 0.1259289776137075, "grad_norm": 0.9103536605834961, "learning_rate": 9.995406730927762e-05, "loss": 0.9973513603210449, "memory(GiB)": 89.13, "step": 9705, "token_acc": 0.7384094359453462, "train_speed(iter/s)": 0.121515 }, { "epoch": 0.1259938560153632, "grad_norm": 0.9600211977958679, "learning_rate": 9.995383716370221e-05, "loss": 1.01983642578125, "memory(GiB)": 89.13, "step": 9710, "token_acc": 0.7433166396275658, "train_speed(iter/s)": 0.121514 }, { "epoch": 0.1260587344170189, "grad_norm": 0.8877113461494446, "learning_rate": 9.995360644326123e-05, "loss": 1.035639190673828, "memory(GiB)": 89.13, "step": 9715, "token_acc": 0.7289740202553942, "train_speed(iter/s)": 0.121516 }, { "epoch": 0.1261236128186746, "grad_norm": 0.9360063672065735, "learning_rate": 9.99533751479574e-05, "loss": 0.9624942779541016, "memory(GiB)": 89.13, "step": 9720, "token_acc": 0.7370931112793339, "train_speed(iter/s)": 0.121516 }, { "epoch": 0.1261884912203303, "grad_norm": 0.9952080249786377, "learning_rate": 9.995314327779333e-05, "loss": 0.9886260986328125, "memory(GiB)": 89.13, "step": 9725, "token_acc": 0.7325072588230483, "train_speed(iter/s)": 0.121517 }, { "epoch": 0.126253369621986, "grad_norm": 0.8262395858764648, "learning_rate": 9.995291083277171e-05, "loss": 1.0224583625793457, "memory(GiB)": 89.13, "step": 9730, "token_acc": 0.7312442584464632, "train_speed(iter/s)": 0.121517 }, { "epoch": 0.1263182480236417, "grad_norm": 0.8500242829322815, "learning_rate": 9.995267781289519e-05, "loss": 0.9925769805908203, "memory(GiB)": 89.13, "step": 9735, "token_acc": 0.7184402757296203, "train_speed(iter/s)": 0.121518 }, { "epoch": 0.1263831264252974, "grad_norm": 1.0049175024032593, "learning_rate": 9.995244421816649e-05, "loss": 0.9864459991455078, "memory(GiB)": 89.13, "step": 9740, "token_acc": 0.7455347762134902, "train_speed(iter/s)": 0.121518 }, { "epoch": 0.1264480048269531, "grad_norm": 0.8418087959289551, "learning_rate": 9.995221004858826e-05, "loss": 1.0574092864990234, "memory(GiB)": 89.13, "step": 9745, "token_acc": 0.7261736644507973, "train_speed(iter/s)": 0.12152 }, { "epoch": 0.1265128832286088, "grad_norm": 1.0093237161636353, "learning_rate": 9.995197530416323e-05, "loss": 1.0234622955322266, "memory(GiB)": 89.13, "step": 9750, "token_acc": 0.7519117963207518, "train_speed(iter/s)": 0.121519 }, { "epoch": 0.1265777616302645, "grad_norm": 0.9406594038009644, "learning_rate": 9.995173998489407e-05, "loss": 0.985432243347168, "memory(GiB)": 89.13, "step": 9755, "token_acc": 0.7617464170518405, "train_speed(iter/s)": 0.121519 }, { "epoch": 0.12664264003192016, "grad_norm": 0.878449559211731, "learning_rate": 9.995150409078348e-05, "loss": 0.9614012718200684, "memory(GiB)": 89.13, "step": 9760, "token_acc": 0.7624320874031411, "train_speed(iter/s)": 0.121515 }, { "epoch": 0.12670751843357586, "grad_norm": 0.910823404788971, "learning_rate": 9.995126762183422e-05, "loss": 0.9969350814819335, "memory(GiB)": 89.13, "step": 9765, "token_acc": 0.7271297930370608, "train_speed(iter/s)": 0.121516 }, { "epoch": 0.12677239683523156, "grad_norm": 0.8728988766670227, "learning_rate": 9.9951030578049e-05, "loss": 0.9937941551208496, "memory(GiB)": 89.13, "step": 9770, "token_acc": 0.7398605592623135, "train_speed(iter/s)": 0.121518 }, { "epoch": 0.12683727523688726, "grad_norm": 0.9192800521850586, "learning_rate": 9.99507929594305e-05, "loss": 1.0012078285217285, "memory(GiB)": 89.13, "step": 9775, "token_acc": 0.7470357142857142, "train_speed(iter/s)": 0.121517 }, { "epoch": 0.12690215363854296, "grad_norm": 0.8336300849914551, "learning_rate": 9.995055476598152e-05, "loss": 1.0135376930236817, "memory(GiB)": 89.13, "step": 9780, "token_acc": 0.7123361344537815, "train_speed(iter/s)": 0.12152 }, { "epoch": 0.12696703204019866, "grad_norm": 0.8521347045898438, "learning_rate": 9.995031599770476e-05, "loss": 0.9728460311889648, "memory(GiB)": 89.13, "step": 9785, "token_acc": 0.7377699427060379, "train_speed(iter/s)": 0.121517 }, { "epoch": 0.12703191044185436, "grad_norm": 0.9043740630149841, "learning_rate": 9.995007665460297e-05, "loss": 0.9737552642822266, "memory(GiB)": 89.13, "step": 9790, "token_acc": 0.7527980898373378, "train_speed(iter/s)": 0.121517 }, { "epoch": 0.12709678884351006, "grad_norm": 0.8940498232841492, "learning_rate": 9.994983673667893e-05, "loss": 0.9985292434692383, "memory(GiB)": 89.13, "step": 9795, "token_acc": 0.7399722438479505, "train_speed(iter/s)": 0.121521 }, { "epoch": 0.12716166724516575, "grad_norm": 0.8830583691596985, "learning_rate": 9.994959624393537e-05, "loss": 0.9969388961791992, "memory(GiB)": 89.13, "step": 9800, "token_acc": 0.7347512799443777, "train_speed(iter/s)": 0.12152 }, { "epoch": 0.12722654564682145, "grad_norm": 0.9992518424987793, "learning_rate": 9.994935517637507e-05, "loss": 0.9846542358398438, "memory(GiB)": 89.13, "step": 9805, "token_acc": 0.7485185908706128, "train_speed(iter/s)": 0.12152 }, { "epoch": 0.12729142404847715, "grad_norm": 0.8732506036758423, "learning_rate": 9.994911353400082e-05, "loss": 0.9633369445800781, "memory(GiB)": 89.13, "step": 9810, "token_acc": 0.7368600102058173, "train_speed(iter/s)": 0.121522 }, { "epoch": 0.12735630245013285, "grad_norm": 0.8786276578903198, "learning_rate": 9.994887131681537e-05, "loss": 1.0198339462280273, "memory(GiB)": 89.13, "step": 9815, "token_acc": 0.7306216171887814, "train_speed(iter/s)": 0.121523 }, { "epoch": 0.12742118085178852, "grad_norm": 0.8346560597419739, "learning_rate": 9.994862852482153e-05, "loss": 1.0020389556884766, "memory(GiB)": 89.13, "step": 9820, "token_acc": 0.7454411764705883, "train_speed(iter/s)": 0.121522 }, { "epoch": 0.12748605925344422, "grad_norm": 0.9515207409858704, "learning_rate": 9.994838515802209e-05, "loss": 1.0401018142700196, "memory(GiB)": 89.13, "step": 9825, "token_acc": 0.716413669940742, "train_speed(iter/s)": 0.121525 }, { "epoch": 0.12755093765509992, "grad_norm": 0.8189806342124939, "learning_rate": 9.994814121641985e-05, "loss": 1.015455722808838, "memory(GiB)": 89.13, "step": 9830, "token_acc": 0.7320424927428818, "train_speed(iter/s)": 0.121523 }, { "epoch": 0.12761581605675562, "grad_norm": 0.8837578296661377, "learning_rate": 9.994789670001761e-05, "loss": 1.0030722618103027, "memory(GiB)": 89.13, "step": 9835, "token_acc": 0.7477896050761069, "train_speed(iter/s)": 0.121525 }, { "epoch": 0.12768069445841132, "grad_norm": 1.0262722969055176, "learning_rate": 9.994765160881818e-05, "loss": 1.0124642372131347, "memory(GiB)": 89.13, "step": 9840, "token_acc": 0.7356447480785653, "train_speed(iter/s)": 0.121524 }, { "epoch": 0.12774557286006702, "grad_norm": 0.8934503793716431, "learning_rate": 9.994740594282441e-05, "loss": 0.9950139045715332, "memory(GiB)": 89.13, "step": 9845, "token_acc": 0.7478868601119689, "train_speed(iter/s)": 0.121525 }, { "epoch": 0.12781045126172272, "grad_norm": 0.9047783613204956, "learning_rate": 9.994715970203909e-05, "loss": 1.0176402091979981, "memory(GiB)": 89.13, "step": 9850, "token_acc": 0.7289323194872117, "train_speed(iter/s)": 0.121522 }, { "epoch": 0.12787532966337842, "grad_norm": 0.8523476123809814, "learning_rate": 9.994691288646508e-05, "loss": 0.9548182487487793, "memory(GiB)": 89.13, "step": 9855, "token_acc": 0.7562148876404494, "train_speed(iter/s)": 0.121521 }, { "epoch": 0.12794020806503412, "grad_norm": 0.8482657670974731, "learning_rate": 9.994666549610521e-05, "loss": 1.0207716941833496, "memory(GiB)": 89.13, "step": 9860, "token_acc": 0.7386578781291635, "train_speed(iter/s)": 0.121521 }, { "epoch": 0.12800508646668982, "grad_norm": 0.9141618013381958, "learning_rate": 9.994641753096233e-05, "loss": 1.0218481063842773, "memory(GiB)": 89.13, "step": 9865, "token_acc": 0.7145098039215686, "train_speed(iter/s)": 0.121523 }, { "epoch": 0.12806996486834551, "grad_norm": 0.8202970027923584, "learning_rate": 9.99461689910393e-05, "loss": 0.9731355667114258, "memory(GiB)": 89.13, "step": 9870, "token_acc": 0.7254135730102905, "train_speed(iter/s)": 0.121523 }, { "epoch": 0.12813484327000121, "grad_norm": 0.9698265790939331, "learning_rate": 9.994591987633894e-05, "loss": 0.952815055847168, "memory(GiB)": 89.13, "step": 9875, "token_acc": 0.7479216948243497, "train_speed(iter/s)": 0.121521 }, { "epoch": 0.12819972167165689, "grad_norm": 0.959522545337677, "learning_rate": 9.994567018686418e-05, "loss": 1.0122481346130372, "memory(GiB)": 89.13, "step": 9880, "token_acc": 0.7241418677220489, "train_speed(iter/s)": 0.121524 }, { "epoch": 0.12826460007331258, "grad_norm": 0.9861425161361694, "learning_rate": 9.994541992261786e-05, "loss": 0.9935207366943359, "memory(GiB)": 89.13, "step": 9885, "token_acc": 0.7299949181032798, "train_speed(iter/s)": 0.121524 }, { "epoch": 0.12832947847496828, "grad_norm": 0.8531480431556702, "learning_rate": 9.994516908360285e-05, "loss": 0.9693162918090821, "memory(GiB)": 89.13, "step": 9890, "token_acc": 0.74895314715289, "train_speed(iter/s)": 0.121523 }, { "epoch": 0.12839435687662398, "grad_norm": 0.9143854975700378, "learning_rate": 9.994491766982205e-05, "loss": 0.9873284339904785, "memory(GiB)": 89.13, "step": 9895, "token_acc": 0.7223819612590799, "train_speed(iter/s)": 0.121524 }, { "epoch": 0.12845923527827968, "grad_norm": 0.8649817109107971, "learning_rate": 9.994466568127836e-05, "loss": 1.0095159530639648, "memory(GiB)": 89.13, "step": 9900, "token_acc": 0.7381452931558047, "train_speed(iter/s)": 0.121524 }, { "epoch": 0.12852411367993538, "grad_norm": 0.9228850603103638, "learning_rate": 9.994441311797466e-05, "loss": 0.991269302368164, "memory(GiB)": 89.13, "step": 9905, "token_acc": 0.7254858411993337, "train_speed(iter/s)": 0.121524 }, { "epoch": 0.12858899208159108, "grad_norm": 0.9260993599891663, "learning_rate": 9.994415997991387e-05, "loss": 1.0194682121276855, "memory(GiB)": 89.13, "step": 9910, "token_acc": 0.7350842418235877, "train_speed(iter/s)": 0.121523 }, { "epoch": 0.12865387048324678, "grad_norm": 0.9183833599090576, "learning_rate": 9.994390626709891e-05, "loss": 1.0075714111328125, "memory(GiB)": 89.13, "step": 9915, "token_acc": 0.7574351613872292, "train_speed(iter/s)": 0.121523 }, { "epoch": 0.12871874888490248, "grad_norm": 0.974145233631134, "learning_rate": 9.994365197953269e-05, "loss": 1.0388721466064452, "memory(GiB)": 89.13, "step": 9920, "token_acc": 0.7284754231026354, "train_speed(iter/s)": 0.121521 }, { "epoch": 0.12878362728655818, "grad_norm": 0.9795641899108887, "learning_rate": 9.994339711721813e-05, "loss": 1.00936918258667, "memory(GiB)": 89.13, "step": 9925, "token_acc": 0.7313497484423417, "train_speed(iter/s)": 0.121523 }, { "epoch": 0.12884850568821388, "grad_norm": 0.9083439707756042, "learning_rate": 9.994314168015817e-05, "loss": 0.938314151763916, "memory(GiB)": 89.13, "step": 9930, "token_acc": 0.7482976568774662, "train_speed(iter/s)": 0.121523 }, { "epoch": 0.12891338408986955, "grad_norm": 1.1550798416137695, "learning_rate": 9.994288566835575e-05, "loss": 1.0080585479736328, "memory(GiB)": 89.13, "step": 9935, "token_acc": 0.7348598554923383, "train_speed(iter/s)": 0.121523 }, { "epoch": 0.12897826249152525, "grad_norm": 0.8679498434066772, "learning_rate": 9.994262908181382e-05, "loss": 0.9979192733764648, "memory(GiB)": 89.13, "step": 9940, "token_acc": 0.7240595658154416, "train_speed(iter/s)": 0.121523 }, { "epoch": 0.12904314089318095, "grad_norm": 0.9112741947174072, "learning_rate": 9.994237192053533e-05, "loss": 1.0213760375976562, "memory(GiB)": 89.13, "step": 9945, "token_acc": 0.7319148936170212, "train_speed(iter/s)": 0.121524 }, { "epoch": 0.12910801929483665, "grad_norm": 0.9341762065887451, "learning_rate": 9.994211418452323e-05, "loss": 1.0224124908447265, "memory(GiB)": 89.13, "step": 9950, "token_acc": 0.7313764801704153, "train_speed(iter/s)": 0.121524 }, { "epoch": 0.12917289769649234, "grad_norm": 0.9493143558502197, "learning_rate": 9.994185587378051e-05, "loss": 0.9894449234008789, "memory(GiB)": 89.13, "step": 9955, "token_acc": 0.7288765410661573, "train_speed(iter/s)": 0.121523 }, { "epoch": 0.12923777609814804, "grad_norm": 0.792855978012085, "learning_rate": 9.994159698831012e-05, "loss": 1.0174810409545898, "memory(GiB)": 89.13, "step": 9960, "token_acc": 0.7307762780186893, "train_speed(iter/s)": 0.121523 }, { "epoch": 0.12930265449980374, "grad_norm": 0.8459696173667908, "learning_rate": 9.994133752811505e-05, "loss": 0.9599979400634766, "memory(GiB)": 89.13, "step": 9965, "token_acc": 0.740225806451613, "train_speed(iter/s)": 0.12152 }, { "epoch": 0.12936753290145944, "grad_norm": 0.8148832321166992, "learning_rate": 9.994107749319827e-05, "loss": 0.9588704109191895, "memory(GiB)": 89.13, "step": 9970, "token_acc": 0.7534122273053802, "train_speed(iter/s)": 0.121518 }, { "epoch": 0.12943241130311514, "grad_norm": 0.8759375810623169, "learning_rate": 9.994081688356278e-05, "loss": 1.0261587142944335, "memory(GiB)": 89.13, "step": 9975, "token_acc": 0.7209876543209877, "train_speed(iter/s)": 0.121518 }, { "epoch": 0.12949728970477084, "grad_norm": 0.8147096633911133, "learning_rate": 9.99405556992116e-05, "loss": 1.0082990646362304, "memory(GiB)": 89.13, "step": 9980, "token_acc": 0.7149153796978567, "train_speed(iter/s)": 0.121521 }, { "epoch": 0.12956216810642654, "grad_norm": 0.7837299108505249, "learning_rate": 9.994029394014772e-05, "loss": 1.0110217094421388, "memory(GiB)": 89.13, "step": 9985, "token_acc": 0.7294381438273811, "train_speed(iter/s)": 0.121518 }, { "epoch": 0.12962704650808224, "grad_norm": 0.7340269088745117, "learning_rate": 9.994003160637414e-05, "loss": 0.9854288101196289, "memory(GiB)": 89.13, "step": 9990, "token_acc": 0.7324874161073825, "train_speed(iter/s)": 0.121518 }, { "epoch": 0.1296919249097379, "grad_norm": 0.8307127952575684, "learning_rate": 9.993976869789389e-05, "loss": 0.9961939811706543, "memory(GiB)": 89.13, "step": 9995, "token_acc": 0.7551870051870052, "train_speed(iter/s)": 0.121516 }, { "epoch": 0.1297568033113936, "grad_norm": 0.920776903629303, "learning_rate": 9.993950521471e-05, "loss": 1.0097381591796875, "memory(GiB)": 89.13, "step": 10000, "token_acc": 0.7341006293681978, "train_speed(iter/s)": 0.121515 }, { "epoch": 0.1297568033113936, "eval_loss": 0.996848464012146, "eval_runtime": 2721.2249, "eval_samples_per_second": 18.308, "eval_steps_per_second": 1.144, "eval_token_acc": 0.7375438274667412, "step": 10000 }, { "epoch": 0.1298216817130493, "grad_norm": 0.9474276900291443, "learning_rate": 9.99392411568255e-05, "loss": 1.01248197555542, "memory(GiB)": 89.65, "step": 10005, "token_acc": 0.7514675336403865, "train_speed(iter/s)": 84.635181 }, { "epoch": 0.129886560114705, "grad_norm": 0.978905439376831, "learning_rate": 9.993897652424342e-05, "loss": 1.0092869758605958, "memory(GiB)": 89.65, "step": 10010, "token_acc": 0.7203376075312449, "train_speed(iter/s)": 62.421027 }, { "epoch": 0.1299514385163607, "grad_norm": 0.7718564867973328, "learning_rate": 9.993871131696683e-05, "loss": 0.9660980224609375, "memory(GiB)": 89.65, "step": 10015, "token_acc": 0.7298093865128944, "train_speed(iter/s)": 49.365932 }, { "epoch": 0.1300163169180164, "grad_norm": 0.9413799047470093, "learning_rate": 9.993844553499875e-05, "loss": 0.9759319305419922, "memory(GiB)": 89.65, "step": 10020, "token_acc": 0.7431080751402781, "train_speed(iter/s)": 40.918692 }, { "epoch": 0.1300811953196721, "grad_norm": 0.8331644535064697, "learning_rate": 9.993817917834225e-05, "loss": 1.03242769241333, "memory(GiB)": 89.65, "step": 10025, "token_acc": 0.7223330378250591, "train_speed(iter/s)": 35.11435 }, { "epoch": 0.1301460737213278, "grad_norm": 0.9229003190994263, "learning_rate": 9.993791224700041e-05, "loss": 0.9937202453613281, "memory(GiB)": 89.65, "step": 10030, "token_acc": 0.735932950342331, "train_speed(iter/s)": 30.741245 }, { "epoch": 0.1302109521229835, "grad_norm": 0.8319275379180908, "learning_rate": 9.993764474097628e-05, "loss": 0.9910884857177734, "memory(GiB)": 89.65, "step": 10035, "token_acc": 0.7215734861390739, "train_speed(iter/s)": 27.238039 }, { "epoch": 0.1302758305246392, "grad_norm": 0.8452498912811279, "learning_rate": 9.993737666027295e-05, "loss": 1.0017388343811036, "memory(GiB)": 89.65, "step": 10040, "token_acc": 0.7400201144979112, "train_speed(iter/s)": 24.490537 }, { "epoch": 0.1303407089262949, "grad_norm": 0.9060531854629517, "learning_rate": 9.993710800489351e-05, "loss": 1.0193645477294921, "memory(GiB)": 89.65, "step": 10045, "token_acc": 0.7282837702483335, "train_speed(iter/s)": 22.303302 }, { "epoch": 0.1304055873279506, "grad_norm": 0.8680469393730164, "learning_rate": 9.993683877484103e-05, "loss": 1.0582050323486327, "memory(GiB)": 89.65, "step": 10050, "token_acc": 0.7241317257513047, "train_speed(iter/s)": 20.4387 }, { "epoch": 0.13047046572960627, "grad_norm": 0.8820314407348633, "learning_rate": 9.993656897011863e-05, "loss": 0.9887815475463867, "memory(GiB)": 89.65, "step": 10055, "token_acc": 0.7492034933843098, "train_speed(iter/s)": 18.911974 }, { "epoch": 0.13053534413126197, "grad_norm": 0.8767476677894592, "learning_rate": 9.993629859072941e-05, "loss": 1.003624725341797, "memory(GiB)": 89.65, "step": 10060, "token_acc": 0.7287812624057165, "train_speed(iter/s)": 17.584315 }, { "epoch": 0.13060022253291767, "grad_norm": 0.9194981455802917, "learning_rate": 9.993602763667649e-05, "loss": 1.0142599105834962, "memory(GiB)": 89.65, "step": 10065, "token_acc": 0.7474939855653568, "train_speed(iter/s)": 16.398212 }, { "epoch": 0.13066510093457337, "grad_norm": 0.9499934911727905, "learning_rate": 9.993575610796298e-05, "loss": 0.9961313247680664, "memory(GiB)": 89.65, "step": 10070, "token_acc": 0.7234439060752889, "train_speed(iter/s)": 15.377168 }, { "epoch": 0.13072997933622907, "grad_norm": 0.9933329820632935, "learning_rate": 9.993548400459198e-05, "loss": 1.0308212280273437, "memory(GiB)": 89.65, "step": 10075, "token_acc": 0.7632349935272876, "train_speed(iter/s)": 14.467733 }, { "epoch": 0.13079485773788477, "grad_norm": 1.0553232431411743, "learning_rate": 9.993521132656665e-05, "loss": 1.0069877624511718, "memory(GiB)": 89.65, "step": 10080, "token_acc": 0.7262040756163172, "train_speed(iter/s)": 13.678988 }, { "epoch": 0.13085973613954047, "grad_norm": 0.972188413143158, "learning_rate": 9.993493807389013e-05, "loss": 0.9727532386779785, "memory(GiB)": 89.65, "step": 10085, "token_acc": 0.7585936701529656, "train_speed(iter/s)": 12.975183 }, { "epoch": 0.13092461454119617, "grad_norm": 0.9440441727638245, "learning_rate": 9.993466424656554e-05, "loss": 1.0074298858642579, "memory(GiB)": 89.65, "step": 10090, "token_acc": 0.7237317939019939, "train_speed(iter/s)": 12.331376 }, { "epoch": 0.13098949294285187, "grad_norm": 0.9859951138496399, "learning_rate": 9.993438984459607e-05, "loss": 1.021653938293457, "memory(GiB)": 89.65, "step": 10095, "token_acc": 0.7246426399026764, "train_speed(iter/s)": 11.738269 }, { "epoch": 0.13105437134450756, "grad_norm": 0.8645209074020386, "learning_rate": 9.993411486798486e-05, "loss": 0.9809858322143554, "memory(GiB)": 89.65, "step": 10100, "token_acc": 0.7525214853251175, "train_speed(iter/s)": 11.190813 }, { "epoch": 0.13111924974616326, "grad_norm": 0.8689084053039551, "learning_rate": 9.993383931673504e-05, "loss": 1.040860939025879, "memory(GiB)": 89.65, "step": 10105, "token_acc": 0.7315765966949531, "train_speed(iter/s)": 10.708719 }, { "epoch": 0.13118412814781896, "grad_norm": 0.8609697818756104, "learning_rate": 9.993356319084984e-05, "loss": 1.0260177612304688, "memory(GiB)": 89.65, "step": 10110, "token_acc": 0.7262624082865775, "train_speed(iter/s)": 10.28271 }, { "epoch": 0.13124900654947463, "grad_norm": 0.8845697045326233, "learning_rate": 9.993328649033237e-05, "loss": 0.9687250137329102, "memory(GiB)": 89.65, "step": 10115, "token_acc": 0.7759909399773499, "train_speed(iter/s)": 9.855174 }, { "epoch": 0.13131388495113033, "grad_norm": 0.8992764949798584, "learning_rate": 9.993300921518589e-05, "loss": 0.9882129669189453, "memory(GiB)": 89.65, "step": 10120, "token_acc": 0.7378735145643245, "train_speed(iter/s)": 9.46599 }, { "epoch": 0.13137876335278603, "grad_norm": 0.9828736782073975, "learning_rate": 9.993273136541355e-05, "loss": 0.9855409622192383, "memory(GiB)": 89.65, "step": 10125, "token_acc": 0.7465027554048326, "train_speed(iter/s)": 9.113622 }, { "epoch": 0.13144364175444173, "grad_norm": 0.8406286239624023, "learning_rate": 9.993245294101855e-05, "loss": 1.0191740036010741, "memory(GiB)": 89.65, "step": 10130, "token_acc": 0.7386500533507727, "train_speed(iter/s)": 8.794352 }, { "epoch": 0.13150852015609743, "grad_norm": 0.9013919234275818, "learning_rate": 9.993217394200407e-05, "loss": 0.9719731330871582, "memory(GiB)": 89.65, "step": 10135, "token_acc": 0.7491402843265856, "train_speed(iter/s)": 8.488016 }, { "epoch": 0.13157339855775313, "grad_norm": 0.9209729433059692, "learning_rate": 9.993189436837336e-05, "loss": 1.045686912536621, "memory(GiB)": 89.65, "step": 10140, "token_acc": 0.7419171866137266, "train_speed(iter/s)": 8.215008 }, { "epoch": 0.13163827695940883, "grad_norm": 0.908660352230072, "learning_rate": 9.993161422012965e-05, "loss": 1.013460922241211, "memory(GiB)": 89.65, "step": 10145, "token_acc": 0.7435668434272955, "train_speed(iter/s)": 7.948609 }, { "epoch": 0.13170315536106453, "grad_norm": 0.8469719886779785, "learning_rate": 9.99313334972761e-05, "loss": 0.9906751632690429, "memory(GiB)": 89.65, "step": 10150, "token_acc": 0.7407200357781754, "train_speed(iter/s)": 7.709103 }, { "epoch": 0.13176803376272023, "grad_norm": 0.9097334742546082, "learning_rate": 9.9931052199816e-05, "loss": 0.97965087890625, "memory(GiB)": 89.65, "step": 10155, "token_acc": 0.723060188542422, "train_speed(iter/s)": 7.479469 }, { "epoch": 0.13183291216437593, "grad_norm": 0.8219349384307861, "learning_rate": 9.993077032775255e-05, "loss": 1.0226629257202149, "memory(GiB)": 89.65, "step": 10160, "token_acc": 0.7443147104745096, "train_speed(iter/s)": 7.27161 }, { "epoch": 0.13189779056603163, "grad_norm": 0.8115010857582092, "learning_rate": 9.9930487881089e-05, "loss": 0.9691839218139648, "memory(GiB)": 89.65, "step": 10165, "token_acc": 0.7355535704315783, "train_speed(iter/s)": 7.068684 }, { "epoch": 0.13196266896768732, "grad_norm": 0.9057178497314453, "learning_rate": 9.993020485982863e-05, "loss": 1.0401418685913086, "memory(GiB)": 89.65, "step": 10170, "token_acc": 0.7351076476026945, "train_speed(iter/s)": 6.883104 }, { "epoch": 0.132027547369343, "grad_norm": 0.771722137928009, "learning_rate": 9.992992126397468e-05, "loss": 1.0353793144226073, "memory(GiB)": 89.65, "step": 10175, "token_acc": 0.7309610372896624, "train_speed(iter/s)": 6.690854 }, { "epoch": 0.1320924257709987, "grad_norm": 0.858666718006134, "learning_rate": 9.992963709353041e-05, "loss": 0.9775740623474121, "memory(GiB)": 89.65, "step": 10180, "token_acc": 0.7351813255523134, "train_speed(iter/s)": 6.521142 }, { "epoch": 0.1321573041726544, "grad_norm": 0.9676341414451599, "learning_rate": 9.992935234849908e-05, "loss": 1.026909637451172, "memory(GiB)": 89.65, "step": 10185, "token_acc": 0.7223573367170703, "train_speed(iter/s)": 6.347394 }, { "epoch": 0.1322221825743101, "grad_norm": 0.8409390449523926, "learning_rate": 9.992906702888396e-05, "loss": 0.9939444541931153, "memory(GiB)": 89.65, "step": 10190, "token_acc": 0.7266581536843687, "train_speed(iter/s)": 6.198804 }, { "epoch": 0.1322870609759658, "grad_norm": 0.9041054248809814, "learning_rate": 9.992878113468838e-05, "loss": 1.0478574752807617, "memory(GiB)": 89.65, "step": 10195, "token_acc": 0.7280032774486322, "train_speed(iter/s)": 6.051608 }, { "epoch": 0.1323519393776215, "grad_norm": 0.7961330413818359, "learning_rate": 9.992849466591558e-05, "loss": 0.9694439888000488, "memory(GiB)": 89.65, "step": 10200, "token_acc": 0.7439186200796107, "train_speed(iter/s)": 5.913339 }, { "epoch": 0.1324168177792772, "grad_norm": 0.9024392366409302, "learning_rate": 9.992820762256889e-05, "loss": 1.0323915481567383, "memory(GiB)": 89.65, "step": 10205, "token_acc": 0.7434925546315994, "train_speed(iter/s)": 5.78185 }, { "epoch": 0.1324816961809329, "grad_norm": 0.8298237323760986, "learning_rate": 9.992792000465158e-05, "loss": 0.9979724884033203, "memory(GiB)": 89.65, "step": 10210, "token_acc": 0.7520180012858061, "train_speed(iter/s)": 5.65072 }, { "epoch": 0.1325465745825886, "grad_norm": 0.9045537114143372, "learning_rate": 9.9927631812167e-05, "loss": 1.008887481689453, "memory(GiB)": 89.65, "step": 10215, "token_acc": 0.729598091514529, "train_speed(iter/s)": 5.531225 }, { "epoch": 0.1326114529842443, "grad_norm": 0.9601762294769287, "learning_rate": 9.992734304511844e-05, "loss": 1.0558420181274415, "memory(GiB)": 89.65, "step": 10220, "token_acc": 0.7128192886130341, "train_speed(iter/s)": 5.415586 }, { "epoch": 0.1326763313859, "grad_norm": 0.8752582669258118, "learning_rate": 9.992705370350922e-05, "loss": 0.9785861968994141, "memory(GiB)": 89.65, "step": 10225, "token_acc": 0.75, "train_speed(iter/s)": 5.303893 }, { "epoch": 0.1327412097875557, "grad_norm": 0.7911595106124878, "learning_rate": 9.992676378734269e-05, "loss": 1.0033147811889649, "memory(GiB)": 89.65, "step": 10230, "token_acc": 0.7438413574568128, "train_speed(iter/s)": 5.192315 }, { "epoch": 0.13280608818921136, "grad_norm": 0.9119812846183777, "learning_rate": 9.992647329662217e-05, "loss": 0.9817861557006836, "memory(GiB)": 89.65, "step": 10235, "token_acc": 0.7551884632992523, "train_speed(iter/s)": 5.083201 }, { "epoch": 0.13287096659086706, "grad_norm": 0.8077133297920227, "learning_rate": 9.992618223135101e-05, "loss": 0.9677708625793457, "memory(GiB)": 89.65, "step": 10240, "token_acc": 0.7230782113548819, "train_speed(iter/s)": 4.979464 }, { "epoch": 0.13293584499252276, "grad_norm": 0.9107375741004944, "learning_rate": 9.992589059153257e-05, "loss": 1.008047389984131, "memory(GiB)": 89.65, "step": 10245, "token_acc": 0.743955238793046, "train_speed(iter/s)": 4.887787 }, { "epoch": 0.13300072339417846, "grad_norm": 1.0062628984451294, "learning_rate": 9.992559837717018e-05, "loss": 1.0145712852478028, "memory(GiB)": 89.65, "step": 10250, "token_acc": 0.7474062093179218, "train_speed(iter/s)": 4.797006 }, { "epoch": 0.13306560179583415, "grad_norm": 0.9952890276908875, "learning_rate": 9.992530558826723e-05, "loss": 0.9880729675292969, "memory(GiB)": 89.65, "step": 10255, "token_acc": 0.7305027535974418, "train_speed(iter/s)": 4.709253 }, { "epoch": 0.13313048019748985, "grad_norm": 1.3083124160766602, "learning_rate": 9.992501222482707e-05, "loss": 1.0090291976928711, "memory(GiB)": 89.65, "step": 10260, "token_acc": 0.743716150534648, "train_speed(iter/s)": 4.625331 }, { "epoch": 0.13319535859914555, "grad_norm": 0.8370843529701233, "learning_rate": 9.992471828685307e-05, "loss": 1.0211344718933106, "memory(GiB)": 89.65, "step": 10265, "token_acc": 0.730587138863001, "train_speed(iter/s)": 4.544506 }, { "epoch": 0.13326023700080125, "grad_norm": 0.9422105550765991, "learning_rate": 9.992442377434863e-05, "loss": 1.0554420471191406, "memory(GiB)": 89.65, "step": 10270, "token_acc": 0.706876312862968, "train_speed(iter/s)": 4.463826 }, { "epoch": 0.13332511540245695, "grad_norm": 0.9161608219146729, "learning_rate": 9.992412868731715e-05, "loss": 1.0037424087524414, "memory(GiB)": 89.65, "step": 10275, "token_acc": 0.7364271232259518, "train_speed(iter/s)": 4.388594 }, { "epoch": 0.13338999380411265, "grad_norm": 0.9068742394447327, "learning_rate": 9.9923833025762e-05, "loss": 1.0479103088378907, "memory(GiB)": 89.65, "step": 10280, "token_acc": 0.744398513217772, "train_speed(iter/s)": 4.314901 }, { "epoch": 0.13345487220576835, "grad_norm": 0.9158126711845398, "learning_rate": 9.992353678968659e-05, "loss": 0.9903295516967774, "memory(GiB)": 89.65, "step": 10285, "token_acc": 0.738052530429212, "train_speed(iter/s)": 4.241573 }, { "epoch": 0.13351975060742405, "grad_norm": 0.9017661213874817, "learning_rate": 9.992323997909433e-05, "loss": 0.9794464111328125, "memory(GiB)": 89.65, "step": 10290, "token_acc": 0.7250163291966035, "train_speed(iter/s)": 4.17159 }, { "epoch": 0.13358462900907972, "grad_norm": 0.9026857614517212, "learning_rate": 9.992294259398864e-05, "loss": 0.9692108154296875, "memory(GiB)": 89.65, "step": 10295, "token_acc": 0.7465644086618545, "train_speed(iter/s)": 4.102737 }, { "epoch": 0.13364950741073542, "grad_norm": 0.8601269125938416, "learning_rate": 9.992264463437295e-05, "loss": 0.9798352241516113, "memory(GiB)": 89.65, "step": 10300, "token_acc": 0.7292297342751429, "train_speed(iter/s)": 4.03687 }, { "epoch": 0.13371438581239112, "grad_norm": 0.9227082133293152, "learning_rate": 9.992234610025067e-05, "loss": 0.9727728843688965, "memory(GiB)": 89.65, "step": 10305, "token_acc": 0.7386563876651983, "train_speed(iter/s)": 3.975057 }, { "epoch": 0.13377926421404682, "grad_norm": 0.8363973498344421, "learning_rate": 9.992204699162524e-05, "loss": 1.0194658279418944, "memory(GiB)": 89.65, "step": 10310, "token_acc": 0.7353637205473814, "train_speed(iter/s)": 3.916358 }, { "epoch": 0.13384414261570252, "grad_norm": 0.9175366759300232, "learning_rate": 9.992174730850011e-05, "loss": 1.0286431312561035, "memory(GiB)": 89.65, "step": 10315, "token_acc": 0.737706779725177, "train_speed(iter/s)": 3.857841 }, { "epoch": 0.13390902101735822, "grad_norm": 0.8647042512893677, "learning_rate": 9.992144705087872e-05, "loss": 1.0321964263916015, "memory(GiB)": 89.65, "step": 10320, "token_acc": 0.7337507656707276, "train_speed(iter/s)": 3.802297 }, { "epoch": 0.13397389941901391, "grad_norm": 0.951493501663208, "learning_rate": 9.992114621876453e-05, "loss": 0.9860708236694335, "memory(GiB)": 89.65, "step": 10325, "token_acc": 0.7357180346400742, "train_speed(iter/s)": 3.750041 }, { "epoch": 0.1340387778206696, "grad_norm": 0.8809897899627686, "learning_rate": 9.992084481216102e-05, "loss": 1.0122374534606933, "memory(GiB)": 89.65, "step": 10330, "token_acc": 0.7410400195047194, "train_speed(iter/s)": 3.697316 }, { "epoch": 0.1341036562223253, "grad_norm": 0.791668713092804, "learning_rate": 9.992054283107163e-05, "loss": 0.9619636535644531, "memory(GiB)": 89.65, "step": 10335, "token_acc": 0.7490973399159974, "train_speed(iter/s)": 3.644825 }, { "epoch": 0.134168534623981, "grad_norm": 0.8046497702598572, "learning_rate": 9.992024027549985e-05, "loss": 0.9652374267578125, "memory(GiB)": 89.65, "step": 10340, "token_acc": 0.7589046131225877, "train_speed(iter/s)": 3.594288 }, { "epoch": 0.1342334130256367, "grad_norm": 0.8983477354049683, "learning_rate": 9.991993714544914e-05, "loss": 0.9695562362670899, "memory(GiB)": 89.65, "step": 10345, "token_acc": 0.73530922742541, "train_speed(iter/s)": 3.545841 }, { "epoch": 0.1342982914272924, "grad_norm": 0.9120919704437256, "learning_rate": 9.991963344092302e-05, "loss": 1.0150216102600098, "memory(GiB)": 89.65, "step": 10350, "token_acc": 0.7450786056049213, "train_speed(iter/s)": 3.498401 }, { "epoch": 0.13436316982894808, "grad_norm": 0.9771372675895691, "learning_rate": 9.991932916192497e-05, "loss": 1.0526631355285645, "memory(GiB)": 89.65, "step": 10355, "token_acc": 0.7394130150425114, "train_speed(iter/s)": 3.453153 }, { "epoch": 0.13442804823060378, "grad_norm": 0.8911521434783936, "learning_rate": 9.99190243084585e-05, "loss": 1.0206520080566406, "memory(GiB)": 89.65, "step": 10360, "token_acc": 0.7296335976001714, "train_speed(iter/s)": 3.409058 }, { "epoch": 0.13449292663225948, "grad_norm": 0.9362576007843018, "learning_rate": 9.99187188805271e-05, "loss": 1.025360679626465, "memory(GiB)": 89.65, "step": 10365, "token_acc": 0.7325407608695652, "train_speed(iter/s)": 3.36577 }, { "epoch": 0.13455780503391518, "grad_norm": 0.8142469525337219, "learning_rate": 9.99184128781343e-05, "loss": 0.9814403533935547, "memory(GiB)": 89.65, "step": 10370, "token_acc": 0.7271027324072661, "train_speed(iter/s)": 3.323264 }, { "epoch": 0.13462268343557088, "grad_norm": 0.8281680941581726, "learning_rate": 9.991810630128363e-05, "loss": 1.0467193603515625, "memory(GiB)": 89.65, "step": 10375, "token_acc": 0.7163116057233704, "train_speed(iter/s)": 3.28147 }, { "epoch": 0.13468756183722658, "grad_norm": 0.9575887322425842, "learning_rate": 9.991779914997857e-05, "loss": 1.0348134994506837, "memory(GiB)": 89.65, "step": 10380, "token_acc": 0.7337191984836177, "train_speed(iter/s)": 3.239757 }, { "epoch": 0.13475244023888228, "grad_norm": 0.8023044466972351, "learning_rate": 9.991749142422273e-05, "loss": 1.0053152084350585, "memory(GiB)": 89.65, "step": 10385, "token_acc": 0.7410673756565722, "train_speed(iter/s)": 3.201484 }, { "epoch": 0.13481731864053798, "grad_norm": 0.7591987252235413, "learning_rate": 9.991718312401958e-05, "loss": 0.9979977607727051, "memory(GiB)": 89.65, "step": 10390, "token_acc": 0.7470086068154783, "train_speed(iter/s)": 3.162191 }, { "epoch": 0.13488219704219367, "grad_norm": 0.868319034576416, "learning_rate": 9.991687424937272e-05, "loss": 1.0638788223266602, "memory(GiB)": 89.65, "step": 10395, "token_acc": 0.7138519924098672, "train_speed(iter/s)": 3.123827 }, { "epoch": 0.13494707544384937, "grad_norm": 0.9396469593048096, "learning_rate": 9.991656480028567e-05, "loss": 0.9783455848693847, "memory(GiB)": 89.65, "step": 10400, "token_acc": 0.7770576131687242, "train_speed(iter/s)": 3.087162 }, { "epoch": 0.13501195384550507, "grad_norm": 0.8920696377754211, "learning_rate": 9.991625477676201e-05, "loss": 1.0446601867675782, "memory(GiB)": 89.65, "step": 10405, "token_acc": 0.7377833359387213, "train_speed(iter/s)": 3.051886 }, { "epoch": 0.13507683224716077, "grad_norm": 0.8691385984420776, "learning_rate": 9.99159441788053e-05, "loss": 0.9850933074951171, "memory(GiB)": 89.65, "step": 10410, "token_acc": 0.7484482078145743, "train_speed(iter/s)": 3.017581 }, { "epoch": 0.13514171064881644, "grad_norm": 0.7702266573905945, "learning_rate": 9.991563300641913e-05, "loss": 1.0288897514343263, "memory(GiB)": 89.65, "step": 10415, "token_acc": 0.712390342927572, "train_speed(iter/s)": 2.983449 }, { "epoch": 0.13520658905047214, "grad_norm": 1.0119677782058716, "learning_rate": 9.991532125960706e-05, "loss": 1.0542747497558593, "memory(GiB)": 89.65, "step": 10420, "token_acc": 0.7295972725335607, "train_speed(iter/s)": 2.951585 }, { "epoch": 0.13527146745212784, "grad_norm": 0.870326042175293, "learning_rate": 9.991500893837267e-05, "loss": 1.0000473022460938, "memory(GiB)": 89.65, "step": 10425, "token_acc": 0.729744505594635, "train_speed(iter/s)": 2.920777 }, { "epoch": 0.13533634585378354, "grad_norm": 0.9140814542770386, "learning_rate": 9.991469604271959e-05, "loss": 0.970792579650879, "memory(GiB)": 89.65, "step": 10430, "token_acc": 0.7446701877481784, "train_speed(iter/s)": 2.888212 }, { "epoch": 0.13540122425543924, "grad_norm": 0.8961354494094849, "learning_rate": 9.99143825726514e-05, "loss": 1.06123685836792, "memory(GiB)": 89.65, "step": 10435, "token_acc": 0.7243782327252455, "train_speed(iter/s)": 2.856838 }, { "epoch": 0.13546610265709494, "grad_norm": 0.8939909338951111, "learning_rate": 9.99140685281717e-05, "loss": 0.9953763008117675, "memory(GiB)": 89.65, "step": 10440, "token_acc": 0.7383920749376642, "train_speed(iter/s)": 2.827063 }, { "epoch": 0.13553098105875064, "grad_norm": 1.0260509252548218, "learning_rate": 9.991375390928411e-05, "loss": 0.9891544342041015, "memory(GiB)": 89.65, "step": 10445, "token_acc": 0.738141516066333, "train_speed(iter/s)": 2.796255 }, { "epoch": 0.13559585946040634, "grad_norm": 0.8681718707084656, "learning_rate": 9.991343871599226e-05, "loss": 1.016929817199707, "memory(GiB)": 89.65, "step": 10450, "token_acc": 0.74441319664943, "train_speed(iter/s)": 2.766095 }, { "epoch": 0.13566073786206204, "grad_norm": 0.8671702742576599, "learning_rate": 9.991312294829977e-05, "loss": 0.974364948272705, "memory(GiB)": 89.65, "step": 10455, "token_acc": 0.7463255082184408, "train_speed(iter/s)": 2.737455 }, { "epoch": 0.13572561626371774, "grad_norm": 0.9249074459075928, "learning_rate": 9.991280660621027e-05, "loss": 0.9849935531616211, "memory(GiB)": 89.65, "step": 10460, "token_acc": 0.7208009717672176, "train_speed(iter/s)": 2.709335 }, { "epoch": 0.13579049466537343, "grad_norm": 0.9150403141975403, "learning_rate": 9.99124896897274e-05, "loss": 1.0312816619873046, "memory(GiB)": 89.65, "step": 10465, "token_acc": 0.7287948355024684, "train_speed(iter/s)": 2.680068 }, { "epoch": 0.13585537306702913, "grad_norm": 0.907920241355896, "learning_rate": 9.99121721988548e-05, "loss": 1.0410511016845703, "memory(GiB)": 89.65, "step": 10470, "token_acc": 0.7353007349632519, "train_speed(iter/s)": 2.653695 }, { "epoch": 0.1359202514686848, "grad_norm": 0.8361615538597107, "learning_rate": 9.991185413359617e-05, "loss": 0.9623313903808594, "memory(GiB)": 89.65, "step": 10475, "token_acc": 0.7384971758583486, "train_speed(iter/s)": 2.626663 }, { "epoch": 0.1359851298703405, "grad_norm": 0.8343441486358643, "learning_rate": 9.991153549395511e-05, "loss": 1.0496618270874023, "memory(GiB)": 89.65, "step": 10480, "token_acc": 0.7426579925650557, "train_speed(iter/s)": 2.601954 }, { "epoch": 0.1360500082719962, "grad_norm": 0.8475773334503174, "learning_rate": 9.991121627993531e-05, "loss": 1.0168842315673827, "memory(GiB)": 89.65, "step": 10485, "token_acc": 0.7644601993542047, "train_speed(iter/s)": 2.576323 }, { "epoch": 0.1361148866736519, "grad_norm": 0.8403646945953369, "learning_rate": 9.991089649154045e-05, "loss": 0.9778810501098633, "memory(GiB)": 89.65, "step": 10490, "token_acc": 0.7498332666399893, "train_speed(iter/s)": 2.550761 }, { "epoch": 0.1361797650753076, "grad_norm": 0.9035772085189819, "learning_rate": 9.991057612877422e-05, "loss": 0.9786808967590332, "memory(GiB)": 89.65, "step": 10495, "token_acc": 0.7679841337299901, "train_speed(iter/s)": 2.526066 }, { "epoch": 0.1362446434769633, "grad_norm": 0.884563148021698, "learning_rate": 9.991025519164026e-05, "loss": 0.9863954544067383, "memory(GiB)": 89.65, "step": 10500, "token_acc": 0.7386520366567567, "train_speed(iter/s)": 2.501658 }, { "epoch": 0.136309521878619, "grad_norm": 0.9505849480628967, "learning_rate": 9.990993368014232e-05, "loss": 0.9527345657348633, "memory(GiB)": 89.65, "step": 10505, "token_acc": 0.7639676113360324, "train_speed(iter/s)": 2.479164 }, { "epoch": 0.1363744002802747, "grad_norm": 0.8215293288230896, "learning_rate": 9.990961159428407e-05, "loss": 0.9914504051208496, "memory(GiB)": 89.65, "step": 10510, "token_acc": 0.7277989552449527, "train_speed(iter/s)": 2.456733 }, { "epoch": 0.1364392786819304, "grad_norm": 0.9632096290588379, "learning_rate": 9.990928893406922e-05, "loss": 1.0036203384399414, "memory(GiB)": 89.65, "step": 10515, "token_acc": 0.7421234057221647, "train_speed(iter/s)": 2.434307 }, { "epoch": 0.1365041570835861, "grad_norm": 0.7657320499420166, "learning_rate": 9.990896569950149e-05, "loss": 0.9595584869384766, "memory(GiB)": 89.65, "step": 10520, "token_acc": 0.7183384352530789, "train_speed(iter/s)": 2.412427 }, { "epoch": 0.1365690354852418, "grad_norm": 1.0294530391693115, "learning_rate": 9.990864189058458e-05, "loss": 1.0051970481872559, "memory(GiB)": 89.65, "step": 10525, "token_acc": 0.7246201320260928, "train_speed(iter/s)": 2.391018 }, { "epoch": 0.1366339138868975, "grad_norm": 0.959503173828125, "learning_rate": 9.990831750732225e-05, "loss": 0.9949187278747559, "memory(GiB)": 89.65, "step": 10530, "token_acc": 0.756175103937393, "train_speed(iter/s)": 2.370265 }, { "epoch": 0.13669879228855317, "grad_norm": 0.8994614481925964, "learning_rate": 9.990799254971821e-05, "loss": 1.0704071044921875, "memory(GiB)": 89.65, "step": 10535, "token_acc": 0.7167932156592052, "train_speed(iter/s)": 2.349653 }, { "epoch": 0.13676367069020887, "grad_norm": 0.7976282238960266, "learning_rate": 9.99076670177762e-05, "loss": 1.0156156539916992, "memory(GiB)": 89.65, "step": 10540, "token_acc": 0.7496062992125985, "train_speed(iter/s)": 2.329834 }, { "epoch": 0.13682854909186457, "grad_norm": 0.8534359931945801, "learning_rate": 9.990734091149998e-05, "loss": 0.9806021690368653, "memory(GiB)": 89.65, "step": 10545, "token_acc": 0.7290625, "train_speed(iter/s)": 2.309677 }, { "epoch": 0.13689342749352026, "grad_norm": 0.8001965284347534, "learning_rate": 9.990701423089327e-05, "loss": 0.9815544128417969, "memory(GiB)": 89.65, "step": 10550, "token_acc": 0.7450563817617257, "train_speed(iter/s)": 2.290465 }, { "epoch": 0.13695830589517596, "grad_norm": 0.7878708243370056, "learning_rate": 9.990668697595986e-05, "loss": 1.0095593452453613, "memory(GiB)": 89.65, "step": 10555, "token_acc": 0.7188631081647426, "train_speed(iter/s)": 2.27186 }, { "epoch": 0.13702318429683166, "grad_norm": 0.9613059759140015, "learning_rate": 9.990635914670352e-05, "loss": 1.0430830001831055, "memory(GiB)": 89.65, "step": 10560, "token_acc": 0.7103251136520182, "train_speed(iter/s)": 2.253627 }, { "epoch": 0.13708806269848736, "grad_norm": 0.8865250945091248, "learning_rate": 9.990603074312802e-05, "loss": 1.0538095474243163, "memory(GiB)": 89.65, "step": 10565, "token_acc": 0.7051309346391313, "train_speed(iter/s)": 2.234402 }, { "epoch": 0.13715294110014306, "grad_norm": 0.7674691677093506, "learning_rate": 9.990570176523711e-05, "loss": 0.999210262298584, "memory(GiB)": 89.65, "step": 10570, "token_acc": 0.7385772869643837, "train_speed(iter/s)": 2.216142 }, { "epoch": 0.13721781950179876, "grad_norm": 0.7932493686676025, "learning_rate": 9.99053722130346e-05, "loss": 0.9457165718078613, "memory(GiB)": 89.65, "step": 10575, "token_acc": 0.7381080872716059, "train_speed(iter/s)": 2.198152 }, { "epoch": 0.13728269790345446, "grad_norm": 0.9236794710159302, "learning_rate": 9.990504208652427e-05, "loss": 1.0093671798706054, "memory(GiB)": 89.65, "step": 10580, "token_acc": 0.7307579210468279, "train_speed(iter/s)": 2.181482 }, { "epoch": 0.13734757630511016, "grad_norm": 0.9235591888427734, "learning_rate": 9.990471138570994e-05, "loss": 1.0277726173400878, "memory(GiB)": 89.65, "step": 10585, "token_acc": 0.7215580275149877, "train_speed(iter/s)": 2.164095 }, { "epoch": 0.13741245470676586, "grad_norm": 0.9731643199920654, "learning_rate": 9.99043801105954e-05, "loss": 0.9963740348815918, "memory(GiB)": 89.65, "step": 10590, "token_acc": 0.7173339982901111, "train_speed(iter/s)": 2.147576 }, { "epoch": 0.13747733310842153, "grad_norm": 0.7816506028175354, "learning_rate": 9.990404826118448e-05, "loss": 1.0415045738220214, "memory(GiB)": 89.65, "step": 10595, "token_acc": 0.7211076923076923, "train_speed(iter/s)": 2.131129 }, { "epoch": 0.13754221151007723, "grad_norm": 0.8717861771583557, "learning_rate": 9.990371583748095e-05, "loss": 1.0201337814331055, "memory(GiB)": 89.65, "step": 10600, "token_acc": 0.7197343776604802, "train_speed(iter/s)": 2.114134 }, { "epoch": 0.13760708991173293, "grad_norm": 0.9152875542640686, "learning_rate": 9.990338283948869e-05, "loss": 0.9510886192321777, "memory(GiB)": 89.65, "step": 10605, "token_acc": 0.75, "train_speed(iter/s)": 2.097072 }, { "epoch": 0.13767196831338863, "grad_norm": 0.9508010149002075, "learning_rate": 9.990304926721151e-05, "loss": 1.0119933128356933, "memory(GiB)": 89.65, "step": 10610, "token_acc": 0.7383484207613551, "train_speed(iter/s)": 2.081163 }, { "epoch": 0.13773684671504433, "grad_norm": 0.8589829206466675, "learning_rate": 9.990271512065325e-05, "loss": 0.9711455345153809, "memory(GiB)": 89.65, "step": 10615, "token_acc": 0.7522443000413177, "train_speed(iter/s)": 2.065383 }, { "epoch": 0.13780172511670002, "grad_norm": 0.9311808347702026, "learning_rate": 9.990238039981776e-05, "loss": 1.0231695175170898, "memory(GiB)": 89.65, "step": 10620, "token_acc": 0.7360499393098665, "train_speed(iter/s)": 2.050554 }, { "epoch": 0.13786660351835572, "grad_norm": 0.8248364925384521, "learning_rate": 9.990204510470889e-05, "loss": 0.9963115692138672, "memory(GiB)": 89.65, "step": 10625, "token_acc": 0.7310218517115069, "train_speed(iter/s)": 2.035222 }, { "epoch": 0.13793148192001142, "grad_norm": 0.9316691160202026, "learning_rate": 9.99017092353305e-05, "loss": 1.0237786293029785, "memory(GiB)": 89.65, "step": 10630, "token_acc": 0.74963530269876, "train_speed(iter/s)": 2.019355 }, { "epoch": 0.13799636032166712, "grad_norm": 0.8039060831069946, "learning_rate": 9.990137279168644e-05, "loss": 0.9759124755859375, "memory(GiB)": 89.65, "step": 10635, "token_acc": 0.7181387624554065, "train_speed(iter/s)": 2.00525 }, { "epoch": 0.13806123872332282, "grad_norm": 0.893413245677948, "learning_rate": 9.99010357737806e-05, "loss": 0.9381406784057618, "memory(GiB)": 89.65, "step": 10640, "token_acc": 0.7737012539616922, "train_speed(iter/s)": 1.990265 }, { "epoch": 0.13812611712497852, "grad_norm": 0.8452526926994324, "learning_rate": 9.990069818161685e-05, "loss": 1.0136354446411133, "memory(GiB)": 89.65, "step": 10645, "token_acc": 0.7725560720708493, "train_speed(iter/s)": 1.976488 }, { "epoch": 0.13819099552663422, "grad_norm": 0.9077231884002686, "learning_rate": 9.990036001519908e-05, "loss": 0.9692028045654297, "memory(GiB)": 89.65, "step": 10650, "token_acc": 0.7417165806876017, "train_speed(iter/s)": 1.961687 }, { "epoch": 0.1382558739282899, "grad_norm": 0.8620310425758362, "learning_rate": 9.990002127453117e-05, "loss": 0.9833439826965332, "memory(GiB)": 89.65, "step": 10655, "token_acc": 0.7253027635030725, "train_speed(iter/s)": 1.948751 }, { "epoch": 0.1383207523299456, "grad_norm": 0.7645382285118103, "learning_rate": 9.989968195961703e-05, "loss": 1.0239444732666017, "memory(GiB)": 89.65, "step": 10660, "token_acc": 0.7454007510526116, "train_speed(iter/s)": 1.934795 }, { "epoch": 0.1383856307316013, "grad_norm": 0.8854386806488037, "learning_rate": 9.989934207046057e-05, "loss": 0.9917402267456055, "memory(GiB)": 89.65, "step": 10665, "token_acc": 0.7242004909110312, "train_speed(iter/s)": 1.92169 }, { "epoch": 0.138450509133257, "grad_norm": 0.820000171661377, "learning_rate": 9.989900160706569e-05, "loss": 1.0043498039245606, "memory(GiB)": 89.65, "step": 10670, "token_acc": 0.7476571263161703, "train_speed(iter/s)": 1.909181 }, { "epoch": 0.1385153875349127, "grad_norm": 0.8518370985984802, "learning_rate": 9.98986605694363e-05, "loss": 0.9850224494934082, "memory(GiB)": 89.65, "step": 10675, "token_acc": 0.7270887305699482, "train_speed(iter/s)": 1.895969 }, { "epoch": 0.1385802659365684, "grad_norm": 0.8706509470939636, "learning_rate": 9.989831895757636e-05, "loss": 1.0459396362304687, "memory(GiB)": 89.65, "step": 10680, "token_acc": 0.7536581859997126, "train_speed(iter/s)": 1.882741 }, { "epoch": 0.13864514433822409, "grad_norm": 0.8130541443824768, "learning_rate": 9.989797677148975e-05, "loss": 1.0302618026733399, "memory(GiB)": 89.65, "step": 10685, "token_acc": 0.7373554013169603, "train_speed(iter/s)": 1.869491 }, { "epoch": 0.13871002273987978, "grad_norm": 0.8312078714370728, "learning_rate": 9.989763401118044e-05, "loss": 1.0123772621154785, "memory(GiB)": 89.65, "step": 10690, "token_acc": 0.7225472047389856, "train_speed(iter/s)": 1.857724 }, { "epoch": 0.13877490114153548, "grad_norm": 0.9218257069587708, "learning_rate": 9.98972906766524e-05, "loss": 1.0143428802490235, "memory(GiB)": 89.65, "step": 10695, "token_acc": 0.7297555115884681, "train_speed(iter/s)": 1.84539 }, { "epoch": 0.13883977954319118, "grad_norm": 1.000017523765564, "learning_rate": 9.989694676790951e-05, "loss": 1.006447696685791, "memory(GiB)": 89.65, "step": 10700, "token_acc": 0.7269178393100317, "train_speed(iter/s)": 1.833652 }, { "epoch": 0.13890465794484688, "grad_norm": 0.7589574456214905, "learning_rate": 9.989660228495578e-05, "loss": 0.9956985473632812, "memory(GiB)": 89.65, "step": 10705, "token_acc": 0.7440840840840841, "train_speed(iter/s)": 1.821459 }, { "epoch": 0.13896953634650258, "grad_norm": 1.007556676864624, "learning_rate": 9.989625722779517e-05, "loss": 0.9875019073486329, "memory(GiB)": 89.65, "step": 10710, "token_acc": 0.7345041701877503, "train_speed(iter/s)": 1.810594 }, { "epoch": 0.13903441474815825, "grad_norm": 0.8517904877662659, "learning_rate": 9.989591159643163e-05, "loss": 0.9990121841430664, "memory(GiB)": 89.65, "step": 10715, "token_acc": 0.7515527950310559, "train_speed(iter/s)": 1.799174 }, { "epoch": 0.13909929314981395, "grad_norm": 0.8693971633911133, "learning_rate": 9.989556539086916e-05, "loss": 1.0631518363952637, "memory(GiB)": 89.65, "step": 10720, "token_acc": 0.7106888914791933, "train_speed(iter/s)": 1.787586 }, { "epoch": 0.13916417155146965, "grad_norm": 0.8346068263053894, "learning_rate": 9.989521861111173e-05, "loss": 0.9976446151733398, "memory(GiB)": 89.65, "step": 10725, "token_acc": 0.762647891413225, "train_speed(iter/s)": 1.775564 }, { "epoch": 0.13922904995312535, "grad_norm": 0.9065845608711243, "learning_rate": 9.989487125716334e-05, "loss": 1.0052069664001464, "memory(GiB)": 89.65, "step": 10730, "token_acc": 0.7399137552989329, "train_speed(iter/s)": 1.764536 }, { "epoch": 0.13929392835478105, "grad_norm": 0.8840484619140625, "learning_rate": 9.989452332902798e-05, "loss": 1.0028379440307618, "memory(GiB)": 89.65, "step": 10735, "token_acc": 0.7369745516410293, "train_speed(iter/s)": 1.753642 }, { "epoch": 0.13935880675643675, "grad_norm": 0.8955674171447754, "learning_rate": 9.989417482670966e-05, "loss": 0.9595216751098633, "memory(GiB)": 89.65, "step": 10740, "token_acc": 0.7376470184823235, "train_speed(iter/s)": 1.742574 }, { "epoch": 0.13942368515809245, "grad_norm": 0.788733184337616, "learning_rate": 9.989382575021239e-05, "loss": 0.9555721282958984, "memory(GiB)": 89.65, "step": 10745, "token_acc": 0.7568096796820201, "train_speed(iter/s)": 1.730914 }, { "epoch": 0.13948856355974815, "grad_norm": 0.9022270441055298, "learning_rate": 9.989347609954019e-05, "loss": 1.0615378379821778, "memory(GiB)": 89.65, "step": 10750, "token_acc": 0.7211158465710965, "train_speed(iter/s)": 1.720732 }, { "epoch": 0.13955344196140385, "grad_norm": 0.8313274383544922, "learning_rate": 9.989312587469706e-05, "loss": 1.001223373413086, "memory(GiB)": 89.65, "step": 10755, "token_acc": 0.740621048194464, "train_speed(iter/s)": 1.710624 }, { "epoch": 0.13961832036305954, "grad_norm": 0.8202585577964783, "learning_rate": 9.989277507568706e-05, "loss": 1.0202961921691895, "memory(GiB)": 89.65, "step": 10760, "token_acc": 0.7377977144077753, "train_speed(iter/s)": 1.700066 }, { "epoch": 0.13968319876471524, "grad_norm": 0.879790186882019, "learning_rate": 9.989242370251421e-05, "loss": 1.019002342224121, "memory(GiB)": 89.65, "step": 10765, "token_acc": 0.7494791666666667, "train_speed(iter/s)": 1.68943 }, { "epoch": 0.13974807716637094, "grad_norm": 0.8370912671089172, "learning_rate": 9.989207175518256e-05, "loss": 1.018333911895752, "memory(GiB)": 89.65, "step": 10770, "token_acc": 0.7326256427268204, "train_speed(iter/s)": 1.67873 }, { "epoch": 0.13981295556802661, "grad_norm": 0.8516767024993896, "learning_rate": 9.989171923369616e-05, "loss": 1.0276076316833496, "memory(GiB)": 89.65, "step": 10775, "token_acc": 0.7419257369233868, "train_speed(iter/s)": 1.668959 }, { "epoch": 0.1398778339696823, "grad_norm": 0.830082893371582, "learning_rate": 9.989136613805908e-05, "loss": 0.9470907211303711, "memory(GiB)": 89.65, "step": 10780, "token_acc": 0.7299320431887958, "train_speed(iter/s)": 1.659308 }, { "epoch": 0.139942712371338, "grad_norm": 0.8066754937171936, "learning_rate": 9.989101246827535e-05, "loss": 0.957858943939209, "memory(GiB)": 89.65, "step": 10785, "token_acc": 0.7364028987629017, "train_speed(iter/s)": 1.649669 }, { "epoch": 0.1400075907729937, "grad_norm": 0.890590250492096, "learning_rate": 9.989065822434907e-05, "loss": 1.0168228149414062, "memory(GiB)": 89.65, "step": 10790, "token_acc": 0.7628890379077748, "train_speed(iter/s)": 1.641063 }, { "epoch": 0.1400724691746494, "grad_norm": 0.8830333948135376, "learning_rate": 9.989030340628429e-05, "loss": 1.0051651000976562, "memory(GiB)": 89.65, "step": 10795, "token_acc": 0.7343332502048523, "train_speed(iter/s)": 1.631982 }, { "epoch": 0.1401373475763051, "grad_norm": 0.9481087923049927, "learning_rate": 9.988994801408513e-05, "loss": 0.978604793548584, "memory(GiB)": 89.65, "step": 10800, "token_acc": 0.7424652314726614, "train_speed(iter/s)": 1.622585 }, { "epoch": 0.1402022259779608, "grad_norm": 0.9209172129631042, "learning_rate": 9.988959204775565e-05, "loss": 0.9828184127807618, "memory(GiB)": 89.65, "step": 10805, "token_acc": 0.7527580535162675, "train_speed(iter/s)": 1.61347 }, { "epoch": 0.1402671043796165, "grad_norm": 0.9246165752410889, "learning_rate": 9.988923550729993e-05, "loss": 1.0410100936889648, "memory(GiB)": 89.65, "step": 10810, "token_acc": 0.75066181336863, "train_speed(iter/s)": 1.604723 }, { "epoch": 0.1403319827812722, "grad_norm": 0.9413055181503296, "learning_rate": 9.988887839272214e-05, "loss": 1.003919792175293, "memory(GiB)": 89.65, "step": 10815, "token_acc": 0.7151149291850476, "train_speed(iter/s)": 1.595807 }, { "epoch": 0.1403968611829279, "grad_norm": 0.9192569255828857, "learning_rate": 9.988852070402633e-05, "loss": 1.0141987800598145, "memory(GiB)": 89.65, "step": 10820, "token_acc": 0.7346793831168831, "train_speed(iter/s)": 1.58697 }, { "epoch": 0.1404617395845836, "grad_norm": 0.8956695199012756, "learning_rate": 9.988816244121664e-05, "loss": 1.001786231994629, "memory(GiB)": 89.65, "step": 10825, "token_acc": 0.7492132104804156, "train_speed(iter/s)": 1.578258 }, { "epoch": 0.14052661798623928, "grad_norm": 0.7856860756874084, "learning_rate": 9.988780360429719e-05, "loss": 0.9494283676147461, "memory(GiB)": 89.65, "step": 10830, "token_acc": 0.7421595145843994, "train_speed(iter/s)": 1.569382 }, { "epoch": 0.14059149638789498, "grad_norm": 0.8559401035308838, "learning_rate": 9.988744419327209e-05, "loss": 1.0244171142578125, "memory(GiB)": 89.65, "step": 10835, "token_acc": 0.7111601028827204, "train_speed(iter/s)": 1.560853 }, { "epoch": 0.14065637478955068, "grad_norm": 0.8992999196052551, "learning_rate": 9.98870842081455e-05, "loss": 1.0842857360839844, "memory(GiB)": 89.65, "step": 10840, "token_acc": 0.7218466795424157, "train_speed(iter/s)": 1.552596 }, { "epoch": 0.14072125319120637, "grad_norm": 0.8095808029174805, "learning_rate": 9.988672364892157e-05, "loss": 0.987132453918457, "memory(GiB)": 89.65, "step": 10845, "token_acc": 0.7405088283409489, "train_speed(iter/s)": 1.543959 }, { "epoch": 0.14078613159286207, "grad_norm": 0.7685208916664124, "learning_rate": 9.988636251560442e-05, "loss": 1.0130138397216797, "memory(GiB)": 89.65, "step": 10850, "token_acc": 0.733588543882036, "train_speed(iter/s)": 1.5354 }, { "epoch": 0.14085100999451777, "grad_norm": 0.9116196632385254, "learning_rate": 9.988600080819823e-05, "loss": 0.9773868560791016, "memory(GiB)": 89.65, "step": 10855, "token_acc": 0.7296685415119035, "train_speed(iter/s)": 1.527261 }, { "epoch": 0.14091588839617347, "grad_norm": 0.8475781083106995, "learning_rate": 9.988563852670716e-05, "loss": 1.0219215393066405, "memory(GiB)": 89.65, "step": 10860, "token_acc": 0.7458472945034577, "train_speed(iter/s)": 1.519451 }, { "epoch": 0.14098076679782917, "grad_norm": 0.942100465297699, "learning_rate": 9.988527567113534e-05, "loss": 1.0023979187011718, "memory(GiB)": 89.65, "step": 10865, "token_acc": 0.7278837536807067, "train_speed(iter/s)": 1.511696 }, { "epoch": 0.14104564519948487, "grad_norm": 0.854900598526001, "learning_rate": 9.988491224148702e-05, "loss": 1.0000720977783204, "memory(GiB)": 89.65, "step": 10870, "token_acc": 0.730160831292607, "train_speed(iter/s)": 1.503676 }, { "epoch": 0.14111052360114057, "grad_norm": 0.8536474704742432, "learning_rate": 9.988454823776632e-05, "loss": 1.0511940956115722, "memory(GiB)": 89.65, "step": 10875, "token_acc": 0.7377346234059284, "train_speed(iter/s)": 1.49571 }, { "epoch": 0.14117540200279627, "grad_norm": 0.7913988828659058, "learning_rate": 9.988418365997745e-05, "loss": 1.0355926513671876, "memory(GiB)": 89.65, "step": 10880, "token_acc": 0.7268547987835066, "train_speed(iter/s)": 1.488526 }, { "epoch": 0.14124028040445197, "grad_norm": 0.8551129698753357, "learning_rate": 9.98838185081246e-05, "loss": 0.92763671875, "memory(GiB)": 89.65, "step": 10885, "token_acc": 0.7348248738498071, "train_speed(iter/s)": 1.480779 }, { "epoch": 0.14130515880610764, "grad_norm": 0.8186479806900024, "learning_rate": 9.988345278221198e-05, "loss": 1.0143579483032226, "memory(GiB)": 89.65, "step": 10890, "token_acc": 0.7330147327407768, "train_speed(iter/s)": 1.473132 }, { "epoch": 0.14137003720776334, "grad_norm": 0.9110378623008728, "learning_rate": 9.98830864822438e-05, "loss": 1.0120784759521484, "memory(GiB)": 89.65, "step": 10895, "token_acc": 0.7447784154783741, "train_speed(iter/s)": 1.465685 }, { "epoch": 0.14143491560941904, "grad_norm": 0.8474167585372925, "learning_rate": 9.988271960822426e-05, "loss": 0.981113052368164, "memory(GiB)": 89.65, "step": 10900, "token_acc": 0.7418103290509133, "train_speed(iter/s)": 1.458073 }, { "epoch": 0.14149979401107474, "grad_norm": 0.8545054197311401, "learning_rate": 9.98823521601576e-05, "loss": 0.9801860809326172, "memory(GiB)": 89.65, "step": 10905, "token_acc": 0.7315970515970516, "train_speed(iter/s)": 1.450792 }, { "epoch": 0.14156467241273044, "grad_norm": 0.7984640002250671, "learning_rate": 9.988198413804806e-05, "loss": 1.0645432472229004, "memory(GiB)": 89.65, "step": 10910, "token_acc": 0.709526421649296, "train_speed(iter/s)": 1.443653 }, { "epoch": 0.14162955081438613, "grad_norm": 1.022268295288086, "learning_rate": 9.988161554189983e-05, "loss": 1.036087989807129, "memory(GiB)": 89.65, "step": 10915, "token_acc": 0.7013815090329437, "train_speed(iter/s)": 1.436953 }, { "epoch": 0.14169442921604183, "grad_norm": 0.9118217825889587, "learning_rate": 9.988124637171719e-05, "loss": 1.042335605621338, "memory(GiB)": 89.65, "step": 10920, "token_acc": 0.7258550225855023, "train_speed(iter/s)": 1.42994 }, { "epoch": 0.14175930761769753, "grad_norm": 0.8458410501480103, "learning_rate": 9.988087662750438e-05, "loss": 0.9615331649780273, "memory(GiB)": 89.65, "step": 10925, "token_acc": 0.7562612744892394, "train_speed(iter/s)": 1.423343 }, { "epoch": 0.14182418601935323, "grad_norm": 0.833785891532898, "learning_rate": 9.988050630926564e-05, "loss": 1.0141176223754882, "memory(GiB)": 89.65, "step": 10930, "token_acc": 0.7317146411565355, "train_speed(iter/s)": 1.416561 }, { "epoch": 0.14188906442100893, "grad_norm": 0.8850396275520325, "learning_rate": 9.988013541700526e-05, "loss": 0.9644658088684082, "memory(GiB)": 89.65, "step": 10935, "token_acc": 0.7555148379679528, "train_speed(iter/s)": 1.409837 }, { "epoch": 0.14195394282266463, "grad_norm": 0.9105612635612488, "learning_rate": 9.987976395072749e-05, "loss": 0.9807582855224609, "memory(GiB)": 89.65, "step": 10940, "token_acc": 0.7519788033270727, "train_speed(iter/s)": 1.403217 }, { "epoch": 0.14201882122432033, "grad_norm": 0.9694616794586182, "learning_rate": 9.987939191043661e-05, "loss": 1.0013189315795898, "memory(GiB)": 89.65, "step": 10945, "token_acc": 0.7270797399495821, "train_speed(iter/s)": 1.39653 }, { "epoch": 0.142083699625976, "grad_norm": 0.8435583710670471, "learning_rate": 9.987901929613688e-05, "loss": 0.9964953422546386, "memory(GiB)": 89.65, "step": 10950, "token_acc": 0.722188394325032, "train_speed(iter/s)": 1.38984 }, { "epoch": 0.1421485780276317, "grad_norm": 0.8198471665382385, "learning_rate": 9.987864610783261e-05, "loss": 1.0123509407043456, "memory(GiB)": 89.65, "step": 10955, "token_acc": 0.7462205992068167, "train_speed(iter/s)": 1.383454 }, { "epoch": 0.1422134564292874, "grad_norm": 0.7666153907775879, "learning_rate": 9.98782723455281e-05, "loss": 0.9642436027526855, "memory(GiB)": 89.65, "step": 10960, "token_acc": 0.7473619764468614, "train_speed(iter/s)": 1.376977 }, { "epoch": 0.1422783348309431, "grad_norm": 0.9223549962043762, "learning_rate": 9.987789800922763e-05, "loss": 1.0219473838806152, "memory(GiB)": 89.65, "step": 10965, "token_acc": 0.7493220637436169, "train_speed(iter/s)": 1.370591 }, { "epoch": 0.1423432132325988, "grad_norm": 0.8070237040519714, "learning_rate": 9.987752309893554e-05, "loss": 1.0558242797851562, "memory(GiB)": 89.65, "step": 10970, "token_acc": 0.7061737804878049, "train_speed(iter/s)": 1.364227 }, { "epoch": 0.1424080916342545, "grad_norm": 0.90378737449646, "learning_rate": 9.98771476146561e-05, "loss": 1.0318693161010741, "memory(GiB)": 89.65, "step": 10975, "token_acc": 0.7380264660441996, "train_speed(iter/s)": 1.357781 }, { "epoch": 0.1424729700359102, "grad_norm": 0.821463942527771, "learning_rate": 9.987677155639368e-05, "loss": 0.9749593734741211, "memory(GiB)": 89.65, "step": 10980, "token_acc": 0.7324689854881701, "train_speed(iter/s)": 1.351456 }, { "epoch": 0.1425378484375659, "grad_norm": 0.8374069333076477, "learning_rate": 9.987639492415256e-05, "loss": 1.0367549896240233, "memory(GiB)": 89.65, "step": 10985, "token_acc": 0.7338328224921482, "train_speed(iter/s)": 1.345511 }, { "epoch": 0.1426027268392216, "grad_norm": 0.8561365604400635, "learning_rate": 9.987601771793711e-05, "loss": 1.029349136352539, "memory(GiB)": 89.65, "step": 10990, "token_acc": 0.7214576962283384, "train_speed(iter/s)": 1.339448 }, { "epoch": 0.1426676052408773, "grad_norm": 0.852005660533905, "learning_rate": 9.987563993775165e-05, "loss": 0.9994305610656739, "memory(GiB)": 89.65, "step": 10995, "token_acc": 0.7365972150945979, "train_speed(iter/s)": 1.333079 }, { "epoch": 0.142732483642533, "grad_norm": 0.893486499786377, "learning_rate": 9.987526158360054e-05, "loss": 0.9737709045410157, "memory(GiB)": 89.65, "step": 11000, "token_acc": 0.7457936810049486, "train_speed(iter/s)": 1.327099 }, { "epoch": 0.1427973620441887, "grad_norm": 0.7505319118499756, "learning_rate": 9.987488265548813e-05, "loss": 0.9569240570068359, "memory(GiB)": 89.65, "step": 11005, "token_acc": 0.7314032672198861, "train_speed(iter/s)": 1.321087 }, { "epoch": 0.14286224044584436, "grad_norm": 0.9518632888793945, "learning_rate": 9.987450315341879e-05, "loss": 0.9860628128051758, "memory(GiB)": 89.65, "step": 11010, "token_acc": 0.7215165511932256, "train_speed(iter/s)": 1.315377 }, { "epoch": 0.14292711884750006, "grad_norm": 0.9990958571434021, "learning_rate": 9.987412307739687e-05, "loss": 1.0073835372924804, "memory(GiB)": 89.65, "step": 11015, "token_acc": 0.7493001220300051, "train_speed(iter/s)": 1.309726 }, { "epoch": 0.14299199724915576, "grad_norm": 0.8777619004249573, "learning_rate": 9.987374242742676e-05, "loss": 1.0308368682861329, "memory(GiB)": 89.65, "step": 11020, "token_acc": 0.744743513271697, "train_speed(iter/s)": 1.303816 }, { "epoch": 0.14305687565081146, "grad_norm": 0.9369611144065857, "learning_rate": 9.98733612035128e-05, "loss": 1.0039113998413085, "memory(GiB)": 89.65, "step": 11025, "token_acc": 0.7579097212655554, "train_speed(iter/s)": 1.298302 }, { "epoch": 0.14312175405246716, "grad_norm": 0.7967137694358826, "learning_rate": 9.987297940565943e-05, "loss": 0.9887954711914062, "memory(GiB)": 89.65, "step": 11030, "token_acc": 0.7213264481680542, "train_speed(iter/s)": 1.292527 }, { "epoch": 0.14318663245412286, "grad_norm": 0.9230301380157471, "learning_rate": 9.987259703387104e-05, "loss": 1.0157033920288085, "memory(GiB)": 89.65, "step": 11035, "token_acc": 0.7470194092296077, "train_speed(iter/s)": 1.28699 }, { "epoch": 0.14325151085577856, "grad_norm": 0.8272759318351746, "learning_rate": 9.9872214088152e-05, "loss": 0.9761556625366211, "memory(GiB)": 89.65, "step": 11040, "token_acc": 0.7572201577677815, "train_speed(iter/s)": 1.281743 }, { "epoch": 0.14331638925743426, "grad_norm": 0.9077045917510986, "learning_rate": 9.987183056850672e-05, "loss": 1.0172224044799805, "memory(GiB)": 89.65, "step": 11045, "token_acc": 0.7432089951811529, "train_speed(iter/s)": 1.276455 }, { "epoch": 0.14338126765908996, "grad_norm": 0.9393822550773621, "learning_rate": 9.987144647493963e-05, "loss": 1.0139591217041015, "memory(GiB)": 89.65, "step": 11050, "token_acc": 0.720871409921671, "train_speed(iter/s)": 1.271213 }, { "epoch": 0.14344614606074566, "grad_norm": 0.8720694184303284, "learning_rate": 9.987106180745513e-05, "loss": 1.0258563995361327, "memory(GiB)": 89.65, "step": 11055, "token_acc": 0.7338206666401812, "train_speed(iter/s)": 1.26626 }, { "epoch": 0.14351102446240135, "grad_norm": 0.9940328598022461, "learning_rate": 9.987067656605769e-05, "loss": 1.0217982292175294, "memory(GiB)": 89.65, "step": 11060, "token_acc": 0.724487211952393, "train_speed(iter/s)": 1.261202 }, { "epoch": 0.14357590286405705, "grad_norm": 0.9693887233734131, "learning_rate": 9.98702907507517e-05, "loss": 0.9646778106689453, "memory(GiB)": 89.65, "step": 11065, "token_acc": 0.7345264469675991, "train_speed(iter/s)": 1.2559 }, { "epoch": 0.14364078126571272, "grad_norm": 0.8984770178794861, "learning_rate": 9.986990436154162e-05, "loss": 0.997279167175293, "memory(GiB)": 89.65, "step": 11070, "token_acc": 0.731341413761784, "train_speed(iter/s)": 1.250504 }, { "epoch": 0.14370565966736842, "grad_norm": 0.8245874643325806, "learning_rate": 9.986951739843187e-05, "loss": 1.04835205078125, "memory(GiB)": 89.65, "step": 11075, "token_acc": 0.7153344208809136, "train_speed(iter/s)": 1.24513 }, { "epoch": 0.14377053806902412, "grad_norm": 0.9428223371505737, "learning_rate": 9.986912986142694e-05, "loss": 0.9594949722290039, "memory(GiB)": 89.65, "step": 11080, "token_acc": 0.7394315133057792, "train_speed(iter/s)": 1.239846 }, { "epoch": 0.14383541647067982, "grad_norm": 0.7806690335273743, "learning_rate": 9.986874175053126e-05, "loss": 0.9756921768188477, "memory(GiB)": 89.65, "step": 11085, "token_acc": 0.7400879702254621, "train_speed(iter/s)": 1.234612 }, { "epoch": 0.14390029487233552, "grad_norm": 0.8142969608306885, "learning_rate": 9.986835306574933e-05, "loss": 1.0093844413757325, "memory(GiB)": 89.65, "step": 11090, "token_acc": 0.7249818709209572, "train_speed(iter/s)": 1.229418 }, { "epoch": 0.14396517327399122, "grad_norm": 0.8286635875701904, "learning_rate": 9.986796380708558e-05, "loss": 1.0041205406188964, "memory(GiB)": 89.65, "step": 11095, "token_acc": 0.7309599645204209, "train_speed(iter/s)": 1.224623 }, { "epoch": 0.14403005167564692, "grad_norm": 0.8577960729598999, "learning_rate": 9.986757397454453e-05, "loss": 1.00836124420166, "memory(GiB)": 89.65, "step": 11100, "token_acc": 0.7344799701418263, "train_speed(iter/s)": 1.219857 }, { "epoch": 0.14409493007730262, "grad_norm": 0.9153376817703247, "learning_rate": 9.986718356813064e-05, "loss": 0.9528297424316406, "memory(GiB)": 89.65, "step": 11105, "token_acc": 0.7373430176392716, "train_speed(iter/s)": 1.21487 }, { "epoch": 0.14415980847895832, "grad_norm": 0.8713948130607605, "learning_rate": 9.986679258784842e-05, "loss": 1.0444272994995116, "memory(GiB)": 89.65, "step": 11110, "token_acc": 0.7315803983734234, "train_speed(iter/s)": 1.209918 }, { "epoch": 0.14422468688061402, "grad_norm": 0.8649516105651855, "learning_rate": 9.986640103370235e-05, "loss": 0.9780693054199219, "memory(GiB)": 89.65, "step": 11115, "token_acc": 0.7277316506144138, "train_speed(iter/s)": 1.205134 }, { "epoch": 0.14428956528226972, "grad_norm": 0.9399420619010925, "learning_rate": 9.986600890569694e-05, "loss": 1.0130836486816406, "memory(GiB)": 89.65, "step": 11120, "token_acc": 0.7386477582008308, "train_speed(iter/s)": 1.200063 }, { "epoch": 0.14435444368392542, "grad_norm": 0.8525856733322144, "learning_rate": 9.986561620383672e-05, "loss": 1.0022351264953613, "memory(GiB)": 89.65, "step": 11125, "token_acc": 0.7469948311095084, "train_speed(iter/s)": 1.194989 }, { "epoch": 0.1444193220855811, "grad_norm": 1.017491340637207, "learning_rate": 9.986522292812619e-05, "loss": 1.011376953125, "memory(GiB)": 89.65, "step": 11130, "token_acc": 0.7331844130057086, "train_speed(iter/s)": 1.19046 }, { "epoch": 0.14448420048723679, "grad_norm": 0.8444634079933167, "learning_rate": 9.98648290785699e-05, "loss": 0.9932987213134765, "memory(GiB)": 89.65, "step": 11135, "token_acc": 0.7516857337677096, "train_speed(iter/s)": 1.186061 }, { "epoch": 0.14454907888889248, "grad_norm": 0.8312439322471619, "learning_rate": 9.986443465517235e-05, "loss": 1.0120155334472656, "memory(GiB)": 89.65, "step": 11140, "token_acc": 0.7351019248526174, "train_speed(iter/s)": 1.18136 }, { "epoch": 0.14461395729054818, "grad_norm": 0.8355777263641357, "learning_rate": 9.98640396579381e-05, "loss": 1.0293837547302247, "memory(GiB)": 89.65, "step": 11145, "token_acc": 0.7280473978714844, "train_speed(iter/s)": 1.176895 }, { "epoch": 0.14467883569220388, "grad_norm": 0.892694890499115, "learning_rate": 9.98636440868717e-05, "loss": 0.9592779159545899, "memory(GiB)": 89.65, "step": 11150, "token_acc": 0.7461287693561532, "train_speed(iter/s)": 1.172286 }, { "epoch": 0.14474371409385958, "grad_norm": 0.9577929973602295, "learning_rate": 9.986324794197767e-05, "loss": 0.987640380859375, "memory(GiB)": 89.65, "step": 11155, "token_acc": 0.7621567678722632, "train_speed(iter/s)": 1.167723 }, { "epoch": 0.14480859249551528, "grad_norm": 0.9192133545875549, "learning_rate": 9.986285122326061e-05, "loss": 1.066562557220459, "memory(GiB)": 89.65, "step": 11160, "token_acc": 0.7421923797626483, "train_speed(iter/s)": 1.163146 }, { "epoch": 0.14487347089717098, "grad_norm": 0.8206189274787903, "learning_rate": 9.986245393072507e-05, "loss": 0.9834689140319824, "memory(GiB)": 89.65, "step": 11165, "token_acc": 0.7339020180562932, "train_speed(iter/s)": 1.158672 }, { "epoch": 0.14493834929882668, "grad_norm": 0.9090688228607178, "learning_rate": 9.986205606437561e-05, "loss": 0.9850709915161133, "memory(GiB)": 89.65, "step": 11170, "token_acc": 0.7432899720072452, "train_speed(iter/s)": 1.154459 }, { "epoch": 0.14500322770048238, "grad_norm": 0.8390305042266846, "learning_rate": 9.986165762421683e-05, "loss": 0.9426128387451171, "memory(GiB)": 89.65, "step": 11175, "token_acc": 0.741046741277156, "train_speed(iter/s)": 1.149981 }, { "epoch": 0.14506810610213808, "grad_norm": 0.9441995024681091, "learning_rate": 9.986125861025329e-05, "loss": 1.0205974578857422, "memory(GiB)": 89.65, "step": 11180, "token_acc": 0.7461347968676796, "train_speed(iter/s)": 1.145382 }, { "epoch": 0.14513298450379378, "grad_norm": 0.8658545613288879, "learning_rate": 9.98608590224896e-05, "loss": 0.9670056343078614, "memory(GiB)": 89.65, "step": 11185, "token_acc": 0.7250496172384463, "train_speed(iter/s)": 1.141203 }, { "epoch": 0.14519786290544945, "grad_norm": 0.749365508556366, "learning_rate": 9.986045886093034e-05, "loss": 1.0250597953796388, "memory(GiB)": 89.65, "step": 11190, "token_acc": 0.7317865711075344, "train_speed(iter/s)": 1.136872 }, { "epoch": 0.14526274130710515, "grad_norm": 0.8182034492492676, "learning_rate": 9.986005812558015e-05, "loss": 1.032149887084961, "memory(GiB)": 89.65, "step": 11195, "token_acc": 0.7228997920739845, "train_speed(iter/s)": 1.132748 }, { "epoch": 0.14532761970876085, "grad_norm": 0.9782443642616272, "learning_rate": 9.985965681644361e-05, "loss": 1.0063515663146974, "memory(GiB)": 89.65, "step": 11200, "token_acc": 0.7196435343193023, "train_speed(iter/s)": 1.128866 }, { "epoch": 0.14539249811041655, "grad_norm": 0.7981781363487244, "learning_rate": 9.985925493352535e-05, "loss": 0.9763326644897461, "memory(GiB)": 89.65, "step": 11205, "token_acc": 0.7584841264730504, "train_speed(iter/s)": 1.124472 }, { "epoch": 0.14545737651207225, "grad_norm": 0.8415031433105469, "learning_rate": 9.985885247682999e-05, "loss": 1.030357551574707, "memory(GiB)": 89.65, "step": 11210, "token_acc": 0.7459962530972382, "train_speed(iter/s)": 1.120343 }, { "epoch": 0.14552225491372794, "grad_norm": 0.9187615513801575, "learning_rate": 9.985844944636217e-05, "loss": 1.0136693954467773, "memory(GiB)": 89.65, "step": 11215, "token_acc": 0.7365969206474536, "train_speed(iter/s)": 1.116636 }, { "epoch": 0.14558713331538364, "grad_norm": 0.9417755603790283, "learning_rate": 9.985804584212651e-05, "loss": 0.9909130096435547, "memory(GiB)": 89.65, "step": 11220, "token_acc": 0.7323900806328921, "train_speed(iter/s)": 1.112732 }, { "epoch": 0.14565201171703934, "grad_norm": 0.819762110710144, "learning_rate": 9.985764166412768e-05, "loss": 0.9970102310180664, "memory(GiB)": 89.65, "step": 11225, "token_acc": 0.7512913775712932, "train_speed(iter/s)": 1.10855 }, { "epoch": 0.14571689011869504, "grad_norm": 0.9144452810287476, "learning_rate": 9.985723691237033e-05, "loss": 0.9549032211303711, "memory(GiB)": 89.65, "step": 11230, "token_acc": 0.7396458814472672, "train_speed(iter/s)": 1.10471 }, { "epoch": 0.14578176852035074, "grad_norm": 0.9265987277030945, "learning_rate": 9.985683158685908e-05, "loss": 0.9703798294067383, "memory(GiB)": 89.65, "step": 11235, "token_acc": 0.7227940063568942, "train_speed(iter/s)": 1.100927 }, { "epoch": 0.14584664692200644, "grad_norm": 0.7433079481124878, "learning_rate": 9.985642568759863e-05, "loss": 1.0286540031433105, "memory(GiB)": 89.65, "step": 11240, "token_acc": 0.7469969702370344, "train_speed(iter/s)": 1.097089 }, { "epoch": 0.14591152532366214, "grad_norm": 0.8036376237869263, "learning_rate": 9.985601921459365e-05, "loss": 0.9840673446655274, "memory(GiB)": 89.65, "step": 11245, "token_acc": 0.7301528939547139, "train_speed(iter/s)": 1.093242 }, { "epoch": 0.1459764037253178, "grad_norm": 0.8410695195198059, "learning_rate": 9.98556121678488e-05, "loss": 0.9872941017150879, "memory(GiB)": 89.65, "step": 11250, "token_acc": 0.7257237014567651, "train_speed(iter/s)": 1.089165 }, { "epoch": 0.1460412821269735, "grad_norm": 0.9278035759925842, "learning_rate": 9.985520454736879e-05, "loss": 1.0451773643493651, "memory(GiB)": 89.65, "step": 11255, "token_acc": 0.7210085682516842, "train_speed(iter/s)": 1.085408 }, { "epoch": 0.1461061605286292, "grad_norm": 0.7803704738616943, "learning_rate": 9.985479635315826e-05, "loss": 1.019361400604248, "memory(GiB)": 89.65, "step": 11260, "token_acc": 0.7374517374517374, "train_speed(iter/s)": 1.081632 }, { "epoch": 0.1461710389302849, "grad_norm": 0.8923273086547852, "learning_rate": 9.985438758522197e-05, "loss": 0.9890348434448242, "memory(GiB)": 89.65, "step": 11265, "token_acc": 0.7227594303191325, "train_speed(iter/s)": 1.078021 }, { "epoch": 0.1462359173319406, "grad_norm": 0.8246127963066101, "learning_rate": 9.98539782435646e-05, "loss": 0.9788227081298828, "memory(GiB)": 89.65, "step": 11270, "token_acc": 0.7462182004439133, "train_speed(iter/s)": 1.074217 }, { "epoch": 0.1463007957335963, "grad_norm": 0.923599362373352, "learning_rate": 9.985356832819085e-05, "loss": 1.000249481201172, "memory(GiB)": 89.65, "step": 11275, "token_acc": 0.7587901069518717, "train_speed(iter/s)": 1.070545 }, { "epoch": 0.146365674135252, "grad_norm": 0.8375346660614014, "learning_rate": 9.985315783910544e-05, "loss": 0.9867733001708985, "memory(GiB)": 89.65, "step": 11280, "token_acc": 0.7282947789746307, "train_speed(iter/s)": 1.0669 }, { "epoch": 0.1464305525369077, "grad_norm": 0.842505156993866, "learning_rate": 9.98527467763131e-05, "loss": 0.9995162963867188, "memory(GiB)": 89.65, "step": 11285, "token_acc": 0.7412477558348295, "train_speed(iter/s)": 1.063211 }, { "epoch": 0.1464954309385634, "grad_norm": 0.8458927869796753, "learning_rate": 9.985233513981856e-05, "loss": 1.0399608612060547, "memory(GiB)": 89.65, "step": 11290, "token_acc": 0.735251573556433, "train_speed(iter/s)": 1.059594 }, { "epoch": 0.1465603093402191, "grad_norm": 0.7843220233917236, "learning_rate": 9.985192292962655e-05, "loss": 1.0165397644042968, "memory(GiB)": 89.65, "step": 11295, "token_acc": 0.752990645066658, "train_speed(iter/s)": 1.056111 }, { "epoch": 0.1466251877418748, "grad_norm": 0.8227854371070862, "learning_rate": 9.985151014574183e-05, "loss": 1.0133460998535155, "memory(GiB)": 89.65, "step": 11300, "token_acc": 0.7319390402075227, "train_speed(iter/s)": 1.052549 }, { "epoch": 0.1466900661435305, "grad_norm": 0.8442780375480652, "learning_rate": 9.985109678816913e-05, "loss": 0.9771812438964844, "memory(GiB)": 89.65, "step": 11305, "token_acc": 0.7489231758277082, "train_speed(iter/s)": 1.049032 }, { "epoch": 0.14675494454518617, "grad_norm": 0.9897001385688782, "learning_rate": 9.985068285691321e-05, "loss": 1.012002182006836, "memory(GiB)": 89.65, "step": 11310, "token_acc": 0.7148889151967168, "train_speed(iter/s)": 1.04556 }, { "epoch": 0.14681982294684187, "grad_norm": 0.7866716384887695, "learning_rate": 9.985026835197886e-05, "loss": 0.9793779373168945, "memory(GiB)": 89.65, "step": 11315, "token_acc": 0.7616869390442527, "train_speed(iter/s)": 1.041951 }, { "epoch": 0.14688470134849757, "grad_norm": 0.9710432291030884, "learning_rate": 9.984985327337082e-05, "loss": 1.0516633987426758, "memory(GiB)": 89.65, "step": 11320, "token_acc": 0.7088607594936709, "train_speed(iter/s)": 1.038493 }, { "epoch": 0.14694957975015327, "grad_norm": 0.82063889503479, "learning_rate": 9.984943762109388e-05, "loss": 0.9924193382263183, "memory(GiB)": 89.65, "step": 11325, "token_acc": 0.7391802491037439, "train_speed(iter/s)": 1.034945 }, { "epoch": 0.14701445815180897, "grad_norm": 0.7823387384414673, "learning_rate": 9.984902139515282e-05, "loss": 1.0116644859313966, "memory(GiB)": 89.65, "step": 11330, "token_acc": 0.7312647265828208, "train_speed(iter/s)": 1.0314 }, { "epoch": 0.14707933655346467, "grad_norm": 1.0118294954299927, "learning_rate": 9.984860459555243e-05, "loss": 1.049652671813965, "memory(GiB)": 89.65, "step": 11335, "token_acc": 0.7348653151902249, "train_speed(iter/s)": 1.028196 }, { "epoch": 0.14714421495512037, "grad_norm": 0.7732745409011841, "learning_rate": 9.984818722229749e-05, "loss": 0.9876100540161132, "memory(GiB)": 89.65, "step": 11340, "token_acc": 0.7701224608452342, "train_speed(iter/s)": 1.02473 }, { "epoch": 0.14720909335677607, "grad_norm": 0.8971213102340698, "learning_rate": 9.984776927539284e-05, "loss": 0.9962034225463867, "memory(GiB)": 89.65, "step": 11345, "token_acc": 0.742780465678863, "train_speed(iter/s)": 1.021387 }, { "epoch": 0.14727397175843177, "grad_norm": 0.8258897662162781, "learning_rate": 9.984735075484326e-05, "loss": 0.9804220199584961, "memory(GiB)": 89.65, "step": 11350, "token_acc": 0.7397802523029114, "train_speed(iter/s)": 1.018159 }, { "epoch": 0.14733885016008746, "grad_norm": 0.7588297724723816, "learning_rate": 9.984693166065359e-05, "loss": 0.942162036895752, "memory(GiB)": 89.65, "step": 11355, "token_acc": 0.7344626235763427, "train_speed(iter/s)": 1.014856 }, { "epoch": 0.14740372856174316, "grad_norm": 0.8558369874954224, "learning_rate": 9.984651199282862e-05, "loss": 1.0122775077819823, "memory(GiB)": 89.65, "step": 11360, "token_acc": 0.7347581115212746, "train_speed(iter/s)": 1.011597 }, { "epoch": 0.14746860696339886, "grad_norm": 0.9913255572319031, "learning_rate": 9.98460917513732e-05, "loss": 1.01586856842041, "memory(GiB)": 89.65, "step": 11365, "token_acc": 0.7207599594488041, "train_speed(iter/s)": 1.008419 }, { "epoch": 0.14753348536505453, "grad_norm": 0.8753107190132141, "learning_rate": 9.984567093629218e-05, "loss": 1.004833984375, "memory(GiB)": 89.65, "step": 11370, "token_acc": 0.7306504631935452, "train_speed(iter/s)": 1.005184 }, { "epoch": 0.14759836376671023, "grad_norm": 0.9617569446563721, "learning_rate": 9.984524954759039e-05, "loss": 0.9873805999755859, "memory(GiB)": 89.65, "step": 11375, "token_acc": 0.7346322307808488, "train_speed(iter/s)": 1.001938 }, { "epoch": 0.14766324216836593, "grad_norm": 0.8542808294296265, "learning_rate": 9.984482758527265e-05, "loss": 1.0105413436889648, "memory(GiB)": 89.65, "step": 11380, "token_acc": 0.7172599576678853, "train_speed(iter/s)": 0.998587 }, { "epoch": 0.14772812057002163, "grad_norm": 0.8509424328804016, "learning_rate": 9.984440504934385e-05, "loss": 1.0020825386047363, "memory(GiB)": 89.65, "step": 11385, "token_acc": 0.7340930304915992, "train_speed(iter/s)": 0.995605 }, { "epoch": 0.14779299897167733, "grad_norm": 0.8719891905784607, "learning_rate": 9.984398193980886e-05, "loss": 1.0089462280273438, "memory(GiB)": 89.65, "step": 11390, "token_acc": 0.738434803451582, "train_speed(iter/s)": 0.992469 }, { "epoch": 0.14785787737333303, "grad_norm": 0.9029290676116943, "learning_rate": 9.984355825667251e-05, "loss": 0.9573274612426758, "memory(GiB)": 89.65, "step": 11395, "token_acc": 0.7524414421029444, "train_speed(iter/s)": 0.989293 }, { "epoch": 0.14792275577498873, "grad_norm": 0.8071057796478271, "learning_rate": 9.984313399993972e-05, "loss": 0.9573493003845215, "memory(GiB)": 89.65, "step": 11400, "token_acc": 0.7550128356344966, "train_speed(iter/s)": 0.986254 }, { "epoch": 0.14798763417664443, "grad_norm": 0.8420518040657043, "learning_rate": 9.984270916961534e-05, "loss": 0.9894875526428223, "memory(GiB)": 89.65, "step": 11405, "token_acc": 0.7376317420954742, "train_speed(iter/s)": 0.983279 }, { "epoch": 0.14805251257830013, "grad_norm": 0.8548696637153625, "learning_rate": 9.984228376570428e-05, "loss": 0.952580451965332, "memory(GiB)": 89.65, "step": 11410, "token_acc": 0.7407482060791744, "train_speed(iter/s)": 0.980164 }, { "epoch": 0.14811739097995583, "grad_norm": 0.8435042500495911, "learning_rate": 9.984185778821143e-05, "loss": 1.0082452774047852, "memory(GiB)": 89.65, "step": 11415, "token_acc": 0.7122242768458592, "train_speed(iter/s)": 0.977106 }, { "epoch": 0.14818226938161153, "grad_norm": 0.8095998764038086, "learning_rate": 9.984143123714167e-05, "loss": 1.0179845809936523, "memory(GiB)": 89.65, "step": 11420, "token_acc": 0.7260856914235834, "train_speed(iter/s)": 0.973947 }, { "epoch": 0.14824714778326722, "grad_norm": 0.811138927936554, "learning_rate": 9.984100411249994e-05, "loss": 1.0170206069946288, "memory(GiB)": 89.65, "step": 11425, "token_acc": 0.7354170452480465, "train_speed(iter/s)": 0.970953 }, { "epoch": 0.1483120261849229, "grad_norm": 0.7948095202445984, "learning_rate": 9.984057641429113e-05, "loss": 1.0389434814453125, "memory(GiB)": 89.65, "step": 11430, "token_acc": 0.7240778001341381, "train_speed(iter/s)": 0.96791 }, { "epoch": 0.1483769045865786, "grad_norm": 0.7992091178894043, "learning_rate": 9.984014814252019e-05, "loss": 0.972836685180664, "memory(GiB)": 89.65, "step": 11435, "token_acc": 0.7351773635237702, "train_speed(iter/s)": 0.965064 }, { "epoch": 0.1484417829882343, "grad_norm": 0.9255802631378174, "learning_rate": 9.983971929719203e-05, "loss": 1.0230863571166993, "memory(GiB)": 89.65, "step": 11440, "token_acc": 0.7370208947570824, "train_speed(iter/s)": 0.962076 }, { "epoch": 0.14850666138989, "grad_norm": 0.7852320671081543, "learning_rate": 9.983928987831158e-05, "loss": 0.9530531883239746, "memory(GiB)": 89.65, "step": 11445, "token_acc": 0.7657789613848203, "train_speed(iter/s)": 0.959 }, { "epoch": 0.1485715397915457, "grad_norm": 0.8679429888725281, "learning_rate": 9.98388598858838e-05, "loss": 0.9946646690368652, "memory(GiB)": 89.65, "step": 11450, "token_acc": 0.7480524017467249, "train_speed(iter/s)": 0.956241 }, { "epoch": 0.1486364181932014, "grad_norm": 0.9333191514015198, "learning_rate": 9.983842931991362e-05, "loss": 1.0557472229003906, "memory(GiB)": 89.65, "step": 11455, "token_acc": 0.7353252462753399, "train_speed(iter/s)": 0.953432 }, { "epoch": 0.1487012965948571, "grad_norm": 0.7802304625511169, "learning_rate": 9.9837998180406e-05, "loss": 1.0198684692382813, "memory(GiB)": 89.65, "step": 11460, "token_acc": 0.7306772585417258, "train_speed(iter/s)": 0.950675 }, { "epoch": 0.1487661749965128, "grad_norm": 0.8128629922866821, "learning_rate": 9.983756646736591e-05, "loss": 0.9459741592407227, "memory(GiB)": 89.65, "step": 11465, "token_acc": 0.7553205639797819, "train_speed(iter/s)": 0.947736 }, { "epoch": 0.1488310533981685, "grad_norm": 0.8553788065910339, "learning_rate": 9.983713418079832e-05, "loss": 0.9990436553955078, "memory(GiB)": 89.65, "step": 11470, "token_acc": 0.7377898527832185, "train_speed(iter/s)": 0.94473 }, { "epoch": 0.1488959317998242, "grad_norm": 0.8890400528907776, "learning_rate": 9.983670132070818e-05, "loss": 1.012380313873291, "memory(GiB)": 89.65, "step": 11475, "token_acc": 0.7418940367319175, "train_speed(iter/s)": 0.941976 }, { "epoch": 0.1489608102014799, "grad_norm": 0.7536320686340332, "learning_rate": 9.98362678871005e-05, "loss": 1.0359912872314454, "memory(GiB)": 89.65, "step": 11480, "token_acc": 0.7507697630167791, "train_speed(iter/s)": 0.939308 }, { "epoch": 0.1490256886031356, "grad_norm": 0.8425878882408142, "learning_rate": 9.983583387998025e-05, "loss": 1.0145814895629883, "memory(GiB)": 89.65, "step": 11485, "token_acc": 0.7540558282016817, "train_speed(iter/s)": 0.936467 }, { "epoch": 0.14909056700479126, "grad_norm": 0.7873908877372742, "learning_rate": 9.983539929935243e-05, "loss": 0.9419000625610352, "memory(GiB)": 89.65, "step": 11490, "token_acc": 0.7766561273856234, "train_speed(iter/s)": 0.93374 }, { "epoch": 0.14915544540644696, "grad_norm": 0.842592716217041, "learning_rate": 9.983496414522207e-05, "loss": 0.9743213653564453, "memory(GiB)": 89.65, "step": 11495, "token_acc": 0.7505557711987014, "train_speed(iter/s)": 0.931148 }, { "epoch": 0.14922032380810266, "grad_norm": 0.864989697933197, "learning_rate": 9.983452841759413e-05, "loss": 1.0022513389587402, "memory(GiB)": 89.65, "step": 11500, "token_acc": 0.73815170070435, "train_speed(iter/s)": 0.928486 }, { "epoch": 0.14928520220975836, "grad_norm": 0.8516853451728821, "learning_rate": 9.983409211647364e-05, "loss": 0.9713436126708984, "memory(GiB)": 89.65, "step": 11505, "token_acc": 0.7616620678461289, "train_speed(iter/s)": 0.925721 }, { "epoch": 0.14935008061141405, "grad_norm": 0.9449465274810791, "learning_rate": 9.983365524186563e-05, "loss": 1.0304130554199218, "memory(GiB)": 89.65, "step": 11510, "token_acc": 0.74039709317415, "train_speed(iter/s)": 0.92307 }, { "epoch": 0.14941495901306975, "grad_norm": 0.8331295251846313, "learning_rate": 9.983321779377511e-05, "loss": 1.0388910293579101, "memory(GiB)": 89.65, "step": 11515, "token_acc": 0.7249949889757467, "train_speed(iter/s)": 0.920503 }, { "epoch": 0.14947983741472545, "grad_norm": 0.8884211182594299, "learning_rate": 9.983277977220715e-05, "loss": 0.9721891403198242, "memory(GiB)": 89.65, "step": 11520, "token_acc": 0.7448152134912092, "train_speed(iter/s)": 0.917893 }, { "epoch": 0.14954471581638115, "grad_norm": 0.8591482639312744, "learning_rate": 9.983234117716675e-05, "loss": 0.9985775947570801, "memory(GiB)": 89.65, "step": 11525, "token_acc": 0.7284931051669713, "train_speed(iter/s)": 0.915348 }, { "epoch": 0.14960959421803685, "grad_norm": 0.9437199234962463, "learning_rate": 9.983190200865897e-05, "loss": 1.0418109893798828, "memory(GiB)": 89.65, "step": 11530, "token_acc": 0.7316905952572995, "train_speed(iter/s)": 0.912875 }, { "epoch": 0.14967447261969255, "grad_norm": 0.698279082775116, "learning_rate": 9.983146226668889e-05, "loss": 0.9843629837036133, "memory(GiB)": 89.65, "step": 11535, "token_acc": 0.7338801002024639, "train_speed(iter/s)": 0.910171 }, { "epoch": 0.14973935102134825, "grad_norm": 0.8693399429321289, "learning_rate": 9.983102195126153e-05, "loss": 1.0418472290039062, "memory(GiB)": 89.65, "step": 11540, "token_acc": 0.7262383592419466, "train_speed(iter/s)": 0.907582 }, { "epoch": 0.14980422942300395, "grad_norm": 0.8935892581939697, "learning_rate": 9.983058106238199e-05, "loss": 1.0406333923339843, "memory(GiB)": 89.65, "step": 11545, "token_acc": 0.7249090608465608, "train_speed(iter/s)": 0.905141 }, { "epoch": 0.14986910782465962, "grad_norm": 0.8744311928749084, "learning_rate": 9.983013960005532e-05, "loss": 0.9648751258850098, "memory(GiB)": 89.65, "step": 11550, "token_acc": 0.7375706957779824, "train_speed(iter/s)": 0.902683 }, { "epoch": 0.14993398622631532, "grad_norm": 0.7825308442115784, "learning_rate": 9.982969756428662e-05, "loss": 0.9925728797912597, "memory(GiB)": 89.65, "step": 11555, "token_acc": 0.736940893154372, "train_speed(iter/s)": 0.900143 }, { "epoch": 0.14999886462797102, "grad_norm": 0.8374772667884827, "learning_rate": 9.982925495508096e-05, "loss": 0.9726966857910156, "memory(GiB)": 89.65, "step": 11560, "token_acc": 0.7387676971973418, "train_speed(iter/s)": 0.89756 }, { "epoch": 0.15006374302962672, "grad_norm": 0.8958409428596497, "learning_rate": 9.982881177244345e-05, "loss": 0.9562368392944336, "memory(GiB)": 89.65, "step": 11565, "token_acc": 0.7248215701823949, "train_speed(iter/s)": 0.895238 }, { "epoch": 0.15012862143128242, "grad_norm": 0.8702595829963684, "learning_rate": 9.982836801637916e-05, "loss": 0.9735102653503418, "memory(GiB)": 89.65, "step": 11570, "token_acc": 0.731433673667456, "train_speed(iter/s)": 0.892661 }, { "epoch": 0.15019349983293812, "grad_norm": 0.8246186375617981, "learning_rate": 9.982792368689324e-05, "loss": 1.0327926635742188, "memory(GiB)": 89.65, "step": 11575, "token_acc": 0.7479435957696827, "train_speed(iter/s)": 0.890146 }, { "epoch": 0.15025837823459381, "grad_norm": 1.0249217748641968, "learning_rate": 9.982747878399077e-05, "loss": 0.9798608779907226, "memory(GiB)": 89.65, "step": 11580, "token_acc": 0.7474832859448244, "train_speed(iter/s)": 0.8879 }, { "epoch": 0.1503232566362495, "grad_norm": 0.8138152360916138, "learning_rate": 9.98270333076769e-05, "loss": 1.0438525199890136, "memory(GiB)": 89.65, "step": 11585, "token_acc": 0.7255369159931738, "train_speed(iter/s)": 0.885574 }, { "epoch": 0.1503881350379052, "grad_norm": 0.801413893699646, "learning_rate": 9.98265872579567e-05, "loss": 1.0400896072387695, "memory(GiB)": 89.65, "step": 11590, "token_acc": 0.7330848300011296, "train_speed(iter/s)": 0.883173 }, { "epoch": 0.1504530134395609, "grad_norm": 0.9415481090545654, "learning_rate": 9.982614063483538e-05, "loss": 1.0136597633361817, "memory(GiB)": 89.65, "step": 11595, "token_acc": 0.7047048509643483, "train_speed(iter/s)": 0.880889 }, { "epoch": 0.1505178918412166, "grad_norm": 0.8599269986152649, "learning_rate": 9.982569343831802e-05, "loss": 0.9593742370605469, "memory(GiB)": 89.65, "step": 11600, "token_acc": 0.7528339559448667, "train_speed(iter/s)": 0.878476 }, { "epoch": 0.1505827702428723, "grad_norm": 0.7338165044784546, "learning_rate": 9.982524566840978e-05, "loss": 0.9931305885314942, "memory(GiB)": 89.65, "step": 11605, "token_acc": 0.7393049661658447, "train_speed(iter/s)": 0.876028 }, { "epoch": 0.15064764864452798, "grad_norm": 0.8761139512062073, "learning_rate": 9.982479732511584e-05, "loss": 0.9958646774291993, "memory(GiB)": 89.65, "step": 11610, "token_acc": 0.7401095055781977, "train_speed(iter/s)": 0.873635 }, { "epoch": 0.15071252704618368, "grad_norm": 0.8065568804740906, "learning_rate": 9.982434840844132e-05, "loss": 1.034571075439453, "memory(GiB)": 89.65, "step": 11615, "token_acc": 0.7452290076335878, "train_speed(iter/s)": 0.871384 }, { "epoch": 0.15077740544783938, "grad_norm": 0.816571056842804, "learning_rate": 9.982389891839142e-05, "loss": 0.952485179901123, "memory(GiB)": 89.65, "step": 11620, "token_acc": 0.7413765076888065, "train_speed(iter/s)": 0.869023 }, { "epoch": 0.15084228384949508, "grad_norm": 0.8653086423873901, "learning_rate": 9.982344885497131e-05, "loss": 1.0195497512817382, "memory(GiB)": 89.65, "step": 11625, "token_acc": 0.7489132009853644, "train_speed(iter/s)": 0.866702 }, { "epoch": 0.15090716225115078, "grad_norm": 0.9207912683486938, "learning_rate": 9.982299821818612e-05, "loss": 0.9964292526245118, "memory(GiB)": 89.65, "step": 11630, "token_acc": 0.7457965232259903, "train_speed(iter/s)": 0.864378 }, { "epoch": 0.15097204065280648, "grad_norm": 0.8855961561203003, "learning_rate": 9.98225470080411e-05, "loss": 0.9660188674926757, "memory(GiB)": 89.65, "step": 11635, "token_acc": 0.741400172860847, "train_speed(iter/s)": 0.862189 }, { "epoch": 0.15103691905446218, "grad_norm": 0.8676099181175232, "learning_rate": 9.98220952245414e-05, "loss": 1.01988525390625, "memory(GiB)": 89.65, "step": 11640, "token_acc": 0.757940079722258, "train_speed(iter/s)": 0.859822 }, { "epoch": 0.15110179745611788, "grad_norm": 0.8177076578140259, "learning_rate": 9.982164286769224e-05, "loss": 0.9801776885986329, "memory(GiB)": 89.65, "step": 11645, "token_acc": 0.7355141613863654, "train_speed(iter/s)": 0.857613 }, { "epoch": 0.15116667585777357, "grad_norm": 0.8037128448486328, "learning_rate": 9.982118993749883e-05, "loss": 0.9640499114990234, "memory(GiB)": 89.65, "step": 11650, "token_acc": 0.736090142504695, "train_speed(iter/s)": 0.85532 }, { "epoch": 0.15123155425942927, "grad_norm": 0.8214165568351746, "learning_rate": 9.982073643396637e-05, "loss": 0.9960202217102051, "memory(GiB)": 89.65, "step": 11655, "token_acc": 0.7614904895541004, "train_speed(iter/s)": 0.853126 }, { "epoch": 0.15129643266108497, "grad_norm": 0.8273711800575256, "learning_rate": 9.982028235710007e-05, "loss": 1.0139478683471679, "memory(GiB)": 89.65, "step": 11660, "token_acc": 0.7395633290985907, "train_speed(iter/s)": 0.85093 }, { "epoch": 0.15136131106274067, "grad_norm": 0.8915349245071411, "learning_rate": 9.981982770690517e-05, "loss": 0.9480093955993653, "memory(GiB)": 89.65, "step": 11665, "token_acc": 0.7479529756186817, "train_speed(iter/s)": 0.848749 }, { "epoch": 0.15142618946439634, "grad_norm": 0.9214180707931519, "learning_rate": 9.981937248338691e-05, "loss": 1.0277477264404298, "memory(GiB)": 89.65, "step": 11670, "token_acc": 0.7297875426002466, "train_speed(iter/s)": 0.846732 }, { "epoch": 0.15149106786605204, "grad_norm": 0.8246386647224426, "learning_rate": 9.98189166865505e-05, "loss": 1.0075170516967773, "memory(GiB)": 89.65, "step": 11675, "token_acc": 0.7510616488504905, "train_speed(iter/s)": 0.844654 }, { "epoch": 0.15155594626770774, "grad_norm": 0.9690290093421936, "learning_rate": 9.981846031640123e-05, "loss": 1.0064523696899415, "memory(GiB)": 89.65, "step": 11680, "token_acc": 0.7246730852133927, "train_speed(iter/s)": 0.842499 }, { "epoch": 0.15162082466936344, "grad_norm": 0.9173450469970703, "learning_rate": 9.98180033729443e-05, "loss": 1.029994583129883, "memory(GiB)": 89.65, "step": 11685, "token_acc": 0.7305945792535831, "train_speed(iter/s)": 0.840336 }, { "epoch": 0.15168570307101914, "grad_norm": 0.8684065341949463, "learning_rate": 9.9817545856185e-05, "loss": 0.9974212646484375, "memory(GiB)": 89.65, "step": 11690, "token_acc": 0.7461108723678556, "train_speed(iter/s)": 0.838332 }, { "epoch": 0.15175058147267484, "grad_norm": 0.8222112059593201, "learning_rate": 9.981708776612859e-05, "loss": 0.9826931953430176, "memory(GiB)": 89.65, "step": 11695, "token_acc": 0.7508097526250223, "train_speed(iter/s)": 0.836139 }, { "epoch": 0.15181545987433054, "grad_norm": 0.8445353507995605, "learning_rate": 9.981662910278035e-05, "loss": 0.9727514266967774, "memory(GiB)": 89.65, "step": 11700, "token_acc": 0.7471706326175276, "train_speed(iter/s)": 0.833931 }, { "epoch": 0.15188033827598624, "grad_norm": 0.8245111703872681, "learning_rate": 9.981616986614553e-05, "loss": 1.0137698173522949, "memory(GiB)": 89.65, "step": 11705, "token_acc": 0.7502847380410023, "train_speed(iter/s)": 0.831837 }, { "epoch": 0.15194521667764194, "grad_norm": 0.8805167078971863, "learning_rate": 9.981571005622945e-05, "loss": 0.9826539993286133, "memory(GiB)": 89.65, "step": 11710, "token_acc": 0.7595706414684975, "train_speed(iter/s)": 0.829598 }, { "epoch": 0.15201009507929764, "grad_norm": 0.8710799813270569, "learning_rate": 9.981524967303736e-05, "loss": 1.0116549491882325, "memory(GiB)": 89.65, "step": 11715, "token_acc": 0.7252573238321457, "train_speed(iter/s)": 0.827512 }, { "epoch": 0.15207497348095333, "grad_norm": 0.8875444531440735, "learning_rate": 9.981478871657461e-05, "loss": 1.0096197128295898, "memory(GiB)": 89.65, "step": 11720, "token_acc": 0.7415124204541784, "train_speed(iter/s)": 0.825487 }, { "epoch": 0.152139851882609, "grad_norm": 0.7685894966125488, "learning_rate": 9.981432718684647e-05, "loss": 0.9799994468688965, "memory(GiB)": 89.65, "step": 11725, "token_acc": 0.7205819687100505, "train_speed(iter/s)": 0.823371 }, { "epoch": 0.1522047302842647, "grad_norm": 0.7621815204620361, "learning_rate": 9.981386508385825e-05, "loss": 1.0792470932006837, "memory(GiB)": 89.65, "step": 11730, "token_acc": 0.7348694811472768, "train_speed(iter/s)": 0.821435 }, { "epoch": 0.1522696086859204, "grad_norm": 0.8132389187812805, "learning_rate": 9.98134024076153e-05, "loss": 0.9757878303527832, "memory(GiB)": 89.65, "step": 11735, "token_acc": 0.735532778133675, "train_speed(iter/s)": 0.819436 }, { "epoch": 0.1523344870875761, "grad_norm": 0.7758135795593262, "learning_rate": 9.981293915812289e-05, "loss": 0.9427996635437011, "memory(GiB)": 89.65, "step": 11740, "token_acc": 0.7455890722822994, "train_speed(iter/s)": 0.81732 }, { "epoch": 0.1523993654892318, "grad_norm": 0.8842960596084595, "learning_rate": 9.981247533538641e-05, "loss": 1.0566612243652345, "memory(GiB)": 89.65, "step": 11745, "token_acc": 0.7388692468946528, "train_speed(iter/s)": 0.815401 }, { "epoch": 0.1524642438908875, "grad_norm": 0.9601258635520935, "learning_rate": 9.981201093941115e-05, "loss": 1.030613899230957, "memory(GiB)": 89.65, "step": 11750, "token_acc": 0.7052435233160622, "train_speed(iter/s)": 0.813426 }, { "epoch": 0.1525291222925432, "grad_norm": 0.8319382667541504, "learning_rate": 9.981154597020248e-05, "loss": 0.9890401840209961, "memory(GiB)": 89.65, "step": 11755, "token_acc": 0.7498359580052494, "train_speed(iter/s)": 0.811427 }, { "epoch": 0.1525940006941989, "grad_norm": 0.9192413687705994, "learning_rate": 9.981108042776575e-05, "loss": 1.0115232467651367, "memory(GiB)": 89.65, "step": 11760, "token_acc": 0.7343154868846569, "train_speed(iter/s)": 0.809494 }, { "epoch": 0.1526588790958546, "grad_norm": 0.8236131072044373, "learning_rate": 9.981061431210629e-05, "loss": 0.9456027030944825, "memory(GiB)": 89.65, "step": 11765, "token_acc": 0.7550618108920816, "train_speed(iter/s)": 0.80753 }, { "epoch": 0.1527237574975103, "grad_norm": 0.8751246333122253, "learning_rate": 9.981014762322953e-05, "loss": 1.0027145385742187, "memory(GiB)": 89.65, "step": 11770, "token_acc": 0.7403470442418386, "train_speed(iter/s)": 0.805663 }, { "epoch": 0.152788635899166, "grad_norm": 0.9033451080322266, "learning_rate": 9.980968036114075e-05, "loss": 0.9990098953247071, "memory(GiB)": 89.65, "step": 11775, "token_acc": 0.7323640052411344, "train_speed(iter/s)": 0.803787 }, { "epoch": 0.1528535143008217, "grad_norm": 0.797736406326294, "learning_rate": 9.980921252584541e-05, "loss": 0.9493623733520508, "memory(GiB)": 89.65, "step": 11780, "token_acc": 0.7416699333594668, "train_speed(iter/s)": 0.801882 }, { "epoch": 0.15291839270247737, "grad_norm": 0.8449960350990295, "learning_rate": 9.980874411734884e-05, "loss": 0.951866340637207, "memory(GiB)": 89.65, "step": 11785, "token_acc": 0.7470979468748709, "train_speed(iter/s)": 0.799938 }, { "epoch": 0.15298327110413307, "grad_norm": 0.8947533965110779, "learning_rate": 9.980827513565645e-05, "loss": 1.0260927200317382, "memory(GiB)": 89.65, "step": 11790, "token_acc": 0.7173045050474612, "train_speed(iter/s)": 0.798022 }, { "epoch": 0.15304814950578877, "grad_norm": 0.8546721339225769, "learning_rate": 9.980780558077363e-05, "loss": 1.0037089347839356, "memory(GiB)": 89.65, "step": 11795, "token_acc": 0.7512010113780025, "train_speed(iter/s)": 0.796088 }, { "epoch": 0.15311302790744447, "grad_norm": 0.868499219417572, "learning_rate": 9.98073354527058e-05, "loss": 0.942524528503418, "memory(GiB)": 89.65, "step": 11800, "token_acc": 0.7710223093895563, "train_speed(iter/s)": 0.794222 }, { "epoch": 0.15317790630910016, "grad_norm": 0.8317728638648987, "learning_rate": 9.980686475145835e-05, "loss": 1.000296401977539, "memory(GiB)": 89.65, "step": 11805, "token_acc": 0.7363340399757723, "train_speed(iter/s)": 0.792412 }, { "epoch": 0.15324278471075586, "grad_norm": 0.855655312538147, "learning_rate": 9.980639347703672e-05, "loss": 0.9931121826171875, "memory(GiB)": 89.65, "step": 11810, "token_acc": 0.7467733021229559, "train_speed(iter/s)": 0.790636 }, { "epoch": 0.15330766311241156, "grad_norm": 0.8695439100265503, "learning_rate": 9.980592162944631e-05, "loss": 1.0339475631713868, "memory(GiB)": 89.65, "step": 11815, "token_acc": 0.7303775268453279, "train_speed(iter/s)": 0.788814 }, { "epoch": 0.15337254151406726, "grad_norm": 1.0520262718200684, "learning_rate": 9.980544920869255e-05, "loss": 0.97854642868042, "memory(GiB)": 89.65, "step": 11820, "token_acc": 0.7491001147106523, "train_speed(iter/s)": 0.78696 }, { "epoch": 0.15343741991572296, "grad_norm": 0.8453856706619263, "learning_rate": 9.98049762147809e-05, "loss": 1.0050260543823242, "memory(GiB)": 89.65, "step": 11825, "token_acc": 0.7404766897322989, "train_speed(iter/s)": 0.785083 }, { "epoch": 0.15350229831737866, "grad_norm": 0.835171639919281, "learning_rate": 9.980450264771679e-05, "loss": 0.9852148056030273, "memory(GiB)": 89.65, "step": 11830, "token_acc": 0.7492959919762373, "train_speed(iter/s)": 0.783318 }, { "epoch": 0.15356717671903436, "grad_norm": 0.8474327921867371, "learning_rate": 9.980402850750565e-05, "loss": 0.9774065017700195, "memory(GiB)": 89.65, "step": 11835, "token_acc": 0.7397181146025879, "train_speed(iter/s)": 0.781554 }, { "epoch": 0.15363205512069006, "grad_norm": 1.0029469728469849, "learning_rate": 9.980355379415298e-05, "loss": 0.9887434005737304, "memory(GiB)": 89.65, "step": 11840, "token_acc": 0.7260924750679963, "train_speed(iter/s)": 0.779766 }, { "epoch": 0.15369693352234573, "grad_norm": 0.8315956592559814, "learning_rate": 9.98030785076642e-05, "loss": 0.9914806365966797, "memory(GiB)": 89.65, "step": 11845, "token_acc": 0.7442048790363397, "train_speed(iter/s)": 0.777977 }, { "epoch": 0.15376181192400143, "grad_norm": 0.8284024596214294, "learning_rate": 9.980260264804481e-05, "loss": 0.979710578918457, "memory(GiB)": 89.65, "step": 11850, "token_acc": 0.7485918344807274, "train_speed(iter/s)": 0.776191 }, { "epoch": 0.15382669032565713, "grad_norm": 0.8075039386749268, "learning_rate": 9.980212621530027e-05, "loss": 0.9772393226623535, "memory(GiB)": 89.65, "step": 11855, "token_acc": 0.7466502981410031, "train_speed(iter/s)": 0.774442 }, { "epoch": 0.15389156872731283, "grad_norm": 1.048107624053955, "learning_rate": 9.980164920943606e-05, "loss": 1.0083604812622071, "memory(GiB)": 89.65, "step": 11860, "token_acc": 0.7200420357303708, "train_speed(iter/s)": 0.772603 }, { "epoch": 0.15395644712896853, "grad_norm": 0.8190948963165283, "learning_rate": 9.980117163045768e-05, "loss": 1.0035894393920899, "memory(GiB)": 89.65, "step": 11865, "token_acc": 0.7346771358256182, "train_speed(iter/s)": 0.770932 }, { "epoch": 0.15402132553062423, "grad_norm": 0.8086226582527161, "learning_rate": 9.980069347837062e-05, "loss": 0.9748990058898925, "memory(GiB)": 89.65, "step": 11870, "token_acc": 0.7628712871287129, "train_speed(iter/s)": 0.769239 }, { "epoch": 0.15408620393227992, "grad_norm": 0.7796744704246521, "learning_rate": 9.98002147531804e-05, "loss": 1.0204444885253907, "memory(GiB)": 89.65, "step": 11875, "token_acc": 0.7262302712309059, "train_speed(iter/s)": 0.767514 }, { "epoch": 0.15415108233393562, "grad_norm": 0.9484221935272217, "learning_rate": 9.97997354548925e-05, "loss": 1.0068728446960449, "memory(GiB)": 89.65, "step": 11880, "token_acc": 0.7048076585896133, "train_speed(iter/s)": 0.765759 }, { "epoch": 0.15421596073559132, "grad_norm": 0.7806928753852844, "learning_rate": 9.979925558351245e-05, "loss": 0.9795634269714355, "memory(GiB)": 89.65, "step": 11885, "token_acc": 0.7305068646921682, "train_speed(iter/s)": 0.764057 }, { "epoch": 0.15428083913724702, "grad_norm": 0.8836185932159424, "learning_rate": 9.979877513904577e-05, "loss": 1.0256362915039063, "memory(GiB)": 89.65, "step": 11890, "token_acc": 0.7317759853903515, "train_speed(iter/s)": 0.762328 }, { "epoch": 0.15434571753890272, "grad_norm": 0.8095592856407166, "learning_rate": 9.9798294121498e-05, "loss": 1.0057620048522948, "memory(GiB)": 89.65, "step": 11895, "token_acc": 0.7216947134217041, "train_speed(iter/s)": 0.76063 }, { "epoch": 0.15441059594055842, "grad_norm": 0.7873480319976807, "learning_rate": 9.979781253087466e-05, "loss": 0.9637145042419434, "memory(GiB)": 89.65, "step": 11900, "token_acc": 0.753033791772979, "train_speed(iter/s)": 0.758842 }, { "epoch": 0.1544754743422141, "grad_norm": 0.7884849905967712, "learning_rate": 9.979733036718129e-05, "loss": 0.9351298332214355, "memory(GiB)": 89.65, "step": 11905, "token_acc": 0.7636037413965527, "train_speed(iter/s)": 0.757042 }, { "epoch": 0.1545403527438698, "grad_norm": 0.9650403261184692, "learning_rate": 9.979684763042346e-05, "loss": 0.9296902656555176, "memory(GiB)": 89.65, "step": 11910, "token_acc": 0.7575963718820862, "train_speed(iter/s)": 0.755369 }, { "epoch": 0.1546052311455255, "grad_norm": 0.7873780131340027, "learning_rate": 9.97963643206067e-05, "loss": 0.9949085235595703, "memory(GiB)": 89.65, "step": 11915, "token_acc": 0.7391599375720974, "train_speed(iter/s)": 0.753719 }, { "epoch": 0.1546701095471812, "grad_norm": 0.846389651298523, "learning_rate": 9.97958804377366e-05, "loss": 0.9584752082824707, "memory(GiB)": 89.65, "step": 11920, "token_acc": 0.7644558474381558, "train_speed(iter/s)": 0.752025 }, { "epoch": 0.1547349879488369, "grad_norm": 0.8762677907943726, "learning_rate": 9.979539598181871e-05, "loss": 0.9948715209960938, "memory(GiB)": 89.65, "step": 11925, "token_acc": 0.7272765055483978, "train_speed(iter/s)": 0.750422 }, { "epoch": 0.1547998663504926, "grad_norm": 0.7947695851325989, "learning_rate": 9.97949109528586e-05, "loss": 0.9744772911071777, "memory(GiB)": 89.65, "step": 11930, "token_acc": 0.7515548691405435, "train_speed(iter/s)": 0.748812 }, { "epoch": 0.1548647447521483, "grad_norm": 0.8965061902999878, "learning_rate": 9.979442535086186e-05, "loss": 1.0522842407226562, "memory(GiB)": 89.65, "step": 11935, "token_acc": 0.7111507185028406, "train_speed(iter/s)": 0.747163 }, { "epoch": 0.15492962315380399, "grad_norm": 0.9135346412658691, "learning_rate": 9.979393917583408e-05, "loss": 0.9989492416381835, "memory(GiB)": 89.65, "step": 11940, "token_acc": 0.7399597585513078, "train_speed(iter/s)": 0.745611 }, { "epoch": 0.15499450155545968, "grad_norm": 0.829170286655426, "learning_rate": 9.979345242778087e-05, "loss": 1.01464786529541, "memory(GiB)": 89.65, "step": 11945, "token_acc": 0.7245076842166773, "train_speed(iter/s)": 0.743978 }, { "epoch": 0.15505937995711538, "grad_norm": 1.0232104063034058, "learning_rate": 9.979296510670779e-05, "loss": 1.0202523231506349, "memory(GiB)": 89.65, "step": 11950, "token_acc": 0.7288555227218273, "train_speed(iter/s)": 0.742463 }, { "epoch": 0.15512425835877108, "grad_norm": 0.8123746514320374, "learning_rate": 9.979247721262048e-05, "loss": 1.0138931274414062, "memory(GiB)": 89.65, "step": 11955, "token_acc": 0.7269519188354654, "train_speed(iter/s)": 0.740899 }, { "epoch": 0.15518913676042678, "grad_norm": 0.836601197719574, "learning_rate": 9.979198874552454e-05, "loss": 1.013938045501709, "memory(GiB)": 89.65, "step": 11960, "token_acc": 0.7589563862928349, "train_speed(iter/s)": 0.739309 }, { "epoch": 0.15525401516208245, "grad_norm": 0.7755442261695862, "learning_rate": 9.97914997054256e-05, "loss": 0.9837608337402344, "memory(GiB)": 89.65, "step": 11965, "token_acc": 0.7405995127634785, "train_speed(iter/s)": 0.737693 }, { "epoch": 0.15531889356373815, "grad_norm": 0.8208988308906555, "learning_rate": 9.97910100923293e-05, "loss": 1.0138186454772948, "memory(GiB)": 89.65, "step": 11970, "token_acc": 0.7267738803765142, "train_speed(iter/s)": 0.736222 }, { "epoch": 0.15538377196539385, "grad_norm": 0.7927229404449463, "learning_rate": 9.979051990624125e-05, "loss": 0.948983097076416, "memory(GiB)": 89.65, "step": 11975, "token_acc": 0.7314828920508211, "train_speed(iter/s)": 0.734594 }, { "epoch": 0.15544865036704955, "grad_norm": 0.8452488780021667, "learning_rate": 9.97900291471671e-05, "loss": 0.989352798461914, "memory(GiB)": 89.65, "step": 11980, "token_acc": 0.7269230769230769, "train_speed(iter/s)": 0.733056 }, { "epoch": 0.15551352876870525, "grad_norm": 0.7811840176582336, "learning_rate": 9.97895378151125e-05, "loss": 0.9841194152832031, "memory(GiB)": 89.65, "step": 11985, "token_acc": 0.7602270959713797, "train_speed(iter/s)": 0.73146 }, { "epoch": 0.15557840717036095, "grad_norm": 0.8941458463668823, "learning_rate": 9.97890459100831e-05, "loss": 0.9942577362060547, "memory(GiB)": 89.65, "step": 11990, "token_acc": 0.7439540724793685, "train_speed(iter/s)": 0.729957 }, { "epoch": 0.15564328557201665, "grad_norm": 0.8702526688575745, "learning_rate": 9.978855343208457e-05, "loss": 1.018487548828125, "memory(GiB)": 89.65, "step": 11995, "token_acc": 0.7442274269635024, "train_speed(iter/s)": 0.728481 }, { "epoch": 0.15570816397367235, "grad_norm": 0.893623411655426, "learning_rate": 9.978806038112257e-05, "loss": 0.9752307891845703, "memory(GiB)": 89.65, "step": 12000, "token_acc": 0.723810539790911, "train_speed(iter/s)": 0.726967 }, { "epoch": 0.15577304237532805, "grad_norm": 0.8318555951118469, "learning_rate": 9.978756675720277e-05, "loss": 1.0210128784179688, "memory(GiB)": 89.65, "step": 12005, "token_acc": 0.7302386437978212, "train_speed(iter/s)": 0.725437 }, { "epoch": 0.15583792077698375, "grad_norm": 0.8656463027000427, "learning_rate": 9.978707256033086e-05, "loss": 0.9969749450683594, "memory(GiB)": 89.65, "step": 12010, "token_acc": 0.7266663834487447, "train_speed(iter/s)": 0.7239 }, { "epoch": 0.15590279917863945, "grad_norm": 0.8347519040107727, "learning_rate": 9.978657779051252e-05, "loss": 0.9960886001586914, "memory(GiB)": 89.65, "step": 12015, "token_acc": 0.7351026229304892, "train_speed(iter/s)": 0.722426 }, { "epoch": 0.15596767758029514, "grad_norm": 0.8784888982772827, "learning_rate": 9.978608244775347e-05, "loss": 0.9552192687988281, "memory(GiB)": 89.65, "step": 12020, "token_acc": 0.7342813803564657, "train_speed(iter/s)": 0.720871 }, { "epoch": 0.15603255598195082, "grad_norm": 0.8283236026763916, "learning_rate": 9.978558653205937e-05, "loss": 0.9858705520629882, "memory(GiB)": 89.65, "step": 12025, "token_acc": 0.7512092006735696, "train_speed(iter/s)": 0.719443 }, { "epoch": 0.15609743438360651, "grad_norm": 0.918168842792511, "learning_rate": 9.978509004343593e-05, "loss": 1.0224075317382812, "memory(GiB)": 89.65, "step": 12030, "token_acc": 0.72028750759263, "train_speed(iter/s)": 0.717984 }, { "epoch": 0.15616231278526221, "grad_norm": 0.878878653049469, "learning_rate": 9.978459298188888e-05, "loss": 1.0057310104370116, "memory(GiB)": 89.65, "step": 12035, "token_acc": 0.7267102761576386, "train_speed(iter/s)": 0.716531 }, { "epoch": 0.1562271911869179, "grad_norm": 0.897991955280304, "learning_rate": 9.978409534742397e-05, "loss": 1.0540311813354493, "memory(GiB)": 89.65, "step": 12040, "token_acc": 0.7261900640470833, "train_speed(iter/s)": 0.715073 }, { "epoch": 0.1562920695885736, "grad_norm": 0.891696572303772, "learning_rate": 9.978359714004687e-05, "loss": 0.9561830520629883, "memory(GiB)": 89.65, "step": 12045, "token_acc": 0.7491285342776517, "train_speed(iter/s)": 0.713543 }, { "epoch": 0.1563569479902293, "grad_norm": 0.7827072143554688, "learning_rate": 9.978309835976334e-05, "loss": 0.9770351409912109, "memory(GiB)": 89.65, "step": 12050, "token_acc": 0.7404642285778439, "train_speed(iter/s)": 0.712101 }, { "epoch": 0.156421826391885, "grad_norm": 0.9553587436676025, "learning_rate": 9.978259900657912e-05, "loss": 1.0212980270385743, "memory(GiB)": 89.65, "step": 12055, "token_acc": 0.730448770829896, "train_speed(iter/s)": 0.710692 }, { "epoch": 0.1564867047935407, "grad_norm": 0.8637094497680664, "learning_rate": 9.978209908049996e-05, "loss": 0.9801783561706543, "memory(GiB)": 89.65, "step": 12060, "token_acc": 0.7487043247245329, "train_speed(iter/s)": 0.709206 }, { "epoch": 0.1565515831951964, "grad_norm": 0.8762169480323792, "learning_rate": 9.97815985815316e-05, "loss": 1.0146636962890625, "memory(GiB)": 89.65, "step": 12065, "token_acc": 0.7462370215641703, "train_speed(iter/s)": 0.707792 }, { "epoch": 0.1566164615968521, "grad_norm": 0.8953553438186646, "learning_rate": 9.978109750967981e-05, "loss": 0.9447284698486328, "memory(GiB)": 89.65, "step": 12070, "token_acc": 0.7481762581976273, "train_speed(iter/s)": 0.706406 }, { "epoch": 0.1566813399985078, "grad_norm": 0.8500825762748718, "learning_rate": 9.978059586495035e-05, "loss": 0.9691539764404297, "memory(GiB)": 89.65, "step": 12075, "token_acc": 0.7391107224974902, "train_speed(iter/s)": 0.705017 }, { "epoch": 0.1567462184001635, "grad_norm": 0.8012600541114807, "learning_rate": 9.978009364734901e-05, "loss": 0.9921768188476563, "memory(GiB)": 89.65, "step": 12080, "token_acc": 0.760318451451764, "train_speed(iter/s)": 0.703686 }, { "epoch": 0.15681109680181918, "grad_norm": 0.7659975290298462, "learning_rate": 9.977959085688155e-05, "loss": 0.9687977790832519, "memory(GiB)": 89.65, "step": 12085, "token_acc": 0.7573980096223555, "train_speed(iter/s)": 0.702364 }, { "epoch": 0.15687597520347488, "grad_norm": 0.9078981876373291, "learning_rate": 9.977908749355375e-05, "loss": 1.0274668693542481, "memory(GiB)": 89.65, "step": 12090, "token_acc": 0.7226200428919342, "train_speed(iter/s)": 0.701003 }, { "epoch": 0.15694085360513058, "grad_norm": 0.9163715839385986, "learning_rate": 9.977858355737142e-05, "loss": 0.9990825653076172, "memory(GiB)": 89.65, "step": 12095, "token_acc": 0.7267295231443759, "train_speed(iter/s)": 0.699625 }, { "epoch": 0.15700573200678627, "grad_norm": 0.818399965763092, "learning_rate": 9.977807904834036e-05, "loss": 1.0023566246032716, "memory(GiB)": 89.65, "step": 12100, "token_acc": 0.7428745308854854, "train_speed(iter/s)": 0.698208 }, { "epoch": 0.15707061040844197, "grad_norm": 0.8085047602653503, "learning_rate": 9.977757396646637e-05, "loss": 0.9970157623291016, "memory(GiB)": 89.65, "step": 12105, "token_acc": 0.7402664098860536, "train_speed(iter/s)": 0.696881 }, { "epoch": 0.15713548881009767, "grad_norm": 0.758371889591217, "learning_rate": 9.977706831175526e-05, "loss": 0.9826669692993164, "memory(GiB)": 89.65, "step": 12110, "token_acc": 0.7272316967020214, "train_speed(iter/s)": 0.695555 }, { "epoch": 0.15720036721175337, "grad_norm": 0.7939783334732056, "learning_rate": 9.977656208421285e-05, "loss": 0.9834707260131836, "memory(GiB)": 89.65, "step": 12115, "token_acc": 0.7139595808383233, "train_speed(iter/s)": 0.694169 }, { "epoch": 0.15726524561340907, "grad_norm": 0.8230713605880737, "learning_rate": 9.977605528384497e-05, "loss": 0.9740758895874023, "memory(GiB)": 89.65, "step": 12120, "token_acc": 0.7595779282175409, "train_speed(iter/s)": 0.692845 }, { "epoch": 0.15733012401506477, "grad_norm": 0.8876591920852661, "learning_rate": 9.977554791065745e-05, "loss": 1.0030418395996095, "memory(GiB)": 89.65, "step": 12125, "token_acc": 0.7333767480120648, "train_speed(iter/s)": 0.691486 }, { "epoch": 0.15739500241672047, "grad_norm": 0.9183725118637085, "learning_rate": 9.977503996465614e-05, "loss": 1.01635799407959, "memory(GiB)": 89.65, "step": 12130, "token_acc": 0.7406547985932345, "train_speed(iter/s)": 0.690134 }, { "epoch": 0.15745988081837617, "grad_norm": 0.8065356612205505, "learning_rate": 9.977453144584685e-05, "loss": 0.9483999252319336, "memory(GiB)": 89.65, "step": 12135, "token_acc": 0.7428707791433004, "train_speed(iter/s)": 0.68885 }, { "epoch": 0.15752475922003187, "grad_norm": 0.8357574343681335, "learning_rate": 9.977402235423546e-05, "loss": 0.9628875732421875, "memory(GiB)": 89.65, "step": 12140, "token_acc": 0.7353262593859061, "train_speed(iter/s)": 0.687542 }, { "epoch": 0.15758963762168754, "grad_norm": 0.8454754948616028, "learning_rate": 9.977351268982784e-05, "loss": 0.9937101364135742, "memory(GiB)": 89.65, "step": 12145, "token_acc": 0.7434680170200795, "train_speed(iter/s)": 0.686238 }, { "epoch": 0.15765451602334324, "grad_norm": 0.9043933749198914, "learning_rate": 9.977300245262982e-05, "loss": 1.0019752502441406, "memory(GiB)": 89.65, "step": 12150, "token_acc": 0.731218135086167, "train_speed(iter/s)": 0.685042 }, { "epoch": 0.15771939442499894, "grad_norm": 0.8683270812034607, "learning_rate": 9.977249164264731e-05, "loss": 0.9946222305297852, "memory(GiB)": 89.65, "step": 12155, "token_acc": 0.7369352522790262, "train_speed(iter/s)": 0.683674 }, { "epoch": 0.15778427282665464, "grad_norm": 0.9211680293083191, "learning_rate": 9.977198025988615e-05, "loss": 1.0005688667297363, "memory(GiB)": 89.65, "step": 12160, "token_acc": 0.7268345278112514, "train_speed(iter/s)": 0.682376 }, { "epoch": 0.15784915122831034, "grad_norm": 0.9463661909103394, "learning_rate": 9.977146830435226e-05, "loss": 1.0642868041992188, "memory(GiB)": 89.65, "step": 12165, "token_acc": 0.7310297639411564, "train_speed(iter/s)": 0.681095 }, { "epoch": 0.15791402962996604, "grad_norm": 0.8194654583930969, "learning_rate": 9.977095577605151e-05, "loss": 1.0051300048828125, "memory(GiB)": 89.65, "step": 12170, "token_acc": 0.7492871243693793, "train_speed(iter/s)": 0.679839 }, { "epoch": 0.15797890803162173, "grad_norm": 0.9691696166992188, "learning_rate": 9.97704426749898e-05, "loss": 1.0539506912231444, "memory(GiB)": 89.65, "step": 12175, "token_acc": 0.7184598378776713, "train_speed(iter/s)": 0.67857 }, { "epoch": 0.15804378643327743, "grad_norm": 0.7478305697441101, "learning_rate": 9.976992900117305e-05, "loss": 1.018069076538086, "memory(GiB)": 89.65, "step": 12180, "token_acc": 0.7537985061031153, "train_speed(iter/s)": 0.677289 }, { "epoch": 0.15810866483493313, "grad_norm": 0.8820475935935974, "learning_rate": 9.976941475460715e-05, "loss": 0.9990007400512695, "memory(GiB)": 89.65, "step": 12185, "token_acc": 0.7430401104379171, "train_speed(iter/s)": 0.676045 }, { "epoch": 0.15817354323658883, "grad_norm": 0.8073982000350952, "learning_rate": 9.976889993529803e-05, "loss": 0.9674321174621582, "memory(GiB)": 89.65, "step": 12190, "token_acc": 0.7516183169503716, "train_speed(iter/s)": 0.674827 }, { "epoch": 0.15823842163824453, "grad_norm": 0.804432213306427, "learning_rate": 9.976838454325161e-05, "loss": 0.9766761779785156, "memory(GiB)": 89.65, "step": 12195, "token_acc": 0.7516200350689944, "train_speed(iter/s)": 0.673643 }, { "epoch": 0.15830330003990023, "grad_norm": 0.9186269044876099, "learning_rate": 9.976786857847384e-05, "loss": 0.9919975280761719, "memory(GiB)": 89.65, "step": 12200, "token_acc": 0.7550077041602465, "train_speed(iter/s)": 0.672355 }, { "epoch": 0.1583681784415559, "grad_norm": 0.7610302567481995, "learning_rate": 9.976735204097063e-05, "loss": 1.0014554977416992, "memory(GiB)": 89.65, "step": 12205, "token_acc": 0.7650013020348946, "train_speed(iter/s)": 0.671112 }, { "epoch": 0.1584330568432116, "grad_norm": 0.912847638130188, "learning_rate": 9.976683493074793e-05, "loss": 1.0127182006835938, "memory(GiB)": 89.65, "step": 12210, "token_acc": 0.7493830454211966, "train_speed(iter/s)": 0.66984 }, { "epoch": 0.1584979352448673, "grad_norm": 0.8801739811897278, "learning_rate": 9.97663172478117e-05, "loss": 1.005162239074707, "memory(GiB)": 89.65, "step": 12215, "token_acc": 0.7383436321322526, "train_speed(iter/s)": 0.668598 }, { "epoch": 0.158562813646523, "grad_norm": 0.8733416795730591, "learning_rate": 9.97657989921679e-05, "loss": 1.017906093597412, "memory(GiB)": 89.65, "step": 12220, "token_acc": 0.7321359846055163, "train_speed(iter/s)": 0.667432 }, { "epoch": 0.1586276920481787, "grad_norm": 0.7794143557548523, "learning_rate": 9.976528016382248e-05, "loss": 0.9923826217651367, "memory(GiB)": 89.65, "step": 12225, "token_acc": 0.7449045826301441, "train_speed(iter/s)": 0.666214 }, { "epoch": 0.1586925704498344, "grad_norm": 0.861871600151062, "learning_rate": 9.976476076278143e-05, "loss": 1.005327033996582, "memory(GiB)": 89.65, "step": 12230, "token_acc": 0.742590027700831, "train_speed(iter/s)": 0.664913 }, { "epoch": 0.1587574488514901, "grad_norm": 0.9127099514007568, "learning_rate": 9.976424078905072e-05, "loss": 1.0374626159667968, "memory(GiB)": 89.65, "step": 12235, "token_acc": 0.7442108566049577, "train_speed(iter/s)": 0.663763 }, { "epoch": 0.1588223272531458, "grad_norm": 0.9121654629707336, "learning_rate": 9.976372024263631e-05, "loss": 0.9934666633605957, "memory(GiB)": 89.65, "step": 12240, "token_acc": 0.7359194063026566, "train_speed(iter/s)": 0.662532 }, { "epoch": 0.1588872056548015, "grad_norm": 0.9504805207252502, "learning_rate": 9.976319912354422e-05, "loss": 1.0059874534606934, "memory(GiB)": 89.65, "step": 12245, "token_acc": 0.7126647642342709, "train_speed(iter/s)": 0.66126 }, { "epoch": 0.1589520840564572, "grad_norm": 0.865321159362793, "learning_rate": 9.976267743178043e-05, "loss": 0.9978589057922364, "memory(GiB)": 89.65, "step": 12250, "token_acc": 0.7366982366982368, "train_speed(iter/s)": 0.660041 }, { "epoch": 0.1590169624581129, "grad_norm": 0.8383451104164124, "learning_rate": 9.976215516735096e-05, "loss": 1.022425365447998, "memory(GiB)": 89.65, "step": 12255, "token_acc": 0.7301577742984463, "train_speed(iter/s)": 0.658838 }, { "epoch": 0.1590818408597686, "grad_norm": 0.8721836805343628, "learning_rate": 9.976163233026181e-05, "loss": 1.0285841941833496, "memory(GiB)": 89.65, "step": 12260, "token_acc": 0.7253884252663003, "train_speed(iter/s)": 0.657618 }, { "epoch": 0.15914671926142426, "grad_norm": 0.8898820877075195, "learning_rate": 9.9761108920519e-05, "loss": 1.0094358444213867, "memory(GiB)": 89.65, "step": 12265, "token_acc": 0.7321178120617111, "train_speed(iter/s)": 0.656368 }, { "epoch": 0.15921159766307996, "grad_norm": 0.8814337253570557, "learning_rate": 9.976058493812854e-05, "loss": 1.030186367034912, "memory(GiB)": 89.65, "step": 12270, "token_acc": 0.7408204661174814, "train_speed(iter/s)": 0.65523 }, { "epoch": 0.15927647606473566, "grad_norm": 0.9298816919326782, "learning_rate": 9.976006038309648e-05, "loss": 0.9553842544555664, "memory(GiB)": 89.65, "step": 12275, "token_acc": 0.7365583401502194, "train_speed(iter/s)": 0.654012 }, { "epoch": 0.15934135446639136, "grad_norm": 0.9319794178009033, "learning_rate": 9.975953525542885e-05, "loss": 0.9611116409301758, "memory(GiB)": 89.65, "step": 12280, "token_acc": 0.7414915254237288, "train_speed(iter/s)": 0.652883 }, { "epoch": 0.15940623286804706, "grad_norm": 0.9038590788841248, "learning_rate": 9.975900955513167e-05, "loss": 1.0008631706237794, "memory(GiB)": 89.65, "step": 12285, "token_acc": 0.7473142635257571, "train_speed(iter/s)": 0.651739 }, { "epoch": 0.15947111126970276, "grad_norm": 0.8827272057533264, "learning_rate": 9.975848328221103e-05, "loss": 1.0604118347167968, "memory(GiB)": 89.65, "step": 12290, "token_acc": 0.7112329380174536, "train_speed(iter/s)": 0.650599 }, { "epoch": 0.15953598967135846, "grad_norm": 0.8409236073493958, "learning_rate": 9.975795643667296e-05, "loss": 0.9486454963684082, "memory(GiB)": 89.65, "step": 12295, "token_acc": 0.7549826831340261, "train_speed(iter/s)": 0.64944 }, { "epoch": 0.15960086807301416, "grad_norm": 1.036346673965454, "learning_rate": 9.975742901852353e-05, "loss": 0.9884014129638672, "memory(GiB)": 89.65, "step": 12300, "token_acc": 0.737280363889806, "train_speed(iter/s)": 0.6482 }, { "epoch": 0.15966574647466986, "grad_norm": 0.8050718903541565, "learning_rate": 9.975690102776882e-05, "loss": 1.0009752273559571, "memory(GiB)": 89.65, "step": 12305, "token_acc": 0.7412793619944376, "train_speed(iter/s)": 0.647119 }, { "epoch": 0.15973062487632556, "grad_norm": 0.8703398108482361, "learning_rate": 9.97563724644149e-05, "loss": 1.0133806228637696, "memory(GiB)": 89.65, "step": 12310, "token_acc": 0.7111517367458866, "train_speed(iter/s)": 0.645987 }, { "epoch": 0.15979550327798125, "grad_norm": 0.8655109405517578, "learning_rate": 9.975584332846783e-05, "loss": 0.9810939788818359, "memory(GiB)": 89.65, "step": 12315, "token_acc": 0.7510276456838881, "train_speed(iter/s)": 0.644917 }, { "epoch": 0.15986038167963695, "grad_norm": 0.7936140894889832, "learning_rate": 9.975531361993372e-05, "loss": 0.9501228332519531, "memory(GiB)": 89.65, "step": 12320, "token_acc": 0.7454102209113345, "train_speed(iter/s)": 0.643814 }, { "epoch": 0.15992526008129263, "grad_norm": 0.9384147524833679, "learning_rate": 9.975478333881866e-05, "loss": 1.004969596862793, "memory(GiB)": 89.65, "step": 12325, "token_acc": 0.7486373478682894, "train_speed(iter/s)": 0.642705 }, { "epoch": 0.15999013848294832, "grad_norm": 0.7653191685676575, "learning_rate": 9.975425248512877e-05, "loss": 0.9987712860107422, "memory(GiB)": 89.65, "step": 12330, "token_acc": 0.750348358663599, "train_speed(iter/s)": 0.641588 }, { "epoch": 0.16005501688460402, "grad_norm": 0.8718465566635132, "learning_rate": 9.975372105887013e-05, "loss": 0.9366028785705567, "memory(GiB)": 89.65, "step": 12335, "token_acc": 0.7536000593824228, "train_speed(iter/s)": 0.640487 }, { "epoch": 0.16011989528625972, "grad_norm": 0.8836268782615662, "learning_rate": 9.975318906004889e-05, "loss": 0.9768154144287109, "memory(GiB)": 89.65, "step": 12340, "token_acc": 0.7243575978813256, "train_speed(iter/s)": 0.639414 }, { "epoch": 0.16018477368791542, "grad_norm": 0.7163830399513245, "learning_rate": 9.975265648867114e-05, "loss": 0.9382209777832031, "memory(GiB)": 89.65, "step": 12345, "token_acc": 0.7543074826127983, "train_speed(iter/s)": 0.638319 }, { "epoch": 0.16024965208957112, "grad_norm": 0.8144587874412537, "learning_rate": 9.975212334474302e-05, "loss": 0.9728004455566406, "memory(GiB)": 89.65, "step": 12350, "token_acc": 0.7576553241315687, "train_speed(iter/s)": 0.637188 }, { "epoch": 0.16031453049122682, "grad_norm": 0.8424519896507263, "learning_rate": 9.975158962827066e-05, "loss": 0.9767007827758789, "memory(GiB)": 89.65, "step": 12355, "token_acc": 0.7302601739990929, "train_speed(iter/s)": 0.636092 }, { "epoch": 0.16037940889288252, "grad_norm": 0.8706042766571045, "learning_rate": 9.975105533926021e-05, "loss": 0.9761459350585937, "memory(GiB)": 89.65, "step": 12360, "token_acc": 0.7358733427725576, "train_speed(iter/s)": 0.635 }, { "epoch": 0.16044428729453822, "grad_norm": 0.8155844807624817, "learning_rate": 9.975052047771782e-05, "loss": 0.9631487846374511, "memory(GiB)": 89.65, "step": 12365, "token_acc": 0.7506645580268515, "train_speed(iter/s)": 0.633934 }, { "epoch": 0.16050916569619392, "grad_norm": 0.7988657355308533, "learning_rate": 9.974998504364966e-05, "loss": 0.9826810836791993, "memory(GiB)": 89.65, "step": 12370, "token_acc": 0.7470903918272339, "train_speed(iter/s)": 0.632881 }, { "epoch": 0.16057404409784962, "grad_norm": 0.793876588344574, "learning_rate": 9.974944903706184e-05, "loss": 0.9893243789672852, "memory(GiB)": 89.65, "step": 12375, "token_acc": 0.7466904373553844, "train_speed(iter/s)": 0.631795 }, { "epoch": 0.16063892249950532, "grad_norm": 0.8099923133850098, "learning_rate": 9.974891245796058e-05, "loss": 1.0069005966186524, "memory(GiB)": 89.65, "step": 12380, "token_acc": 0.727361772553788, "train_speed(iter/s)": 0.630757 }, { "epoch": 0.160703800901161, "grad_norm": 0.897159218788147, "learning_rate": 9.974837530635204e-05, "loss": 1.0647315979003906, "memory(GiB)": 89.65, "step": 12385, "token_acc": 0.7271915584415585, "train_speed(iter/s)": 0.629731 }, { "epoch": 0.16076867930281669, "grad_norm": 0.8911254405975342, "learning_rate": 9.974783758224239e-05, "loss": 0.9537185668945313, "memory(GiB)": 89.65, "step": 12390, "token_acc": 0.7410408729902499, "train_speed(iter/s)": 0.628658 }, { "epoch": 0.16083355770447239, "grad_norm": 0.8205724358558655, "learning_rate": 9.974729928563785e-05, "loss": 0.9892360687255859, "memory(GiB)": 89.65, "step": 12395, "token_acc": 0.7343122316110264, "train_speed(iter/s)": 0.627611 }, { "epoch": 0.16089843610612808, "grad_norm": 0.86163330078125, "learning_rate": 9.974676041654458e-05, "loss": 1.029021644592285, "memory(GiB)": 89.65, "step": 12400, "token_acc": 0.7217217217217218, "train_speed(iter/s)": 0.626522 }, { "epoch": 0.16096331450778378, "grad_norm": 0.8283206224441528, "learning_rate": 9.974622097496878e-05, "loss": 1.0098384857177733, "memory(GiB)": 89.65, "step": 12405, "token_acc": 0.7417640807651434, "train_speed(iter/s)": 0.625477 }, { "epoch": 0.16102819290943948, "grad_norm": 0.9119144678115845, "learning_rate": 9.974568096091667e-05, "loss": 0.9955626487731933, "memory(GiB)": 89.65, "step": 12410, "token_acc": 0.7579565856047005, "train_speed(iter/s)": 0.624506 }, { "epoch": 0.16109307131109518, "grad_norm": 0.8268982768058777, "learning_rate": 9.974514037439448e-05, "loss": 1.0220434188842773, "memory(GiB)": 89.65, "step": 12415, "token_acc": 0.7269423326534397, "train_speed(iter/s)": 0.623477 }, { "epoch": 0.16115794971275088, "grad_norm": 0.9629665613174438, "learning_rate": 9.974459921540842e-05, "loss": 0.9824756622314453, "memory(GiB)": 89.65, "step": 12420, "token_acc": 0.7323094004441155, "train_speed(iter/s)": 0.622443 }, { "epoch": 0.16122282811440658, "grad_norm": 0.7875502109527588, "learning_rate": 9.974405748396469e-05, "loss": 0.9696428298950195, "memory(GiB)": 89.65, "step": 12425, "token_acc": 0.727834402424147, "train_speed(iter/s)": 0.621453 }, { "epoch": 0.16128770651606228, "grad_norm": 0.8366703391075134, "learning_rate": 9.974351518006957e-05, "loss": 1.0153316497802733, "memory(GiB)": 89.65, "step": 12430, "token_acc": 0.7503537231417196, "train_speed(iter/s)": 0.620439 }, { "epoch": 0.16135258491771798, "grad_norm": 0.7565732598304749, "learning_rate": 9.974297230372926e-05, "loss": 0.9781785011291504, "memory(GiB)": 89.65, "step": 12435, "token_acc": 0.7239699184408431, "train_speed(iter/s)": 0.619417 }, { "epoch": 0.16141746331937368, "grad_norm": 0.9033042192459106, "learning_rate": 9.974242885495004e-05, "loss": 0.9938219070434571, "memory(GiB)": 89.65, "step": 12440, "token_acc": 0.7372829086187307, "train_speed(iter/s)": 0.618336 }, { "epoch": 0.16148234172102935, "grad_norm": 0.8911659717559814, "learning_rate": 9.974188483373816e-05, "loss": 1.044062328338623, "memory(GiB)": 89.65, "step": 12445, "token_acc": 0.7421150908164997, "train_speed(iter/s)": 0.617265 }, { "epoch": 0.16154722012268505, "grad_norm": 0.8261591196060181, "learning_rate": 9.974134024009984e-05, "loss": 0.9976621627807617, "memory(GiB)": 89.65, "step": 12450, "token_acc": 0.7142216788916056, "train_speed(iter/s)": 0.616279 }, { "epoch": 0.16161209852434075, "grad_norm": 0.8576469421386719, "learning_rate": 9.974079507404141e-05, "loss": 1.0230194091796876, "memory(GiB)": 89.65, "step": 12455, "token_acc": 0.74651073753089, "train_speed(iter/s)": 0.615303 }, { "epoch": 0.16167697692599645, "grad_norm": 0.8126233220100403, "learning_rate": 9.97402493355691e-05, "loss": 0.9919380187988281, "memory(GiB)": 89.65, "step": 12460, "token_acc": 0.7473855617875882, "train_speed(iter/s)": 0.614336 }, { "epoch": 0.16174185532765215, "grad_norm": 0.8548097014427185, "learning_rate": 9.97397030246892e-05, "loss": 1.0545181274414062, "memory(GiB)": 89.65, "step": 12465, "token_acc": 0.7263276907132313, "train_speed(iter/s)": 0.613367 }, { "epoch": 0.16180673372930784, "grad_norm": 0.7684264183044434, "learning_rate": 9.973915614140799e-05, "loss": 0.9994182586669922, "memory(GiB)": 89.65, "step": 12470, "token_acc": 0.7605663567202029, "train_speed(iter/s)": 0.612373 }, { "epoch": 0.16187161213096354, "grad_norm": 0.8494448661804199, "learning_rate": 9.973860868573178e-05, "loss": 1.0425616264343263, "memory(GiB)": 89.65, "step": 12475, "token_acc": 0.6988087210609125, "train_speed(iter/s)": 0.611297 }, { "epoch": 0.16193649053261924, "grad_norm": 0.8744799494743347, "learning_rate": 9.973806065766686e-05, "loss": 1.0034477233886718, "memory(GiB)": 89.65, "step": 12480, "token_acc": 0.7440226139531232, "train_speed(iter/s)": 0.610375 }, { "epoch": 0.16200136893427494, "grad_norm": 0.881893515586853, "learning_rate": 9.973751205721955e-05, "loss": 0.9722704887390137, "memory(GiB)": 89.65, "step": 12485, "token_acc": 0.7389444164809089, "train_speed(iter/s)": 0.609427 }, { "epoch": 0.16206624733593064, "grad_norm": 0.7690728902816772, "learning_rate": 9.973696288439614e-05, "loss": 0.952728271484375, "memory(GiB)": 89.65, "step": 12490, "token_acc": 0.7454561813928379, "train_speed(iter/s)": 0.60842 }, { "epoch": 0.16213112573758634, "grad_norm": 0.8765332102775574, "learning_rate": 9.973641313920296e-05, "loss": 0.982606315612793, "memory(GiB)": 89.65, "step": 12495, "token_acc": 0.74338397222948, "train_speed(iter/s)": 0.607478 }, { "epoch": 0.16219600413924204, "grad_norm": 0.9641931056976318, "learning_rate": 9.973586282164636e-05, "loss": 1.0406500816345214, "memory(GiB)": 89.65, "step": 12500, "token_acc": 0.7242202027999017, "train_speed(iter/s)": 0.606569 }, { "epoch": 0.1622608825408977, "grad_norm": 0.8449525237083435, "learning_rate": 9.973531193173262e-05, "loss": 0.9360445976257324, "memory(GiB)": 89.65, "step": 12505, "token_acc": 0.7360503574095807, "train_speed(iter/s)": 0.605649 }, { "epoch": 0.1623257609425534, "grad_norm": 0.9046818017959595, "learning_rate": 9.973476046946814e-05, "loss": 0.9807109832763672, "memory(GiB)": 89.65, "step": 12510, "token_acc": 0.7365179186767746, "train_speed(iter/s)": 0.604629 }, { "epoch": 0.1623906393442091, "grad_norm": 0.8842944502830505, "learning_rate": 9.973420843485922e-05, "loss": 1.0587944030761718, "memory(GiB)": 89.65, "step": 12515, "token_acc": 0.7208945583723558, "train_speed(iter/s)": 0.60371 }, { "epoch": 0.1624555177458648, "grad_norm": 0.809508740901947, "learning_rate": 9.973365582791223e-05, "loss": 0.940915870666504, "memory(GiB)": 89.65, "step": 12520, "token_acc": 0.7562620074672853, "train_speed(iter/s)": 0.602728 }, { "epoch": 0.1625203961475205, "grad_norm": 0.9235515594482422, "learning_rate": 9.973310264863353e-05, "loss": 1.0286104202270507, "memory(GiB)": 89.65, "step": 12525, "token_acc": 0.7135934349686959, "train_speed(iter/s)": 0.601771 }, { "epoch": 0.1625852745491762, "grad_norm": 0.8006936311721802, "learning_rate": 9.97325488970295e-05, "loss": 1.0212116241455078, "memory(GiB)": 89.65, "step": 12530, "token_acc": 0.7212578356409727, "train_speed(iter/s)": 0.600881 }, { "epoch": 0.1626501529508319, "grad_norm": 0.8512935042381287, "learning_rate": 9.973199457310649e-05, "loss": 1.030265998840332, "memory(GiB)": 89.65, "step": 12535, "token_acc": 0.7310330405687997, "train_speed(iter/s)": 0.599948 }, { "epoch": 0.1627150313524876, "grad_norm": 0.8067015409469604, "learning_rate": 9.973143967687089e-05, "loss": 0.9633482933044434, "memory(GiB)": 89.65, "step": 12540, "token_acc": 0.7443100877919496, "train_speed(iter/s)": 0.59904 }, { "epoch": 0.1627799097541433, "grad_norm": 0.931668221950531, "learning_rate": 9.973088420832908e-05, "loss": 0.9914508819580078, "memory(GiB)": 89.65, "step": 12545, "token_acc": 0.7379227053140096, "train_speed(iter/s)": 0.598154 }, { "epoch": 0.162844788155799, "grad_norm": 0.8246443867683411, "learning_rate": 9.973032816748744e-05, "loss": 1.0105836868286133, "memory(GiB)": 89.65, "step": 12550, "token_acc": 0.7255052443080071, "train_speed(iter/s)": 0.597303 }, { "epoch": 0.1629096665574547, "grad_norm": 0.9011732339859009, "learning_rate": 9.97297715543524e-05, "loss": 1.0056068420410156, "memory(GiB)": 89.65, "step": 12555, "token_acc": 0.7379247402415052, "train_speed(iter/s)": 0.596368 }, { "epoch": 0.1629745449591104, "grad_norm": 0.7797217965126038, "learning_rate": 9.972921436893034e-05, "loss": 0.9595612525939942, "memory(GiB)": 89.65, "step": 12560, "token_acc": 0.7405359248566665, "train_speed(iter/s)": 0.595438 }, { "epoch": 0.16303942336076607, "grad_norm": 0.8748644590377808, "learning_rate": 9.97286566112277e-05, "loss": 1.0150222778320312, "memory(GiB)": 89.65, "step": 12565, "token_acc": 0.7375722995153978, "train_speed(iter/s)": 0.59453 }, { "epoch": 0.16310430176242177, "grad_norm": 0.8378843069076538, "learning_rate": 9.972809828125087e-05, "loss": 0.9841838836669922, "memory(GiB)": 89.65, "step": 12570, "token_acc": 0.7398658501020706, "train_speed(iter/s)": 0.593636 }, { "epoch": 0.16316918016407747, "grad_norm": 0.808856725692749, "learning_rate": 9.972753937900628e-05, "loss": 0.9888036727905274, "memory(GiB)": 89.65, "step": 12575, "token_acc": 0.7493064144132099, "train_speed(iter/s)": 0.592736 }, { "epoch": 0.16323405856573317, "grad_norm": 0.8524967432022095, "learning_rate": 9.972697990450037e-05, "loss": 1.0173895835876465, "memory(GiB)": 89.65, "step": 12580, "token_acc": 0.7436401107747794, "train_speed(iter/s)": 0.591828 }, { "epoch": 0.16329893696738887, "grad_norm": 0.9117621183395386, "learning_rate": 9.972641985773959e-05, "loss": 1.0026898384094238, "memory(GiB)": 89.65, "step": 12585, "token_acc": 0.7390134245187436, "train_speed(iter/s)": 0.590927 }, { "epoch": 0.16336381536904457, "grad_norm": 0.8672768473625183, "learning_rate": 9.972585923873035e-05, "loss": 0.9852417945861817, "memory(GiB)": 89.65, "step": 12590, "token_acc": 0.7508871051090364, "train_speed(iter/s)": 0.590021 }, { "epoch": 0.16342869377070027, "grad_norm": 0.8646494150161743, "learning_rate": 9.972529804747912e-05, "loss": 1.0004241943359375, "memory(GiB)": 89.65, "step": 12595, "token_acc": 0.7645349948396468, "train_speed(iter/s)": 0.589097 }, { "epoch": 0.16349357217235597, "grad_norm": 0.8587789535522461, "learning_rate": 9.972473628399238e-05, "loss": 0.9300917625427246, "memory(GiB)": 89.65, "step": 12600, "token_acc": 0.7859891394253246, "train_speed(iter/s)": 0.588227 }, { "epoch": 0.16355845057401167, "grad_norm": 0.8804176449775696, "learning_rate": 9.972417394827657e-05, "loss": 0.9550841331481934, "memory(GiB)": 89.65, "step": 12605, "token_acc": 0.7570344917012448, "train_speed(iter/s)": 0.587313 }, { "epoch": 0.16362332897566736, "grad_norm": 0.8479302525520325, "learning_rate": 9.972361104033817e-05, "loss": 1.0001083374023438, "memory(GiB)": 89.65, "step": 12610, "token_acc": 0.7396248420643405, "train_speed(iter/s)": 0.586431 }, { "epoch": 0.16368820737732306, "grad_norm": 0.8612625598907471, "learning_rate": 9.972304756018362e-05, "loss": 0.9987253189086914, "memory(GiB)": 89.65, "step": 12615, "token_acc": 0.7368887177512514, "train_speed(iter/s)": 0.585533 }, { "epoch": 0.16375308577897874, "grad_norm": 1.0043373107910156, "learning_rate": 9.972248350781948e-05, "loss": 0.9860608100891113, "memory(GiB)": 89.65, "step": 12620, "token_acc": 0.7366282531787454, "train_speed(iter/s)": 0.584653 }, { "epoch": 0.16381796418063443, "grad_norm": 0.7590059041976929, "learning_rate": 9.972191888325219e-05, "loss": 0.9297431945800781, "memory(GiB)": 89.65, "step": 12625, "token_acc": 0.7357228012152444, "train_speed(iter/s)": 0.583794 }, { "epoch": 0.16388284258229013, "grad_norm": 0.8083802461624146, "learning_rate": 9.972135368648825e-05, "loss": 0.9647595405578613, "memory(GiB)": 89.65, "step": 12630, "token_acc": 0.7498837929965912, "train_speed(iter/s)": 0.582925 }, { "epoch": 0.16394772098394583, "grad_norm": 0.845586359500885, "learning_rate": 9.972078791753417e-05, "loss": 0.999808406829834, "memory(GiB)": 89.65, "step": 12635, "token_acc": 0.7505646982897709, "train_speed(iter/s)": 0.582022 }, { "epoch": 0.16401259938560153, "grad_norm": 0.8801283836364746, "learning_rate": 9.972022157639647e-05, "loss": 1.034379768371582, "memory(GiB)": 89.65, "step": 12640, "token_acc": 0.7362881806108897, "train_speed(iter/s)": 0.581178 }, { "epoch": 0.16407747778725723, "grad_norm": 1.026223063468933, "learning_rate": 9.971965466308165e-05, "loss": 0.9839878082275391, "memory(GiB)": 89.65, "step": 12645, "token_acc": 0.7429586189807549, "train_speed(iter/s)": 0.580348 }, { "epoch": 0.16414235618891293, "grad_norm": 0.7887604236602783, "learning_rate": 9.971908717759626e-05, "loss": 0.9520496368408203, "memory(GiB)": 89.65, "step": 12650, "token_acc": 0.7415135567660727, "train_speed(iter/s)": 0.579473 }, { "epoch": 0.16420723459056863, "grad_norm": 0.8227635622024536, "learning_rate": 9.971851911994681e-05, "loss": 0.924985122680664, "memory(GiB)": 89.65, "step": 12655, "token_acc": 0.7613028516915948, "train_speed(iter/s)": 0.578618 }, { "epoch": 0.16427211299222433, "grad_norm": 0.841041624546051, "learning_rate": 9.971795049013984e-05, "loss": 0.9870044708251953, "memory(GiB)": 89.65, "step": 12660, "token_acc": 0.7342572062084257, "train_speed(iter/s)": 0.577741 }, { "epoch": 0.16433699139388003, "grad_norm": 0.81275475025177, "learning_rate": 9.971738128818189e-05, "loss": 1.0466704368591309, "memory(GiB)": 89.65, "step": 12665, "token_acc": 0.7136670487835944, "train_speed(iter/s)": 0.576877 }, { "epoch": 0.16440186979553573, "grad_norm": 0.8278107643127441, "learning_rate": 9.97168115140795e-05, "loss": 0.9627665519714356, "memory(GiB)": 89.65, "step": 12670, "token_acc": 0.7401957564212047, "train_speed(iter/s)": 0.575993 }, { "epoch": 0.16446674819719143, "grad_norm": 0.8071231842041016, "learning_rate": 9.971624116783927e-05, "loss": 0.9407631874084472, "memory(GiB)": 89.65, "step": 12675, "token_acc": 0.7510538719421117, "train_speed(iter/s)": 0.575132 }, { "epoch": 0.1645316265988471, "grad_norm": 0.8914564251899719, "learning_rate": 9.971567024946773e-05, "loss": 0.9883912086486817, "memory(GiB)": 89.65, "step": 12680, "token_acc": 0.7327798967915639, "train_speed(iter/s)": 0.574283 }, { "epoch": 0.1645965050005028, "grad_norm": 0.8695298433303833, "learning_rate": 9.971509875897146e-05, "loss": 0.9491628646850586, "memory(GiB)": 89.65, "step": 12685, "token_acc": 0.7262785258233769, "train_speed(iter/s)": 0.573422 }, { "epoch": 0.1646613834021585, "grad_norm": 0.9896183013916016, "learning_rate": 9.971452669635702e-05, "loss": 0.9792612075805665, "memory(GiB)": 89.65, "step": 12690, "token_acc": 0.7369847176559847, "train_speed(iter/s)": 0.57258 }, { "epoch": 0.1647262618038142, "grad_norm": 0.7586923837661743, "learning_rate": 9.971395406163102e-05, "loss": 0.9657570838928222, "memory(GiB)": 89.65, "step": 12695, "token_acc": 0.7393894542090657, "train_speed(iter/s)": 0.571695 }, { "epoch": 0.1647911402054699, "grad_norm": 0.8881884813308716, "learning_rate": 9.971338085480003e-05, "loss": 1.0360107421875, "memory(GiB)": 89.65, "step": 12700, "token_acc": 0.7110767417881868, "train_speed(iter/s)": 0.570852 }, { "epoch": 0.1648560186071256, "grad_norm": 0.7926466464996338, "learning_rate": 9.971280707587065e-05, "loss": 1.009310531616211, "memory(GiB)": 89.65, "step": 12705, "token_acc": 0.7428485622043697, "train_speed(iter/s)": 0.569987 }, { "epoch": 0.1649208970087813, "grad_norm": 0.838157594203949, "learning_rate": 9.971223272484948e-05, "loss": 1.0461606979370117, "memory(GiB)": 89.65, "step": 12710, "token_acc": 0.7185748058088484, "train_speed(iter/s)": 0.569141 }, { "epoch": 0.164985775410437, "grad_norm": 0.8935750126838684, "learning_rate": 9.971165780174316e-05, "loss": 1.0289615631103515, "memory(GiB)": 89.65, "step": 12715, "token_acc": 0.7629564418586848, "train_speed(iter/s)": 0.568316 }, { "epoch": 0.1650506538120927, "grad_norm": 0.8710665106773376, "learning_rate": 9.971108230655825e-05, "loss": 0.952393913269043, "memory(GiB)": 89.65, "step": 12720, "token_acc": 0.7446013630018885, "train_speed(iter/s)": 0.567501 }, { "epoch": 0.1651155322137484, "grad_norm": 0.8375855088233948, "learning_rate": 9.971050623930142e-05, "loss": 0.9702272415161133, "memory(GiB)": 89.65, "step": 12725, "token_acc": 0.7254223258963843, "train_speed(iter/s)": 0.566608 }, { "epoch": 0.1651804106154041, "grad_norm": 0.9207974672317505, "learning_rate": 9.970992959997929e-05, "loss": 1.0258476257324218, "memory(GiB)": 89.65, "step": 12730, "token_acc": 0.7308590346750304, "train_speed(iter/s)": 0.565815 }, { "epoch": 0.1652452890170598, "grad_norm": 0.8744789958000183, "learning_rate": 9.970935238859849e-05, "loss": 1.0394783973693849, "memory(GiB)": 89.65, "step": 12735, "token_acc": 0.7405816259087905, "train_speed(iter/s)": 0.564967 }, { "epoch": 0.16531016741871546, "grad_norm": 0.7867351770401001, "learning_rate": 9.970877460516566e-05, "loss": 0.9714565277099609, "memory(GiB)": 89.65, "step": 12740, "token_acc": 0.7251757251757251, "train_speed(iter/s)": 0.564131 }, { "epoch": 0.16537504582037116, "grad_norm": 0.9947208762168884, "learning_rate": 9.970819624968744e-05, "loss": 1.0171531677246093, "memory(GiB)": 89.65, "step": 12745, "token_acc": 0.7234432234432234, "train_speed(iter/s)": 0.563366 }, { "epoch": 0.16543992422202686, "grad_norm": 0.8170895576477051, "learning_rate": 9.97076173221705e-05, "loss": 0.9207340240478515, "memory(GiB)": 89.65, "step": 12750, "token_acc": 0.7543911198171727, "train_speed(iter/s)": 0.562522 }, { "epoch": 0.16550480262368256, "grad_norm": 0.8977817893028259, "learning_rate": 9.97070378226215e-05, "loss": 0.9784250259399414, "memory(GiB)": 89.65, "step": 12755, "token_acc": 0.7666612284851946, "train_speed(iter/s)": 0.561713 }, { "epoch": 0.16556968102533826, "grad_norm": 0.7872916460037231, "learning_rate": 9.97064577510471e-05, "loss": 0.9791540145874024, "memory(GiB)": 89.65, "step": 12760, "token_acc": 0.7513599878266823, "train_speed(iter/s)": 0.560973 }, { "epoch": 0.16563455942699395, "grad_norm": 0.9617129564285278, "learning_rate": 9.9705877107454e-05, "loss": 1.0413125038146973, "memory(GiB)": 89.65, "step": 12765, "token_acc": 0.7217989842288158, "train_speed(iter/s)": 0.560203 }, { "epoch": 0.16569943782864965, "grad_norm": 0.9380966424942017, "learning_rate": 9.970529589184887e-05, "loss": 0.957160758972168, "memory(GiB)": 89.65, "step": 12770, "token_acc": 0.7368894735020249, "train_speed(iter/s)": 0.559411 }, { "epoch": 0.16576431623030535, "grad_norm": 0.8927614688873291, "learning_rate": 9.970471410423838e-05, "loss": 0.9716377258300781, "memory(GiB)": 89.65, "step": 12775, "token_acc": 0.7277564396208464, "train_speed(iter/s)": 0.558626 }, { "epoch": 0.16582919463196105, "grad_norm": 0.9620044231414795, "learning_rate": 9.970413174462924e-05, "loss": 1.0178229331970214, "memory(GiB)": 89.65, "step": 12780, "token_acc": 0.7233260060872506, "train_speed(iter/s)": 0.557909 }, { "epoch": 0.16589407303361675, "grad_norm": 0.8084186315536499, "learning_rate": 9.970354881302815e-05, "loss": 0.9589617729187012, "memory(GiB)": 89.65, "step": 12785, "token_acc": 0.7480649458184829, "train_speed(iter/s)": 0.557126 }, { "epoch": 0.16595895143527245, "grad_norm": 0.8856860399246216, "learning_rate": 9.970296530944183e-05, "loss": 0.998080062866211, "memory(GiB)": 89.65, "step": 12790, "token_acc": 0.7432929320441236, "train_speed(iter/s)": 0.556338 }, { "epoch": 0.16602382983692815, "grad_norm": 0.8421514630317688, "learning_rate": 9.970238123387696e-05, "loss": 0.9766743659973145, "memory(GiB)": 89.65, "step": 12795, "token_acc": 0.7682632541133455, "train_speed(iter/s)": 0.555556 }, { "epoch": 0.16608870823858382, "grad_norm": 0.8936494588851929, "learning_rate": 9.97017965863403e-05, "loss": 0.9785603523254395, "memory(GiB)": 89.65, "step": 12800, "token_acc": 0.7402683258798367, "train_speed(iter/s)": 0.554776 }, { "epoch": 0.16615358664023952, "grad_norm": 0.8481636047363281, "learning_rate": 9.970121136683857e-05, "loss": 1.0175621032714843, "memory(GiB)": 89.65, "step": 12805, "token_acc": 0.7350346138183793, "train_speed(iter/s)": 0.554032 }, { "epoch": 0.16621846504189522, "grad_norm": 0.7939226627349854, "learning_rate": 9.97006255753785e-05, "loss": 0.990912914276123, "memory(GiB)": 89.65, "step": 12810, "token_acc": 0.7301266995269443, "train_speed(iter/s)": 0.55325 }, { "epoch": 0.16628334344355092, "grad_norm": 0.8840574026107788, "learning_rate": 9.970003921196683e-05, "loss": 1.028106117248535, "memory(GiB)": 89.65, "step": 12815, "token_acc": 0.7267703940671805, "train_speed(iter/s)": 0.552475 }, { "epoch": 0.16634822184520662, "grad_norm": 0.8331611752510071, "learning_rate": 9.969945227661028e-05, "loss": 1.0386205673217774, "memory(GiB)": 89.65, "step": 12820, "token_acc": 0.7360802634602129, "train_speed(iter/s)": 0.551723 }, { "epoch": 0.16641310024686232, "grad_norm": 0.8283082246780396, "learning_rate": 9.969886476931567e-05, "loss": 0.9765869140625, "memory(GiB)": 89.65, "step": 12825, "token_acc": 0.7488669977475644, "train_speed(iter/s)": 0.550904 }, { "epoch": 0.16647797864851802, "grad_norm": 0.7913424968719482, "learning_rate": 9.969827669008968e-05, "loss": 0.9619653701782227, "memory(GiB)": 89.65, "step": 12830, "token_acc": 0.7418536430666772, "train_speed(iter/s)": 0.550169 }, { "epoch": 0.16654285705017371, "grad_norm": 0.7894279360771179, "learning_rate": 9.969768803893915e-05, "loss": 0.9955771446228028, "memory(GiB)": 89.65, "step": 12835, "token_acc": 0.733422271050773, "train_speed(iter/s)": 0.549405 }, { "epoch": 0.16660773545182941, "grad_norm": 0.9217360615730286, "learning_rate": 9.969709881587083e-05, "loss": 1.0095422744750977, "memory(GiB)": 89.65, "step": 12840, "token_acc": 0.7513812154696132, "train_speed(iter/s)": 0.548657 }, { "epoch": 0.1666726138534851, "grad_norm": 0.791466236114502, "learning_rate": 9.969650902089147e-05, "loss": 0.978700065612793, "memory(GiB)": 89.65, "step": 12845, "token_acc": 0.7496476888387824, "train_speed(iter/s)": 0.547926 }, { "epoch": 0.1667374922551408, "grad_norm": 0.9135726690292358, "learning_rate": 9.969591865400791e-05, "loss": 1.0683155059814453, "memory(GiB)": 89.65, "step": 12850, "token_acc": 0.7186528175146163, "train_speed(iter/s)": 0.547159 }, { "epoch": 0.1668023706567965, "grad_norm": 0.7710577249526978, "learning_rate": 9.969532771522689e-05, "loss": 0.9870573043823242, "memory(GiB)": 89.65, "step": 12855, "token_acc": 0.7266250243997657, "train_speed(iter/s)": 0.546324 }, { "epoch": 0.16686724905845218, "grad_norm": 0.7889006733894348, "learning_rate": 9.969473620455526e-05, "loss": 0.9960948944091796, "memory(GiB)": 89.65, "step": 12860, "token_acc": 0.7277690575308811, "train_speed(iter/s)": 0.545593 }, { "epoch": 0.16693212746010788, "grad_norm": 0.8602114319801331, "learning_rate": 9.96941441219998e-05, "loss": 0.989738655090332, "memory(GiB)": 89.65, "step": 12865, "token_acc": 0.7528934214498977, "train_speed(iter/s)": 0.544862 }, { "epoch": 0.16699700586176358, "grad_norm": 0.8424610495567322, "learning_rate": 9.969355146756731e-05, "loss": 1.005305290222168, "memory(GiB)": 89.65, "step": 12870, "token_acc": 0.7455029980013325, "train_speed(iter/s)": 0.544113 }, { "epoch": 0.16706188426341928, "grad_norm": 0.7989156246185303, "learning_rate": 9.969295824126465e-05, "loss": 0.9750544548034668, "memory(GiB)": 89.65, "step": 12875, "token_acc": 0.7454018135761225, "train_speed(iter/s)": 0.543401 }, { "epoch": 0.16712676266507498, "grad_norm": 0.7285611629486084, "learning_rate": 9.969236444309862e-05, "loss": 1.0192940711975098, "memory(GiB)": 89.65, "step": 12880, "token_acc": 0.7398200757575758, "train_speed(iter/s)": 0.54266 }, { "epoch": 0.16719164106673068, "grad_norm": 0.8928941488265991, "learning_rate": 9.969177007307604e-05, "loss": 1.0189278602600098, "memory(GiB)": 89.65, "step": 12885, "token_acc": 0.7293928427824689, "train_speed(iter/s)": 0.541927 }, { "epoch": 0.16725651946838638, "grad_norm": 0.9452711343765259, "learning_rate": 9.969117513120379e-05, "loss": 0.9659904479980469, "memory(GiB)": 89.65, "step": 12890, "token_acc": 0.7391304347826086, "train_speed(iter/s)": 0.541192 }, { "epoch": 0.16732139787004208, "grad_norm": 0.8216612339019775, "learning_rate": 9.969057961748869e-05, "loss": 1.014315700531006, "memory(GiB)": 89.65, "step": 12895, "token_acc": 0.7244094488188977, "train_speed(iter/s)": 0.54049 }, { "epoch": 0.16738627627169778, "grad_norm": 0.8794660568237305, "learning_rate": 9.968998353193759e-05, "loss": 0.9446299552917481, "memory(GiB)": 89.65, "step": 12900, "token_acc": 0.7350214964698458, "train_speed(iter/s)": 0.53979 }, { "epoch": 0.16745115467335348, "grad_norm": 0.8392118215560913, "learning_rate": 9.968938687455736e-05, "loss": 0.9784174919128418, "memory(GiB)": 89.65, "step": 12905, "token_acc": 0.7252333743396773, "train_speed(iter/s)": 0.539077 }, { "epoch": 0.16751603307500917, "grad_norm": 0.7252846956253052, "learning_rate": 9.968878964535487e-05, "loss": 1.019363784790039, "memory(GiB)": 89.65, "step": 12910, "token_acc": 0.7400018938796123, "train_speed(iter/s)": 0.53834 }, { "epoch": 0.16758091147666487, "grad_norm": 0.7840639352798462, "learning_rate": 9.968819184433698e-05, "loss": 1.0102224349975586, "memory(GiB)": 89.65, "step": 12915, "token_acc": 0.727164077172614, "train_speed(iter/s)": 0.537628 }, { "epoch": 0.16764578987832054, "grad_norm": 0.9453591704368591, "learning_rate": 9.968759347151058e-05, "loss": 1.0078742980957032, "memory(GiB)": 89.65, "step": 12920, "token_acc": 0.7303989511144409, "train_speed(iter/s)": 0.536933 }, { "epoch": 0.16771066827997624, "grad_norm": 0.8637279272079468, "learning_rate": 9.968699452688255e-05, "loss": 1.0290350914001465, "memory(GiB)": 89.65, "step": 12925, "token_acc": 0.7380340224703272, "train_speed(iter/s)": 0.536242 }, { "epoch": 0.16777554668163194, "grad_norm": 1.0135245323181152, "learning_rate": 9.968639501045977e-05, "loss": 0.9875469207763672, "memory(GiB)": 89.65, "step": 12930, "token_acc": 0.7223753045006189, "train_speed(iter/s)": 0.535542 }, { "epoch": 0.16784042508328764, "grad_norm": 0.8483598232269287, "learning_rate": 9.968579492224919e-05, "loss": 0.9416542053222656, "memory(GiB)": 89.65, "step": 12935, "token_acc": 0.7497964633471196, "train_speed(iter/s)": 0.534787 }, { "epoch": 0.16790530348494334, "grad_norm": 0.8090547919273376, "learning_rate": 9.968519426225765e-05, "loss": 0.9862329483032226, "memory(GiB)": 89.65, "step": 12940, "token_acc": 0.7342807793703221, "train_speed(iter/s)": 0.534057 }, { "epoch": 0.16797018188659904, "grad_norm": 0.9271068572998047, "learning_rate": 9.968459303049211e-05, "loss": 1.0681731224060058, "memory(GiB)": 89.65, "step": 12945, "token_acc": 0.7288455489147531, "train_speed(iter/s)": 0.533399 }, { "epoch": 0.16803506028825474, "grad_norm": 0.8745424151420593, "learning_rate": 9.968399122695945e-05, "loss": 0.9986076354980469, "memory(GiB)": 89.65, "step": 12950, "token_acc": 0.7207566760764463, "train_speed(iter/s)": 0.532709 }, { "epoch": 0.16809993868991044, "grad_norm": 0.8892561793327332, "learning_rate": 9.968338885166663e-05, "loss": 0.9613723754882812, "memory(GiB)": 89.65, "step": 12955, "token_acc": 0.7526963262554769, "train_speed(iter/s)": 0.532063 }, { "epoch": 0.16816481709156614, "grad_norm": 0.8005008697509766, "learning_rate": 9.968278590462057e-05, "loss": 0.9920478820800781, "memory(GiB)": 89.65, "step": 12960, "token_acc": 0.7387092178372988, "train_speed(iter/s)": 0.531358 }, { "epoch": 0.16822969549322184, "grad_norm": 0.9120790362358093, "learning_rate": 9.96821823858282e-05, "loss": 0.9322877883911133, "memory(GiB)": 89.65, "step": 12965, "token_acc": 0.7588477892515746, "train_speed(iter/s)": 0.530656 }, { "epoch": 0.16829457389487754, "grad_norm": 0.7961069345474243, "learning_rate": 9.968157829529648e-05, "loss": 0.9735231399536133, "memory(GiB)": 89.65, "step": 12970, "token_acc": 0.7485125308011299, "train_speed(iter/s)": 0.529992 }, { "epoch": 0.16835945229653324, "grad_norm": 0.9007936120033264, "learning_rate": 9.968097363303236e-05, "loss": 1.0533744812011718, "memory(GiB)": 89.65, "step": 12975, "token_acc": 0.7215806506557656, "train_speed(iter/s)": 0.529298 }, { "epoch": 0.1684243306981889, "grad_norm": 0.8298960328102112, "learning_rate": 9.968036839904278e-05, "loss": 0.9769752502441407, "memory(GiB)": 89.65, "step": 12980, "token_acc": 0.7503689710136372, "train_speed(iter/s)": 0.528608 }, { "epoch": 0.1684892090998446, "grad_norm": 0.8589034080505371, "learning_rate": 9.967976259333473e-05, "loss": 0.981197738647461, "memory(GiB)": 89.65, "step": 12985, "token_acc": 0.7477611411985416, "train_speed(iter/s)": 0.527956 }, { "epoch": 0.1685540875015003, "grad_norm": 0.8491289615631104, "learning_rate": 9.967915621591515e-05, "loss": 0.9695119857788086, "memory(GiB)": 89.65, "step": 12990, "token_acc": 0.7262769074003311, "train_speed(iter/s)": 0.527308 }, { "epoch": 0.168618965903156, "grad_norm": 0.9037366509437561, "learning_rate": 9.967854926679106e-05, "loss": 0.9984892845153809, "memory(GiB)": 89.65, "step": 12995, "token_acc": 0.7319199457259159, "train_speed(iter/s)": 0.526635 }, { "epoch": 0.1686838443048117, "grad_norm": 0.8766149878501892, "learning_rate": 9.96779417459694e-05, "loss": 0.9822019577026367, "memory(GiB)": 89.65, "step": 13000, "token_acc": 0.7541957225317807, "train_speed(iter/s)": 0.52595 }, { "epoch": 0.1687487227064674, "grad_norm": 0.8785683512687683, "learning_rate": 9.96773336534572e-05, "loss": 1.0481533050537108, "memory(GiB)": 89.65, "step": 13005, "token_acc": 0.7203978695636838, "train_speed(iter/s)": 0.525265 }, { "epoch": 0.1688136011081231, "grad_norm": 0.8732132315635681, "learning_rate": 9.967672498926144e-05, "loss": 1.0246068954467773, "memory(GiB)": 89.65, "step": 13010, "token_acc": 0.718722073279714, "train_speed(iter/s)": 0.524608 }, { "epoch": 0.1688784795097788, "grad_norm": 0.9427456259727478, "learning_rate": 9.967611575338913e-05, "loss": 0.9871608734130859, "memory(GiB)": 89.65, "step": 13015, "token_acc": 0.7458384440161205, "train_speed(iter/s)": 0.523928 }, { "epoch": 0.1689433579114345, "grad_norm": 0.7587016820907593, "learning_rate": 9.967550594584727e-05, "loss": 1.0053221702575683, "memory(GiB)": 89.65, "step": 13020, "token_acc": 0.730045464125493, "train_speed(iter/s)": 0.523247 }, { "epoch": 0.1690082363130902, "grad_norm": 0.8706302642822266, "learning_rate": 9.967489556664289e-05, "loss": 0.9290781021118164, "memory(GiB)": 89.65, "step": 13025, "token_acc": 0.7392048974237511, "train_speed(iter/s)": 0.522578 }, { "epoch": 0.1690731147147459, "grad_norm": 0.9370279908180237, "learning_rate": 9.9674284615783e-05, "loss": 0.9870082855224609, "memory(GiB)": 89.65, "step": 13030, "token_acc": 0.731066231066231, "train_speed(iter/s)": 0.521926 }, { "epoch": 0.1691379931164016, "grad_norm": 0.8251950144767761, "learning_rate": 9.967367309327465e-05, "loss": 0.9634206771850586, "memory(GiB)": 89.65, "step": 13035, "token_acc": 0.7622973925299507, "train_speed(iter/s)": 0.521293 }, { "epoch": 0.16920287151805727, "grad_norm": 0.8637605309486389, "learning_rate": 9.967306099912485e-05, "loss": 1.0038573265075683, "memory(GiB)": 89.65, "step": 13040, "token_acc": 0.7291014751899866, "train_speed(iter/s)": 0.520635 }, { "epoch": 0.16926774991971297, "grad_norm": 0.8224723935127258, "learning_rate": 9.967244833334067e-05, "loss": 0.9728996276855468, "memory(GiB)": 89.65, "step": 13045, "token_acc": 0.7423146473779385, "train_speed(iter/s)": 0.519961 }, { "epoch": 0.16933262832136867, "grad_norm": 0.9412451982498169, "learning_rate": 9.967183509592915e-05, "loss": 1.0183990478515625, "memory(GiB)": 89.65, "step": 13050, "token_acc": 0.7392595063375583, "train_speed(iter/s)": 0.519284 }, { "epoch": 0.16939750672302437, "grad_norm": 0.8830515742301941, "learning_rate": 9.967122128689736e-05, "loss": 0.9764277458190918, "memory(GiB)": 89.65, "step": 13055, "token_acc": 0.7854527198629733, "train_speed(iter/s)": 0.518595 }, { "epoch": 0.16946238512468006, "grad_norm": 0.8182381987571716, "learning_rate": 9.967060690625234e-05, "loss": 0.9778825759887695, "memory(GiB)": 89.65, "step": 13060, "token_acc": 0.7315132954884972, "train_speed(iter/s)": 0.517927 }, { "epoch": 0.16952726352633576, "grad_norm": 0.8951712250709534, "learning_rate": 9.966999195400117e-05, "loss": 0.9691059112548828, "memory(GiB)": 89.65, "step": 13065, "token_acc": 0.7545638142002931, "train_speed(iter/s)": 0.517299 }, { "epoch": 0.16959214192799146, "grad_norm": 1.0234603881835938, "learning_rate": 9.966937643015095e-05, "loss": 0.9895990371704102, "memory(GiB)": 89.65, "step": 13070, "token_acc": 0.7571762471696735, "train_speed(iter/s)": 0.516647 }, { "epoch": 0.16965702032964716, "grad_norm": 0.9245968461036682, "learning_rate": 9.966876033470872e-05, "loss": 1.0278876304626465, "memory(GiB)": 89.65, "step": 13075, "token_acc": 0.7312775330396476, "train_speed(iter/s)": 0.516004 }, { "epoch": 0.16972189873130286, "grad_norm": 0.8171122670173645, "learning_rate": 9.96681436676816e-05, "loss": 0.9813455581665039, "memory(GiB)": 89.65, "step": 13080, "token_acc": 0.7339721651836641, "train_speed(iter/s)": 0.515383 }, { "epoch": 0.16978677713295856, "grad_norm": 0.9787347316741943, "learning_rate": 9.966752642907669e-05, "loss": 0.9933586120605469, "memory(GiB)": 89.65, "step": 13085, "token_acc": 0.7364088749204923, "train_speed(iter/s)": 0.514738 }, { "epoch": 0.16985165553461426, "grad_norm": 0.9776408076286316, "learning_rate": 9.966690861890107e-05, "loss": 0.9919440269470214, "memory(GiB)": 89.65, "step": 13090, "token_acc": 0.7368421052631579, "train_speed(iter/s)": 0.514115 }, { "epoch": 0.16991653393626996, "grad_norm": 0.8130245804786682, "learning_rate": 9.966629023716186e-05, "loss": 0.9617788314819335, "memory(GiB)": 89.65, "step": 13095, "token_acc": 0.7518669061347953, "train_speed(iter/s)": 0.513481 }, { "epoch": 0.16998141233792563, "grad_norm": 0.812796950340271, "learning_rate": 9.96656712838662e-05, "loss": 1.0319127082824706, "memory(GiB)": 89.65, "step": 13100, "token_acc": 0.7424328954882924, "train_speed(iter/s)": 0.512827 }, { "epoch": 0.17004629073958133, "grad_norm": 0.8506656289100647, "learning_rate": 9.966505175902117e-05, "loss": 1.0470962524414062, "memory(GiB)": 89.65, "step": 13105, "token_acc": 0.7109375, "train_speed(iter/s)": 0.512209 }, { "epoch": 0.17011116914123703, "grad_norm": 0.9087045192718506, "learning_rate": 9.966443166263391e-05, "loss": 1.0325519561767578, "memory(GiB)": 89.65, "step": 13110, "token_acc": 0.7144298688193743, "train_speed(iter/s)": 0.511592 }, { "epoch": 0.17017604754289273, "grad_norm": 0.8231368660926819, "learning_rate": 9.966381099471158e-05, "loss": 0.9646810531616211, "memory(GiB)": 89.65, "step": 13115, "token_acc": 0.734669592056822, "train_speed(iter/s)": 0.510997 }, { "epoch": 0.17024092594454843, "grad_norm": 0.7913344502449036, "learning_rate": 9.966318975526132e-05, "loss": 0.9615319252014161, "memory(GiB)": 89.65, "step": 13120, "token_acc": 0.7315054508331207, "train_speed(iter/s)": 0.510391 }, { "epoch": 0.17030580434620413, "grad_norm": 0.937162458896637, "learning_rate": 9.966256794429027e-05, "loss": 1.0461393356323243, "memory(GiB)": 89.65, "step": 13125, "token_acc": 0.7371027504631609, "train_speed(iter/s)": 0.50974 }, { "epoch": 0.17037068274785983, "grad_norm": 0.8967177271842957, "learning_rate": 9.966194556180556e-05, "loss": 1.015693473815918, "memory(GiB)": 89.65, "step": 13130, "token_acc": 0.7659665666523789, "train_speed(iter/s)": 0.509143 }, { "epoch": 0.17043556114951552, "grad_norm": 0.753980278968811, "learning_rate": 9.96613226078144e-05, "loss": 0.9982595443725586, "memory(GiB)": 89.65, "step": 13135, "token_acc": 0.7560565506828633, "train_speed(iter/s)": 0.508511 }, { "epoch": 0.17050043955117122, "grad_norm": 1.5545495748519897, "learning_rate": 9.966069908232392e-05, "loss": 0.9958757400512696, "memory(GiB)": 89.65, "step": 13140, "token_acc": 0.729807342784442, "train_speed(iter/s)": 0.507894 }, { "epoch": 0.17056531795282692, "grad_norm": 0.9282245635986328, "learning_rate": 9.966007498534132e-05, "loss": 0.969586181640625, "memory(GiB)": 89.65, "step": 13145, "token_acc": 0.752096143096702, "train_speed(iter/s)": 0.507291 }, { "epoch": 0.17063019635448262, "grad_norm": 0.9438607096672058, "learning_rate": 9.965945031687379e-05, "loss": 1.0362393379211425, "memory(GiB)": 89.65, "step": 13150, "token_acc": 0.7279760516289557, "train_speed(iter/s)": 0.50668 }, { "epoch": 0.17069507475613832, "grad_norm": 0.8032426834106445, "learning_rate": 9.965882507692848e-05, "loss": 0.9499552726745606, "memory(GiB)": 89.65, "step": 13155, "token_acc": 0.7412253310017487, "train_speed(iter/s)": 0.506071 }, { "epoch": 0.170759953157794, "grad_norm": 0.9380815625190735, "learning_rate": 9.965819926551259e-05, "loss": 1.0310727119445802, "memory(GiB)": 89.65, "step": 13160, "token_acc": 0.7470130919126309, "train_speed(iter/s)": 0.505475 }, { "epoch": 0.1708248315594497, "grad_norm": 0.8997186422348022, "learning_rate": 9.965757288263337e-05, "loss": 0.986961555480957, "memory(GiB)": 89.65, "step": 13165, "token_acc": 0.7410234999403554, "train_speed(iter/s)": 0.504843 }, { "epoch": 0.1708897099611054, "grad_norm": 0.7152538299560547, "learning_rate": 9.965694592829798e-05, "loss": 0.9547439575195312, "memory(GiB)": 89.65, "step": 13170, "token_acc": 0.7423670031584815, "train_speed(iter/s)": 0.504218 }, { "epoch": 0.1709545883627611, "grad_norm": 0.9267613887786865, "learning_rate": 9.965631840251366e-05, "loss": 0.9400544166564941, "memory(GiB)": 89.65, "step": 13175, "token_acc": 0.7469293163383546, "train_speed(iter/s)": 0.503608 }, { "epoch": 0.1710194667644168, "grad_norm": 0.8549709916114807, "learning_rate": 9.965569030528763e-05, "loss": 0.9525200843811035, "memory(GiB)": 89.65, "step": 13180, "token_acc": 0.7425332061348793, "train_speed(iter/s)": 0.503013 }, { "epoch": 0.1710843451660725, "grad_norm": 0.8415513634681702, "learning_rate": 9.96550616366271e-05, "loss": 1.036191749572754, "memory(GiB)": 89.65, "step": 13185, "token_acc": 0.7312911411024896, "train_speed(iter/s)": 0.50243 }, { "epoch": 0.1711492235677282, "grad_norm": 0.8543468117713928, "learning_rate": 9.965443239653931e-05, "loss": 0.9601637840270996, "memory(GiB)": 89.65, "step": 13190, "token_acc": 0.7385990813648294, "train_speed(iter/s)": 0.50191 }, { "epoch": 0.1712141019693839, "grad_norm": 0.7827450037002563, "learning_rate": 9.965380258503152e-05, "loss": 1.0154159545898438, "memory(GiB)": 89.65, "step": 13195, "token_acc": 0.7481072714425115, "train_speed(iter/s)": 0.501277 }, { "epoch": 0.17127898037103959, "grad_norm": 0.8055043816566467, "learning_rate": 9.965317220211097e-05, "loss": 1.0381175994873046, "memory(GiB)": 89.65, "step": 13200, "token_acc": 0.7479393652297489, "train_speed(iter/s)": 0.500718 }, { "epoch": 0.17134385877269528, "grad_norm": 0.839653730392456, "learning_rate": 9.96525412477849e-05, "loss": 0.9579383850097656, "memory(GiB)": 89.65, "step": 13205, "token_acc": 0.7555124223602484, "train_speed(iter/s)": 0.500123 }, { "epoch": 0.17140873717435098, "grad_norm": 0.7583733797073364, "learning_rate": 9.96519097220606e-05, "loss": 0.9816569328308106, "memory(GiB)": 89.65, "step": 13210, "token_acc": 0.7324100494119284, "train_speed(iter/s)": 0.499566 }, { "epoch": 0.17147361557600668, "grad_norm": 0.8976486921310425, "learning_rate": 9.965127762494529e-05, "loss": 1.0091777801513673, "memory(GiB)": 89.65, "step": 13215, "token_acc": 0.7282558961773636, "train_speed(iter/s)": 0.498987 }, { "epoch": 0.17153849397766235, "grad_norm": 0.8682900071144104, "learning_rate": 9.965064495644629e-05, "loss": 1.0263151168823241, "memory(GiB)": 89.65, "step": 13220, "token_acc": 0.7105815130912773, "train_speed(iter/s)": 0.498396 }, { "epoch": 0.17160337237931805, "grad_norm": 0.8457943201065063, "learning_rate": 9.965001171657085e-05, "loss": 1.0511743545532226, "memory(GiB)": 89.65, "step": 13225, "token_acc": 0.7329882454437615, "train_speed(iter/s)": 0.497835 }, { "epoch": 0.17166825078097375, "grad_norm": 0.8530846834182739, "learning_rate": 9.964937790532629e-05, "loss": 0.9963027954101562, "memory(GiB)": 89.65, "step": 13230, "token_acc": 0.7326738775384426, "train_speed(iter/s)": 0.497236 }, { "epoch": 0.17173312918262945, "grad_norm": 0.7976863980293274, "learning_rate": 9.964874352271987e-05, "loss": 1.001016902923584, "memory(GiB)": 89.65, "step": 13235, "token_acc": 0.7386167146974063, "train_speed(iter/s)": 0.496638 }, { "epoch": 0.17179800758428515, "grad_norm": 0.8058832287788391, "learning_rate": 9.96481085687589e-05, "loss": 1.0331999778747558, "memory(GiB)": 89.65, "step": 13240, "token_acc": 0.7361995356153097, "train_speed(iter/s)": 0.496042 }, { "epoch": 0.17186288598594085, "grad_norm": 0.8513492941856384, "learning_rate": 9.96474730434507e-05, "loss": 1.0068252563476563, "memory(GiB)": 89.65, "step": 13245, "token_acc": 0.7521750593197997, "train_speed(iter/s)": 0.495465 }, { "epoch": 0.17192776438759655, "grad_norm": 0.8013015389442444, "learning_rate": 9.96468369468026e-05, "loss": 0.9965646743774415, "memory(GiB)": 89.65, "step": 13250, "token_acc": 0.7507049033766591, "train_speed(iter/s)": 0.494893 }, { "epoch": 0.17199264278925225, "grad_norm": 0.9878842830657959, "learning_rate": 9.964620027882186e-05, "loss": 1.0466679573059081, "memory(GiB)": 89.65, "step": 13255, "token_acc": 0.7279229393782085, "train_speed(iter/s)": 0.494338 }, { "epoch": 0.17205752119090795, "grad_norm": 0.8082539439201355, "learning_rate": 9.964556303951585e-05, "loss": 0.9710786819458008, "memory(GiB)": 89.65, "step": 13260, "token_acc": 0.7104105317858999, "train_speed(iter/s)": 0.493783 }, { "epoch": 0.17212239959256365, "grad_norm": 0.8600430488586426, "learning_rate": 9.964492522889191e-05, "loss": 1.0296154975891114, "memory(GiB)": 89.65, "step": 13265, "token_acc": 0.7207053469852105, "train_speed(iter/s)": 0.493218 }, { "epoch": 0.17218727799421935, "grad_norm": 0.8414685726165771, "learning_rate": 9.964428684695734e-05, "loss": 0.9707745552062989, "memory(GiB)": 89.65, "step": 13270, "token_acc": 0.7402651515151515, "train_speed(iter/s)": 0.492649 }, { "epoch": 0.17225215639587504, "grad_norm": 0.7661842107772827, "learning_rate": 9.964364789371954e-05, "loss": 0.9904436111450196, "memory(GiB)": 89.65, "step": 13275, "token_acc": 0.7274819885803115, "train_speed(iter/s)": 0.492042 }, { "epoch": 0.17231703479753072, "grad_norm": 0.9711649417877197, "learning_rate": 9.964300836918582e-05, "loss": 1.0351852416992187, "memory(GiB)": 89.65, "step": 13280, "token_acc": 0.7284428595686874, "train_speed(iter/s)": 0.491491 }, { "epoch": 0.17238191319918642, "grad_norm": 0.849551796913147, "learning_rate": 9.964236827336355e-05, "loss": 1.0382675170898437, "memory(GiB)": 89.65, "step": 13285, "token_acc": 0.7220935618341825, "train_speed(iter/s)": 0.490914 }, { "epoch": 0.17244679160084211, "grad_norm": 0.9156277179718018, "learning_rate": 9.964172760626012e-05, "loss": 1.0008383750915528, "memory(GiB)": 89.65, "step": 13290, "token_acc": 0.7250420500643119, "train_speed(iter/s)": 0.490346 }, { "epoch": 0.1725116700024978, "grad_norm": 0.8366336822509766, "learning_rate": 9.964108636788286e-05, "loss": 0.9787786483764649, "memory(GiB)": 89.65, "step": 13295, "token_acc": 0.7431251790317961, "train_speed(iter/s)": 0.489785 }, { "epoch": 0.1725765484041535, "grad_norm": 0.8025233745574951, "learning_rate": 9.96404445582392e-05, "loss": 0.9869997024536132, "memory(GiB)": 89.65, "step": 13300, "token_acc": 0.7372550464122587, "train_speed(iter/s)": 0.489217 }, { "epoch": 0.1726414268058092, "grad_norm": 0.764934778213501, "learning_rate": 9.963980217733648e-05, "loss": 0.9847221374511719, "memory(GiB)": 89.65, "step": 13305, "token_acc": 0.7360850321888412, "train_speed(iter/s)": 0.488619 }, { "epoch": 0.1727063052074649, "grad_norm": 0.9169658422470093, "learning_rate": 9.963915922518211e-05, "loss": 1.018192672729492, "memory(GiB)": 89.65, "step": 13310, "token_acc": 0.7396599047228445, "train_speed(iter/s)": 0.488061 }, { "epoch": 0.1727711836091206, "grad_norm": 0.9233101010322571, "learning_rate": 9.963851570178349e-05, "loss": 1.0729073524475097, "memory(GiB)": 89.65, "step": 13315, "token_acc": 0.7171111970106945, "train_speed(iter/s)": 0.487528 }, { "epoch": 0.1728360620107763, "grad_norm": 0.8588965535163879, "learning_rate": 9.963787160714804e-05, "loss": 1.0197945594787599, "memory(GiB)": 89.65, "step": 13320, "token_acc": 0.7334973788381299, "train_speed(iter/s)": 0.48699 }, { "epoch": 0.172900940412432, "grad_norm": 0.7953397631645203, "learning_rate": 9.963722694128314e-05, "loss": 0.996192455291748, "memory(GiB)": 89.65, "step": 13325, "token_acc": 0.7421285083345206, "train_speed(iter/s)": 0.48646 }, { "epoch": 0.1729658188140877, "grad_norm": 0.9188552498817444, "learning_rate": 9.963658170419624e-05, "loss": 0.9978156089782715, "memory(GiB)": 89.65, "step": 13330, "token_acc": 0.7410115979381443, "train_speed(iter/s)": 0.485916 }, { "epoch": 0.1730306972157434, "grad_norm": 0.9276458621025085, "learning_rate": 9.963593589589475e-05, "loss": 1.0090798377990722, "memory(GiB)": 89.65, "step": 13335, "token_acc": 0.7331454938952182, "train_speed(iter/s)": 0.485396 }, { "epoch": 0.17309557561739908, "grad_norm": 0.8569705486297607, "learning_rate": 9.96352895163861e-05, "loss": 0.9690299034118652, "memory(GiB)": 89.65, "step": 13340, "token_acc": 0.7344916474172274, "train_speed(iter/s)": 0.484846 }, { "epoch": 0.17316045401905478, "grad_norm": 1.1096928119659424, "learning_rate": 9.963464256567774e-05, "loss": 1.0197714805603026, "memory(GiB)": 89.65, "step": 13345, "token_acc": 0.722268316438801, "train_speed(iter/s)": 0.484296 }, { "epoch": 0.17322533242071048, "grad_norm": 0.792212188243866, "learning_rate": 9.963399504377711e-05, "loss": 0.972115421295166, "memory(GiB)": 89.65, "step": 13350, "token_acc": 0.7524955505687534, "train_speed(iter/s)": 0.483738 }, { "epoch": 0.17329021082236618, "grad_norm": 0.8678473830223083, "learning_rate": 9.963334695069165e-05, "loss": 0.9742166519165039, "memory(GiB)": 89.65, "step": 13355, "token_acc": 0.738788990825688, "train_speed(iter/s)": 0.483184 }, { "epoch": 0.17335508922402187, "grad_norm": 0.8408044576644897, "learning_rate": 9.963269828642883e-05, "loss": 0.95047607421875, "memory(GiB)": 89.65, "step": 13360, "token_acc": 0.7406399939551929, "train_speed(iter/s)": 0.482647 }, { "epoch": 0.17341996762567757, "grad_norm": 0.9260180592536926, "learning_rate": 9.963204905099612e-05, "loss": 0.9642832756042481, "memory(GiB)": 89.65, "step": 13365, "token_acc": 0.7692250731237735, "train_speed(iter/s)": 0.482076 }, { "epoch": 0.17348484602733327, "grad_norm": 0.846198320388794, "learning_rate": 9.963139924440098e-05, "loss": 0.9890029907226563, "memory(GiB)": 89.65, "step": 13370, "token_acc": 0.7377160585019944, "train_speed(iter/s)": 0.481572 }, { "epoch": 0.17354972442898897, "grad_norm": 0.836971640586853, "learning_rate": 9.963074886665088e-05, "loss": 1.0045570373535155, "memory(GiB)": 89.65, "step": 13375, "token_acc": 0.754094918101638, "train_speed(iter/s)": 0.481068 }, { "epoch": 0.17361460283064467, "grad_norm": 0.8119128346443176, "learning_rate": 9.963009791775333e-05, "loss": 0.9565927505493164, "memory(GiB)": 89.65, "step": 13380, "token_acc": 0.7661201741182214, "train_speed(iter/s)": 0.480525 }, { "epoch": 0.17367948123230037, "grad_norm": 0.8445014357566833, "learning_rate": 9.96294463977158e-05, "loss": 1.0067377090454102, "memory(GiB)": 89.65, "step": 13385, "token_acc": 0.7322238548521051, "train_speed(iter/s)": 0.479983 }, { "epoch": 0.17374435963395607, "grad_norm": 0.9427288174629211, "learning_rate": 9.962879430654579e-05, "loss": 0.9370558738708497, "memory(GiB)": 89.65, "step": 13390, "token_acc": 0.7609843721932819, "train_speed(iter/s)": 0.479428 }, { "epoch": 0.17380923803561177, "grad_norm": 0.8863264322280884, "learning_rate": 9.962814164425083e-05, "loss": 0.9948306083679199, "memory(GiB)": 89.65, "step": 13395, "token_acc": 0.740992207391405, "train_speed(iter/s)": 0.478915 }, { "epoch": 0.17387411643726744, "grad_norm": 0.8902182579040527, "learning_rate": 9.962748841083838e-05, "loss": 0.9735855102539063, "memory(GiB)": 89.65, "step": 13400, "token_acc": 0.7276560499962352, "train_speed(iter/s)": 0.478381 }, { "epoch": 0.17393899483892314, "grad_norm": 0.8899716138839722, "learning_rate": 9.962683460631601e-05, "loss": 0.9951318740844727, "memory(GiB)": 89.65, "step": 13405, "token_acc": 0.740131804764193, "train_speed(iter/s)": 0.477865 }, { "epoch": 0.17400387324057884, "grad_norm": 0.8836468458175659, "learning_rate": 9.962618023069121e-05, "loss": 1.0476716041564942, "memory(GiB)": 89.65, "step": 13410, "token_acc": 0.7294490336033362, "train_speed(iter/s)": 0.477371 }, { "epoch": 0.17406875164223454, "grad_norm": 0.8828542828559875, "learning_rate": 9.962552528397153e-05, "loss": 1.0291128158569336, "memory(GiB)": 89.65, "step": 13415, "token_acc": 0.7241736301116616, "train_speed(iter/s)": 0.476854 }, { "epoch": 0.17413363004389024, "grad_norm": 0.9572950601577759, "learning_rate": 9.96248697661645e-05, "loss": 0.9615035057067871, "memory(GiB)": 89.65, "step": 13420, "token_acc": 0.7535300195155551, "train_speed(iter/s)": 0.476354 }, { "epoch": 0.17419850844554594, "grad_norm": 0.7567702531814575, "learning_rate": 9.962421367727766e-05, "loss": 0.9698683738708496, "memory(GiB)": 89.65, "step": 13425, "token_acc": 0.7570821276747758, "train_speed(iter/s)": 0.475839 }, { "epoch": 0.17426338684720163, "grad_norm": 0.940318763256073, "learning_rate": 9.962355701731856e-05, "loss": 0.9704832077026367, "memory(GiB)": 89.65, "step": 13430, "token_acc": 0.7369856420951312, "train_speed(iter/s)": 0.47533 }, { "epoch": 0.17432826524885733, "grad_norm": 0.8074429035186768, "learning_rate": 9.962289978629476e-05, "loss": 0.9536735534667968, "memory(GiB)": 89.65, "step": 13435, "token_acc": 0.7317225715132164, "train_speed(iter/s)": 0.474831 }, { "epoch": 0.17439314365051303, "grad_norm": 0.9856299161911011, "learning_rate": 9.962224198421382e-05, "loss": 0.9747236251831055, "memory(GiB)": 89.65, "step": 13440, "token_acc": 0.7448014398958374, "train_speed(iter/s)": 0.474339 }, { "epoch": 0.17445802205216873, "grad_norm": 0.839838445186615, "learning_rate": 9.962158361108333e-05, "loss": 0.9742183685302734, "memory(GiB)": 89.65, "step": 13445, "token_acc": 0.7546617787021044, "train_speed(iter/s)": 0.473845 }, { "epoch": 0.17452290045382443, "grad_norm": 0.8233851194381714, "learning_rate": 9.962092466691083e-05, "loss": 0.9891227722167969, "memory(GiB)": 89.65, "step": 13450, "token_acc": 0.7333093525179856, "train_speed(iter/s)": 0.473347 }, { "epoch": 0.17458777885548013, "grad_norm": 0.890866756439209, "learning_rate": 9.962026515170392e-05, "loss": 1.045567512512207, "memory(GiB)": 89.65, "step": 13455, "token_acc": 0.7158295754937024, "train_speed(iter/s)": 0.472865 }, { "epoch": 0.1746526572571358, "grad_norm": 0.8338186740875244, "learning_rate": 9.961960506547022e-05, "loss": 0.9882492065429688, "memory(GiB)": 89.65, "step": 13460, "token_acc": 0.7332424006235386, "train_speed(iter/s)": 0.472357 }, { "epoch": 0.1747175356587915, "grad_norm": 0.8888368010520935, "learning_rate": 9.961894440821726e-05, "loss": 1.0090462684631347, "memory(GiB)": 89.65, "step": 13465, "token_acc": 0.7130651945320715, "train_speed(iter/s)": 0.471849 }, { "epoch": 0.1747824140604472, "grad_norm": 0.9960221648216248, "learning_rate": 9.961828317995271e-05, "loss": 1.0359312057495118, "memory(GiB)": 89.65, "step": 13470, "token_acc": 0.7470246841022627, "train_speed(iter/s)": 0.471353 }, { "epoch": 0.1748472924621029, "grad_norm": 0.8619325757026672, "learning_rate": 9.961762138068415e-05, "loss": 1.015797233581543, "memory(GiB)": 89.65, "step": 13475, "token_acc": 0.7319445876731635, "train_speed(iter/s)": 0.470862 }, { "epoch": 0.1749121708637586, "grad_norm": 0.7748475670814514, "learning_rate": 9.961695901041919e-05, "loss": 0.9973673820495605, "memory(GiB)": 89.65, "step": 13480, "token_acc": 0.7498655913978495, "train_speed(iter/s)": 0.470354 }, { "epoch": 0.1749770492654143, "grad_norm": 0.855748176574707, "learning_rate": 9.961629606916544e-05, "loss": 0.9563323974609375, "memory(GiB)": 89.65, "step": 13485, "token_acc": 0.7767897304591537, "train_speed(iter/s)": 0.469849 }, { "epoch": 0.17504192766707, "grad_norm": 0.902885377407074, "learning_rate": 9.961563255693057e-05, "loss": 1.0370067596435546, "memory(GiB)": 89.65, "step": 13490, "token_acc": 0.7416392438968393, "train_speed(iter/s)": 0.469385 }, { "epoch": 0.1751068060687257, "grad_norm": 0.8770858645439148, "learning_rate": 9.96149684737222e-05, "loss": 1.001909065246582, "memory(GiB)": 89.65, "step": 13495, "token_acc": 0.7502472325900406, "train_speed(iter/s)": 0.468885 }, { "epoch": 0.1751716844703814, "grad_norm": 0.8207348585128784, "learning_rate": 9.961430381954796e-05, "loss": 1.0398662567138672, "memory(GiB)": 89.65, "step": 13500, "token_acc": 0.7355894384529564, "train_speed(iter/s)": 0.468398 }, { "epoch": 0.1752365628720371, "grad_norm": 0.8549158573150635, "learning_rate": 9.961363859441548e-05, "loss": 0.9737783432006836, "memory(GiB)": 89.65, "step": 13505, "token_acc": 0.7239558329332694, "train_speed(iter/s)": 0.467882 }, { "epoch": 0.1753014412736928, "grad_norm": 0.8101903200149536, "learning_rate": 9.961297279833246e-05, "loss": 1.0427995681762696, "memory(GiB)": 89.65, "step": 13510, "token_acc": 0.7173154681677634, "train_speed(iter/s)": 0.467385 }, { "epoch": 0.17536631967534846, "grad_norm": 0.8037312030792236, "learning_rate": 9.961230643130655e-05, "loss": 0.9434599876403809, "memory(GiB)": 89.65, "step": 13515, "token_acc": 0.7429325669985966, "train_speed(iter/s)": 0.466883 }, { "epoch": 0.17543119807700416, "grad_norm": 0.7939199209213257, "learning_rate": 9.96116394933454e-05, "loss": 1.028295135498047, "memory(GiB)": 89.65, "step": 13520, "token_acc": 0.737490325402968, "train_speed(iter/s)": 0.466401 }, { "epoch": 0.17549607647865986, "grad_norm": 0.859557032585144, "learning_rate": 9.96109719844567e-05, "loss": 1.0068183898925782, "memory(GiB)": 89.65, "step": 13525, "token_acc": 0.7372257288848812, "train_speed(iter/s)": 0.465935 }, { "epoch": 0.17556095488031556, "grad_norm": 0.8042744994163513, "learning_rate": 9.961030390464811e-05, "loss": 0.992489242553711, "memory(GiB)": 89.65, "step": 13530, "token_acc": 0.7535342039055948, "train_speed(iter/s)": 0.465446 }, { "epoch": 0.17562583328197126, "grad_norm": 0.9624685049057007, "learning_rate": 9.960963525392737e-05, "loss": 0.9932510375976562, "memory(GiB)": 89.65, "step": 13535, "token_acc": 0.7531222770839384, "train_speed(iter/s)": 0.464967 }, { "epoch": 0.17569071168362696, "grad_norm": 0.8574230670928955, "learning_rate": 9.960896603230212e-05, "loss": 0.9956007957458496, "memory(GiB)": 89.65, "step": 13540, "token_acc": 0.748989623156745, "train_speed(iter/s)": 0.464468 }, { "epoch": 0.17575559008528266, "grad_norm": 0.8807932734489441, "learning_rate": 9.960829623978007e-05, "loss": 0.9916855812072753, "memory(GiB)": 89.65, "step": 13545, "token_acc": 0.7532251493753395, "train_speed(iter/s)": 0.463973 }, { "epoch": 0.17582046848693836, "grad_norm": 0.859843909740448, "learning_rate": 9.960762587636896e-05, "loss": 0.9538573265075684, "memory(GiB)": 89.65, "step": 13550, "token_acc": 0.7299900997639175, "train_speed(iter/s)": 0.463466 }, { "epoch": 0.17588534688859406, "grad_norm": 0.8030350804328918, "learning_rate": 9.960695494207649e-05, "loss": 0.978999137878418, "memory(GiB)": 89.65, "step": 13555, "token_acc": 0.7524952146568226, "train_speed(iter/s)": 0.462977 }, { "epoch": 0.17595022529024976, "grad_norm": 0.851102352142334, "learning_rate": 9.960628343691036e-05, "loss": 0.9153537750244141, "memory(GiB)": 89.65, "step": 13560, "token_acc": 0.774598127539304, "train_speed(iter/s)": 0.462503 }, { "epoch": 0.17601510369190546, "grad_norm": 0.8471717238426208, "learning_rate": 9.960561136087831e-05, "loss": 0.9894713401794434, "memory(GiB)": 89.65, "step": 13565, "token_acc": 0.7360642208717834, "train_speed(iter/s)": 0.462024 }, { "epoch": 0.17607998209356115, "grad_norm": 0.7842838764190674, "learning_rate": 9.960493871398808e-05, "loss": 0.9950716018676757, "memory(GiB)": 89.65, "step": 13570, "token_acc": 0.7449466734382936, "train_speed(iter/s)": 0.461555 }, { "epoch": 0.17614486049521683, "grad_norm": 0.8874218463897705, "learning_rate": 9.96042654962474e-05, "loss": 1.0073680877685547, "memory(GiB)": 89.65, "step": 13575, "token_acc": 0.7378104493728715, "train_speed(iter/s)": 0.461076 }, { "epoch": 0.17620973889687253, "grad_norm": 0.8076656460762024, "learning_rate": 9.960359170766404e-05, "loss": 0.9743228912353515, "memory(GiB)": 89.65, "step": 13580, "token_acc": 0.7437490698020539, "train_speed(iter/s)": 0.460574 }, { "epoch": 0.17627461729852822, "grad_norm": 0.9407855868339539, "learning_rate": 9.960291734824573e-05, "loss": 0.9775432586669922, "memory(GiB)": 89.65, "step": 13585, "token_acc": 0.7481645944345767, "train_speed(iter/s)": 0.460116 }, { "epoch": 0.17633949570018392, "grad_norm": 1.0344775915145874, "learning_rate": 9.960224241800025e-05, "loss": 0.9717880249023437, "memory(GiB)": 89.65, "step": 13590, "token_acc": 0.748271488769791, "train_speed(iter/s)": 0.459658 }, { "epoch": 0.17640437410183962, "grad_norm": 0.8496866822242737, "learning_rate": 9.960156691693535e-05, "loss": 1.0146077156066895, "memory(GiB)": 89.65, "step": 13595, "token_acc": 0.7452028939918214, "train_speed(iter/s)": 0.459177 }, { "epoch": 0.17646925250349532, "grad_norm": 0.883752167224884, "learning_rate": 9.960089084505881e-05, "loss": 0.9634881973266601, "memory(GiB)": 89.65, "step": 13600, "token_acc": 0.7376213825593693, "train_speed(iter/s)": 0.458691 }, { "epoch": 0.17653413090515102, "grad_norm": 0.8503162264823914, "learning_rate": 9.960021420237842e-05, "loss": 0.972197437286377, "memory(GiB)": 89.65, "step": 13605, "token_acc": 0.7498398006407975, "train_speed(iter/s)": 0.458228 }, { "epoch": 0.17659900930680672, "grad_norm": 0.8291974663734436, "learning_rate": 9.959953698890195e-05, "loss": 0.9454692840576172, "memory(GiB)": 89.65, "step": 13610, "token_acc": 0.7409459726894914, "train_speed(iter/s)": 0.457785 }, { "epoch": 0.17666388770846242, "grad_norm": 1.0285571813583374, "learning_rate": 9.959885920463718e-05, "loss": 1.0469247817993164, "memory(GiB)": 89.65, "step": 13615, "token_acc": 0.7380770988877038, "train_speed(iter/s)": 0.457356 }, { "epoch": 0.17672876611011812, "grad_norm": 0.8377014398574829, "learning_rate": 9.959818084959197e-05, "loss": 1.023348045349121, "memory(GiB)": 89.65, "step": 13620, "token_acc": 0.7283601927230436, "train_speed(iter/s)": 0.45687 }, { "epoch": 0.17679364451177382, "grad_norm": 0.9314685463905334, "learning_rate": 9.959750192377406e-05, "loss": 1.0326486587524415, "memory(GiB)": 89.65, "step": 13625, "token_acc": 0.7299078980570275, "train_speed(iter/s)": 0.456412 }, { "epoch": 0.17685852291342952, "grad_norm": 0.7932472825050354, "learning_rate": 9.959682242719128e-05, "loss": 0.9848188400268555, "memory(GiB)": 89.65, "step": 13630, "token_acc": 0.7405426687227271, "train_speed(iter/s)": 0.455937 }, { "epoch": 0.1769234013150852, "grad_norm": 0.8595666885375977, "learning_rate": 9.959614235985149e-05, "loss": 1.0046148300170898, "memory(GiB)": 89.65, "step": 13635, "token_acc": 0.7530823606772974, "train_speed(iter/s)": 0.455507 }, { "epoch": 0.1769882797167409, "grad_norm": 0.8358298540115356, "learning_rate": 9.959546172176247e-05, "loss": 1.0097844123840332, "memory(GiB)": 89.65, "step": 13640, "token_acc": 0.7338433040614709, "train_speed(iter/s)": 0.455025 }, { "epoch": 0.1770531581183966, "grad_norm": 0.9655298590660095, "learning_rate": 9.959478051293207e-05, "loss": 0.9705826759338378, "memory(GiB)": 89.65, "step": 13645, "token_acc": 0.7567920862038509, "train_speed(iter/s)": 0.454594 }, { "epoch": 0.17711803652005229, "grad_norm": 0.9315623641014099, "learning_rate": 9.959409873336813e-05, "loss": 1.007102870941162, "memory(GiB)": 89.65, "step": 13650, "token_acc": 0.7392792430038252, "train_speed(iter/s)": 0.454142 }, { "epoch": 0.17718291492170798, "grad_norm": 0.8015475869178772, "learning_rate": 9.959341638307848e-05, "loss": 1.0277652740478516, "memory(GiB)": 89.65, "step": 13655, "token_acc": 0.7353506730088047, "train_speed(iter/s)": 0.453676 }, { "epoch": 0.17724779332336368, "grad_norm": 0.8332260847091675, "learning_rate": 9.9592733462071e-05, "loss": 0.9996649742126464, "memory(GiB)": 89.65, "step": 13660, "token_acc": 0.7523949648520517, "train_speed(iter/s)": 0.45325 }, { "epoch": 0.17731267172501938, "grad_norm": 0.8225462436676025, "learning_rate": 9.959204997035354e-05, "loss": 0.99197998046875, "memory(GiB)": 89.65, "step": 13665, "token_acc": 0.7202163740179454, "train_speed(iter/s)": 0.452802 }, { "epoch": 0.17737755012667508, "grad_norm": 1.0061509609222412, "learning_rate": 9.959136590793395e-05, "loss": 1.02642879486084, "memory(GiB)": 89.65, "step": 13670, "token_acc": 0.7484170694241504, "train_speed(iter/s)": 0.452357 }, { "epoch": 0.17744242852833078, "grad_norm": 0.78265780210495, "learning_rate": 9.959068127482011e-05, "loss": 0.969517707824707, "memory(GiB)": 89.65, "step": 13675, "token_acc": 0.7368592111094696, "train_speed(iter/s)": 0.451907 }, { "epoch": 0.17750730692998648, "grad_norm": 0.8235393166542053, "learning_rate": 9.958999607101991e-05, "loss": 1.0216730117797852, "memory(GiB)": 89.65, "step": 13680, "token_acc": 0.7423704500627577, "train_speed(iter/s)": 0.451487 }, { "epoch": 0.17757218533164218, "grad_norm": 0.7739363312721252, "learning_rate": 9.958931029654121e-05, "loss": 0.9924886703491211, "memory(GiB)": 89.65, "step": 13685, "token_acc": 0.748695261466146, "train_speed(iter/s)": 0.451012 }, { "epoch": 0.17763706373329788, "grad_norm": 0.8855289220809937, "learning_rate": 9.958862395139195e-05, "loss": 0.993474292755127, "memory(GiB)": 89.65, "step": 13690, "token_acc": 0.7494639253279516, "train_speed(iter/s)": 0.450607 }, { "epoch": 0.17770194213495355, "grad_norm": 0.8357194066047668, "learning_rate": 9.958793703557998e-05, "loss": 1.021781063079834, "memory(GiB)": 89.65, "step": 13695, "token_acc": 0.7349978659837815, "train_speed(iter/s)": 0.450188 }, { "epoch": 0.17776682053660925, "grad_norm": 0.7833422422409058, "learning_rate": 9.958724954911322e-05, "loss": 0.9489372253417969, "memory(GiB)": 89.65, "step": 13700, "token_acc": 0.7446343473431509, "train_speed(iter/s)": 0.449715 }, { "epoch": 0.17783169893826495, "grad_norm": 0.8291795253753662, "learning_rate": 9.958656149199959e-05, "loss": 0.9845722198486329, "memory(GiB)": 89.65, "step": 13705, "token_acc": 0.74626244275387, "train_speed(iter/s)": 0.449275 }, { "epoch": 0.17789657733992065, "grad_norm": 0.8040551543235779, "learning_rate": 9.958587286424699e-05, "loss": 0.9974653244018554, "memory(GiB)": 89.65, "step": 13710, "token_acc": 0.7242490886632724, "train_speed(iter/s)": 0.44882 }, { "epoch": 0.17796145574157635, "grad_norm": 0.8653684854507446, "learning_rate": 9.958518366586336e-05, "loss": 0.989010238647461, "memory(GiB)": 89.65, "step": 13715, "token_acc": 0.7293866096485366, "train_speed(iter/s)": 0.448397 }, { "epoch": 0.17802633414323205, "grad_norm": 0.7165513634681702, "learning_rate": 9.958449389685664e-05, "loss": 0.9992738723754883, "memory(GiB)": 89.65, "step": 13720, "token_acc": 0.7270866084158959, "train_speed(iter/s)": 0.447964 }, { "epoch": 0.17809121254488774, "grad_norm": 0.765049397945404, "learning_rate": 9.958380355723473e-05, "loss": 1.026323127746582, "memory(GiB)": 89.65, "step": 13725, "token_acc": 0.7265216946028061, "train_speed(iter/s)": 0.44753 }, { "epoch": 0.17815609094654344, "grad_norm": 0.8091757297515869, "learning_rate": 9.958311264700563e-05, "loss": 0.9832674026489258, "memory(GiB)": 89.65, "step": 13730, "token_acc": 0.7601220417250762, "train_speed(iter/s)": 0.447081 }, { "epoch": 0.17822096934819914, "grad_norm": 0.884499192237854, "learning_rate": 9.958242116617725e-05, "loss": 0.9855929374694824, "memory(GiB)": 89.65, "step": 13735, "token_acc": 0.7542177027963948, "train_speed(iter/s)": 0.446618 }, { "epoch": 0.17828584774985484, "grad_norm": 0.8487294912338257, "learning_rate": 9.958172911475755e-05, "loss": 0.9524068832397461, "memory(GiB)": 89.65, "step": 13740, "token_acc": 0.7467698398416411, "train_speed(iter/s)": 0.446187 }, { "epoch": 0.17835072615151054, "grad_norm": 0.9429180026054382, "learning_rate": 9.95810364927545e-05, "loss": 1.0122187614440918, "memory(GiB)": 89.65, "step": 13745, "token_acc": 0.7348242811501597, "train_speed(iter/s)": 0.445757 }, { "epoch": 0.17841560455316624, "grad_norm": 0.8385087251663208, "learning_rate": 9.958034330017609e-05, "loss": 1.0220224380493164, "memory(GiB)": 89.65, "step": 13750, "token_acc": 0.7338581392503161, "train_speed(iter/s)": 0.44532 }, { "epoch": 0.1784804829548219, "grad_norm": 0.8524168133735657, "learning_rate": 9.957964953703027e-05, "loss": 0.9506772041320801, "memory(GiB)": 89.65, "step": 13755, "token_acc": 0.7397164473265941, "train_speed(iter/s)": 0.444893 }, { "epoch": 0.1785453613564776, "grad_norm": 0.7792100310325623, "learning_rate": 9.957895520332503e-05, "loss": 0.9375975608825684, "memory(GiB)": 89.65, "step": 13760, "token_acc": 0.7429434068248841, "train_speed(iter/s)": 0.444471 }, { "epoch": 0.1786102397581333, "grad_norm": 0.8958619832992554, "learning_rate": 9.957826029906838e-05, "loss": 0.9874868392944336, "memory(GiB)": 89.65, "step": 13765, "token_acc": 0.7310318519931337, "train_speed(iter/s)": 0.444042 }, { "epoch": 0.178675118159789, "grad_norm": 0.8862139582633972, "learning_rate": 9.957756482426829e-05, "loss": 1.0159671783447266, "memory(GiB)": 89.65, "step": 13770, "token_acc": 0.7406178372276009, "train_speed(iter/s)": 0.443604 }, { "epoch": 0.1787399965614447, "grad_norm": 0.8580374717712402, "learning_rate": 9.957686877893277e-05, "loss": 1.0042848587036133, "memory(GiB)": 89.65, "step": 13775, "token_acc": 0.7420444258756353, "train_speed(iter/s)": 0.443185 }, { "epoch": 0.1788048749631004, "grad_norm": 0.787126362323761, "learning_rate": 9.957617216306985e-05, "loss": 0.9527530670166016, "memory(GiB)": 89.65, "step": 13780, "token_acc": 0.7401536829244767, "train_speed(iter/s)": 0.44277 }, { "epoch": 0.1788697533647561, "grad_norm": 0.8198925852775574, "learning_rate": 9.95754749766875e-05, "loss": 0.9696979522705078, "memory(GiB)": 89.65, "step": 13785, "token_acc": 0.7412241960283966, "train_speed(iter/s)": 0.442325 }, { "epoch": 0.1789346317664118, "grad_norm": 0.7691224813461304, "learning_rate": 9.957477721979382e-05, "loss": 0.9869306564331055, "memory(GiB)": 89.65, "step": 13790, "token_acc": 0.7515590502382151, "train_speed(iter/s)": 0.441908 }, { "epoch": 0.1789995101680675, "grad_norm": 0.8114054799079895, "learning_rate": 9.957407889239676e-05, "loss": 1.0027048110961914, "memory(GiB)": 89.65, "step": 13795, "token_acc": 0.7298900656924521, "train_speed(iter/s)": 0.441504 }, { "epoch": 0.1790643885697232, "grad_norm": 0.9310644268989563, "learning_rate": 9.95733799945044e-05, "loss": 0.9778928756713867, "memory(GiB)": 89.65, "step": 13800, "token_acc": 0.7445128241613603, "train_speed(iter/s)": 0.441086 }, { "epoch": 0.1791292669713789, "grad_norm": 0.8468830585479736, "learning_rate": 9.957268052612478e-05, "loss": 0.9737459182739258, "memory(GiB)": 89.65, "step": 13805, "token_acc": 0.7361530000354823, "train_speed(iter/s)": 0.440662 }, { "epoch": 0.1791941453730346, "grad_norm": 0.8917935490608215, "learning_rate": 9.957198048726594e-05, "loss": 1.0256731986999512, "memory(GiB)": 89.65, "step": 13810, "token_acc": 0.7188850391248491, "train_speed(iter/s)": 0.440288 }, { "epoch": 0.17925902377469027, "grad_norm": 0.8227853775024414, "learning_rate": 9.957127987793593e-05, "loss": 1.0190001487731934, "memory(GiB)": 89.65, "step": 13815, "token_acc": 0.7341990463629907, "train_speed(iter/s)": 0.439886 }, { "epoch": 0.17932390217634597, "grad_norm": 0.8012522459030151, "learning_rate": 9.957057869814284e-05, "loss": 0.993875789642334, "memory(GiB)": 89.65, "step": 13820, "token_acc": 0.7404849269731635, "train_speed(iter/s)": 0.439486 }, { "epoch": 0.17938878057800167, "grad_norm": 0.9113558530807495, "learning_rate": 9.956987694789471e-05, "loss": 1.0185523986816407, "memory(GiB)": 89.65, "step": 13825, "token_acc": 0.7338431951339506, "train_speed(iter/s)": 0.439095 }, { "epoch": 0.17945365897965737, "grad_norm": 0.8954495787620544, "learning_rate": 9.956917462719962e-05, "loss": 0.9862782478332519, "memory(GiB)": 89.65, "step": 13830, "token_acc": 0.7559679562014412, "train_speed(iter/s)": 0.438704 }, { "epoch": 0.17951853738131307, "grad_norm": 0.8360642790794373, "learning_rate": 9.956847173606568e-05, "loss": 0.9750411987304688, "memory(GiB)": 89.65, "step": 13835, "token_acc": 0.7549786371207183, "train_speed(iter/s)": 0.438286 }, { "epoch": 0.17958341578296877, "grad_norm": 0.7150177955627441, "learning_rate": 9.956776827450093e-05, "loss": 0.966497802734375, "memory(GiB)": 89.65, "step": 13840, "token_acc": 0.7441953708766175, "train_speed(iter/s)": 0.437874 }, { "epoch": 0.17964829418462447, "grad_norm": 0.8336069583892822, "learning_rate": 9.95670642425135e-05, "loss": 1.0027304649353028, "memory(GiB)": 89.65, "step": 13845, "token_acc": 0.7516794791883095, "train_speed(iter/s)": 0.437456 }, { "epoch": 0.17971317258628017, "grad_norm": 0.8685814738273621, "learning_rate": 9.956635964011149e-05, "loss": 0.9921808242797852, "memory(GiB)": 89.65, "step": 13850, "token_acc": 0.7380643487285937, "train_speed(iter/s)": 0.437036 }, { "epoch": 0.17977805098793587, "grad_norm": 0.7067486643791199, "learning_rate": 9.956565446730301e-05, "loss": 0.9638375282287598, "memory(GiB)": 89.65, "step": 13855, "token_acc": 0.746054643547503, "train_speed(iter/s)": 0.436608 }, { "epoch": 0.17984292938959157, "grad_norm": 0.8501732349395752, "learning_rate": 9.956494872409615e-05, "loss": 0.9268192291259766, "memory(GiB)": 89.65, "step": 13860, "token_acc": 0.758295206298174, "train_speed(iter/s)": 0.436224 }, { "epoch": 0.17990780779124727, "grad_norm": 0.930038332939148, "learning_rate": 9.956424241049906e-05, "loss": 1.037664794921875, "memory(GiB)": 89.65, "step": 13865, "token_acc": 0.7108844841467623, "train_speed(iter/s)": 0.435839 }, { "epoch": 0.17997268619290296, "grad_norm": 0.9457349181175232, "learning_rate": 9.956353552651987e-05, "loss": 0.9737918853759766, "memory(GiB)": 89.65, "step": 13870, "token_acc": 0.74915514057764, "train_speed(iter/s)": 0.435431 }, { "epoch": 0.18003756459455864, "grad_norm": 0.8754644393920898, "learning_rate": 9.956282807216668e-05, "loss": 0.9924877166748047, "memory(GiB)": 89.65, "step": 13875, "token_acc": 0.7500741839762611, "train_speed(iter/s)": 0.435017 }, { "epoch": 0.18010244299621433, "grad_norm": 0.9177762269973755, "learning_rate": 9.956212004744767e-05, "loss": 0.9860945701599121, "memory(GiB)": 89.65, "step": 13880, "token_acc": 0.7479228719234989, "train_speed(iter/s)": 0.434638 }, { "epoch": 0.18016732139787003, "grad_norm": 0.9157535433769226, "learning_rate": 9.956141145237097e-05, "loss": 0.9639211654663086, "memory(GiB)": 89.65, "step": 13885, "token_acc": 0.7466500208073242, "train_speed(iter/s)": 0.434255 }, { "epoch": 0.18023219979952573, "grad_norm": 0.7990052700042725, "learning_rate": 9.956070228694475e-05, "loss": 0.9967486381530761, "memory(GiB)": 89.65, "step": 13890, "token_acc": 0.7366891267010173, "train_speed(iter/s)": 0.433873 }, { "epoch": 0.18029707820118143, "grad_norm": 0.8327978849411011, "learning_rate": 9.955999255117714e-05, "loss": 0.974824333190918, "memory(GiB)": 89.65, "step": 13895, "token_acc": 0.739755985267035, "train_speed(iter/s)": 0.433475 }, { "epoch": 0.18036195660283713, "grad_norm": 0.7601631879806519, "learning_rate": 9.955928224507632e-05, "loss": 0.963038158416748, "memory(GiB)": 89.65, "step": 13900, "token_acc": 0.7666452902787733, "train_speed(iter/s)": 0.433083 }, { "epoch": 0.18042683500449283, "grad_norm": 0.897587776184082, "learning_rate": 9.955857136865048e-05, "loss": 1.0161394119262694, "memory(GiB)": 89.65, "step": 13905, "token_acc": 0.7254627755492584, "train_speed(iter/s)": 0.432691 }, { "epoch": 0.18049171340614853, "grad_norm": 0.8178868889808655, "learning_rate": 9.955785992190779e-05, "loss": 1.0159525871276855, "memory(GiB)": 89.65, "step": 13910, "token_acc": 0.7407099371236568, "train_speed(iter/s)": 0.432271 }, { "epoch": 0.18055659180780423, "grad_norm": 0.8527428507804871, "learning_rate": 9.955714790485644e-05, "loss": 1.0340029716491699, "memory(GiB)": 89.65, "step": 13915, "token_acc": 0.7222685281655901, "train_speed(iter/s)": 0.431881 }, { "epoch": 0.18062147020945993, "grad_norm": 0.9642379879951477, "learning_rate": 9.955643531750461e-05, "loss": 0.9955332756042481, "memory(GiB)": 89.65, "step": 13920, "token_acc": 0.7367312552653749, "train_speed(iter/s)": 0.431499 }, { "epoch": 0.18068634861111563, "grad_norm": 0.8864215016365051, "learning_rate": 9.955572215986053e-05, "loss": 0.9974623680114746, "memory(GiB)": 89.65, "step": 13925, "token_acc": 0.729776247848537, "train_speed(iter/s)": 0.431108 }, { "epoch": 0.18075122701277133, "grad_norm": 0.8942062854766846, "learning_rate": 9.955500843193238e-05, "loss": 0.9755864143371582, "memory(GiB)": 89.65, "step": 13930, "token_acc": 0.7432228674088095, "train_speed(iter/s)": 0.430717 }, { "epoch": 0.180816105414427, "grad_norm": 0.7650218605995178, "learning_rate": 9.955429413372838e-05, "loss": 0.9469874382019043, "memory(GiB)": 89.65, "step": 13935, "token_acc": 0.735921822400889, "train_speed(iter/s)": 0.430318 }, { "epoch": 0.1808809838160827, "grad_norm": 0.8797993659973145, "learning_rate": 9.955357926525675e-05, "loss": 0.9633398056030273, "memory(GiB)": 89.65, "step": 13940, "token_acc": 0.7367924185759414, "train_speed(iter/s)": 0.429889 }, { "epoch": 0.1809458622177384, "grad_norm": 0.8285903930664062, "learning_rate": 9.955286382652572e-05, "loss": 1.038570022583008, "memory(GiB)": 89.65, "step": 13945, "token_acc": 0.7224368731218046, "train_speed(iter/s)": 0.429481 }, { "epoch": 0.1810107406193941, "grad_norm": 0.9430190324783325, "learning_rate": 9.955214781754352e-05, "loss": 1.0525594711303712, "memory(GiB)": 89.65, "step": 13950, "token_acc": 0.7283669598206587, "train_speed(iter/s)": 0.429098 }, { "epoch": 0.1810756190210498, "grad_norm": 0.8532354235649109, "learning_rate": 9.955143123831841e-05, "loss": 1.003957176208496, "memory(GiB)": 89.65, "step": 13955, "token_acc": 0.7414743640229976, "train_speed(iter/s)": 0.42866 }, { "epoch": 0.1811404974227055, "grad_norm": 0.8224155306816101, "learning_rate": 9.95507140888586e-05, "loss": 0.9588729858398437, "memory(GiB)": 89.65, "step": 13960, "token_acc": 0.7488066107745999, "train_speed(iter/s)": 0.428268 }, { "epoch": 0.1812053758243612, "grad_norm": 1.0149129629135132, "learning_rate": 9.954999636917237e-05, "loss": 0.974093246459961, "memory(GiB)": 89.65, "step": 13965, "token_acc": 0.7494683113568694, "train_speed(iter/s)": 0.427868 }, { "epoch": 0.1812702542260169, "grad_norm": 0.8799230456352234, "learning_rate": 9.954927807926796e-05, "loss": 1.0284423828125, "memory(GiB)": 89.65, "step": 13970, "token_acc": 0.7344791173578997, "train_speed(iter/s)": 0.427483 }, { "epoch": 0.1813351326276726, "grad_norm": 0.889823317527771, "learning_rate": 9.954855921915365e-05, "loss": 1.0150815963745117, "memory(GiB)": 89.65, "step": 13975, "token_acc": 0.7242476958951771, "train_speed(iter/s)": 0.427101 }, { "epoch": 0.1814000110293283, "grad_norm": 0.8112358450889587, "learning_rate": 9.954783978883773e-05, "loss": 0.9862098693847656, "memory(GiB)": 89.65, "step": 13980, "token_acc": 0.7391551902067542, "train_speed(iter/s)": 0.426723 }, { "epoch": 0.181464889430984, "grad_norm": 0.7865927815437317, "learning_rate": 9.954711978832843e-05, "loss": 1.0265029907226562, "memory(GiB)": 89.65, "step": 13985, "token_acc": 0.7184865448168881, "train_speed(iter/s)": 0.426333 }, { "epoch": 0.1815297678326397, "grad_norm": 0.7986543774604797, "learning_rate": 9.954639921763407e-05, "loss": 0.9631309509277344, "memory(GiB)": 89.65, "step": 13990, "token_acc": 0.7506594566077552, "train_speed(iter/s)": 0.425912 }, { "epoch": 0.18159464623429536, "grad_norm": 0.8127240538597107, "learning_rate": 9.954567807676295e-05, "loss": 0.9862800598144531, "memory(GiB)": 89.65, "step": 13995, "token_acc": 0.741711986888177, "train_speed(iter/s)": 0.425541 }, { "epoch": 0.18165952463595106, "grad_norm": 0.8559340834617615, "learning_rate": 9.954495636572334e-05, "loss": 0.9953765869140625, "memory(GiB)": 89.65, "step": 14000, "token_acc": 0.7239819004524887, "train_speed(iter/s)": 0.425175 }, { "epoch": 0.18172440303760676, "grad_norm": 0.8865818381309509, "learning_rate": 9.954423408452358e-05, "loss": 1.0168424606323243, "memory(GiB)": 89.65, "step": 14005, "token_acc": 0.7357446023251523, "train_speed(iter/s)": 0.424812 }, { "epoch": 0.18178928143926246, "grad_norm": 0.8340731859207153, "learning_rate": 9.954351123317194e-05, "loss": 1.0181539535522461, "memory(GiB)": 89.65, "step": 14010, "token_acc": 0.7239970555760029, "train_speed(iter/s)": 0.424423 }, { "epoch": 0.18185415984091816, "grad_norm": 0.8758397698402405, "learning_rate": 9.954278781167679e-05, "loss": 1.0033949851989745, "memory(GiB)": 89.65, "step": 14015, "token_acc": 0.7421097713645648, "train_speed(iter/s)": 0.424044 }, { "epoch": 0.18191903824257386, "grad_norm": 0.7913399934768677, "learning_rate": 9.95420638200464e-05, "loss": 0.9362159729003906, "memory(GiB)": 89.65, "step": 14020, "token_acc": 0.768598404688263, "train_speed(iter/s)": 0.423658 }, { "epoch": 0.18198391664422955, "grad_norm": 0.7848005890846252, "learning_rate": 9.954133925828915e-05, "loss": 0.974311637878418, "memory(GiB)": 89.65, "step": 14025, "token_acc": 0.739949567723343, "train_speed(iter/s)": 0.423257 }, { "epoch": 0.18204879504588525, "grad_norm": 0.9253488779067993, "learning_rate": 9.954061412641334e-05, "loss": 1.0152338981628417, "memory(GiB)": 89.65, "step": 14030, "token_acc": 0.7333722287047841, "train_speed(iter/s)": 0.422892 }, { "epoch": 0.18211367344754095, "grad_norm": 0.8206222057342529, "learning_rate": 9.953988842442732e-05, "loss": 0.9632567405700684, "memory(GiB)": 89.65, "step": 14035, "token_acc": 0.7372166027206138, "train_speed(iter/s)": 0.422527 }, { "epoch": 0.18217855184919665, "grad_norm": 0.7781499028205872, "learning_rate": 9.953916215233947e-05, "loss": 1.0361655235290528, "memory(GiB)": 89.65, "step": 14040, "token_acc": 0.7245547801548259, "train_speed(iter/s)": 0.422149 }, { "epoch": 0.18224343025085235, "grad_norm": 0.8843247294425964, "learning_rate": 9.953843531015814e-05, "loss": 0.9686384201049805, "memory(GiB)": 89.65, "step": 14045, "token_acc": 0.7379617980098364, "train_speed(iter/s)": 0.421787 }, { "epoch": 0.18230830865250805, "grad_norm": 0.9042667150497437, "learning_rate": 9.953770789789168e-05, "loss": 0.9929007530212403, "memory(GiB)": 89.65, "step": 14050, "token_acc": 0.7472467490768984, "train_speed(iter/s)": 0.421415 }, { "epoch": 0.18237318705416372, "grad_norm": 0.9067066311836243, "learning_rate": 9.953697991554845e-05, "loss": 0.9871835708618164, "memory(GiB)": 89.65, "step": 14055, "token_acc": 0.7442875481386393, "train_speed(iter/s)": 0.421049 }, { "epoch": 0.18243806545581942, "grad_norm": 0.7429545521736145, "learning_rate": 9.953625136313685e-05, "loss": 0.9825889587402343, "memory(GiB)": 89.65, "step": 14060, "token_acc": 0.717687074829932, "train_speed(iter/s)": 0.420663 }, { "epoch": 0.18250294385747512, "grad_norm": 0.8716597557067871, "learning_rate": 9.953552224066525e-05, "loss": 0.9952826499938965, "memory(GiB)": 89.65, "step": 14065, "token_acc": 0.7536968094359974, "train_speed(iter/s)": 0.420283 }, { "epoch": 0.18256782225913082, "grad_norm": 0.9516083598136902, "learning_rate": 9.953479254814206e-05, "loss": 0.9981049537658692, "memory(GiB)": 89.65, "step": 14070, "token_acc": 0.7483333947184265, "train_speed(iter/s)": 0.419923 }, { "epoch": 0.18263270066078652, "grad_norm": 0.9305484294891357, "learning_rate": 9.953406228557566e-05, "loss": 1.0028223037719726, "memory(GiB)": 89.65, "step": 14075, "token_acc": 0.7347463809852935, "train_speed(iter/s)": 0.419569 }, { "epoch": 0.18269757906244222, "grad_norm": 0.7639400362968445, "learning_rate": 9.953333145297448e-05, "loss": 0.9653168678283691, "memory(GiB)": 89.65, "step": 14080, "token_acc": 0.7492157778815991, "train_speed(iter/s)": 0.419198 }, { "epoch": 0.18276245746409792, "grad_norm": 0.8614873290061951, "learning_rate": 9.953260005034688e-05, "loss": 1.059273910522461, "memory(GiB)": 89.65, "step": 14085, "token_acc": 0.7102962427745665, "train_speed(iter/s)": 0.418854 }, { "epoch": 0.18282733586575362, "grad_norm": 0.9695978164672852, "learning_rate": 9.953186807770132e-05, "loss": 0.9771751403808594, "memory(GiB)": 89.65, "step": 14090, "token_acc": 0.7534959681535164, "train_speed(iter/s)": 0.418511 }, { "epoch": 0.18289221426740931, "grad_norm": 0.7258839011192322, "learning_rate": 9.95311355350462e-05, "loss": 1.001115894317627, "memory(GiB)": 89.65, "step": 14095, "token_acc": 0.7200390891053925, "train_speed(iter/s)": 0.418148 }, { "epoch": 0.182957092669065, "grad_norm": 0.8597836494445801, "learning_rate": 9.953040242238998e-05, "loss": 1.028127956390381, "memory(GiB)": 89.65, "step": 14100, "token_acc": 0.7495714897871733, "train_speed(iter/s)": 0.41779 }, { "epoch": 0.1830219710707207, "grad_norm": 0.7821646332740784, "learning_rate": 9.952966873974106e-05, "loss": 0.9627819061279297, "memory(GiB)": 89.65, "step": 14105, "token_acc": 0.7302535130689741, "train_speed(iter/s)": 0.417425 }, { "epoch": 0.1830868494723764, "grad_norm": 0.778769850730896, "learning_rate": 9.95289344871079e-05, "loss": 0.9724933624267578, "memory(GiB)": 89.65, "step": 14110, "token_acc": 0.7367432611577552, "train_speed(iter/s)": 0.417066 }, { "epoch": 0.18315172787403208, "grad_norm": 0.848933219909668, "learning_rate": 9.952819966449895e-05, "loss": 1.0342521667480469, "memory(GiB)": 89.65, "step": 14115, "token_acc": 0.7444764358236097, "train_speed(iter/s)": 0.416712 }, { "epoch": 0.18321660627568778, "grad_norm": 0.8631255030632019, "learning_rate": 9.952746427192268e-05, "loss": 0.9899716377258301, "memory(GiB)": 89.65, "step": 14120, "token_acc": 0.741293034427542, "train_speed(iter/s)": 0.416335 }, { "epoch": 0.18328148467734348, "grad_norm": 0.9030176997184753, "learning_rate": 9.952672830938751e-05, "loss": 0.9795166015625, "memory(GiB)": 89.65, "step": 14125, "token_acc": 0.749792005787665, "train_speed(iter/s)": 0.415992 }, { "epoch": 0.18334636307899918, "grad_norm": 0.8623008131980896, "learning_rate": 9.952599177690197e-05, "loss": 0.9879825592041016, "memory(GiB)": 89.65, "step": 14130, "token_acc": 0.7610904472466661, "train_speed(iter/s)": 0.415663 }, { "epoch": 0.18341124148065488, "grad_norm": 0.7986576557159424, "learning_rate": 9.952525467447448e-05, "loss": 0.9958074569702149, "memory(GiB)": 89.65, "step": 14135, "token_acc": 0.7456854188780123, "train_speed(iter/s)": 0.415309 }, { "epoch": 0.18347611988231058, "grad_norm": 0.9382964968681335, "learning_rate": 9.952451700211357e-05, "loss": 0.9930728912353516, "memory(GiB)": 89.65, "step": 14140, "token_acc": 0.7624620528839808, "train_speed(iter/s)": 0.414967 }, { "epoch": 0.18354099828396628, "grad_norm": 0.804015040397644, "learning_rate": 9.952377875982769e-05, "loss": 1.0660520553588868, "memory(GiB)": 89.65, "step": 14145, "token_acc": 0.702408314180074, "train_speed(iter/s)": 0.414615 }, { "epoch": 0.18360587668562198, "grad_norm": 0.7938770055770874, "learning_rate": 9.952303994762536e-05, "loss": 0.9718400001525879, "memory(GiB)": 89.65, "step": 14150, "token_acc": 0.7442547950026395, "train_speed(iter/s)": 0.414254 }, { "epoch": 0.18367075508727768, "grad_norm": 0.7668663263320923, "learning_rate": 9.952230056551506e-05, "loss": 0.995334243774414, "memory(GiB)": 89.65, "step": 14155, "token_acc": 0.7270986171525028, "train_speed(iter/s)": 0.413904 }, { "epoch": 0.18373563348893338, "grad_norm": 0.8268716335296631, "learning_rate": 9.952156061350532e-05, "loss": 1.020294761657715, "memory(GiB)": 89.65, "step": 14160, "token_acc": 0.7024771045250245, "train_speed(iter/s)": 0.413555 }, { "epoch": 0.18380051189058907, "grad_norm": 0.8791439533233643, "learning_rate": 9.952082009160466e-05, "loss": 0.992428970336914, "memory(GiB)": 89.65, "step": 14165, "token_acc": 0.7357702349869452, "train_speed(iter/s)": 0.413219 }, { "epoch": 0.18386539029224477, "grad_norm": 0.8646413683891296, "learning_rate": 9.952007899982156e-05, "loss": 0.9935840606689453, "memory(GiB)": 89.65, "step": 14170, "token_acc": 0.7502023199352577, "train_speed(iter/s)": 0.412881 }, { "epoch": 0.18393026869390044, "grad_norm": 0.9048921465873718, "learning_rate": 9.951933733816461e-05, "loss": 0.9926608085632325, "memory(GiB)": 89.65, "step": 14175, "token_acc": 0.740290042357671, "train_speed(iter/s)": 0.412516 }, { "epoch": 0.18399514709555614, "grad_norm": 0.8430342078208923, "learning_rate": 9.95185951066423e-05, "loss": 1.0030041694641114, "memory(GiB)": 89.65, "step": 14180, "token_acc": 0.7417836565771322, "train_speed(iter/s)": 0.412171 }, { "epoch": 0.18406002549721184, "grad_norm": 0.8495647311210632, "learning_rate": 9.951785230526318e-05, "loss": 1.004305648803711, "memory(GiB)": 89.65, "step": 14185, "token_acc": 0.7495267895082474, "train_speed(iter/s)": 0.411814 }, { "epoch": 0.18412490389886754, "grad_norm": 0.8705364465713501, "learning_rate": 9.951710893403581e-05, "loss": 0.9879872322082519, "memory(GiB)": 89.65, "step": 14190, "token_acc": 0.7475390851187029, "train_speed(iter/s)": 0.411457 }, { "epoch": 0.18418978230052324, "grad_norm": 1.024604320526123, "learning_rate": 9.951636499296872e-05, "loss": 0.9986550331115722, "memory(GiB)": 89.65, "step": 14195, "token_acc": 0.7101994730899511, "train_speed(iter/s)": 0.411129 }, { "epoch": 0.18425466070217894, "grad_norm": 0.79970782995224, "learning_rate": 9.951562048207051e-05, "loss": 1.0263943672180176, "memory(GiB)": 89.65, "step": 14200, "token_acc": 0.7347604949517953, "train_speed(iter/s)": 0.410799 }, { "epoch": 0.18431953910383464, "grad_norm": 0.723822832107544, "learning_rate": 9.951487540134971e-05, "loss": 0.9899527549743652, "memory(GiB)": 89.65, "step": 14205, "token_acc": 0.7458575425005379, "train_speed(iter/s)": 0.410443 }, { "epoch": 0.18438441750549034, "grad_norm": 0.7086465358734131, "learning_rate": 9.951412975081492e-05, "loss": 0.9993785858154297, "memory(GiB)": 89.65, "step": 14210, "token_acc": 0.7284273074537203, "train_speed(iter/s)": 0.410085 }, { "epoch": 0.18444929590714604, "grad_norm": 0.874809980392456, "learning_rate": 9.951338353047471e-05, "loss": 1.0047701835632323, "memory(GiB)": 89.65, "step": 14215, "token_acc": 0.7453660991361235, "train_speed(iter/s)": 0.409755 }, { "epoch": 0.18451417430880174, "grad_norm": 0.903545081615448, "learning_rate": 9.951263674033767e-05, "loss": 0.9892549514770508, "memory(GiB)": 89.65, "step": 14220, "token_acc": 0.7273939436471455, "train_speed(iter/s)": 0.409402 }, { "epoch": 0.18457905271045744, "grad_norm": 0.8081318736076355, "learning_rate": 9.951188938041239e-05, "loss": 0.9217363357543945, "memory(GiB)": 89.65, "step": 14225, "token_acc": 0.7865992919949755, "train_speed(iter/s)": 0.40903 }, { "epoch": 0.18464393111211314, "grad_norm": 0.8523758053779602, "learning_rate": 9.951114145070747e-05, "loss": 0.9804935455322266, "memory(GiB)": 89.65, "step": 14230, "token_acc": 0.7164365883227093, "train_speed(iter/s)": 0.408701 }, { "epoch": 0.1847088095137688, "grad_norm": 0.7993977665901184, "learning_rate": 9.951039295123153e-05, "loss": 0.9994077682495117, "memory(GiB)": 89.65, "step": 14235, "token_acc": 0.7513353115727003, "train_speed(iter/s)": 0.408378 }, { "epoch": 0.1847736879154245, "grad_norm": 1.1075170040130615, "learning_rate": 9.950964388199317e-05, "loss": 0.9860298156738281, "memory(GiB)": 89.65, "step": 14240, "token_acc": 0.7521703314111907, "train_speed(iter/s)": 0.408043 }, { "epoch": 0.1848385663170802, "grad_norm": 0.8479109406471252, "learning_rate": 9.950889424300103e-05, "loss": 0.9508421897888184, "memory(GiB)": 89.65, "step": 14245, "token_acc": 0.7544238239102288, "train_speed(iter/s)": 0.407708 }, { "epoch": 0.1849034447187359, "grad_norm": 0.875390350818634, "learning_rate": 9.950814403426369e-05, "loss": 0.9892765998840332, "memory(GiB)": 89.65, "step": 14250, "token_acc": 0.729876419505678, "train_speed(iter/s)": 0.407394 }, { "epoch": 0.1849683231203916, "grad_norm": 0.6714265942573547, "learning_rate": 9.950739325578984e-05, "loss": 0.9272948265075683, "memory(GiB)": 89.65, "step": 14255, "token_acc": 0.7549114220721032, "train_speed(iter/s)": 0.407066 }, { "epoch": 0.1850332015220473, "grad_norm": 0.729059100151062, "learning_rate": 9.950664190758808e-05, "loss": 1.0066457748413087, "memory(GiB)": 89.65, "step": 14260, "token_acc": 0.7416158254760584, "train_speed(iter/s)": 0.406731 }, { "epoch": 0.185098079923703, "grad_norm": 0.8019087314605713, "learning_rate": 9.950588998966708e-05, "loss": 1.020518970489502, "memory(GiB)": 89.65, "step": 14265, "token_acc": 0.7271755556374627, "train_speed(iter/s)": 0.406408 }, { "epoch": 0.1851629583253587, "grad_norm": 0.8642765879631042, "learning_rate": 9.950513750203548e-05, "loss": 1.0105684280395508, "memory(GiB)": 89.65, "step": 14270, "token_acc": 0.7371282253299193, "train_speed(iter/s)": 0.406071 }, { "epoch": 0.1852278367270144, "grad_norm": 0.7885985970497131, "learning_rate": 9.950438444470195e-05, "loss": 0.9731883049011231, "memory(GiB)": 89.65, "step": 14275, "token_acc": 0.7441014899419042, "train_speed(iter/s)": 0.405725 }, { "epoch": 0.1852927151286701, "grad_norm": 1.1460129022598267, "learning_rate": 9.950363081767514e-05, "loss": 0.9690214157104492, "memory(GiB)": 89.65, "step": 14280, "token_acc": 0.7429881240298885, "train_speed(iter/s)": 0.405389 }, { "epoch": 0.1853575935303258, "grad_norm": 0.7936517000198364, "learning_rate": 9.950287662096374e-05, "loss": 1.0058222770690919, "memory(GiB)": 89.65, "step": 14285, "token_acc": 0.7419997655608955, "train_speed(iter/s)": 0.405064 }, { "epoch": 0.1854224719319815, "grad_norm": 0.7503355145454407, "learning_rate": 9.950212185457641e-05, "loss": 0.9360671997070312, "memory(GiB)": 89.65, "step": 14290, "token_acc": 0.7408399819358723, "train_speed(iter/s)": 0.404751 }, { "epoch": 0.18548735033363717, "grad_norm": 0.8771044611930847, "learning_rate": 9.950136651852187e-05, "loss": 0.9366191864013672, "memory(GiB)": 89.65, "step": 14295, "token_acc": 0.7597459631106006, "train_speed(iter/s)": 0.404406 }, { "epoch": 0.18555222873529287, "grad_norm": 0.8544950485229492, "learning_rate": 9.950061061280878e-05, "loss": 0.9974760055541992, "memory(GiB)": 89.65, "step": 14300, "token_acc": 0.719094694681967, "train_speed(iter/s)": 0.404089 }, { "epoch": 0.18561710713694857, "grad_norm": 0.8457149863243103, "learning_rate": 9.949985413744585e-05, "loss": 0.9296605110168457, "memory(GiB)": 89.65, "step": 14305, "token_acc": 0.7568290034249436, "train_speed(iter/s)": 0.403749 }, { "epoch": 0.18568198553860427, "grad_norm": 0.731805145740509, "learning_rate": 9.949909709244176e-05, "loss": 0.982357406616211, "memory(GiB)": 89.65, "step": 14310, "token_acc": 0.7411547002220578, "train_speed(iter/s)": 0.403419 }, { "epoch": 0.18574686394025997, "grad_norm": 0.9114458560943604, "learning_rate": 9.949833947780527e-05, "loss": 1.0882102966308593, "memory(GiB)": 89.65, "step": 14315, "token_acc": 0.7045285820341499, "train_speed(iter/s)": 0.403102 }, { "epoch": 0.18581174234191566, "grad_norm": 0.8343044519424438, "learning_rate": 9.949758129354508e-05, "loss": 0.9708288192749024, "memory(GiB)": 89.65, "step": 14320, "token_acc": 0.7402752231078411, "train_speed(iter/s)": 0.402786 }, { "epoch": 0.18587662074357136, "grad_norm": 0.8449217677116394, "learning_rate": 9.94968225396699e-05, "loss": 0.9914234161376954, "memory(GiB)": 89.65, "step": 14325, "token_acc": 0.744666577217228, "train_speed(iter/s)": 0.40245 }, { "epoch": 0.18594149914522706, "grad_norm": 0.8902744650840759, "learning_rate": 9.949606321618848e-05, "loss": 1.0369964599609376, "memory(GiB)": 89.65, "step": 14330, "token_acc": 0.7314174810173993, "train_speed(iter/s)": 0.402146 }, { "epoch": 0.18600637754688276, "grad_norm": 0.9819983243942261, "learning_rate": 9.949530332310954e-05, "loss": 0.9909199714660645, "memory(GiB)": 89.65, "step": 14335, "token_acc": 0.7446969946887474, "train_speed(iter/s)": 0.401825 }, { "epoch": 0.18607125594853846, "grad_norm": 0.9776026010513306, "learning_rate": 9.949454286044184e-05, "loss": 1.0149593353271484, "memory(GiB)": 89.65, "step": 14340, "token_acc": 0.7300779442645122, "train_speed(iter/s)": 0.401506 }, { "epoch": 0.18613613435019416, "grad_norm": 0.8561179041862488, "learning_rate": 9.949378182819412e-05, "loss": 1.0014421463012695, "memory(GiB)": 89.65, "step": 14345, "token_acc": 0.7510634618710013, "train_speed(iter/s)": 0.401175 }, { "epoch": 0.18620101275184986, "grad_norm": 0.9900202751159668, "learning_rate": 9.949302022637514e-05, "loss": 0.9671016693115234, "memory(GiB)": 89.65, "step": 14350, "token_acc": 0.7521891089481003, "train_speed(iter/s)": 0.400821 }, { "epoch": 0.18626589115350553, "grad_norm": 0.8480575680732727, "learning_rate": 9.949225805499369e-05, "loss": 0.964535903930664, "memory(GiB)": 89.65, "step": 14355, "token_acc": 0.767915646770853, "train_speed(iter/s)": 0.400502 }, { "epoch": 0.18633076955516123, "grad_norm": 0.83262699842453, "learning_rate": 9.94914953140585e-05, "loss": 1.0371782302856445, "memory(GiB)": 89.65, "step": 14360, "token_acc": 0.7332031943212067, "train_speed(iter/s)": 0.400174 }, { "epoch": 0.18639564795681693, "grad_norm": 0.8213477730751038, "learning_rate": 9.949073200357838e-05, "loss": 0.9926937103271485, "memory(GiB)": 89.65, "step": 14365, "token_acc": 0.7574452003023432, "train_speed(iter/s)": 0.399858 }, { "epoch": 0.18646052635847263, "grad_norm": 0.916416347026825, "learning_rate": 9.94899681235621e-05, "loss": 0.9634407043457032, "memory(GiB)": 89.65, "step": 14370, "token_acc": 0.7372501568522004, "train_speed(iter/s)": 0.399568 }, { "epoch": 0.18652540476012833, "grad_norm": 0.8739131093025208, "learning_rate": 9.948920367401844e-05, "loss": 0.9804075241088868, "memory(GiB)": 89.65, "step": 14375, "token_acc": 0.7717984713773203, "train_speed(iter/s)": 0.399271 }, { "epoch": 0.18659028316178403, "grad_norm": 0.7644845843315125, "learning_rate": 9.948843865495622e-05, "loss": 0.9762514114379883, "memory(GiB)": 89.65, "step": 14380, "token_acc": 0.7341142792572225, "train_speed(iter/s)": 0.398945 }, { "epoch": 0.18665516156343973, "grad_norm": 0.8541745543479919, "learning_rate": 9.948767306638422e-05, "loss": 1.0108802795410157, "memory(GiB)": 89.65, "step": 14385, "token_acc": 0.7185666878959541, "train_speed(iter/s)": 0.398621 }, { "epoch": 0.18672003996509542, "grad_norm": 0.9204621911048889, "learning_rate": 9.948690690831128e-05, "loss": 1.0233467102050782, "memory(GiB)": 89.65, "step": 14390, "token_acc": 0.7367458866544789, "train_speed(iter/s)": 0.398317 }, { "epoch": 0.18678491836675112, "grad_norm": 0.7998715043067932, "learning_rate": 9.948614018074619e-05, "loss": 0.9740880012512207, "memory(GiB)": 89.65, "step": 14395, "token_acc": 0.7536466774716369, "train_speed(iter/s)": 0.397996 }, { "epoch": 0.18684979676840682, "grad_norm": 0.7326331734657288, "learning_rate": 9.94853728836978e-05, "loss": 0.9929349899291993, "memory(GiB)": 89.65, "step": 14400, "token_acc": 0.7218959596932063, "train_speed(iter/s)": 0.397688 }, { "epoch": 0.18691467517006252, "grad_norm": 0.7608689665794373, "learning_rate": 9.94846050171749e-05, "loss": 1.0030829429626464, "memory(GiB)": 89.65, "step": 14405, "token_acc": 0.7333896396396397, "train_speed(iter/s)": 0.397385 }, { "epoch": 0.1869795535717182, "grad_norm": 0.7727503776550293, "learning_rate": 9.948383658118636e-05, "loss": 0.9745906829833985, "memory(GiB)": 89.65, "step": 14410, "token_acc": 0.7325287061154114, "train_speed(iter/s)": 0.397068 }, { "epoch": 0.1870444319733739, "grad_norm": 0.8212957382202148, "learning_rate": 9.948306757574102e-05, "loss": 0.9612700462341308, "memory(GiB)": 89.65, "step": 14415, "token_acc": 0.7506077731822056, "train_speed(iter/s)": 0.396757 }, { "epoch": 0.1871093103750296, "grad_norm": 0.945887565612793, "learning_rate": 9.948229800084772e-05, "loss": 1.0080209732055665, "memory(GiB)": 89.65, "step": 14420, "token_acc": 0.7271436946527199, "train_speed(iter/s)": 0.396458 }, { "epoch": 0.1871741887766853, "grad_norm": 0.8524113297462463, "learning_rate": 9.948152785651531e-05, "loss": 0.9910593032836914, "memory(GiB)": 89.65, "step": 14425, "token_acc": 0.7202660665828314, "train_speed(iter/s)": 0.396153 }, { "epoch": 0.187239067178341, "grad_norm": 0.8475600481033325, "learning_rate": 9.948075714275267e-05, "loss": 0.9569486618041992, "memory(GiB)": 89.65, "step": 14430, "token_acc": 0.7446240545751149, "train_speed(iter/s)": 0.395844 }, { "epoch": 0.1873039455799967, "grad_norm": 0.7718340158462524, "learning_rate": 9.947998585956866e-05, "loss": 0.9747758865356445, "memory(GiB)": 89.65, "step": 14435, "token_acc": 0.7336280456461396, "train_speed(iter/s)": 0.395521 }, { "epoch": 0.1873688239816524, "grad_norm": 0.8512265086174011, "learning_rate": 9.947921400697215e-05, "loss": 0.9791158676147461, "memory(GiB)": 89.65, "step": 14440, "token_acc": 0.7442495126705653, "train_speed(iter/s)": 0.395197 }, { "epoch": 0.1874337023833081, "grad_norm": 0.8774914741516113, "learning_rate": 9.947844158497204e-05, "loss": 0.9941203117370605, "memory(GiB)": 89.65, "step": 14445, "token_acc": 0.7256529167683671, "train_speed(iter/s)": 0.394895 }, { "epoch": 0.1874985807849638, "grad_norm": 0.8207067847251892, "learning_rate": 9.947766859357721e-05, "loss": 0.9692419052124024, "memory(GiB)": 89.65, "step": 14450, "token_acc": 0.7336222015887754, "train_speed(iter/s)": 0.394579 }, { "epoch": 0.18756345918661949, "grad_norm": 0.7795988321304321, "learning_rate": 9.947689503279654e-05, "loss": 0.9878180503845215, "memory(GiB)": 89.65, "step": 14455, "token_acc": 0.7346298414773655, "train_speed(iter/s)": 0.394266 }, { "epoch": 0.18762833758827518, "grad_norm": 0.7653995752334595, "learning_rate": 9.947612090263896e-05, "loss": 1.0278564453125, "memory(GiB)": 89.65, "step": 14460, "token_acc": 0.7093457611441175, "train_speed(iter/s)": 0.393959 }, { "epoch": 0.18769321598993088, "grad_norm": 0.8335837125778198, "learning_rate": 9.947534620311336e-05, "loss": 0.974120044708252, "memory(GiB)": 89.65, "step": 14465, "token_acc": 0.739409984871407, "train_speed(iter/s)": 0.393656 }, { "epoch": 0.18775809439158656, "grad_norm": 0.8390584588050842, "learning_rate": 9.947457093422868e-05, "loss": 0.9590273857116699, "memory(GiB)": 89.65, "step": 14470, "token_acc": 0.7472726009311376, "train_speed(iter/s)": 0.393349 }, { "epoch": 0.18782297279324225, "grad_norm": 0.7705576419830322, "learning_rate": 9.947379509599379e-05, "loss": 0.9507960319519043, "memory(GiB)": 89.65, "step": 14475, "token_acc": 0.7425512459041378, "train_speed(iter/s)": 0.393022 }, { "epoch": 0.18788785119489795, "grad_norm": 0.9068230390548706, "learning_rate": 9.947301868841766e-05, "loss": 0.9494183540344239, "memory(GiB)": 89.65, "step": 14480, "token_acc": 0.7731474794164379, "train_speed(iter/s)": 0.392727 }, { "epoch": 0.18795272959655365, "grad_norm": 0.7989099621772766, "learning_rate": 9.947224171150922e-05, "loss": 0.9816526412963867, "memory(GiB)": 89.65, "step": 14485, "token_acc": 0.7410554285361182, "train_speed(iter/s)": 0.39243 }, { "epoch": 0.18801760799820935, "grad_norm": 0.8666488528251648, "learning_rate": 9.94714641652774e-05, "loss": 1.0389460563659667, "memory(GiB)": 89.65, "step": 14490, "token_acc": 0.7277535907555844, "train_speed(iter/s)": 0.392151 }, { "epoch": 0.18808248639986505, "grad_norm": 0.9362528920173645, "learning_rate": 9.947068604973115e-05, "loss": 0.9659055709838867, "memory(GiB)": 89.65, "step": 14495, "token_acc": 0.7699623120002818, "train_speed(iter/s)": 0.391829 }, { "epoch": 0.18814736480152075, "grad_norm": 0.8853940963745117, "learning_rate": 9.946990736487945e-05, "loss": 1.0062637329101562, "memory(GiB)": 89.65, "step": 14500, "token_acc": 0.7397833579517479, "train_speed(iter/s)": 0.391538 }, { "epoch": 0.18821224320317645, "grad_norm": 0.8173947930335999, "learning_rate": 9.946912811073123e-05, "loss": 0.9746129035949707, "memory(GiB)": 89.65, "step": 14505, "token_acc": 0.7603488807877073, "train_speed(iter/s)": 0.39122 }, { "epoch": 0.18827712160483215, "grad_norm": 0.863067626953125, "learning_rate": 9.946834828729545e-05, "loss": 0.9373590469360351, "memory(GiB)": 89.65, "step": 14510, "token_acc": 0.7858680425052107, "train_speed(iter/s)": 0.39092 }, { "epoch": 0.18834200000648785, "grad_norm": 0.903656542301178, "learning_rate": 9.946756789458112e-05, "loss": 0.9705121994018555, "memory(GiB)": 89.65, "step": 14515, "token_acc": 0.7337418001398555, "train_speed(iter/s)": 0.390622 }, { "epoch": 0.18840687840814355, "grad_norm": 0.908970057964325, "learning_rate": 9.946678693259721e-05, "loss": 1.0185275077819824, "memory(GiB)": 89.65, "step": 14520, "token_acc": 0.7128052155729134, "train_speed(iter/s)": 0.390313 }, { "epoch": 0.18847175680979925, "grad_norm": 0.9090825319290161, "learning_rate": 9.946600540135269e-05, "loss": 1.0075323104858398, "memory(GiB)": 89.65, "step": 14525, "token_acc": 0.7522431716668643, "train_speed(iter/s)": 0.390029 }, { "epoch": 0.18853663521145492, "grad_norm": 0.8992526531219482, "learning_rate": 9.946522330085656e-05, "loss": 0.9848674774169922, "memory(GiB)": 89.65, "step": 14530, "token_acc": 0.7365702479338843, "train_speed(iter/s)": 0.389727 }, { "epoch": 0.18860151361311062, "grad_norm": 0.8671475648880005, "learning_rate": 9.946444063111781e-05, "loss": 0.968842601776123, "memory(GiB)": 89.65, "step": 14535, "token_acc": 0.73919473012032, "train_speed(iter/s)": 0.389422 }, { "epoch": 0.18866639201476632, "grad_norm": 0.8081725239753723, "learning_rate": 9.946365739214549e-05, "loss": 1.0197504043579102, "memory(GiB)": 89.65, "step": 14540, "token_acc": 0.7209522308056124, "train_speed(iter/s)": 0.38913 }, { "epoch": 0.18873127041642201, "grad_norm": 0.872215747833252, "learning_rate": 9.946287358394856e-05, "loss": 1.0157673835754395, "memory(GiB)": 89.65, "step": 14545, "token_acc": 0.7634601136082984, "train_speed(iter/s)": 0.388823 }, { "epoch": 0.1887961488180777, "grad_norm": 0.9072367548942566, "learning_rate": 9.946208920653608e-05, "loss": 1.0130151748657226, "memory(GiB)": 89.65, "step": 14550, "token_acc": 0.7415481504550331, "train_speed(iter/s)": 0.388526 }, { "epoch": 0.1888610272197334, "grad_norm": 0.9343808889389038, "learning_rate": 9.946130425991707e-05, "loss": 1.0194817543029786, "memory(GiB)": 89.65, "step": 14555, "token_acc": 0.7329839297490541, "train_speed(iter/s)": 0.388219 }, { "epoch": 0.1889259056213891, "grad_norm": 0.8401597142219543, "learning_rate": 9.946051874410054e-05, "loss": 1.0352498054504395, "memory(GiB)": 89.65, "step": 14560, "token_acc": 0.7141534956571266, "train_speed(iter/s)": 0.38794 }, { "epoch": 0.1889907840230448, "grad_norm": 0.9426642060279846, "learning_rate": 9.945973265909554e-05, "loss": 0.9936470031738281, "memory(GiB)": 89.65, "step": 14565, "token_acc": 0.7350380205665655, "train_speed(iter/s)": 0.387667 }, { "epoch": 0.1890556624247005, "grad_norm": 0.7890317440032959, "learning_rate": 9.945894600491112e-05, "loss": 0.9600719451904297, "memory(GiB)": 89.65, "step": 14570, "token_acc": 0.7596684032399132, "train_speed(iter/s)": 0.387366 }, { "epoch": 0.1891205408263562, "grad_norm": 0.9586418271064758, "learning_rate": 9.945815878155633e-05, "loss": 1.0091131210327149, "memory(GiB)": 89.65, "step": 14575, "token_acc": 0.7467976458369396, "train_speed(iter/s)": 0.387081 }, { "epoch": 0.1891854192280119, "grad_norm": 0.8222958445549011, "learning_rate": 9.945737098904025e-05, "loss": 0.9808709144592285, "memory(GiB)": 89.65, "step": 14580, "token_acc": 0.741625015490107, "train_speed(iter/s)": 0.386806 }, { "epoch": 0.1892502976296676, "grad_norm": 0.9330892562866211, "learning_rate": 9.94565826273719e-05, "loss": 1.0664661407470704, "memory(GiB)": 89.65, "step": 14585, "token_acc": 0.7147309833024119, "train_speed(iter/s)": 0.386526 }, { "epoch": 0.18931517603132328, "grad_norm": 0.825862467288971, "learning_rate": 9.94557936965604e-05, "loss": 0.9762276649475098, "memory(GiB)": 89.65, "step": 14590, "token_acc": 0.7425062012492155, "train_speed(iter/s)": 0.386232 }, { "epoch": 0.18938005443297898, "grad_norm": 0.8659449815750122, "learning_rate": 9.945500419661479e-05, "loss": 1.0190958023071288, "memory(GiB)": 89.65, "step": 14595, "token_acc": 0.7370904957111853, "train_speed(iter/s)": 0.385957 }, { "epoch": 0.18944493283463468, "grad_norm": 0.8391568660736084, "learning_rate": 9.945421412754421e-05, "loss": 0.9576831817626953, "memory(GiB)": 89.65, "step": 14600, "token_acc": 0.7262829937688161, "train_speed(iter/s)": 0.385678 }, { "epoch": 0.18950981123629038, "grad_norm": 0.8192936778068542, "learning_rate": 9.945342348935769e-05, "loss": 0.9751107215881347, "memory(GiB)": 89.65, "step": 14605, "token_acc": 0.7403838267858608, "train_speed(iter/s)": 0.385385 }, { "epoch": 0.18957468963794608, "grad_norm": 0.8720535039901733, "learning_rate": 9.945263228206436e-05, "loss": 0.9886520385742188, "memory(GiB)": 89.65, "step": 14610, "token_acc": 0.7309120699071545, "train_speed(iter/s)": 0.385083 }, { "epoch": 0.18963956803960177, "grad_norm": 0.8501237630844116, "learning_rate": 9.945184050567333e-05, "loss": 1.0192411422729493, "memory(GiB)": 89.65, "step": 14615, "token_acc": 0.7407318986392005, "train_speed(iter/s)": 0.384791 }, { "epoch": 0.18970444644125747, "grad_norm": 0.8760257363319397, "learning_rate": 9.945104816019368e-05, "loss": 0.968073844909668, "memory(GiB)": 89.65, "step": 14620, "token_acc": 0.7365769106739761, "train_speed(iter/s)": 0.384525 }, { "epoch": 0.18976932484291317, "grad_norm": 0.8440539836883545, "learning_rate": 9.945025524563457e-05, "loss": 0.9977526664733887, "memory(GiB)": 89.65, "step": 14625, "token_acc": 0.7463579697239537, "train_speed(iter/s)": 0.384241 }, { "epoch": 0.18983420324456887, "grad_norm": 0.9226705431938171, "learning_rate": 9.944946176200509e-05, "loss": 0.9992505073547363, "memory(GiB)": 89.65, "step": 14630, "token_acc": 0.7434448462929476, "train_speed(iter/s)": 0.383954 }, { "epoch": 0.18989908164622457, "grad_norm": 0.8821514248847961, "learning_rate": 9.944866770931441e-05, "loss": 0.9715032577514648, "memory(GiB)": 89.65, "step": 14635, "token_acc": 0.7532102728731942, "train_speed(iter/s)": 0.383674 }, { "epoch": 0.18996396004788027, "grad_norm": 0.8781871199607849, "learning_rate": 9.944787308757161e-05, "loss": 0.9473139762878418, "memory(GiB)": 89.65, "step": 14640, "token_acc": 0.7713879677877783, "train_speed(iter/s)": 0.383391 }, { "epoch": 0.19002883844953597, "grad_norm": 0.8404408097267151, "learning_rate": 9.944707789678592e-05, "loss": 0.9582944869995117, "memory(GiB)": 89.65, "step": 14645, "token_acc": 0.7561099597231455, "train_speed(iter/s)": 0.383107 }, { "epoch": 0.19009371685119164, "grad_norm": 0.8405942916870117, "learning_rate": 9.94462821369664e-05, "loss": 0.9808954238891602, "memory(GiB)": 89.65, "step": 14650, "token_acc": 0.7567846712728827, "train_speed(iter/s)": 0.382826 }, { "epoch": 0.19015859525284734, "grad_norm": 0.8177167177200317, "learning_rate": 9.944548580812225e-05, "loss": 0.9975932121276856, "memory(GiB)": 89.65, "step": 14655, "token_acc": 0.7426942026051158, "train_speed(iter/s)": 0.382539 }, { "epoch": 0.19022347365450304, "grad_norm": 0.8662164211273193, "learning_rate": 9.944468891026264e-05, "loss": 0.9945512771606445, "memory(GiB)": 89.65, "step": 14660, "token_acc": 0.7220266901305177, "train_speed(iter/s)": 0.382254 }, { "epoch": 0.19028835205615874, "grad_norm": 0.8364856839179993, "learning_rate": 9.944389144339674e-05, "loss": 0.9995327949523926, "memory(GiB)": 89.65, "step": 14665, "token_acc": 0.7492550655542313, "train_speed(iter/s)": 0.381985 }, { "epoch": 0.19035323045781444, "grad_norm": 0.8706039190292358, "learning_rate": 9.94430934075337e-05, "loss": 0.9837055206298828, "memory(GiB)": 89.65, "step": 14670, "token_acc": 0.7467710961328463, "train_speed(iter/s)": 0.381696 }, { "epoch": 0.19041810885947014, "grad_norm": 0.8702878952026367, "learning_rate": 9.944229480268273e-05, "loss": 0.9695758819580078, "memory(GiB)": 89.65, "step": 14675, "token_acc": 0.7261388286334056, "train_speed(iter/s)": 0.381421 }, { "epoch": 0.19048298726112584, "grad_norm": 0.8938469886779785, "learning_rate": 9.944149562885302e-05, "loss": 1.028327178955078, "memory(GiB)": 89.65, "step": 14680, "token_acc": 0.7326315079124068, "train_speed(iter/s)": 0.38115 }, { "epoch": 0.19054786566278153, "grad_norm": 1.013069987297058, "learning_rate": 9.944069588605376e-05, "loss": 1.0383577346801758, "memory(GiB)": 89.65, "step": 14685, "token_acc": 0.7306960359557841, "train_speed(iter/s)": 0.380866 }, { "epoch": 0.19061274406443723, "grad_norm": 0.8544966578483582, "learning_rate": 9.943989557429413e-05, "loss": 1.0050540924072267, "memory(GiB)": 89.65, "step": 14690, "token_acc": 0.7294090118999866, "train_speed(iter/s)": 0.380591 }, { "epoch": 0.19067762246609293, "grad_norm": 0.8911079168319702, "learning_rate": 9.94390946935834e-05, "loss": 1.0102435111999513, "memory(GiB)": 89.65, "step": 14695, "token_acc": 0.7358308286818377, "train_speed(iter/s)": 0.380315 }, { "epoch": 0.19074250086774863, "grad_norm": 0.8038025498390198, "learning_rate": 9.943829324393073e-05, "loss": 0.9830583572387696, "memory(GiB)": 89.65, "step": 14700, "token_acc": 0.7519944397437447, "train_speed(iter/s)": 0.380056 }, { "epoch": 0.19080737926940433, "grad_norm": 0.9062402248382568, "learning_rate": 9.943749122534537e-05, "loss": 0.9347715377807617, "memory(GiB)": 89.65, "step": 14705, "token_acc": 0.7461533130024951, "train_speed(iter/s)": 0.379788 }, { "epoch": 0.19087225767106, "grad_norm": 0.8496866822242737, "learning_rate": 9.943668863783655e-05, "loss": 0.9664566993713379, "memory(GiB)": 89.65, "step": 14710, "token_acc": 0.7308744748911242, "train_speed(iter/s)": 0.379502 }, { "epoch": 0.1909371360727157, "grad_norm": 0.9023811221122742, "learning_rate": 9.943588548141348e-05, "loss": 0.9792156219482422, "memory(GiB)": 89.65, "step": 14715, "token_acc": 0.7387539936102236, "train_speed(iter/s)": 0.379218 }, { "epoch": 0.1910020144743714, "grad_norm": 0.9285873770713806, "learning_rate": 9.943508175608543e-05, "loss": 0.9989458084106445, "memory(GiB)": 89.65, "step": 14720, "token_acc": 0.74541520343372, "train_speed(iter/s)": 0.378957 }, { "epoch": 0.1910668928760271, "grad_norm": 0.888166606426239, "learning_rate": 9.943427746186164e-05, "loss": 1.0223611831665038, "memory(GiB)": 89.65, "step": 14725, "token_acc": 0.7103723922791967, "train_speed(iter/s)": 0.378681 }, { "epoch": 0.1911317712776828, "grad_norm": 0.8873077034950256, "learning_rate": 9.943347259875137e-05, "loss": 0.9651453018188476, "memory(GiB)": 89.65, "step": 14730, "token_acc": 0.7643035268970431, "train_speed(iter/s)": 0.378419 }, { "epoch": 0.1911966496793385, "grad_norm": 0.790394127368927, "learning_rate": 9.943266716676387e-05, "loss": 0.9360391616821289, "memory(GiB)": 89.65, "step": 14735, "token_acc": 0.7393857977443844, "train_speed(iter/s)": 0.378163 }, { "epoch": 0.1912615280809942, "grad_norm": 0.8316014409065247, "learning_rate": 9.943186116590843e-05, "loss": 1.010297679901123, "memory(GiB)": 89.65, "step": 14740, "token_acc": 0.727100073046019, "train_speed(iter/s)": 0.377893 }, { "epoch": 0.1913264064826499, "grad_norm": 0.9362151026725769, "learning_rate": 9.943105459619431e-05, "loss": 1.0400161743164062, "memory(GiB)": 89.65, "step": 14745, "token_acc": 0.7455632642109702, "train_speed(iter/s)": 0.377639 }, { "epoch": 0.1913912848843056, "grad_norm": 0.837733805179596, "learning_rate": 9.943024745763079e-05, "loss": 0.9631664276123046, "memory(GiB)": 89.65, "step": 14750, "token_acc": 0.7492712304193228, "train_speed(iter/s)": 0.377369 }, { "epoch": 0.1914561632859613, "grad_norm": 0.8576817512512207, "learning_rate": 9.942943975022717e-05, "loss": 0.9631793975830079, "memory(GiB)": 89.65, "step": 14755, "token_acc": 0.7328282113015153, "train_speed(iter/s)": 0.377095 }, { "epoch": 0.191521041687617, "grad_norm": 0.8785094618797302, "learning_rate": 9.942863147399272e-05, "loss": 0.9790193557739257, "memory(GiB)": 89.65, "step": 14760, "token_acc": 0.7585185637518319, "train_speed(iter/s)": 0.376824 }, { "epoch": 0.1915859200892727, "grad_norm": 0.834752082824707, "learning_rate": 9.94278226289368e-05, "loss": 0.9724266052246093, "memory(GiB)": 89.65, "step": 14765, "token_acc": 0.7549935734880328, "train_speed(iter/s)": 0.376547 }, { "epoch": 0.19165079849092836, "grad_norm": 0.8118419051170349, "learning_rate": 9.942701321506864e-05, "loss": 0.9813200950622558, "memory(GiB)": 89.65, "step": 14770, "token_acc": 0.7461178799358383, "train_speed(iter/s)": 0.37627 }, { "epoch": 0.19171567689258406, "grad_norm": 0.8188068270683289, "learning_rate": 9.942620323239762e-05, "loss": 0.9910293579101562, "memory(GiB)": 89.65, "step": 14775, "token_acc": 0.74391648400639, "train_speed(iter/s)": 0.375989 }, { "epoch": 0.19178055529423976, "grad_norm": 0.9211812019348145, "learning_rate": 9.942539268093301e-05, "loss": 0.9870401382446289, "memory(GiB)": 89.65, "step": 14780, "token_acc": 0.7321296328948761, "train_speed(iter/s)": 0.375721 }, { "epoch": 0.19184543369589546, "grad_norm": 0.7800498008728027, "learning_rate": 9.942458156068417e-05, "loss": 1.0683255195617676, "memory(GiB)": 89.65, "step": 14785, "token_acc": 0.733939436819018, "train_speed(iter/s)": 0.375467 }, { "epoch": 0.19191031209755116, "grad_norm": 0.8103358149528503, "learning_rate": 9.942376987166043e-05, "loss": 0.9803506851196289, "memory(GiB)": 89.65, "step": 14790, "token_acc": 0.7568151980847941, "train_speed(iter/s)": 0.375212 }, { "epoch": 0.19197519049920686, "grad_norm": 0.8390507102012634, "learning_rate": 9.942295761387113e-05, "loss": 0.9744730949401855, "memory(GiB)": 89.65, "step": 14795, "token_acc": 0.7593698895304208, "train_speed(iter/s)": 0.374948 }, { "epoch": 0.19204006890086256, "grad_norm": 0.8723632097244263, "learning_rate": 9.942214478732561e-05, "loss": 0.9538474082946777, "memory(GiB)": 89.65, "step": 14800, "token_acc": 0.7369289130793336, "train_speed(iter/s)": 0.374703 }, { "epoch": 0.19210494730251826, "grad_norm": 0.7632706165313721, "learning_rate": 9.942133139203322e-05, "loss": 1.0150890350341797, "memory(GiB)": 89.65, "step": 14805, "token_acc": 0.73054563283603, "train_speed(iter/s)": 0.374432 }, { "epoch": 0.19216982570417396, "grad_norm": 0.9498401880264282, "learning_rate": 9.942051742800335e-05, "loss": 0.971920108795166, "memory(GiB)": 89.65, "step": 14810, "token_acc": 0.7307803421821334, "train_speed(iter/s)": 0.374176 }, { "epoch": 0.19223470410582966, "grad_norm": 0.8192874193191528, "learning_rate": 9.941970289524534e-05, "loss": 0.976130199432373, "memory(GiB)": 89.65, "step": 14815, "token_acc": 0.7364874981284624, "train_speed(iter/s)": 0.373919 }, { "epoch": 0.19229958250748536, "grad_norm": 0.8152374029159546, "learning_rate": 9.941888779376856e-05, "loss": 1.0184341430664063, "memory(GiB)": 89.65, "step": 14820, "token_acc": 0.7177902383076334, "train_speed(iter/s)": 0.373656 }, { "epoch": 0.19236446090914106, "grad_norm": 0.8518803119659424, "learning_rate": 9.94180721235824e-05, "loss": 0.974489688873291, "memory(GiB)": 89.65, "step": 14825, "token_acc": 0.739886419406857, "train_speed(iter/s)": 0.373391 }, { "epoch": 0.19242933931079673, "grad_norm": 0.7707037925720215, "learning_rate": 9.941725588469626e-05, "loss": 0.9981203079223633, "memory(GiB)": 89.65, "step": 14830, "token_acc": 0.7318301131718232, "train_speed(iter/s)": 0.373123 }, { "epoch": 0.19249421771245243, "grad_norm": 0.7651588320732117, "learning_rate": 9.941643907711951e-05, "loss": 0.9788759231567383, "memory(GiB)": 89.65, "step": 14835, "token_acc": 0.7437002138672784, "train_speed(iter/s)": 0.372851 }, { "epoch": 0.19255909611410812, "grad_norm": 0.8680979609489441, "learning_rate": 9.941562170086156e-05, "loss": 0.951075553894043, "memory(GiB)": 89.65, "step": 14840, "token_acc": 0.7439278021243407, "train_speed(iter/s)": 0.37261 }, { "epoch": 0.19262397451576382, "grad_norm": 0.8138846755027771, "learning_rate": 9.941480375593181e-05, "loss": 1.0102128982543945, "memory(GiB)": 89.65, "step": 14845, "token_acc": 0.7342006772411335, "train_speed(iter/s)": 0.37235 }, { "epoch": 0.19268885291741952, "grad_norm": 0.9172461628913879, "learning_rate": 9.941398524233969e-05, "loss": 0.9848146438598633, "memory(GiB)": 89.65, "step": 14850, "token_acc": 0.7406634191602877, "train_speed(iter/s)": 0.372108 }, { "epoch": 0.19275373131907522, "grad_norm": 1.1826475858688354, "learning_rate": 9.94131661600946e-05, "loss": 0.9934005737304688, "memory(GiB)": 89.65, "step": 14855, "token_acc": 0.7472312475980855, "train_speed(iter/s)": 0.371848 }, { "epoch": 0.19281860972073092, "grad_norm": 0.8619216680526733, "learning_rate": 9.941234650920596e-05, "loss": 1.003225040435791, "memory(GiB)": 89.65, "step": 14860, "token_acc": 0.7543116289847397, "train_speed(iter/s)": 0.371588 }, { "epoch": 0.19288348812238662, "grad_norm": 0.8028552532196045, "learning_rate": 9.941152628968325e-05, "loss": 1.0023778915405273, "memory(GiB)": 89.65, "step": 14865, "token_acc": 0.7442435023223639, "train_speed(iter/s)": 0.371328 }, { "epoch": 0.19294836652404232, "grad_norm": 0.8156508207321167, "learning_rate": 9.941070550153587e-05, "loss": 0.9931400299072266, "memory(GiB)": 89.65, "step": 14870, "token_acc": 0.7448348313295202, "train_speed(iter/s)": 0.371073 }, { "epoch": 0.19301324492569802, "grad_norm": 0.8215388655662537, "learning_rate": 9.940988414477325e-05, "loss": 0.9894197463989258, "memory(GiB)": 89.65, "step": 14875, "token_acc": 0.7330570967062293, "train_speed(iter/s)": 0.370827 }, { "epoch": 0.19307812332735372, "grad_norm": 0.9368343949317932, "learning_rate": 9.940906221940486e-05, "loss": 0.988133430480957, "memory(GiB)": 89.65, "step": 14880, "token_acc": 0.7539930319895027, "train_speed(iter/s)": 0.370589 }, { "epoch": 0.19314300172900942, "grad_norm": 0.8806885480880737, "learning_rate": 9.940823972544018e-05, "loss": 0.9861847877502441, "memory(GiB)": 89.65, "step": 14885, "token_acc": 0.7442056006434159, "train_speed(iter/s)": 0.370335 }, { "epoch": 0.1932078801306651, "grad_norm": 0.8220598101615906, "learning_rate": 9.940741666288866e-05, "loss": 1.004782009124756, "memory(GiB)": 89.65, "step": 14890, "token_acc": 0.7458048477315102, "train_speed(iter/s)": 0.370059 }, { "epoch": 0.1932727585323208, "grad_norm": 0.8675746917724609, "learning_rate": 9.940659303175977e-05, "loss": 0.9731381416320801, "memory(GiB)": 89.65, "step": 14895, "token_acc": 0.7293157020015897, "train_speed(iter/s)": 0.369808 }, { "epoch": 0.1933376369339765, "grad_norm": 0.7973287105560303, "learning_rate": 9.940576883206296e-05, "loss": 0.9765115737915039, "memory(GiB)": 89.65, "step": 14900, "token_acc": 0.7483040371453553, "train_speed(iter/s)": 0.369546 }, { "epoch": 0.19340251533563219, "grad_norm": 0.8768794536590576, "learning_rate": 9.940494406380776e-05, "loss": 1.0012710571289063, "memory(GiB)": 89.65, "step": 14905, "token_acc": 0.7294520547945206, "train_speed(iter/s)": 0.369304 }, { "epoch": 0.19346739373728788, "grad_norm": 0.8419222831726074, "learning_rate": 9.940411872700366e-05, "loss": 1.0000911712646485, "memory(GiB)": 89.65, "step": 14910, "token_acc": 0.7301939663015393, "train_speed(iter/s)": 0.369041 }, { "epoch": 0.19353227213894358, "grad_norm": 0.8538241386413574, "learning_rate": 9.940329282166012e-05, "loss": 1.0172572135925293, "memory(GiB)": 89.65, "step": 14915, "token_acc": 0.7436292840895948, "train_speed(iter/s)": 0.368787 }, { "epoch": 0.19359715054059928, "grad_norm": 0.853335440158844, "learning_rate": 9.940246634778668e-05, "loss": 0.9884934425354004, "memory(GiB)": 89.65, "step": 14920, "token_acc": 0.7671825050857309, "train_speed(iter/s)": 0.368546 }, { "epoch": 0.19366202894225498, "grad_norm": 0.9501705765724182, "learning_rate": 9.940163930539281e-05, "loss": 0.9996623992919922, "memory(GiB)": 89.65, "step": 14925, "token_acc": 0.7173957412068849, "train_speed(iter/s)": 0.368318 }, { "epoch": 0.19372690734391068, "grad_norm": 0.8950729370117188, "learning_rate": 9.940081169448808e-05, "loss": 0.9334480285644531, "memory(GiB)": 89.65, "step": 14930, "token_acc": 0.7506188118811881, "train_speed(iter/s)": 0.368058 }, { "epoch": 0.19379178574556638, "grad_norm": 0.9153522849082947, "learning_rate": 9.939998351508197e-05, "loss": 1.007706069946289, "memory(GiB)": 89.65, "step": 14935, "token_acc": 0.7487138140213468, "train_speed(iter/s)": 0.367804 }, { "epoch": 0.19385666414722208, "grad_norm": 0.8337724208831787, "learning_rate": 9.939915476718404e-05, "loss": 1.0270034790039062, "memory(GiB)": 89.65, "step": 14940, "token_acc": 0.7416089325351449, "train_speed(iter/s)": 0.367525 }, { "epoch": 0.19392154254887778, "grad_norm": 0.838625431060791, "learning_rate": 9.939832545080382e-05, "loss": 0.9968693733215332, "memory(GiB)": 89.65, "step": 14945, "token_acc": 0.7185661195416537, "train_speed(iter/s)": 0.367273 }, { "epoch": 0.19398642095053345, "grad_norm": 0.9099295735359192, "learning_rate": 9.939749556595085e-05, "loss": 1.0129507064819336, "memory(GiB)": 89.65, "step": 14950, "token_acc": 0.7380303899082569, "train_speed(iter/s)": 0.367033 }, { "epoch": 0.19405129935218915, "grad_norm": 0.8789278268814087, "learning_rate": 9.939666511263468e-05, "loss": 1.0414228439331055, "memory(GiB)": 89.65, "step": 14955, "token_acc": 0.7117017085139152, "train_speed(iter/s)": 0.366788 }, { "epoch": 0.19411617775384485, "grad_norm": 0.893783450126648, "learning_rate": 9.939583409086486e-05, "loss": 1.046136474609375, "memory(GiB)": 89.65, "step": 14960, "token_acc": 0.7310247495783277, "train_speed(iter/s)": 0.366553 }, { "epoch": 0.19418105615550055, "grad_norm": 0.8569930195808411, "learning_rate": 9.939500250065095e-05, "loss": 0.998774528503418, "memory(GiB)": 89.65, "step": 14965, "token_acc": 0.7355744512556852, "train_speed(iter/s)": 0.366314 }, { "epoch": 0.19424593455715625, "grad_norm": 0.7934057116508484, "learning_rate": 9.939417034200255e-05, "loss": 0.9844961166381836, "memory(GiB)": 89.65, "step": 14970, "token_acc": 0.7482993197278912, "train_speed(iter/s)": 0.366074 }, { "epoch": 0.19431081295881195, "grad_norm": 0.9658803343772888, "learning_rate": 9.939333761492922e-05, "loss": 0.9838160514831543, "memory(GiB)": 89.65, "step": 14975, "token_acc": 0.75, "train_speed(iter/s)": 0.365824 }, { "epoch": 0.19437569136046765, "grad_norm": 0.8731250166893005, "learning_rate": 9.939250431944051e-05, "loss": 1.004587745666504, "memory(GiB)": 89.65, "step": 14980, "token_acc": 0.7363247409921856, "train_speed(iter/s)": 0.365573 }, { "epoch": 0.19444056976212334, "grad_norm": 0.8305149078369141, "learning_rate": 9.939167045554607e-05, "loss": 0.9832040786743164, "memory(GiB)": 89.65, "step": 14985, "token_acc": 0.7285393919341201, "train_speed(iter/s)": 0.365322 }, { "epoch": 0.19450544816377904, "grad_norm": 0.8247231245040894, "learning_rate": 9.939083602325545e-05, "loss": 0.9722402572631836, "memory(GiB)": 89.65, "step": 14990, "token_acc": 0.7275996607294317, "train_speed(iter/s)": 0.365089 }, { "epoch": 0.19457032656543474, "grad_norm": 0.9221243858337402, "learning_rate": 9.939000102257827e-05, "loss": 1.0084064483642579, "memory(GiB)": 89.65, "step": 14995, "token_acc": 0.7369162719693114, "train_speed(iter/s)": 0.364841 }, { "epoch": 0.19463520496709044, "grad_norm": 0.7589426040649414, "learning_rate": 9.938916545352415e-05, "loss": 0.9767069816589355, "memory(GiB)": 89.65, "step": 15000, "token_acc": 0.7429804087983591, "train_speed(iter/s)": 0.364606 }, { "epoch": 0.19470008336874614, "grad_norm": 0.885569155216217, "learning_rate": 9.938832931610269e-05, "loss": 0.9859104156494141, "memory(GiB)": 89.65, "step": 15005, "token_acc": 0.7413445556459313, "train_speed(iter/s)": 0.364369 }, { "epoch": 0.1947649617704018, "grad_norm": 0.8844535946846008, "learning_rate": 9.938749261032349e-05, "loss": 1.0048088073730468, "memory(GiB)": 89.65, "step": 15010, "token_acc": 0.7352813179981349, "train_speed(iter/s)": 0.364125 }, { "epoch": 0.1948298401720575, "grad_norm": 0.929596483707428, "learning_rate": 9.938665533619623e-05, "loss": 0.9735311508178711, "memory(GiB)": 89.65, "step": 15015, "token_acc": 0.7653706544923848, "train_speed(iter/s)": 0.363867 }, { "epoch": 0.1948947185737132, "grad_norm": 0.9076858758926392, "learning_rate": 9.93858174937305e-05, "loss": 1.0009756088256836, "memory(GiB)": 89.65, "step": 15020, "token_acc": 0.7456134194272881, "train_speed(iter/s)": 0.36361 }, { "epoch": 0.1949595969753689, "grad_norm": 0.7845638394355774, "learning_rate": 9.938497908293598e-05, "loss": 0.9288320541381836, "memory(GiB)": 89.65, "step": 15025, "token_acc": 0.7345073809611898, "train_speed(iter/s)": 0.363372 }, { "epoch": 0.1950244753770246, "grad_norm": 0.829730749130249, "learning_rate": 9.938414010382228e-05, "loss": 0.9581765174865723, "memory(GiB)": 89.65, "step": 15030, "token_acc": 0.7306748721360555, "train_speed(iter/s)": 0.363134 }, { "epoch": 0.1950893537786803, "grad_norm": 0.9294995069503784, "learning_rate": 9.938330055639908e-05, "loss": 0.9457247734069825, "memory(GiB)": 89.65, "step": 15035, "token_acc": 0.7418217505088689, "train_speed(iter/s)": 0.362892 }, { "epoch": 0.195154232180336, "grad_norm": 0.7996830344200134, "learning_rate": 9.938246044067606e-05, "loss": 0.9662328720092773, "memory(GiB)": 89.65, "step": 15040, "token_acc": 0.7651691714216828, "train_speed(iter/s)": 0.362673 }, { "epoch": 0.1952191105819917, "grad_norm": 0.8446518778800964, "learning_rate": 9.938161975666282e-05, "loss": 1.0179681777954102, "memory(GiB)": 89.65, "step": 15045, "token_acc": 0.7391661734727428, "train_speed(iter/s)": 0.362443 }, { "epoch": 0.1952839889836474, "grad_norm": 0.9404258131980896, "learning_rate": 9.938077850436911e-05, "loss": 1.0361249923706055, "memory(GiB)": 89.65, "step": 15050, "token_acc": 0.7325181810916647, "train_speed(iter/s)": 0.362206 }, { "epoch": 0.1953488673853031, "grad_norm": 0.8272647261619568, "learning_rate": 9.937993668380456e-05, "loss": 0.9826740264892578, "memory(GiB)": 89.65, "step": 15055, "token_acc": 0.7703527815468114, "train_speed(iter/s)": 0.361965 }, { "epoch": 0.1954137457869588, "grad_norm": 0.8663648366928101, "learning_rate": 9.937909429497889e-05, "loss": 1.0030410766601563, "memory(GiB)": 89.65, "step": 15060, "token_acc": 0.7388021803162136, "train_speed(iter/s)": 0.361709 }, { "epoch": 0.1954786241886145, "grad_norm": 0.8574697375297546, "learning_rate": 9.937825133790177e-05, "loss": 0.9583072662353516, "memory(GiB)": 89.65, "step": 15065, "token_acc": 0.7583393910023423, "train_speed(iter/s)": 0.36148 }, { "epoch": 0.19554350259027017, "grad_norm": 0.8691109418869019, "learning_rate": 9.937740781258292e-05, "loss": 1.021560287475586, "memory(GiB)": 89.65, "step": 15070, "token_acc": 0.7317926943895245, "train_speed(iter/s)": 0.361247 }, { "epoch": 0.19560838099192587, "grad_norm": 0.8971670866012573, "learning_rate": 9.937656371903204e-05, "loss": 1.0358108520507812, "memory(GiB)": 89.65, "step": 15075, "token_acc": 0.7239925760483186, "train_speed(iter/s)": 0.361018 }, { "epoch": 0.19567325939358157, "grad_norm": 0.8221350908279419, "learning_rate": 9.937571905725885e-05, "loss": 0.9676655769348145, "memory(GiB)": 89.65, "step": 15080, "token_acc": 0.7551068120050265, "train_speed(iter/s)": 0.360786 }, { "epoch": 0.19573813779523727, "grad_norm": 0.7734683752059937, "learning_rate": 9.937487382727305e-05, "loss": 0.9523819923400879, "memory(GiB)": 89.65, "step": 15085, "token_acc": 0.7255683920569008, "train_speed(iter/s)": 0.360555 }, { "epoch": 0.19580301619689297, "grad_norm": 0.9350913763046265, "learning_rate": 9.937402802908437e-05, "loss": 0.9472253799438477, "memory(GiB)": 89.65, "step": 15090, "token_acc": 0.7474721757474359, "train_speed(iter/s)": 0.360309 }, { "epoch": 0.19586789459854867, "grad_norm": 0.8520199656486511, "learning_rate": 9.937318166270258e-05, "loss": 1.0081954956054688, "memory(GiB)": 89.65, "step": 15095, "token_acc": 0.7402737873922596, "train_speed(iter/s)": 0.36008 }, { "epoch": 0.19593277300020437, "grad_norm": 0.8733436465263367, "learning_rate": 9.937233472813736e-05, "loss": 0.990542221069336, "memory(GiB)": 89.65, "step": 15100, "token_acc": 0.7390701914311759, "train_speed(iter/s)": 0.359853 }, { "epoch": 0.19599765140186007, "grad_norm": 0.8680356740951538, "learning_rate": 9.937148722539852e-05, "loss": 0.952197265625, "memory(GiB)": 89.65, "step": 15105, "token_acc": 0.7704015219785721, "train_speed(iter/s)": 0.35962 }, { "epoch": 0.19606252980351577, "grad_norm": 0.8580846190452576, "learning_rate": 9.937063915449577e-05, "loss": 1.0100385665893554, "memory(GiB)": 89.65, "step": 15110, "token_acc": 0.720252484555466, "train_speed(iter/s)": 0.35939 }, { "epoch": 0.19612740820517147, "grad_norm": 0.8187165260314941, "learning_rate": 9.936979051543889e-05, "loss": 1.0197183609008789, "memory(GiB)": 89.65, "step": 15115, "token_acc": 0.7336099300507475, "train_speed(iter/s)": 0.359155 }, { "epoch": 0.19619228660682717, "grad_norm": 0.9578506946563721, "learning_rate": 9.936894130823762e-05, "loss": 0.9827280044555664, "memory(GiB)": 89.65, "step": 15120, "token_acc": 0.732905264573388, "train_speed(iter/s)": 0.358923 }, { "epoch": 0.19625716500848286, "grad_norm": 0.8792101144790649, "learning_rate": 9.936809153290177e-05, "loss": 0.9935153007507325, "memory(GiB)": 89.65, "step": 15125, "token_acc": 0.7274788338233731, "train_speed(iter/s)": 0.358687 }, { "epoch": 0.19632204341013854, "grad_norm": 0.8929461240768433, "learning_rate": 9.936724118944109e-05, "loss": 1.0333224296569825, "memory(GiB)": 89.65, "step": 15130, "token_acc": 0.7442192728503214, "train_speed(iter/s)": 0.358465 }, { "epoch": 0.19638692181179424, "grad_norm": 0.9295535683631897, "learning_rate": 9.936639027786537e-05, "loss": 0.9996561050415039, "memory(GiB)": 89.65, "step": 15135, "token_acc": 0.7396075581395349, "train_speed(iter/s)": 0.358232 }, { "epoch": 0.19645180021344993, "grad_norm": 0.9243419170379639, "learning_rate": 9.936553879818442e-05, "loss": 0.9706248283386231, "memory(GiB)": 89.65, "step": 15140, "token_acc": 0.7432096341383598, "train_speed(iter/s)": 0.357999 }, { "epoch": 0.19651667861510563, "grad_norm": 0.8791604042053223, "learning_rate": 9.936468675040803e-05, "loss": 0.9989158630371093, "memory(GiB)": 89.65, "step": 15145, "token_acc": 0.718116178658249, "train_speed(iter/s)": 0.357776 }, { "epoch": 0.19658155701676133, "grad_norm": 0.868850827217102, "learning_rate": 9.936383413454597e-05, "loss": 1.0229604721069336, "memory(GiB)": 89.65, "step": 15150, "token_acc": 0.7269890795631825, "train_speed(iter/s)": 0.357548 }, { "epoch": 0.19664643541841703, "grad_norm": 0.8478118181228638, "learning_rate": 9.936298095060813e-05, "loss": 0.9968891143798828, "memory(GiB)": 89.65, "step": 15155, "token_acc": 0.7529009099257034, "train_speed(iter/s)": 0.357333 }, { "epoch": 0.19671131382007273, "grad_norm": 0.8551748394966125, "learning_rate": 9.936212719860424e-05, "loss": 0.9776628494262696, "memory(GiB)": 89.65, "step": 15160, "token_acc": 0.7343176089305006, "train_speed(iter/s)": 0.357095 }, { "epoch": 0.19677619222172843, "grad_norm": 0.9730837345123291, "learning_rate": 9.93612728785442e-05, "loss": 0.9526084899902344, "memory(GiB)": 89.65, "step": 15165, "token_acc": 0.7399922269724057, "train_speed(iter/s)": 0.356871 }, { "epoch": 0.19684107062338413, "grad_norm": 0.9104942083358765, "learning_rate": 9.936041799043778e-05, "loss": 0.982890510559082, "memory(GiB)": 89.65, "step": 15170, "token_acc": 0.7345209332515957, "train_speed(iter/s)": 0.356638 }, { "epoch": 0.19690594902503983, "grad_norm": 0.9084501266479492, "learning_rate": 9.935956253429487e-05, "loss": 0.9540742874145508, "memory(GiB)": 89.65, "step": 15175, "token_acc": 0.7484643195619362, "train_speed(iter/s)": 0.356407 }, { "epoch": 0.19697082742669553, "grad_norm": 0.9555622339248657, "learning_rate": 9.935870651012529e-05, "loss": 1.0373504638671875, "memory(GiB)": 89.65, "step": 15180, "token_acc": 0.72236355226642, "train_speed(iter/s)": 0.35619 }, { "epoch": 0.19703570582835123, "grad_norm": 0.8218843936920166, "learning_rate": 9.935784991793889e-05, "loss": 1.0337747573852538, "memory(GiB)": 89.65, "step": 15185, "token_acc": 0.7411088622235072, "train_speed(iter/s)": 0.355975 }, { "epoch": 0.1971005842300069, "grad_norm": 0.8684719204902649, "learning_rate": 9.935699275774552e-05, "loss": 1.0221935272216798, "memory(GiB)": 89.65, "step": 15190, "token_acc": 0.7424281720987868, "train_speed(iter/s)": 0.355754 }, { "epoch": 0.1971654626316626, "grad_norm": 0.7837380766868591, "learning_rate": 9.935613502955506e-05, "loss": 1.0037742614746095, "memory(GiB)": 89.65, "step": 15195, "token_acc": 0.7374252947827491, "train_speed(iter/s)": 0.355539 }, { "epoch": 0.1972303410333183, "grad_norm": 0.8577329516410828, "learning_rate": 9.935527673337737e-05, "loss": 0.9921413421630859, "memory(GiB)": 89.65, "step": 15200, "token_acc": 0.740931396132012, "train_speed(iter/s)": 0.355319 }, { "epoch": 0.197295219434974, "grad_norm": 0.9582401514053345, "learning_rate": 9.935441786922232e-05, "loss": 0.9913684844970703, "memory(GiB)": 89.65, "step": 15205, "token_acc": 0.7417928633594429, "train_speed(iter/s)": 0.355104 }, { "epoch": 0.1973600978366297, "grad_norm": 0.8552390336990356, "learning_rate": 9.935355843709983e-05, "loss": 0.9778367042541504, "memory(GiB)": 89.65, "step": 15210, "token_acc": 0.7411154627212069, "train_speed(iter/s)": 0.354875 }, { "epoch": 0.1974249762382854, "grad_norm": 0.9702271819114685, "learning_rate": 9.935269843701975e-05, "loss": 1.0108380317687988, "memory(GiB)": 89.65, "step": 15215, "token_acc": 0.7326496301277741, "train_speed(iter/s)": 0.354662 }, { "epoch": 0.1974898546399411, "grad_norm": 1.0493303537368774, "learning_rate": 9.9351837868992e-05, "loss": 0.973884105682373, "memory(GiB)": 89.65, "step": 15220, "token_acc": 0.7679067585609641, "train_speed(iter/s)": 0.354444 }, { "epoch": 0.1975547330415968, "grad_norm": 0.8733147382736206, "learning_rate": 9.935097673302649e-05, "loss": 0.9772382736206054, "memory(GiB)": 89.65, "step": 15225, "token_acc": 0.7327610569450332, "train_speed(iter/s)": 0.354225 }, { "epoch": 0.1976196114432525, "grad_norm": 0.9701107144355774, "learning_rate": 9.93501150291331e-05, "loss": 1.0252168655395508, "memory(GiB)": 89.65, "step": 15230, "token_acc": 0.75425385562233, "train_speed(iter/s)": 0.35402 }, { "epoch": 0.1976844898449082, "grad_norm": 0.8841954469680786, "learning_rate": 9.934925275732176e-05, "loss": 1.0238382339477539, "memory(GiB)": 89.65, "step": 15235, "token_acc": 0.735119163329575, "train_speed(iter/s)": 0.353819 }, { "epoch": 0.1977493682465639, "grad_norm": 0.9113918542861938, "learning_rate": 9.934838991760241e-05, "loss": 1.0323018074035644, "memory(GiB)": 89.65, "step": 15240, "token_acc": 0.720592895116713, "train_speed(iter/s)": 0.353612 }, { "epoch": 0.1978142466482196, "grad_norm": 0.8642629384994507, "learning_rate": 9.934752650998494e-05, "loss": 1.0237035751342773, "memory(GiB)": 89.65, "step": 15245, "token_acc": 0.7146587743732591, "train_speed(iter/s)": 0.353398 }, { "epoch": 0.19787912504987526, "grad_norm": 0.8667125701904297, "learning_rate": 9.934666253447933e-05, "loss": 1.0196365356445312, "memory(GiB)": 89.65, "step": 15250, "token_acc": 0.7174153351421968, "train_speed(iter/s)": 0.353178 }, { "epoch": 0.19794400345153096, "grad_norm": 0.8552078008651733, "learning_rate": 9.934579799109549e-05, "loss": 0.9894542694091797, "memory(GiB)": 89.65, "step": 15255, "token_acc": 0.7503337783711616, "train_speed(iter/s)": 0.352971 }, { "epoch": 0.19800888185318666, "grad_norm": 0.8546783328056335, "learning_rate": 9.934493287984339e-05, "loss": 0.9451125144958497, "memory(GiB)": 89.65, "step": 15260, "token_acc": 0.7276964047936085, "train_speed(iter/s)": 0.352757 }, { "epoch": 0.19807376025484236, "grad_norm": 0.831473708152771, "learning_rate": 9.934406720073298e-05, "loss": 0.9277220726013183, "memory(GiB)": 89.65, "step": 15265, "token_acc": 0.7573687657902124, "train_speed(iter/s)": 0.35254 }, { "epoch": 0.19813863865649806, "grad_norm": 0.8255251049995422, "learning_rate": 9.934320095377422e-05, "loss": 0.9991909027099609, "memory(GiB)": 89.65, "step": 15270, "token_acc": 0.7167650365373806, "train_speed(iter/s)": 0.352319 }, { "epoch": 0.19820351705815376, "grad_norm": 0.9858525991439819, "learning_rate": 9.934233413897709e-05, "loss": 0.9585682868957519, "memory(GiB)": 89.65, "step": 15275, "token_acc": 0.736737729300942, "train_speed(iter/s)": 0.352098 }, { "epoch": 0.19826839545980945, "grad_norm": 0.8356394171714783, "learning_rate": 9.934146675635154e-05, "loss": 0.9513029098510742, "memory(GiB)": 89.65, "step": 15280, "token_acc": 0.7571047824394335, "train_speed(iter/s)": 0.351863 }, { "epoch": 0.19833327386146515, "grad_norm": 0.8550448417663574, "learning_rate": 9.934059880590756e-05, "loss": 0.954754638671875, "memory(GiB)": 89.65, "step": 15285, "token_acc": 0.7271171941830624, "train_speed(iter/s)": 0.351649 }, { "epoch": 0.19839815226312085, "grad_norm": 0.9548473954200745, "learning_rate": 9.933973028765516e-05, "loss": 0.9459774017333984, "memory(GiB)": 89.65, "step": 15290, "token_acc": 0.7630817956485817, "train_speed(iter/s)": 0.351445 }, { "epoch": 0.19846303066477655, "grad_norm": 0.8000460863113403, "learning_rate": 9.933886120160432e-05, "loss": 1.0218620300292969, "memory(GiB)": 89.65, "step": 15295, "token_acc": 0.7320320669222725, "train_speed(iter/s)": 0.351237 }, { "epoch": 0.19852790906643225, "grad_norm": 0.8617013692855835, "learning_rate": 9.933799154776502e-05, "loss": 0.992673397064209, "memory(GiB)": 89.65, "step": 15300, "token_acc": 0.7583748845798707, "train_speed(iter/s)": 0.35102 }, { "epoch": 0.19859278746808792, "grad_norm": 0.9199671745300293, "learning_rate": 9.93371213261473e-05, "loss": 0.9698594093322754, "memory(GiB)": 89.65, "step": 15305, "token_acc": 0.7127349843643291, "train_speed(iter/s)": 0.35081 }, { "epoch": 0.19865766586974362, "grad_norm": 0.9396126866340637, "learning_rate": 9.933625053676116e-05, "loss": 0.992650032043457, "memory(GiB)": 89.65, "step": 15310, "token_acc": 0.7461185663325972, "train_speed(iter/s)": 0.350586 }, { "epoch": 0.19872254427139932, "grad_norm": 0.8021483421325684, "learning_rate": 9.933537917961664e-05, "loss": 1.0131550788879395, "memory(GiB)": 89.65, "step": 15315, "token_acc": 0.7371324766197941, "train_speed(iter/s)": 0.35036 }, { "epoch": 0.19878742267305502, "grad_norm": 1.0396400690078735, "learning_rate": 9.933450725472375e-05, "loss": 1.0092758178710937, "memory(GiB)": 89.65, "step": 15320, "token_acc": 0.7432150313152401, "train_speed(iter/s)": 0.350134 }, { "epoch": 0.19885230107471072, "grad_norm": 0.8656120300292969, "learning_rate": 9.93336347620925e-05, "loss": 0.958957290649414, "memory(GiB)": 89.65, "step": 15325, "token_acc": 0.7554409156859584, "train_speed(iter/s)": 0.349924 }, { "epoch": 0.19891717947636642, "grad_norm": 0.9360485672950745, "learning_rate": 9.933276170173298e-05, "loss": 0.9594289779663085, "memory(GiB)": 89.65, "step": 15330, "token_acc": 0.7587647662095948, "train_speed(iter/s)": 0.349713 }, { "epoch": 0.19898205787802212, "grad_norm": 0.8318716287612915, "learning_rate": 9.933188807365521e-05, "loss": 0.9930180549621582, "memory(GiB)": 89.65, "step": 15335, "token_acc": 0.7401622013948058, "train_speed(iter/s)": 0.349503 }, { "epoch": 0.19904693627967782, "grad_norm": 0.8380689024925232, "learning_rate": 9.933101387786925e-05, "loss": 0.9631025314331054, "memory(GiB)": 89.65, "step": 15340, "token_acc": 0.7467499638884877, "train_speed(iter/s)": 0.3493 }, { "epoch": 0.19911181468133352, "grad_norm": 0.8421921133995056, "learning_rate": 9.933013911438514e-05, "loss": 0.993710708618164, "memory(GiB)": 89.65, "step": 15345, "token_acc": 0.7489093398719867, "train_speed(iter/s)": 0.349097 }, { "epoch": 0.19917669308298921, "grad_norm": 0.7244139909744263, "learning_rate": 9.932926378321298e-05, "loss": 0.9119966506958008, "memory(GiB)": 89.65, "step": 15350, "token_acc": 0.7415910917231087, "train_speed(iter/s)": 0.348878 }, { "epoch": 0.1992415714846449, "grad_norm": 0.8767353892326355, "learning_rate": 9.932838788436282e-05, "loss": 1.0347758293151856, "memory(GiB)": 89.65, "step": 15355, "token_acc": 0.732088511285823, "train_speed(iter/s)": 0.348661 }, { "epoch": 0.1993064498863006, "grad_norm": 0.8942996859550476, "learning_rate": 9.932751141784475e-05, "loss": 0.9667251586914063, "memory(GiB)": 89.65, "step": 15360, "token_acc": 0.7272401667100807, "train_speed(iter/s)": 0.348449 }, { "epoch": 0.19937132828795628, "grad_norm": 0.8654780387878418, "learning_rate": 9.932663438366885e-05, "loss": 0.9966829299926758, "memory(GiB)": 89.65, "step": 15365, "token_acc": 0.727075785306092, "train_speed(iter/s)": 0.348238 }, { "epoch": 0.19943620668961198, "grad_norm": 0.7531718611717224, "learning_rate": 9.932575678184521e-05, "loss": 1.010403823852539, "memory(GiB)": 89.65, "step": 15370, "token_acc": 0.7472090952674474, "train_speed(iter/s)": 0.348022 }, { "epoch": 0.19950108509126768, "grad_norm": 0.8898470997810364, "learning_rate": 9.932487861238395e-05, "loss": 0.9572219848632812, "memory(GiB)": 89.65, "step": 15375, "token_acc": 0.7575814418808229, "train_speed(iter/s)": 0.347817 }, { "epoch": 0.19956596349292338, "grad_norm": 0.9345524907112122, "learning_rate": 9.932399987529516e-05, "loss": 0.9726449012756347, "memory(GiB)": 89.65, "step": 15380, "token_acc": 0.7144506890445762, "train_speed(iter/s)": 0.347613 }, { "epoch": 0.19963084189457908, "grad_norm": 0.8439540863037109, "learning_rate": 9.932312057058895e-05, "loss": 0.9711490631103515, "memory(GiB)": 89.65, "step": 15385, "token_acc": 0.7431917768950408, "train_speed(iter/s)": 0.347406 }, { "epoch": 0.19969572029623478, "grad_norm": 0.8177687525749207, "learning_rate": 9.932224069827545e-05, "loss": 0.9656764984130859, "memory(GiB)": 89.65, "step": 15390, "token_acc": 0.7560182708530513, "train_speed(iter/s)": 0.347192 }, { "epoch": 0.19976059869789048, "grad_norm": 0.799961507320404, "learning_rate": 9.932136025836476e-05, "loss": 0.9720089912414551, "memory(GiB)": 89.65, "step": 15395, "token_acc": 0.7501761059453367, "train_speed(iter/s)": 0.346989 }, { "epoch": 0.19982547709954618, "grad_norm": 0.7509666681289673, "learning_rate": 9.932047925086705e-05, "loss": 0.9513437271118164, "memory(GiB)": 89.65, "step": 15400, "token_acc": 0.7604490686116772, "train_speed(iter/s)": 0.346769 }, { "epoch": 0.19989035550120188, "grad_norm": 0.8811435699462891, "learning_rate": 9.931959767579242e-05, "loss": 1.0261813163757325, "memory(GiB)": 89.65, "step": 15405, "token_acc": 0.7273692232322683, "train_speed(iter/s)": 0.346551 }, { "epoch": 0.19995523390285758, "grad_norm": 0.7738818526268005, "learning_rate": 9.931871553315104e-05, "loss": 0.9613842010498047, "memory(GiB)": 89.65, "step": 15410, "token_acc": 0.7456282258685771, "train_speed(iter/s)": 0.346326 }, { "epoch": 0.20002011230451328, "grad_norm": 0.7422395944595337, "learning_rate": 9.931783282295305e-05, "loss": 0.9409988403320313, "memory(GiB)": 89.65, "step": 15415, "token_acc": 0.7585500053943252, "train_speed(iter/s)": 0.346108 }, { "epoch": 0.20008499070616897, "grad_norm": 0.8046788573265076, "learning_rate": 9.931694954520865e-05, "loss": 0.9372488021850586, "memory(GiB)": 89.65, "step": 15420, "token_acc": 0.7221633085896076, "train_speed(iter/s)": 0.3459 }, { "epoch": 0.20014986910782465, "grad_norm": 0.8425417542457581, "learning_rate": 9.931606569992793e-05, "loss": 0.9623407363891602, "memory(GiB)": 89.65, "step": 15425, "token_acc": 0.7179162833486661, "train_speed(iter/s)": 0.345694 }, { "epoch": 0.20021474750948035, "grad_norm": 0.8529877662658691, "learning_rate": 9.931518128712112e-05, "loss": 0.987038516998291, "memory(GiB)": 89.65, "step": 15430, "token_acc": 0.7312563494751101, "train_speed(iter/s)": 0.345489 }, { "epoch": 0.20027962591113604, "grad_norm": 0.7852601408958435, "learning_rate": 9.931429630679836e-05, "loss": 0.9530572891235352, "memory(GiB)": 89.65, "step": 15435, "token_acc": 0.7339947780678852, "train_speed(iter/s)": 0.345286 }, { "epoch": 0.20034450431279174, "grad_norm": 0.8127527832984924, "learning_rate": 9.931341075896988e-05, "loss": 0.9766312599182129, "memory(GiB)": 89.65, "step": 15440, "token_acc": 0.7621998956158664, "train_speed(iter/s)": 0.345085 }, { "epoch": 0.20040938271444744, "grad_norm": 0.8567566275596619, "learning_rate": 9.931252464364582e-05, "loss": 0.9814334869384765, "memory(GiB)": 89.65, "step": 15445, "token_acc": 0.7654331864904552, "train_speed(iter/s)": 0.344877 }, { "epoch": 0.20047426111610314, "grad_norm": 0.8579979538917542, "learning_rate": 9.93116379608364e-05, "loss": 0.9661848068237304, "memory(GiB)": 89.65, "step": 15450, "token_acc": 0.7630906327714838, "train_speed(iter/s)": 0.344677 }, { "epoch": 0.20053913951775884, "grad_norm": 0.8531342148780823, "learning_rate": 9.931075071055182e-05, "loss": 1.013036060333252, "memory(GiB)": 89.65, "step": 15455, "token_acc": 0.7647755213550679, "train_speed(iter/s)": 0.344476 }, { "epoch": 0.20060401791941454, "grad_norm": 0.8682916760444641, "learning_rate": 9.93098628928023e-05, "loss": 0.9802642822265625, "memory(GiB)": 89.65, "step": 15460, "token_acc": 0.7405522987870073, "train_speed(iter/s)": 0.34427 }, { "epoch": 0.20066889632107024, "grad_norm": 0.8597568273544312, "learning_rate": 9.930897450759806e-05, "loss": 1.014964771270752, "memory(GiB)": 89.65, "step": 15465, "token_acc": 0.7469735663825412, "train_speed(iter/s)": 0.344069 }, { "epoch": 0.20073377472272594, "grad_norm": 0.7785951495170593, "learning_rate": 9.930808555494932e-05, "loss": 0.9752105712890625, "memory(GiB)": 89.65, "step": 15470, "token_acc": 0.760733984642561, "train_speed(iter/s)": 0.343862 }, { "epoch": 0.20079865312438164, "grad_norm": 0.9601557850837708, "learning_rate": 9.93071960348663e-05, "loss": 1.0187337875366211, "memory(GiB)": 89.65, "step": 15475, "token_acc": 0.7441384180790961, "train_speed(iter/s)": 0.343653 }, { "epoch": 0.20086353152603734, "grad_norm": 0.7907692193984985, "learning_rate": 9.930630594735922e-05, "loss": 1.032429027557373, "memory(GiB)": 89.65, "step": 15480, "token_acc": 0.7388697346724629, "train_speed(iter/s)": 0.343456 }, { "epoch": 0.200928409927693, "grad_norm": 0.8284798860549927, "learning_rate": 9.930541529243836e-05, "loss": 0.9724512100219727, "memory(GiB)": 89.65, "step": 15485, "token_acc": 0.7399173785184148, "train_speed(iter/s)": 0.343251 }, { "epoch": 0.2009932883293487, "grad_norm": 0.7882564663887024, "learning_rate": 9.930452407011396e-05, "loss": 0.9869583129882813, "memory(GiB)": 89.65, "step": 15490, "token_acc": 0.7308109726432394, "train_speed(iter/s)": 0.343057 }, { "epoch": 0.2010581667310044, "grad_norm": 0.8406911492347717, "learning_rate": 9.930363228039626e-05, "loss": 1.0133254051208496, "memory(GiB)": 89.65, "step": 15495, "token_acc": 0.7408498583569405, "train_speed(iter/s)": 0.342857 }, { "epoch": 0.2011230451326601, "grad_norm": 0.9938189387321472, "learning_rate": 9.930273992329555e-05, "loss": 1.0144073486328125, "memory(GiB)": 89.65, "step": 15500, "token_acc": 0.7371297782939911, "train_speed(iter/s)": 0.342671 }, { "epoch": 0.2011879235343158, "grad_norm": 0.8592751622200012, "learning_rate": 9.930184699882206e-05, "loss": 1.0018210411071777, "memory(GiB)": 89.65, "step": 15505, "token_acc": 0.7350981186259711, "train_speed(iter/s)": 0.342469 }, { "epoch": 0.2012528019359715, "grad_norm": 1.0199915170669556, "learning_rate": 9.93009535069861e-05, "loss": 1.0019583702087402, "memory(GiB)": 89.65, "step": 15510, "token_acc": 0.7276521220702761, "train_speed(iter/s)": 0.342271 }, { "epoch": 0.2013176803376272, "grad_norm": 0.9702481627464294, "learning_rate": 9.930005944779794e-05, "loss": 0.9762019157409668, "memory(GiB)": 89.65, "step": 15515, "token_acc": 0.7351005688366732, "train_speed(iter/s)": 0.342071 }, { "epoch": 0.2013825587392829, "grad_norm": 0.7631314396858215, "learning_rate": 9.929916482126787e-05, "loss": 0.93032808303833, "memory(GiB)": 89.65, "step": 15520, "token_acc": 0.770795555702087, "train_speed(iter/s)": 0.34187 }, { "epoch": 0.2014474371409386, "grad_norm": 0.98768150806427, "learning_rate": 9.929826962740617e-05, "loss": 0.9443740844726562, "memory(GiB)": 89.65, "step": 15525, "token_acc": 0.7213360453912857, "train_speed(iter/s)": 0.341675 }, { "epoch": 0.2015123155425943, "grad_norm": 0.8878443837165833, "learning_rate": 9.929737386622316e-05, "loss": 1.024911117553711, "memory(GiB)": 89.65, "step": 15530, "token_acc": 0.7354813664596274, "train_speed(iter/s)": 0.341472 }, { "epoch": 0.20157719394425, "grad_norm": 0.8036848902702332, "learning_rate": 9.929647753772915e-05, "loss": 0.9389215469360351, "memory(GiB)": 89.65, "step": 15535, "token_acc": 0.7514767413241448, "train_speed(iter/s)": 0.341248 }, { "epoch": 0.2016420723459057, "grad_norm": 0.8719359636306763, "learning_rate": 9.929558064193445e-05, "loss": 0.9868710517883301, "memory(GiB)": 89.65, "step": 15540, "token_acc": 0.7170415860089446, "train_speed(iter/s)": 0.341043 }, { "epoch": 0.20170695074756137, "grad_norm": 0.7047494053840637, "learning_rate": 9.929468317884938e-05, "loss": 0.9607490539550781, "memory(GiB)": 89.65, "step": 15545, "token_acc": 0.7606411881905452, "train_speed(iter/s)": 0.340847 }, { "epoch": 0.20177182914921707, "grad_norm": 0.8514368534088135, "learning_rate": 9.929378514848427e-05, "loss": 0.991510009765625, "memory(GiB)": 89.65, "step": 15550, "token_acc": 0.7505746182892793, "train_speed(iter/s)": 0.340655 }, { "epoch": 0.20183670755087277, "grad_norm": 0.8835910558700562, "learning_rate": 9.929288655084945e-05, "loss": 0.954927635192871, "memory(GiB)": 89.65, "step": 15555, "token_acc": 0.7384916748285995, "train_speed(iter/s)": 0.340453 }, { "epoch": 0.20190158595252847, "grad_norm": 0.8386980891227722, "learning_rate": 9.929198738595527e-05, "loss": 1.0262458801269532, "memory(GiB)": 89.65, "step": 15560, "token_acc": 0.7229570574379417, "train_speed(iter/s)": 0.340271 }, { "epoch": 0.20196646435418417, "grad_norm": 0.8617469072341919, "learning_rate": 9.929108765381207e-05, "loss": 0.9882963180541993, "memory(GiB)": 89.65, "step": 15565, "token_acc": 0.7460860725177906, "train_speed(iter/s)": 0.340085 }, { "epoch": 0.20203134275583987, "grad_norm": 0.8043317198753357, "learning_rate": 9.929018735443021e-05, "loss": 1.0015737533569335, "memory(GiB)": 89.65, "step": 15570, "token_acc": 0.7268390660306302, "train_speed(iter/s)": 0.33989 }, { "epoch": 0.20209622115749556, "grad_norm": 0.8778215050697327, "learning_rate": 9.928928648782003e-05, "loss": 1.0253873825073243, "memory(GiB)": 89.65, "step": 15575, "token_acc": 0.7298490566037736, "train_speed(iter/s)": 0.339706 }, { "epoch": 0.20216109955915126, "grad_norm": 1.1059165000915527, "learning_rate": 9.928838505399192e-05, "loss": 1.0478663444519043, "memory(GiB)": 89.65, "step": 15580, "token_acc": 0.7074688009710579, "train_speed(iter/s)": 0.339506 }, { "epoch": 0.20222597796080696, "grad_norm": 0.7986911535263062, "learning_rate": 9.928748305295625e-05, "loss": 1.0087225914001465, "memory(GiB)": 89.65, "step": 15585, "token_acc": 0.7221700063912178, "train_speed(iter/s)": 0.339315 }, { "epoch": 0.20229085636246266, "grad_norm": 0.8503946661949158, "learning_rate": 9.92865804847234e-05, "loss": 0.9962936401367187, "memory(GiB)": 89.65, "step": 15590, "token_acc": 0.7358378645575877, "train_speed(iter/s)": 0.339116 }, { "epoch": 0.20235573476411836, "grad_norm": 0.8754649758338928, "learning_rate": 9.928567734930376e-05, "loss": 0.9940039634704589, "memory(GiB)": 89.65, "step": 15595, "token_acc": 0.7339060582394288, "train_speed(iter/s)": 0.338924 }, { "epoch": 0.20242061316577406, "grad_norm": 0.911769688129425, "learning_rate": 9.92847736467077e-05, "loss": 0.9867048263549805, "memory(GiB)": 89.65, "step": 15600, "token_acc": 0.7333647604888132, "train_speed(iter/s)": 0.338732 }, { "epoch": 0.20248549156742973, "grad_norm": 0.9294474720954895, "learning_rate": 9.928386937694564e-05, "loss": 0.958774185180664, "memory(GiB)": 89.65, "step": 15605, "token_acc": 0.731943502127922, "train_speed(iter/s)": 0.338538 }, { "epoch": 0.20255036996908543, "grad_norm": 0.9208536148071289, "learning_rate": 9.928296454002798e-05, "loss": 0.981169605255127, "memory(GiB)": 89.65, "step": 15610, "token_acc": 0.7540003232584451, "train_speed(iter/s)": 0.338349 }, { "epoch": 0.20261524837074113, "grad_norm": 0.8865651488304138, "learning_rate": 9.928205913596516e-05, "loss": 0.9774700164794922, "memory(GiB)": 89.65, "step": 15615, "token_acc": 0.7468893528183717, "train_speed(iter/s)": 0.338158 }, { "epoch": 0.20268012677239683, "grad_norm": 0.8382910490036011, "learning_rate": 9.928115316476756e-05, "loss": 0.9937410354614258, "memory(GiB)": 89.65, "step": 15620, "token_acc": 0.742249394059979, "train_speed(iter/s)": 0.337974 }, { "epoch": 0.20274500517405253, "grad_norm": 0.959044337272644, "learning_rate": 9.928024662644562e-05, "loss": 1.011500930786133, "memory(GiB)": 89.65, "step": 15625, "token_acc": 0.7193558572781813, "train_speed(iter/s)": 0.337776 }, { "epoch": 0.20280988357570823, "grad_norm": 0.997776448726654, "learning_rate": 9.927933952100976e-05, "loss": 0.9958784103393554, "memory(GiB)": 89.65, "step": 15630, "token_acc": 0.7585943190094683, "train_speed(iter/s)": 0.337583 }, { "epoch": 0.20287476197736393, "grad_norm": 0.9437141418457031, "learning_rate": 9.927843184847046e-05, "loss": 1.0185001373291016, "memory(GiB)": 89.65, "step": 15635, "token_acc": 0.724246814517627, "train_speed(iter/s)": 0.337397 }, { "epoch": 0.20293964037901963, "grad_norm": 0.835651695728302, "learning_rate": 9.927752360883812e-05, "loss": 0.9971983909606934, "memory(GiB)": 89.65, "step": 15640, "token_acc": 0.7580677557579004, "train_speed(iter/s)": 0.337212 }, { "epoch": 0.20300451878067532, "grad_norm": 0.9645047783851624, "learning_rate": 9.927661480212321e-05, "loss": 0.9616676330566406, "memory(GiB)": 89.65, "step": 15645, "token_acc": 0.7797178802897445, "train_speed(iter/s)": 0.337035 }, { "epoch": 0.20306939718233102, "grad_norm": 0.7998273968696594, "learning_rate": 9.92757054283362e-05, "loss": 0.9386563301086426, "memory(GiB)": 89.65, "step": 15650, "token_acc": 0.7607666007967128, "train_speed(iter/s)": 0.336834 }, { "epoch": 0.20313427558398672, "grad_norm": 0.7915163636207581, "learning_rate": 9.927479548748753e-05, "loss": 0.9876181602478027, "memory(GiB)": 89.65, "step": 15655, "token_acc": 0.7279233430082303, "train_speed(iter/s)": 0.336649 }, { "epoch": 0.20319915398564242, "grad_norm": 0.8723470568656921, "learning_rate": 9.92738849795877e-05, "loss": 0.9882942199707031, "memory(GiB)": 89.65, "step": 15660, "token_acc": 0.7458497885576486, "train_speed(iter/s)": 0.336463 }, { "epoch": 0.2032640323872981, "grad_norm": 0.9079492688179016, "learning_rate": 9.927297390464715e-05, "loss": 0.9580598831176758, "memory(GiB)": 89.65, "step": 15665, "token_acc": 0.7353799663473095, "train_speed(iter/s)": 0.336261 }, { "epoch": 0.2033289107889538, "grad_norm": 0.8114516139030457, "learning_rate": 9.92720622626764e-05, "loss": 0.9795614242553711, "memory(GiB)": 89.65, "step": 15670, "token_acc": 0.7585583742498636, "train_speed(iter/s)": 0.336081 }, { "epoch": 0.2033937891906095, "grad_norm": 0.9337282776832581, "learning_rate": 9.927115005368593e-05, "loss": 0.9732565879821777, "memory(GiB)": 89.65, "step": 15675, "token_acc": 0.7657018383444998, "train_speed(iter/s)": 0.335894 }, { "epoch": 0.2034586675922652, "grad_norm": 0.9532607793807983, "learning_rate": 9.927023727768623e-05, "loss": 1.0087130546569825, "memory(GiB)": 89.65, "step": 15680, "token_acc": 0.7173608407940834, "train_speed(iter/s)": 0.335701 }, { "epoch": 0.2035235459939209, "grad_norm": 0.8782395124435425, "learning_rate": 9.926932393468782e-05, "loss": 0.9824441909790039, "memory(GiB)": 89.65, "step": 15685, "token_acc": 0.7582385723029347, "train_speed(iter/s)": 0.335515 }, { "epoch": 0.2035884243955766, "grad_norm": 0.8591576218605042, "learning_rate": 9.92684100247012e-05, "loss": 1.0091341972351073, "memory(GiB)": 89.65, "step": 15690, "token_acc": 0.7143960131957605, "train_speed(iter/s)": 0.33533 }, { "epoch": 0.2036533027972323, "grad_norm": 0.8952431082725525, "learning_rate": 9.926749554773689e-05, "loss": 1.0046213150024415, "memory(GiB)": 89.65, "step": 15695, "token_acc": 0.7348434377081945, "train_speed(iter/s)": 0.335151 }, { "epoch": 0.203718181198888, "grad_norm": 0.85976642370224, "learning_rate": 9.926658050380539e-05, "loss": 0.9663272857666015, "memory(GiB)": 89.65, "step": 15700, "token_acc": 0.7533307463458803, "train_speed(iter/s)": 0.334975 }, { "epoch": 0.2037830596005437, "grad_norm": 0.8346255421638489, "learning_rate": 9.926566489291727e-05, "loss": 0.9760221481323242, "memory(GiB)": 89.65, "step": 15705, "token_acc": 0.7614995490372927, "train_speed(iter/s)": 0.334775 }, { "epoch": 0.20384793800219939, "grad_norm": 0.8181597590446472, "learning_rate": 9.926474871508306e-05, "loss": 0.9841273307800293, "memory(GiB)": 89.65, "step": 15710, "token_acc": 0.7319904282295796, "train_speed(iter/s)": 0.33458 }, { "epoch": 0.20391281640385509, "grad_norm": 0.8455023169517517, "learning_rate": 9.926383197031328e-05, "loss": 0.9250776290893554, "memory(GiB)": 89.65, "step": 15715, "token_acc": 0.7464929170678036, "train_speed(iter/s)": 0.334379 }, { "epoch": 0.20397769480551078, "grad_norm": 0.9034731984138489, "learning_rate": 9.926291465861848e-05, "loss": 1.0429040908813476, "memory(GiB)": 89.65, "step": 15720, "token_acc": 0.7230036866029564, "train_speed(iter/s)": 0.334194 }, { "epoch": 0.20404257320716646, "grad_norm": 0.8272038698196411, "learning_rate": 9.926199678000925e-05, "loss": 0.9874340057373047, "memory(GiB)": 89.65, "step": 15725, "token_acc": 0.7374967219870378, "train_speed(iter/s)": 0.334021 }, { "epoch": 0.20410745160882215, "grad_norm": 0.8179622888565063, "learning_rate": 9.926107833449611e-05, "loss": 0.9835335731506347, "memory(GiB)": 89.65, "step": 15730, "token_acc": 0.7464401772525849, "train_speed(iter/s)": 0.333827 }, { "epoch": 0.20417233001047785, "grad_norm": 0.8527510762214661, "learning_rate": 9.926015932208967e-05, "loss": 1.0066734313964845, "memory(GiB)": 89.65, "step": 15735, "token_acc": 0.7499430208706411, "train_speed(iter/s)": 0.333637 }, { "epoch": 0.20423720841213355, "grad_norm": 0.8649172186851501, "learning_rate": 9.925923974280049e-05, "loss": 0.9936538696289062, "memory(GiB)": 89.65, "step": 15740, "token_acc": 0.7146146053964454, "train_speed(iter/s)": 0.333443 }, { "epoch": 0.20430208681378925, "grad_norm": 0.996807336807251, "learning_rate": 9.925831959663914e-05, "loss": 1.0194358825683594, "memory(GiB)": 89.65, "step": 15745, "token_acc": 0.7189075761846597, "train_speed(iter/s)": 0.333255 }, { "epoch": 0.20436696521544495, "grad_norm": 0.8881897926330566, "learning_rate": 9.925739888361623e-05, "loss": 0.9906426429748535, "memory(GiB)": 89.65, "step": 15750, "token_acc": 0.7457025705724649, "train_speed(iter/s)": 0.333079 }, { "epoch": 0.20443184361710065, "grad_norm": 0.8283020257949829, "learning_rate": 9.925647760374232e-05, "loss": 0.9916280746459961, "memory(GiB)": 89.65, "step": 15755, "token_acc": 0.7669810702823082, "train_speed(iter/s)": 0.332897 }, { "epoch": 0.20449672201875635, "grad_norm": 0.8176338076591492, "learning_rate": 9.925555575702806e-05, "loss": 0.9671258926391602, "memory(GiB)": 89.65, "step": 15760, "token_acc": 0.7186960352422908, "train_speed(iter/s)": 0.332727 }, { "epoch": 0.20456160042041205, "grad_norm": 0.8722872138023376, "learning_rate": 9.925463334348402e-05, "loss": 1.0005168914794922, "memory(GiB)": 89.65, "step": 15765, "token_acc": 0.7268818564066897, "train_speed(iter/s)": 0.332544 }, { "epoch": 0.20462647882206775, "grad_norm": 1.0090792179107666, "learning_rate": 9.925371036312082e-05, "loss": 0.9633132934570312, "memory(GiB)": 89.65, "step": 15770, "token_acc": 0.7513196905054595, "train_speed(iter/s)": 0.332375 }, { "epoch": 0.20469135722372345, "grad_norm": 0.8813987970352173, "learning_rate": 9.925278681594911e-05, "loss": 0.930479907989502, "memory(GiB)": 89.65, "step": 15775, "token_acc": 0.7868825556588808, "train_speed(iter/s)": 0.3322 }, { "epoch": 0.20475623562537915, "grad_norm": 0.9799169301986694, "learning_rate": 9.925186270197949e-05, "loss": 0.9568308830261231, "memory(GiB)": 89.65, "step": 15780, "token_acc": 0.7458342661592093, "train_speed(iter/s)": 0.332019 }, { "epoch": 0.20482111402703482, "grad_norm": 0.7594404220581055, "learning_rate": 9.92509380212226e-05, "loss": 0.9061190605163574, "memory(GiB)": 89.65, "step": 15785, "token_acc": 0.7627261099945185, "train_speed(iter/s)": 0.331836 }, { "epoch": 0.20488599242869052, "grad_norm": 0.8849910497665405, "learning_rate": 9.92500127736891e-05, "loss": 0.9828107833862305, "memory(GiB)": 89.65, "step": 15790, "token_acc": 0.7593995609997876, "train_speed(iter/s)": 0.331662 }, { "epoch": 0.20495087083034622, "grad_norm": 0.941827118396759, "learning_rate": 9.924908695938961e-05, "loss": 1.0073246002197265, "memory(GiB)": 89.65, "step": 15795, "token_acc": 0.7433656957928803, "train_speed(iter/s)": 0.331484 }, { "epoch": 0.20501574923200191, "grad_norm": 0.8364708423614502, "learning_rate": 9.92481605783348e-05, "loss": 1.0013648986816406, "memory(GiB)": 89.65, "step": 15800, "token_acc": 0.7305145899744281, "train_speed(iter/s)": 0.331296 }, { "epoch": 0.20508062763365761, "grad_norm": 0.7786981463432312, "learning_rate": 9.924723363053534e-05, "loss": 0.9817331314086915, "memory(GiB)": 89.65, "step": 15805, "token_acc": 0.747223487369933, "train_speed(iter/s)": 0.331112 }, { "epoch": 0.2051455060353133, "grad_norm": 0.9115062355995178, "learning_rate": 9.924630611600187e-05, "loss": 0.999117374420166, "memory(GiB)": 89.65, "step": 15810, "token_acc": 0.7410420841683367, "train_speed(iter/s)": 0.330932 }, { "epoch": 0.205210384436969, "grad_norm": 0.7845136523246765, "learning_rate": 9.924537803474507e-05, "loss": 0.9646769523620605, "memory(GiB)": 89.65, "step": 15815, "token_acc": 0.7321577289350762, "train_speed(iter/s)": 0.330757 }, { "epoch": 0.2052752628386247, "grad_norm": 0.8904278874397278, "learning_rate": 9.924444938677565e-05, "loss": 0.982550048828125, "memory(GiB)": 89.65, "step": 15820, "token_acc": 0.7518849534738313, "train_speed(iter/s)": 0.330587 }, { "epoch": 0.2053401412402804, "grad_norm": 0.9284570217132568, "learning_rate": 9.924352017210427e-05, "loss": 0.9935836791992188, "memory(GiB)": 89.65, "step": 15825, "token_acc": 0.719972728305746, "train_speed(iter/s)": 0.330416 }, { "epoch": 0.2054050196419361, "grad_norm": 0.8404231071472168, "learning_rate": 9.924259039074162e-05, "loss": 0.9512187957763671, "memory(GiB)": 89.65, "step": 15830, "token_acc": 0.7531420363660105, "train_speed(iter/s)": 0.330237 }, { "epoch": 0.2054698980435918, "grad_norm": 0.7474850416183472, "learning_rate": 9.924166004269842e-05, "loss": 1.0128034591674804, "memory(GiB)": 89.65, "step": 15835, "token_acc": 0.7389033942558747, "train_speed(iter/s)": 0.330066 }, { "epoch": 0.2055347764452475, "grad_norm": 0.7973255515098572, "learning_rate": 9.924072912798537e-05, "loss": 0.9516165733337403, "memory(GiB)": 89.65, "step": 15840, "token_acc": 0.7561047753639859, "train_speed(iter/s)": 0.329889 }, { "epoch": 0.20559965484690318, "grad_norm": 0.8589317202568054, "learning_rate": 9.923979764661315e-05, "loss": 0.9863450050354003, "memory(GiB)": 89.65, "step": 15845, "token_acc": 0.733497281905255, "train_speed(iter/s)": 0.329721 }, { "epoch": 0.20566453324855888, "grad_norm": 0.9412388205528259, "learning_rate": 9.923886559859253e-05, "loss": 0.9968592643737793, "memory(GiB)": 89.65, "step": 15850, "token_acc": 0.7420434073946984, "train_speed(iter/s)": 0.329553 }, { "epoch": 0.20572941165021458, "grad_norm": 0.7901058793067932, "learning_rate": 9.923793298393422e-05, "loss": 0.9828474044799804, "memory(GiB)": 89.65, "step": 15855, "token_acc": 0.7415666094911378, "train_speed(iter/s)": 0.329368 }, { "epoch": 0.20579429005187028, "grad_norm": 0.8565243482589722, "learning_rate": 9.923699980264894e-05, "loss": 0.9656425476074219, "memory(GiB)": 89.65, "step": 15860, "token_acc": 0.7489439184268026, "train_speed(iter/s)": 0.3292 }, { "epoch": 0.20585916845352598, "grad_norm": 0.8193528056144714, "learning_rate": 9.923606605474743e-05, "loss": 0.9350666999816895, "memory(GiB)": 89.65, "step": 15865, "token_acc": 0.7348776871756857, "train_speed(iter/s)": 0.329017 }, { "epoch": 0.20592404685518167, "grad_norm": 0.7973108291625977, "learning_rate": 9.923513174024044e-05, "loss": 0.9946647644042969, "memory(GiB)": 89.65, "step": 15870, "token_acc": 0.7386870815769898, "train_speed(iter/s)": 0.328838 }, { "epoch": 0.20598892525683737, "grad_norm": 0.8037042021751404, "learning_rate": 9.923419685913872e-05, "loss": 0.9805288314819336, "memory(GiB)": 89.65, "step": 15875, "token_acc": 0.7394533459000943, "train_speed(iter/s)": 0.32867 }, { "epoch": 0.20605380365849307, "grad_norm": 0.7685597538948059, "learning_rate": 9.923326141145303e-05, "loss": 0.9762002944946289, "memory(GiB)": 89.65, "step": 15880, "token_acc": 0.7315855687675014, "train_speed(iter/s)": 0.328499 }, { "epoch": 0.20611868206014877, "grad_norm": 0.9114903211593628, "learning_rate": 9.923232539719415e-05, "loss": 1.0007570266723633, "memory(GiB)": 89.65, "step": 15885, "token_acc": 0.7513939315352697, "train_speed(iter/s)": 0.328332 }, { "epoch": 0.20618356046180447, "grad_norm": 0.9041333794593811, "learning_rate": 9.923138881637283e-05, "loss": 1.0138870239257813, "memory(GiB)": 89.65, "step": 15890, "token_acc": 0.7428284368070953, "train_speed(iter/s)": 0.328155 }, { "epoch": 0.20624843886346017, "grad_norm": 0.8597579598426819, "learning_rate": 9.923045166899986e-05, "loss": 1.036326026916504, "memory(GiB)": 89.65, "step": 15895, "token_acc": 0.7224871071324317, "train_speed(iter/s)": 0.327977 }, { "epoch": 0.20631331726511587, "grad_norm": 0.748046338558197, "learning_rate": 9.922951395508602e-05, "loss": 0.9133877754211426, "memory(GiB)": 89.65, "step": 15900, "token_acc": 0.7605430387059503, "train_speed(iter/s)": 0.327799 }, { "epoch": 0.20637819566677154, "grad_norm": 0.9113165736198425, "learning_rate": 9.92285756746421e-05, "loss": 1.0069875717163086, "memory(GiB)": 89.65, "step": 15905, "token_acc": 0.73629932068565, "train_speed(iter/s)": 0.327633 }, { "epoch": 0.20644307406842724, "grad_norm": 0.9339202642440796, "learning_rate": 9.922763682767889e-05, "loss": 1.0138799667358398, "memory(GiB)": 89.65, "step": 15910, "token_acc": 0.7390677371580014, "train_speed(iter/s)": 0.327465 }, { "epoch": 0.20650795247008294, "grad_norm": 0.7339625358581543, "learning_rate": 9.922669741420721e-05, "loss": 0.9621002197265625, "memory(GiB)": 89.65, "step": 15915, "token_acc": 0.7378003283092076, "train_speed(iter/s)": 0.327283 }, { "epoch": 0.20657283087173864, "grad_norm": 0.8605341911315918, "learning_rate": 9.922575743423787e-05, "loss": 0.9689338684082032, "memory(GiB)": 89.65, "step": 15920, "token_acc": 0.7550761421319797, "train_speed(iter/s)": 0.327109 }, { "epoch": 0.20663770927339434, "grad_norm": 0.8149296045303345, "learning_rate": 9.922481688778169e-05, "loss": 0.9646430969238281, "memory(GiB)": 89.65, "step": 15925, "token_acc": 0.7403893342057909, "train_speed(iter/s)": 0.32693 }, { "epoch": 0.20670258767505004, "grad_norm": 0.7936162948608398, "learning_rate": 9.922387577484946e-05, "loss": 0.9635124206542969, "memory(GiB)": 89.65, "step": 15930, "token_acc": 0.7429795158286778, "train_speed(iter/s)": 0.32675 }, { "epoch": 0.20676746607670574, "grad_norm": 0.8685179948806763, "learning_rate": 9.922293409545205e-05, "loss": 0.9871305465698242, "memory(GiB)": 89.65, "step": 15935, "token_acc": 0.7422492622682065, "train_speed(iter/s)": 0.326579 }, { "epoch": 0.20683234447836144, "grad_norm": 0.803824782371521, "learning_rate": 9.922199184960029e-05, "loss": 0.9713207244873047, "memory(GiB)": 89.65, "step": 15940, "token_acc": 0.7440286624203821, "train_speed(iter/s)": 0.326404 }, { "epoch": 0.20689722288001713, "grad_norm": 0.7903825044631958, "learning_rate": 9.9221049037305e-05, "loss": 0.9558258056640625, "memory(GiB)": 89.65, "step": 15945, "token_acc": 0.7525402201524132, "train_speed(iter/s)": 0.326242 }, { "epoch": 0.20696210128167283, "grad_norm": 0.7993687391281128, "learning_rate": 9.922010565857705e-05, "loss": 0.9603094100952149, "memory(GiB)": 89.65, "step": 15950, "token_acc": 0.746031746031746, "train_speed(iter/s)": 0.326069 }, { "epoch": 0.20702697968332853, "grad_norm": 0.8878947496414185, "learning_rate": 9.921916171342729e-05, "loss": 0.9780506134033203, "memory(GiB)": 89.65, "step": 15955, "token_acc": 0.7291077651390624, "train_speed(iter/s)": 0.325905 }, { "epoch": 0.20709185808498423, "grad_norm": 0.7611897587776184, "learning_rate": 9.92182172018666e-05, "loss": 1.0129476547241212, "memory(GiB)": 89.65, "step": 15960, "token_acc": 0.7107999588180789, "train_speed(iter/s)": 0.325735 }, { "epoch": 0.2071567364866399, "grad_norm": 0.826846182346344, "learning_rate": 9.921727212390582e-05, "loss": 0.9560791015625, "memory(GiB)": 89.65, "step": 15965, "token_acc": 0.7389204448828272, "train_speed(iter/s)": 0.325551 }, { "epoch": 0.2072216148882956, "grad_norm": 0.8440036177635193, "learning_rate": 9.921632647955584e-05, "loss": 1.002410316467285, "memory(GiB)": 89.65, "step": 15970, "token_acc": 0.7175255352299613, "train_speed(iter/s)": 0.325371 }, { "epoch": 0.2072864932899513, "grad_norm": 0.9599837064743042, "learning_rate": 9.921538026882755e-05, "loss": 0.9742895126342773, "memory(GiB)": 89.65, "step": 15975, "token_acc": 0.7359119026578751, "train_speed(iter/s)": 0.325201 }, { "epoch": 0.207351371691607, "grad_norm": 0.8830959796905518, "learning_rate": 9.921443349173183e-05, "loss": 0.958800220489502, "memory(GiB)": 89.65, "step": 15980, "token_acc": 0.740058651026393, "train_speed(iter/s)": 0.325031 }, { "epoch": 0.2074162500932627, "grad_norm": 0.8710813522338867, "learning_rate": 9.921348614827956e-05, "loss": 0.9802392959594727, "memory(GiB)": 89.65, "step": 15985, "token_acc": 0.7425290300546448, "train_speed(iter/s)": 0.324857 }, { "epoch": 0.2074811284949184, "grad_norm": 0.8487351536750793, "learning_rate": 9.921253823848168e-05, "loss": 0.9846914291381836, "memory(GiB)": 89.65, "step": 15990, "token_acc": 0.7318180266885089, "train_speed(iter/s)": 0.324693 }, { "epoch": 0.2075460068965741, "grad_norm": 0.8726041913032532, "learning_rate": 9.921158976234906e-05, "loss": 1.0257086753845215, "memory(GiB)": 89.65, "step": 15995, "token_acc": 0.741096262126043, "train_speed(iter/s)": 0.324512 }, { "epoch": 0.2076108852982298, "grad_norm": 1.0071706771850586, "learning_rate": 9.921064071989265e-05, "loss": 0.9998886108398437, "memory(GiB)": 89.65, "step": 16000, "token_acc": 0.7269079132638694, "train_speed(iter/s)": 0.324333 }, { "epoch": 0.2076757636998855, "grad_norm": 0.9827010035514832, "learning_rate": 9.920969111112335e-05, "loss": 1.0133976936340332, "memory(GiB)": 89.65, "step": 16005, "token_acc": 0.7393546737138229, "train_speed(iter/s)": 0.32418 }, { "epoch": 0.2077406421015412, "grad_norm": 0.8633816242218018, "learning_rate": 9.920874093605209e-05, "loss": 0.9875153541564942, "memory(GiB)": 89.65, "step": 16010, "token_acc": 0.7428211664260925, "train_speed(iter/s)": 0.324002 }, { "epoch": 0.2078055205031969, "grad_norm": 0.7265738844871521, "learning_rate": 9.920779019468979e-05, "loss": 0.9748924255371094, "memory(GiB)": 89.65, "step": 16015, "token_acc": 0.7333030281837891, "train_speed(iter/s)": 0.323831 }, { "epoch": 0.2078703989048526, "grad_norm": 0.8329987525939941, "learning_rate": 9.920683888704743e-05, "loss": 1.0122377395629882, "memory(GiB)": 89.65, "step": 16020, "token_acc": 0.7337312992851353, "train_speed(iter/s)": 0.323665 }, { "epoch": 0.20793527730650826, "grad_norm": 0.7867085933685303, "learning_rate": 9.920588701313594e-05, "loss": 0.9983351707458497, "memory(GiB)": 89.65, "step": 16025, "token_acc": 0.7218575810547841, "train_speed(iter/s)": 0.323508 }, { "epoch": 0.20800015570816396, "grad_norm": 0.8102641701698303, "learning_rate": 9.920493457296625e-05, "loss": 0.9902725219726562, "memory(GiB)": 89.65, "step": 16030, "token_acc": 0.7319845857418111, "train_speed(iter/s)": 0.323336 }, { "epoch": 0.20806503410981966, "grad_norm": 0.8071226477622986, "learning_rate": 9.920398156654933e-05, "loss": 1.012824535369873, "memory(GiB)": 89.65, "step": 16035, "token_acc": 0.7137428531585553, "train_speed(iter/s)": 0.323177 }, { "epoch": 0.20812991251147536, "grad_norm": 0.8607335686683655, "learning_rate": 9.920302799389617e-05, "loss": 1.0248493194580077, "memory(GiB)": 89.65, "step": 16040, "token_acc": 0.74387078165561, "train_speed(iter/s)": 0.323009 }, { "epoch": 0.20819479091313106, "grad_norm": 0.89060378074646, "learning_rate": 9.920207385501774e-05, "loss": 1.0717679977416992, "memory(GiB)": 89.65, "step": 16045, "token_acc": 0.7362073464181138, "train_speed(iter/s)": 0.322844 }, { "epoch": 0.20825966931478676, "grad_norm": 0.839154064655304, "learning_rate": 9.920111914992501e-05, "loss": 1.004679298400879, "memory(GiB)": 89.65, "step": 16050, "token_acc": 0.720740540174643, "train_speed(iter/s)": 0.322684 }, { "epoch": 0.20832454771644246, "grad_norm": 0.8574193120002747, "learning_rate": 9.920016387862895e-05, "loss": 0.9817071914672851, "memory(GiB)": 89.65, "step": 16055, "token_acc": 0.7310513447432763, "train_speed(iter/s)": 0.322516 }, { "epoch": 0.20838942611809816, "grad_norm": 0.7530657649040222, "learning_rate": 9.919920804114059e-05, "loss": 0.9616043090820312, "memory(GiB)": 89.65, "step": 16060, "token_acc": 0.7650845649855249, "train_speed(iter/s)": 0.322352 }, { "epoch": 0.20845430451975386, "grad_norm": 0.8016478419303894, "learning_rate": 9.919825163747089e-05, "loss": 0.937152099609375, "memory(GiB)": 89.65, "step": 16065, "token_acc": 0.7578874420979684, "train_speed(iter/s)": 0.322181 }, { "epoch": 0.20851918292140956, "grad_norm": 0.8795000314712524, "learning_rate": 9.91972946676309e-05, "loss": 0.9346677780151367, "memory(GiB)": 89.65, "step": 16070, "token_acc": 0.7734870137939605, "train_speed(iter/s)": 0.322004 }, { "epoch": 0.20858406132306526, "grad_norm": 1.055545687675476, "learning_rate": 9.919633713163159e-05, "loss": 0.9668727874755859, "memory(GiB)": 89.65, "step": 16075, "token_acc": 0.7644913916246898, "train_speed(iter/s)": 0.321839 }, { "epoch": 0.20864893972472096, "grad_norm": 0.851004958152771, "learning_rate": 9.919537902948402e-05, "loss": 0.9990901947021484, "memory(GiB)": 89.65, "step": 16080, "token_acc": 0.7206138587320734, "train_speed(iter/s)": 0.321678 }, { "epoch": 0.20871381812637663, "grad_norm": 0.7789195775985718, "learning_rate": 9.919442036119916e-05, "loss": 0.9535748481750488, "memory(GiB)": 89.65, "step": 16085, "token_acc": 0.738032962531119, "train_speed(iter/s)": 0.321514 }, { "epoch": 0.20877869652803233, "grad_norm": 0.8728150725364685, "learning_rate": 9.919346112678812e-05, "loss": 1.0137518882751464, "memory(GiB)": 89.65, "step": 16090, "token_acc": 0.7255245958032336, "train_speed(iter/s)": 0.321353 }, { "epoch": 0.20884357492968803, "grad_norm": 0.8598151206970215, "learning_rate": 9.919250132626186e-05, "loss": 0.9409168243408204, "memory(GiB)": 89.65, "step": 16095, "token_acc": 0.7354960762331838, "train_speed(iter/s)": 0.321182 }, { "epoch": 0.20890845333134372, "grad_norm": 0.7918723821640015, "learning_rate": 9.919154095963148e-05, "loss": 1.015459156036377, "memory(GiB)": 89.65, "step": 16100, "token_acc": 0.7478895664974314, "train_speed(iter/s)": 0.321018 }, { "epoch": 0.20897333173299942, "grad_norm": 0.9226036667823792, "learning_rate": 9.919058002690799e-05, "loss": 0.9721007347106934, "memory(GiB)": 89.65, "step": 16105, "token_acc": 0.7450117063284062, "train_speed(iter/s)": 0.320848 }, { "epoch": 0.20903821013465512, "grad_norm": 0.7674175500869751, "learning_rate": 9.918961852810249e-05, "loss": 0.9555426597595215, "memory(GiB)": 89.65, "step": 16110, "token_acc": 0.7474400151143019, "train_speed(iter/s)": 0.320692 }, { "epoch": 0.20910308853631082, "grad_norm": 0.795595645904541, "learning_rate": 9.918865646322602e-05, "loss": 0.9786827087402343, "memory(GiB)": 89.65, "step": 16115, "token_acc": 0.7376998874833782, "train_speed(iter/s)": 0.320534 }, { "epoch": 0.20916796693796652, "grad_norm": 0.8301074504852295, "learning_rate": 9.918769383228967e-05, "loss": 0.9963440895080566, "memory(GiB)": 89.65, "step": 16120, "token_acc": 0.7566159549987216, "train_speed(iter/s)": 0.32037 }, { "epoch": 0.20923284533962222, "grad_norm": 0.7933509945869446, "learning_rate": 9.918673063530449e-05, "loss": 0.9709458351135254, "memory(GiB)": 89.65, "step": 16125, "token_acc": 0.7356525512602611, "train_speed(iter/s)": 0.320197 }, { "epoch": 0.20929772374127792, "grad_norm": 0.8138552904129028, "learning_rate": 9.918576687228159e-05, "loss": 0.956486701965332, "memory(GiB)": 89.65, "step": 16130, "token_acc": 0.7532678342588638, "train_speed(iter/s)": 0.320032 }, { "epoch": 0.20936260214293362, "grad_norm": 0.7684318423271179, "learning_rate": 9.918480254323203e-05, "loss": 0.9809523582458496, "memory(GiB)": 89.65, "step": 16135, "token_acc": 0.7287194419990519, "train_speed(iter/s)": 0.319869 }, { "epoch": 0.20942748054458932, "grad_norm": 0.8367980122566223, "learning_rate": 9.918383764816695e-05, "loss": 0.9878980636596679, "memory(GiB)": 89.65, "step": 16140, "token_acc": 0.7473287723883465, "train_speed(iter/s)": 0.319702 }, { "epoch": 0.209492358946245, "grad_norm": 0.7800894975662231, "learning_rate": 9.918287218709743e-05, "loss": 0.9728158950805664, "memory(GiB)": 89.65, "step": 16145, "token_acc": 0.7511362376676644, "train_speed(iter/s)": 0.319548 }, { "epoch": 0.2095572373479007, "grad_norm": 0.7463024854660034, "learning_rate": 9.918190616003458e-05, "loss": 0.9637968063354492, "memory(GiB)": 89.65, "step": 16150, "token_acc": 0.7434803841771122, "train_speed(iter/s)": 0.319389 }, { "epoch": 0.2096221157495564, "grad_norm": 0.7764238715171814, "learning_rate": 9.91809395669895e-05, "loss": 1.012062168121338, "memory(GiB)": 89.65, "step": 16155, "token_acc": 0.7249609008445418, "train_speed(iter/s)": 0.319221 }, { "epoch": 0.20968699415121209, "grad_norm": 0.9029353260993958, "learning_rate": 9.917997240797336e-05, "loss": 0.9978830337524414, "memory(GiB)": 89.65, "step": 16160, "token_acc": 0.7365445956312836, "train_speed(iter/s)": 0.319061 }, { "epoch": 0.20975187255286779, "grad_norm": 0.8033533096313477, "learning_rate": 9.917900468299726e-05, "loss": 1.0615399360656739, "memory(GiB)": 89.65, "step": 16165, "token_acc": 0.734215446569826, "train_speed(iter/s)": 0.318898 }, { "epoch": 0.20981675095452348, "grad_norm": 0.865185558795929, "learning_rate": 9.917803639207233e-05, "loss": 1.0144722938537598, "memory(GiB)": 89.65, "step": 16170, "token_acc": 0.7153627311522048, "train_speed(iter/s)": 0.318741 }, { "epoch": 0.20988162935617918, "grad_norm": 0.933725118637085, "learning_rate": 9.917706753520971e-05, "loss": 1.0001209259033204, "memory(GiB)": 89.65, "step": 16175, "token_acc": 0.7324757247147211, "train_speed(iter/s)": 0.318591 }, { "epoch": 0.20994650775783488, "grad_norm": 0.8601433038711548, "learning_rate": 9.917609811242057e-05, "loss": 1.0309837341308594, "memory(GiB)": 89.65, "step": 16180, "token_acc": 0.7423490151391828, "train_speed(iter/s)": 0.318442 }, { "epoch": 0.21001138615949058, "grad_norm": 0.7819585204124451, "learning_rate": 9.917512812371606e-05, "loss": 0.9686826705932617, "memory(GiB)": 89.65, "step": 16185, "token_acc": 0.7619827159553813, "train_speed(iter/s)": 0.31829 }, { "epoch": 0.21007626456114628, "grad_norm": 0.8968464136123657, "learning_rate": 9.917415756910735e-05, "loss": 0.953865909576416, "memory(GiB)": 89.65, "step": 16190, "token_acc": 0.7561625582944703, "train_speed(iter/s)": 0.318143 }, { "epoch": 0.21014114296280198, "grad_norm": 0.7089178562164307, "learning_rate": 9.91731864486056e-05, "loss": 0.9616935729980469, "memory(GiB)": 89.65, "step": 16195, "token_acc": 0.7292328263413004, "train_speed(iter/s)": 0.317984 }, { "epoch": 0.21020602136445765, "grad_norm": 1.0854442119598389, "learning_rate": 9.917221476222197e-05, "loss": 0.9568533897399902, "memory(GiB)": 89.65, "step": 16200, "token_acc": 0.7392409831434221, "train_speed(iter/s)": 0.317821 }, { "epoch": 0.21027089976611335, "grad_norm": 0.9382250308990479, "learning_rate": 9.917124250996766e-05, "loss": 0.9851888656616211, "memory(GiB)": 89.65, "step": 16205, "token_acc": 0.7290476404696568, "train_speed(iter/s)": 0.317665 }, { "epoch": 0.21033577816776905, "grad_norm": 0.8216039538383484, "learning_rate": 9.917026969185387e-05, "loss": 0.9826550483703613, "memory(GiB)": 89.65, "step": 16210, "token_acc": 0.7324739855255104, "train_speed(iter/s)": 0.317508 }, { "epoch": 0.21040065656942475, "grad_norm": 0.8318911790847778, "learning_rate": 9.916929630789177e-05, "loss": 1.021925926208496, "memory(GiB)": 89.65, "step": 16215, "token_acc": 0.736822218023805, "train_speed(iter/s)": 0.31736 }, { "epoch": 0.21046553497108045, "grad_norm": 0.8987971544265747, "learning_rate": 9.916832235809257e-05, "loss": 1.022885513305664, "memory(GiB)": 89.65, "step": 16220, "token_acc": 0.7488272771826647, "train_speed(iter/s)": 0.317202 }, { "epoch": 0.21053041337273615, "grad_norm": 0.8124363422393799, "learning_rate": 9.916734784246748e-05, "loss": 0.9883916854858399, "memory(GiB)": 89.65, "step": 16225, "token_acc": 0.7440514241068258, "train_speed(iter/s)": 0.317043 }, { "epoch": 0.21059529177439185, "grad_norm": 0.9359393119812012, "learning_rate": 9.916637276102773e-05, "loss": 0.9777406692504883, "memory(GiB)": 89.65, "step": 16230, "token_acc": 0.7309471213585531, "train_speed(iter/s)": 0.316904 }, { "epoch": 0.21066017017604755, "grad_norm": 0.8199836611747742, "learning_rate": 9.916539711378451e-05, "loss": 0.9673976898193359, "memory(GiB)": 89.65, "step": 16235, "token_acc": 0.7574803604436229, "train_speed(iter/s)": 0.316737 }, { "epoch": 0.21072504857770324, "grad_norm": 0.8921099901199341, "learning_rate": 9.916442090074907e-05, "loss": 1.0303989410400392, "memory(GiB)": 89.65, "step": 16240, "token_acc": 0.7394445391037655, "train_speed(iter/s)": 0.31657 }, { "epoch": 0.21078992697935894, "grad_norm": 0.8672165274620056, "learning_rate": 9.916344412193263e-05, "loss": 1.0197086334228516, "memory(GiB)": 89.65, "step": 16245, "token_acc": 0.7407158128117824, "train_speed(iter/s)": 0.316425 }, { "epoch": 0.21085480538101464, "grad_norm": 0.884097158908844, "learning_rate": 9.916246677734647e-05, "loss": 0.9708377838134765, "memory(GiB)": 89.65, "step": 16250, "token_acc": 0.7357681075819302, "train_speed(iter/s)": 0.316274 }, { "epoch": 0.21091968378267034, "grad_norm": 0.8270410895347595, "learning_rate": 9.916148886700178e-05, "loss": 0.996759033203125, "memory(GiB)": 89.65, "step": 16255, "token_acc": 0.7456670697299476, "train_speed(iter/s)": 0.316107 }, { "epoch": 0.210984562184326, "grad_norm": 0.8920193314552307, "learning_rate": 9.916051039090984e-05, "loss": 0.9697940826416016, "memory(GiB)": 89.65, "step": 16260, "token_acc": 0.748561110484144, "train_speed(iter/s)": 0.315958 }, { "epoch": 0.2110494405859817, "grad_norm": 0.7788894176483154, "learning_rate": 9.915953134908191e-05, "loss": 0.9980021476745605, "memory(GiB)": 89.65, "step": 16265, "token_acc": 0.7505649226490526, "train_speed(iter/s)": 0.315801 }, { "epoch": 0.2111143189876374, "grad_norm": 0.8262596130371094, "learning_rate": 9.915855174152927e-05, "loss": 0.975406551361084, "memory(GiB)": 89.65, "step": 16270, "token_acc": 0.7354852973690029, "train_speed(iter/s)": 0.315652 }, { "epoch": 0.2111791973892931, "grad_norm": 0.8812094330787659, "learning_rate": 9.915757156826318e-05, "loss": 0.979881763458252, "memory(GiB)": 89.65, "step": 16275, "token_acc": 0.7369267900241352, "train_speed(iter/s)": 0.315501 }, { "epoch": 0.2112440757909488, "grad_norm": 0.763163685798645, "learning_rate": 9.915659082929491e-05, "loss": 0.9915812492370606, "memory(GiB)": 89.65, "step": 16280, "token_acc": 0.7199549116467195, "train_speed(iter/s)": 0.315343 }, { "epoch": 0.2113089541926045, "grad_norm": 0.8576927185058594, "learning_rate": 9.915560952463576e-05, "loss": 0.945819091796875, "memory(GiB)": 89.65, "step": 16285, "token_acc": 0.7394277220364177, "train_speed(iter/s)": 0.315198 }, { "epoch": 0.2113738325942602, "grad_norm": 0.7417463660240173, "learning_rate": 9.915462765429702e-05, "loss": 0.9884578704833984, "memory(GiB)": 89.65, "step": 16290, "token_acc": 0.7542525672179979, "train_speed(iter/s)": 0.31505 }, { "epoch": 0.2114387109959159, "grad_norm": 0.8308852910995483, "learning_rate": 9.915364521828999e-05, "loss": 1.0207143783569337, "memory(GiB)": 89.65, "step": 16295, "token_acc": 0.735770887449446, "train_speed(iter/s)": 0.314897 }, { "epoch": 0.2115035893975716, "grad_norm": 0.7875194549560547, "learning_rate": 9.915266221662595e-05, "loss": 0.983714485168457, "memory(GiB)": 89.65, "step": 16300, "token_acc": 0.7371062168943406, "train_speed(iter/s)": 0.314749 }, { "epoch": 0.2115684677992273, "grad_norm": 0.8169892430305481, "learning_rate": 9.915167864931627e-05, "loss": 0.9611991882324219, "memory(GiB)": 89.65, "step": 16305, "token_acc": 0.7289230463444529, "train_speed(iter/s)": 0.314586 }, { "epoch": 0.211633346200883, "grad_norm": 0.8890318870544434, "learning_rate": 9.915069451637223e-05, "loss": 0.9511870384216309, "memory(GiB)": 89.65, "step": 16310, "token_acc": 0.7506718881978038, "train_speed(iter/s)": 0.314432 }, { "epoch": 0.2116982246025387, "grad_norm": 0.8611366152763367, "learning_rate": 9.914970981780516e-05, "loss": 0.9886322021484375, "memory(GiB)": 89.65, "step": 16315, "token_acc": 0.7411132460670109, "train_speed(iter/s)": 0.314273 }, { "epoch": 0.21176310300419438, "grad_norm": 0.8686774373054504, "learning_rate": 9.914872455362639e-05, "loss": 1.053838062286377, "memory(GiB)": 89.65, "step": 16320, "token_acc": 0.7302426134998798, "train_speed(iter/s)": 0.314118 }, { "epoch": 0.21182798140585007, "grad_norm": 0.8965868353843689, "learning_rate": 9.914773872384723e-05, "loss": 0.9975987434387207, "memory(GiB)": 89.65, "step": 16325, "token_acc": 0.7101849473413819, "train_speed(iter/s)": 0.313977 }, { "epoch": 0.21189285980750577, "grad_norm": 0.9302377700805664, "learning_rate": 9.91467523284791e-05, "loss": 0.9905570983886719, "memory(GiB)": 89.65, "step": 16330, "token_acc": 0.7362182502351834, "train_speed(iter/s)": 0.313834 }, { "epoch": 0.21195773820916147, "grad_norm": 0.7684674859046936, "learning_rate": 9.91457653675333e-05, "loss": 0.9858423233032226, "memory(GiB)": 89.65, "step": 16335, "token_acc": 0.7241847353217554, "train_speed(iter/s)": 0.313678 }, { "epoch": 0.21202261661081717, "grad_norm": 0.7165451049804688, "learning_rate": 9.914477784102119e-05, "loss": 0.9334287643432617, "memory(GiB)": 89.65, "step": 16340, "token_acc": 0.7456698480028279, "train_speed(iter/s)": 0.313528 }, { "epoch": 0.21208749501247287, "grad_norm": 0.8667672276496887, "learning_rate": 9.914378974895414e-05, "loss": 0.9849003791809082, "memory(GiB)": 89.65, "step": 16345, "token_acc": 0.7514677103718199, "train_speed(iter/s)": 0.31337 }, { "epoch": 0.21215237341412857, "grad_norm": 0.7740128636360168, "learning_rate": 9.914280109134351e-05, "loss": 0.9665106773376465, "memory(GiB)": 89.65, "step": 16350, "token_acc": 0.7184940976284164, "train_speed(iter/s)": 0.313219 }, { "epoch": 0.21221725181578427, "grad_norm": 0.8160079717636108, "learning_rate": 9.914181186820068e-05, "loss": 0.9185642242431641, "memory(GiB)": 89.65, "step": 16355, "token_acc": 0.7771477549943957, "train_speed(iter/s)": 0.313058 }, { "epoch": 0.21228213021743997, "grad_norm": 0.8062708377838135, "learning_rate": 9.914082207953706e-05, "loss": 0.9786247253417969, "memory(GiB)": 89.65, "step": 16360, "token_acc": 0.7443388009870808, "train_speed(iter/s)": 0.31291 }, { "epoch": 0.21234700861909567, "grad_norm": 0.7676191926002502, "learning_rate": 9.913983172536402e-05, "loss": 0.9680014610290527, "memory(GiB)": 89.65, "step": 16365, "token_acc": 0.7518035466789864, "train_speed(iter/s)": 0.312756 }, { "epoch": 0.21241188702075137, "grad_norm": 0.8249332308769226, "learning_rate": 9.913884080569296e-05, "loss": 0.9665069580078125, "memory(GiB)": 89.65, "step": 16370, "token_acc": 0.7293767464463613, "train_speed(iter/s)": 0.312607 }, { "epoch": 0.21247676542240707, "grad_norm": 0.8994984030723572, "learning_rate": 9.913784932053528e-05, "loss": 0.9414220809936523, "memory(GiB)": 89.65, "step": 16375, "token_acc": 0.7625484035946518, "train_speed(iter/s)": 0.312454 }, { "epoch": 0.21254164382406274, "grad_norm": 0.8932491540908813, "learning_rate": 9.913685726990239e-05, "loss": 0.9879932403564453, "memory(GiB)": 89.65, "step": 16380, "token_acc": 0.7510619901507463, "train_speed(iter/s)": 0.312311 }, { "epoch": 0.21260652222571844, "grad_norm": 0.8323279023170471, "learning_rate": 9.913586465380571e-05, "loss": 0.9679712295532227, "memory(GiB)": 89.65, "step": 16385, "token_acc": 0.7416792865556117, "train_speed(iter/s)": 0.312163 }, { "epoch": 0.21267140062737414, "grad_norm": 0.7777948379516602, "learning_rate": 9.913487147225667e-05, "loss": 0.956072998046875, "memory(GiB)": 89.65, "step": 16390, "token_acc": 0.7272418093186713, "train_speed(iter/s)": 0.312001 }, { "epoch": 0.21273627902902983, "grad_norm": 0.8640491366386414, "learning_rate": 9.913387772526668e-05, "loss": 0.9657190322875977, "memory(GiB)": 89.65, "step": 16395, "token_acc": 0.749525157992886, "train_speed(iter/s)": 0.311851 }, { "epoch": 0.21280115743068553, "grad_norm": 0.7774252891540527, "learning_rate": 9.913288341284719e-05, "loss": 0.9569988250732422, "memory(GiB)": 89.65, "step": 16400, "token_acc": 0.769112915581431, "train_speed(iter/s)": 0.311688 }, { "epoch": 0.21286603583234123, "grad_norm": 0.8453464508056641, "learning_rate": 9.913188853500966e-05, "loss": 0.9500871658325195, "memory(GiB)": 89.65, "step": 16405, "token_acc": 0.7479683021412915, "train_speed(iter/s)": 0.311538 }, { "epoch": 0.21293091423399693, "grad_norm": 0.93779456615448, "learning_rate": 9.913089309176548e-05, "loss": 1.041395664215088, "memory(GiB)": 89.65, "step": 16410, "token_acc": 0.7122365988909427, "train_speed(iter/s)": 0.311402 }, { "epoch": 0.21299579263565263, "grad_norm": 0.7586132287979126, "learning_rate": 9.912989708312617e-05, "loss": 0.9550201416015625, "memory(GiB)": 89.65, "step": 16415, "token_acc": 0.7482285208148804, "train_speed(iter/s)": 0.311258 }, { "epoch": 0.21306067103730833, "grad_norm": 0.7485277652740479, "learning_rate": 9.912890050910316e-05, "loss": 0.953059196472168, "memory(GiB)": 89.65, "step": 16420, "token_acc": 0.7289250918383645, "train_speed(iter/s)": 0.311102 }, { "epoch": 0.21312554943896403, "grad_norm": 0.8102803230285645, "learning_rate": 9.912790336970792e-05, "loss": 1.0040390014648437, "memory(GiB)": 89.65, "step": 16425, "token_acc": 0.7371410905437261, "train_speed(iter/s)": 0.31096 }, { "epoch": 0.21319042784061973, "grad_norm": 0.9062525033950806, "learning_rate": 9.912690566495195e-05, "loss": 0.9908709526062012, "memory(GiB)": 89.65, "step": 16430, "token_acc": 0.7306556182146008, "train_speed(iter/s)": 0.310811 }, { "epoch": 0.21325530624227543, "grad_norm": 0.860710620880127, "learning_rate": 9.912590739484669e-05, "loss": 1.0061075210571289, "memory(GiB)": 89.65, "step": 16435, "token_acc": 0.7277220397170677, "train_speed(iter/s)": 0.310665 }, { "epoch": 0.2133201846439311, "grad_norm": 0.8974315524101257, "learning_rate": 9.912490855940366e-05, "loss": 1.0082490921020508, "memory(GiB)": 89.65, "step": 16440, "token_acc": 0.7127848370838361, "train_speed(iter/s)": 0.310516 }, { "epoch": 0.2133850630455868, "grad_norm": 0.806363582611084, "learning_rate": 9.912390915863434e-05, "loss": 0.962136173248291, "memory(GiB)": 89.65, "step": 16445, "token_acc": 0.7623796130571892, "train_speed(iter/s)": 0.310363 }, { "epoch": 0.2134499414472425, "grad_norm": 0.79999178647995, "learning_rate": 9.912290919255024e-05, "loss": 0.9678106307983398, "memory(GiB)": 89.65, "step": 16450, "token_acc": 0.7498330110740025, "train_speed(iter/s)": 0.31021 }, { "epoch": 0.2135148198488982, "grad_norm": 0.9000002145767212, "learning_rate": 9.912190866116286e-05, "loss": 1.0085926055908203, "memory(GiB)": 89.65, "step": 16455, "token_acc": 0.7342069558886793, "train_speed(iter/s)": 0.310073 }, { "epoch": 0.2135796982505539, "grad_norm": 0.7852510213851929, "learning_rate": 9.912090756448372e-05, "loss": 0.9980619430541993, "memory(GiB)": 89.65, "step": 16460, "token_acc": 0.7384163999390336, "train_speed(iter/s)": 0.30993 }, { "epoch": 0.2136445766522096, "grad_norm": 0.7807852029800415, "learning_rate": 9.911990590252433e-05, "loss": 0.9649677276611328, "memory(GiB)": 89.65, "step": 16465, "token_acc": 0.7384697414022927, "train_speed(iter/s)": 0.309787 }, { "epoch": 0.2137094550538653, "grad_norm": 0.8019145131111145, "learning_rate": 9.911890367529621e-05, "loss": 0.966917610168457, "memory(GiB)": 89.65, "step": 16470, "token_acc": 0.756084388185654, "train_speed(iter/s)": 0.309649 }, { "epoch": 0.213774333455521, "grad_norm": 0.7731825709342957, "learning_rate": 9.911790088281094e-05, "loss": 0.9734806060791016, "memory(GiB)": 89.65, "step": 16475, "token_acc": 0.7334594975694831, "train_speed(iter/s)": 0.309506 }, { "epoch": 0.2138392118571767, "grad_norm": 0.8143060207366943, "learning_rate": 9.911689752508e-05, "loss": 0.9858640670776367, "memory(GiB)": 89.65, "step": 16480, "token_acc": 0.74497204675815, "train_speed(iter/s)": 0.309369 }, { "epoch": 0.2139040902588324, "grad_norm": 0.791163980960846, "learning_rate": 9.911589360211498e-05, "loss": 0.9946230888366699, "memory(GiB)": 89.65, "step": 16485, "token_acc": 0.7334817524051841, "train_speed(iter/s)": 0.309227 }, { "epoch": 0.2139689686604881, "grad_norm": 0.9144330620765686, "learning_rate": 9.911488911392741e-05, "loss": 0.9856603622436524, "memory(GiB)": 89.65, "step": 16490, "token_acc": 0.736587211502081, "train_speed(iter/s)": 0.309071 }, { "epoch": 0.2140338470621438, "grad_norm": 1.0234521627426147, "learning_rate": 9.911388406052887e-05, "loss": 0.996976089477539, "memory(GiB)": 89.65, "step": 16495, "token_acc": 0.7400611620795107, "train_speed(iter/s)": 0.308921 }, { "epoch": 0.21409872546379946, "grad_norm": 0.8320348858833313, "learning_rate": 9.911287844193091e-05, "loss": 0.9992586135864258, "memory(GiB)": 89.65, "step": 16500, "token_acc": 0.7344199075926802, "train_speed(iter/s)": 0.308775 }, { "epoch": 0.21416360386545516, "grad_norm": 0.8681120276451111, "learning_rate": 9.911187225814509e-05, "loss": 0.9156489372253418, "memory(GiB)": 89.65, "step": 16505, "token_acc": 0.7622607552002215, "train_speed(iter/s)": 0.308632 }, { "epoch": 0.21422848226711086, "grad_norm": 1.0023705959320068, "learning_rate": 9.911086550918303e-05, "loss": 1.0233186721801757, "memory(GiB)": 89.65, "step": 16510, "token_acc": 0.7310330912025828, "train_speed(iter/s)": 0.308491 }, { "epoch": 0.21429336066876656, "grad_norm": 0.7959322333335876, "learning_rate": 9.910985819505627e-05, "loss": 0.9964740753173829, "memory(GiB)": 89.65, "step": 16515, "token_acc": 0.7425051688490696, "train_speed(iter/s)": 0.308345 }, { "epoch": 0.21435823907042226, "grad_norm": 0.8536934852600098, "learning_rate": 9.910885031577642e-05, "loss": 0.9777542114257812, "memory(GiB)": 89.65, "step": 16520, "token_acc": 0.7434585709493459, "train_speed(iter/s)": 0.308214 }, { "epoch": 0.21442311747207796, "grad_norm": 0.9046229124069214, "learning_rate": 9.910784187135508e-05, "loss": 0.9193485260009766, "memory(GiB)": 89.65, "step": 16525, "token_acc": 0.7493600094521681, "train_speed(iter/s)": 0.308065 }, { "epoch": 0.21448799587373366, "grad_norm": 0.8348016738891602, "learning_rate": 9.910683286180388e-05, "loss": 0.9636377334594727, "memory(GiB)": 89.65, "step": 16530, "token_acc": 0.7471896383186706, "train_speed(iter/s)": 0.30792 }, { "epoch": 0.21455287427538935, "grad_norm": 0.8162512183189392, "learning_rate": 9.91058232871344e-05, "loss": 0.9764731407165528, "memory(GiB)": 89.65, "step": 16535, "token_acc": 0.7345845508353938, "train_speed(iter/s)": 0.307787 }, { "epoch": 0.21461775267704505, "grad_norm": 0.7352502942085266, "learning_rate": 9.910481314735825e-05, "loss": 0.9902411460876465, "memory(GiB)": 89.65, "step": 16540, "token_acc": 0.7411845045939905, "train_speed(iter/s)": 0.307654 }, { "epoch": 0.21468263107870075, "grad_norm": 0.792322039604187, "learning_rate": 9.910380244248708e-05, "loss": 0.9709924697875977, "memory(GiB)": 89.65, "step": 16545, "token_acc": 0.7217953129181481, "train_speed(iter/s)": 0.307515 }, { "epoch": 0.21474750948035645, "grad_norm": 0.9324902892112732, "learning_rate": 9.910279117253252e-05, "loss": 0.9671077728271484, "memory(GiB)": 89.65, "step": 16550, "token_acc": 0.7187171932676352, "train_speed(iter/s)": 0.307379 }, { "epoch": 0.21481238788201215, "grad_norm": 0.7979962825775146, "learning_rate": 9.910177933750619e-05, "loss": 0.9989227294921875, "memory(GiB)": 89.65, "step": 16555, "token_acc": 0.7520251877603227, "train_speed(iter/s)": 0.30724 }, { "epoch": 0.21487726628366782, "grad_norm": 0.8604037165641785, "learning_rate": 9.910076693741974e-05, "loss": 0.9818105697631836, "memory(GiB)": 89.65, "step": 16560, "token_acc": 0.7612889696630785, "train_speed(iter/s)": 0.307097 }, { "epoch": 0.21494214468532352, "grad_norm": 0.8334984183311462, "learning_rate": 9.909975397228484e-05, "loss": 0.9248238563537597, "memory(GiB)": 89.65, "step": 16565, "token_acc": 0.7470755402815412, "train_speed(iter/s)": 0.306951 }, { "epoch": 0.21500702308697922, "grad_norm": 0.8497083187103271, "learning_rate": 9.909874044211312e-05, "loss": 0.9516575813293457, "memory(GiB)": 89.65, "step": 16570, "token_acc": 0.7522048469576817, "train_speed(iter/s)": 0.3068 }, { "epoch": 0.21507190148863492, "grad_norm": 0.8974537253379822, "learning_rate": 9.909772634691625e-05, "loss": 1.0214977264404297, "memory(GiB)": 89.65, "step": 16575, "token_acc": 0.7352749331419999, "train_speed(iter/s)": 0.306663 }, { "epoch": 0.21513677989029062, "grad_norm": 0.8701181411743164, "learning_rate": 9.90967116867059e-05, "loss": 1.0212112426757813, "memory(GiB)": 89.65, "step": 16580, "token_acc": 0.7361917310637086, "train_speed(iter/s)": 0.306525 }, { "epoch": 0.21520165829194632, "grad_norm": 0.8653997778892517, "learning_rate": 9.909569646149377e-05, "loss": 0.9942615509033204, "memory(GiB)": 89.65, "step": 16585, "token_acc": 0.7379668543335184, "train_speed(iter/s)": 0.306385 }, { "epoch": 0.21526653669360202, "grad_norm": 0.9101306796073914, "learning_rate": 9.90946806712915e-05, "loss": 1.0193456649780273, "memory(GiB)": 89.65, "step": 16590, "token_acc": 0.7441919755250327, "train_speed(iter/s)": 0.306247 }, { "epoch": 0.21533141509525772, "grad_norm": 0.814784049987793, "learning_rate": 9.909366431611084e-05, "loss": 0.9899440765380859, "memory(GiB)": 89.65, "step": 16595, "token_acc": 0.7477200567452543, "train_speed(iter/s)": 0.306102 }, { "epoch": 0.21539629349691342, "grad_norm": 0.8457547426223755, "learning_rate": 9.909264739596342e-05, "loss": 0.9659971237182617, "memory(GiB)": 89.65, "step": 16600, "token_acc": 0.7403641522818634, "train_speed(iter/s)": 0.305959 }, { "epoch": 0.21546117189856911, "grad_norm": 0.8908969759941101, "learning_rate": 9.9091629910861e-05, "loss": 1.0124704360961914, "memory(GiB)": 89.65, "step": 16605, "token_acc": 0.7477285296673777, "train_speed(iter/s)": 0.305821 }, { "epoch": 0.21552605030022481, "grad_norm": 0.8131412863731384, "learning_rate": 9.909061186081524e-05, "loss": 1.0032888412475587, "memory(GiB)": 89.65, "step": 16610, "token_acc": 0.7162345941351467, "train_speed(iter/s)": 0.305684 }, { "epoch": 0.2155909287018805, "grad_norm": 0.8081653118133545, "learning_rate": 9.908959324583788e-05, "loss": 0.9967777252197265, "memory(GiB)": 89.65, "step": 16615, "token_acc": 0.7410883380329111, "train_speed(iter/s)": 0.305558 }, { "epoch": 0.21565580710353618, "grad_norm": 0.8436856865882874, "learning_rate": 9.908857406594066e-05, "loss": 0.9244979858398438, "memory(GiB)": 89.65, "step": 16620, "token_acc": 0.7561865628162276, "train_speed(iter/s)": 0.305414 }, { "epoch": 0.21572068550519188, "grad_norm": 0.8775990605354309, "learning_rate": 9.908755432113528e-05, "loss": 0.98953857421875, "memory(GiB)": 89.65, "step": 16625, "token_acc": 0.7532627610208816, "train_speed(iter/s)": 0.305284 }, { "epoch": 0.21578556390684758, "grad_norm": 0.839905858039856, "learning_rate": 9.908653401143347e-05, "loss": 0.9627765655517578, "memory(GiB)": 89.65, "step": 16630, "token_acc": 0.7374083129584352, "train_speed(iter/s)": 0.305142 }, { "epoch": 0.21585044230850328, "grad_norm": 0.8770694732666016, "learning_rate": 9.9085513136847e-05, "loss": 0.957496452331543, "memory(GiB)": 89.65, "step": 16635, "token_acc": 0.7533113673805601, "train_speed(iter/s)": 0.304997 }, { "epoch": 0.21591532071015898, "grad_norm": 0.7859492897987366, "learning_rate": 9.90844916973876e-05, "loss": 0.9879940032958985, "memory(GiB)": 89.65, "step": 16640, "token_acc": 0.7406403304931578, "train_speed(iter/s)": 0.304864 }, { "epoch": 0.21598019911181468, "grad_norm": 0.8870040774345398, "learning_rate": 9.908346969306703e-05, "loss": 0.9413699150085449, "memory(GiB)": 89.65, "step": 16645, "token_acc": 0.7508778471769056, "train_speed(iter/s)": 0.304728 }, { "epoch": 0.21604507751347038, "grad_norm": 0.9590166807174683, "learning_rate": 9.908244712389705e-05, "loss": 1.0011123657226562, "memory(GiB)": 89.65, "step": 16650, "token_acc": 0.7359805434731771, "train_speed(iter/s)": 0.304608 }, { "epoch": 0.21610995591512608, "grad_norm": 0.8771262764930725, "learning_rate": 9.908142398988942e-05, "loss": 1.0227081298828125, "memory(GiB)": 89.65, "step": 16655, "token_acc": 0.721114656724906, "train_speed(iter/s)": 0.304473 }, { "epoch": 0.21617483431678178, "grad_norm": 0.8759707808494568, "learning_rate": 9.908040029105592e-05, "loss": 0.9948584556579589, "memory(GiB)": 89.65, "step": 16660, "token_acc": 0.7291242362525459, "train_speed(iter/s)": 0.304333 }, { "epoch": 0.21623971271843748, "grad_norm": 0.9193868637084961, "learning_rate": 9.907937602740834e-05, "loss": 1.0026572227478028, "memory(GiB)": 89.65, "step": 16665, "token_acc": 0.7501645705574611, "train_speed(iter/s)": 0.304203 }, { "epoch": 0.21630459112009318, "grad_norm": 0.8335990905761719, "learning_rate": 9.907835119895847e-05, "loss": 0.9574136734008789, "memory(GiB)": 89.65, "step": 16670, "token_acc": 0.7388816555882958, "train_speed(iter/s)": 0.30407 }, { "epoch": 0.21636946952174888, "grad_norm": 0.8688567876815796, "learning_rate": 9.907732580571807e-05, "loss": 0.9732705116271972, "memory(GiB)": 89.65, "step": 16675, "token_acc": 0.7396762347298209, "train_speed(iter/s)": 0.303928 }, { "epoch": 0.21643434792340455, "grad_norm": 0.8375464081764221, "learning_rate": 9.907629984769897e-05, "loss": 0.9985902786254883, "memory(GiB)": 89.65, "step": 16680, "token_acc": 0.7564362561440896, "train_speed(iter/s)": 0.303801 }, { "epoch": 0.21649922632506025, "grad_norm": 0.8929132223129272, "learning_rate": 9.907527332491298e-05, "loss": 0.9925485610961914, "memory(GiB)": 89.65, "step": 16685, "token_acc": 0.7375247307100462, "train_speed(iter/s)": 0.303673 }, { "epoch": 0.21656410472671594, "grad_norm": 0.8134740591049194, "learning_rate": 9.90742462373719e-05, "loss": 0.9317522048950195, "memory(GiB)": 89.65, "step": 16690, "token_acc": 0.7506687174000267, "train_speed(iter/s)": 0.303524 }, { "epoch": 0.21662898312837164, "grad_norm": 0.826597273349762, "learning_rate": 9.907321858508756e-05, "loss": 1.0438605308532716, "memory(GiB)": 89.65, "step": 16695, "token_acc": 0.7075319611592247, "train_speed(iter/s)": 0.303395 }, { "epoch": 0.21669386153002734, "grad_norm": 0.8127396702766418, "learning_rate": 9.907219036807176e-05, "loss": 0.9788223266601562, "memory(GiB)": 89.65, "step": 16700, "token_acc": 0.7507587754024808, "train_speed(iter/s)": 0.303252 }, { "epoch": 0.21675873993168304, "grad_norm": 0.7667953372001648, "learning_rate": 9.907116158633635e-05, "loss": 0.9724338531494141, "memory(GiB)": 89.65, "step": 16705, "token_acc": 0.7682675593896865, "train_speed(iter/s)": 0.303112 }, { "epoch": 0.21682361833333874, "grad_norm": 0.8241933584213257, "learning_rate": 9.907013223989319e-05, "loss": 0.9733107566833497, "memory(GiB)": 89.65, "step": 16710, "token_acc": 0.7580760661998727, "train_speed(iter/s)": 0.302975 }, { "epoch": 0.21688849673499444, "grad_norm": 0.7737424373626709, "learning_rate": 9.906910232875411e-05, "loss": 0.9758397102355957, "memory(GiB)": 89.65, "step": 16715, "token_acc": 0.721302153671192, "train_speed(iter/s)": 0.302842 }, { "epoch": 0.21695337513665014, "grad_norm": 0.940118670463562, "learning_rate": 9.906807185293095e-05, "loss": 1.0250308990478516, "memory(GiB)": 89.65, "step": 16720, "token_acc": 0.7435433988030686, "train_speed(iter/s)": 0.302713 }, { "epoch": 0.21701825353830584, "grad_norm": 0.7968839406967163, "learning_rate": 9.906704081243556e-05, "loss": 0.9718474388122559, "memory(GiB)": 89.65, "step": 16725, "token_acc": 0.7477552197901266, "train_speed(iter/s)": 0.302589 }, { "epoch": 0.21708313193996154, "grad_norm": 0.8768232464790344, "learning_rate": 9.906600920727984e-05, "loss": 0.9748435974121094, "memory(GiB)": 89.65, "step": 16730, "token_acc": 0.7518220611004116, "train_speed(iter/s)": 0.302453 }, { "epoch": 0.21714801034161724, "grad_norm": 0.8631677627563477, "learning_rate": 9.906497703747564e-05, "loss": 0.9833002090454102, "memory(GiB)": 89.65, "step": 16735, "token_acc": 0.7492772186642269, "train_speed(iter/s)": 0.302324 }, { "epoch": 0.2172128887432729, "grad_norm": 0.8335025906562805, "learning_rate": 9.906394430303484e-05, "loss": 0.9753167152404785, "memory(GiB)": 89.65, "step": 16740, "token_acc": 0.748121304596605, "train_speed(iter/s)": 0.302195 }, { "epoch": 0.2172777671449286, "grad_norm": 0.837019681930542, "learning_rate": 9.906291100396933e-05, "loss": 0.9796312332153321, "memory(GiB)": 89.65, "step": 16745, "token_acc": 0.7360971379011275, "train_speed(iter/s)": 0.302056 }, { "epoch": 0.2173426455465843, "grad_norm": 1.008934736251831, "learning_rate": 9.906187714029099e-05, "loss": 1.033253288269043, "memory(GiB)": 89.65, "step": 16750, "token_acc": 0.7105140483959825, "train_speed(iter/s)": 0.301922 }, { "epoch": 0.21740752394824, "grad_norm": 0.8826538324356079, "learning_rate": 9.906084271201172e-05, "loss": 0.9980390548706055, "memory(GiB)": 89.65, "step": 16755, "token_acc": 0.7367742885542834, "train_speed(iter/s)": 0.301786 }, { "epoch": 0.2174724023498957, "grad_norm": 0.898729145526886, "learning_rate": 9.905980771914344e-05, "loss": 0.9982851982116699, "memory(GiB)": 89.65, "step": 16760, "token_acc": 0.7221744161894306, "train_speed(iter/s)": 0.301658 }, { "epoch": 0.2175372807515514, "grad_norm": 0.7528257966041565, "learning_rate": 9.905877216169807e-05, "loss": 1.0067829132080077, "memory(GiB)": 89.65, "step": 16765, "token_acc": 0.7586662163406921, "train_speed(iter/s)": 0.30153 }, { "epoch": 0.2176021591532071, "grad_norm": 0.8128000497817993, "learning_rate": 9.905773603968749e-05, "loss": 0.9551799774169922, "memory(GiB)": 89.65, "step": 16770, "token_acc": 0.7478157402685705, "train_speed(iter/s)": 0.301403 }, { "epoch": 0.2176670375548628, "grad_norm": 0.7979691624641418, "learning_rate": 9.905669935312362e-05, "loss": 0.9903261184692382, "memory(GiB)": 89.65, "step": 16775, "token_acc": 0.72945503823444, "train_speed(iter/s)": 0.301269 }, { "epoch": 0.2177319159565185, "grad_norm": 0.9292251467704773, "learning_rate": 9.905566210201844e-05, "loss": 0.9999879837036133, "memory(GiB)": 89.65, "step": 16780, "token_acc": 0.7446815649956379, "train_speed(iter/s)": 0.301142 }, { "epoch": 0.2177967943581742, "grad_norm": 0.7483833432197571, "learning_rate": 9.905462428638386e-05, "loss": 0.9200697898864746, "memory(GiB)": 89.65, "step": 16785, "token_acc": 0.7489366189756389, "train_speed(iter/s)": 0.301001 }, { "epoch": 0.2178616727598299, "grad_norm": 0.8103446960449219, "learning_rate": 9.90535859062318e-05, "loss": 0.9597026824951171, "memory(GiB)": 89.65, "step": 16790, "token_acc": 0.7394975430825663, "train_speed(iter/s)": 0.300866 }, { "epoch": 0.2179265511614856, "grad_norm": 0.8563724756240845, "learning_rate": 9.905254696157424e-05, "loss": 0.9781513214111328, "memory(GiB)": 89.65, "step": 16795, "token_acc": 0.7421307934709486, "train_speed(iter/s)": 0.300729 }, { "epoch": 0.21799142956314127, "grad_norm": 0.79590904712677, "learning_rate": 9.905150745242314e-05, "loss": 0.9953762054443359, "memory(GiB)": 89.65, "step": 16800, "token_acc": 0.7368706009745534, "train_speed(iter/s)": 0.300599 }, { "epoch": 0.21805630796479697, "grad_norm": 0.8475101590156555, "learning_rate": 9.905046737879042e-05, "loss": 0.9498834609985352, "memory(GiB)": 89.65, "step": 16805, "token_acc": 0.7437670663659028, "train_speed(iter/s)": 0.300461 }, { "epoch": 0.21812118636645267, "grad_norm": 0.8678985834121704, "learning_rate": 9.90494267406881e-05, "loss": 0.9410888671875, "memory(GiB)": 89.65, "step": 16810, "token_acc": 0.7494073350997071, "train_speed(iter/s)": 0.300317 }, { "epoch": 0.21818606476810837, "grad_norm": 0.8524393439292908, "learning_rate": 9.904838553812812e-05, "loss": 0.9688761711120606, "memory(GiB)": 89.65, "step": 16815, "token_acc": 0.7334382788928243, "train_speed(iter/s)": 0.300192 }, { "epoch": 0.21825094316976407, "grad_norm": 0.8397647738456726, "learning_rate": 9.904734377112249e-05, "loss": 0.9946990013122559, "memory(GiB)": 89.65, "step": 16820, "token_acc": 0.7379718479387475, "train_speed(iter/s)": 0.300065 }, { "epoch": 0.21831582157141977, "grad_norm": 0.8626759648323059, "learning_rate": 9.904630143968318e-05, "loss": 0.9886379241943359, "memory(GiB)": 89.65, "step": 16825, "token_acc": 0.7433825746156165, "train_speed(iter/s)": 0.299928 }, { "epoch": 0.21838069997307546, "grad_norm": 0.8298487663269043, "learning_rate": 9.904525854382219e-05, "loss": 0.9372337341308594, "memory(GiB)": 89.65, "step": 16830, "token_acc": 0.7468877951941768, "train_speed(iter/s)": 0.299805 }, { "epoch": 0.21844557837473116, "grad_norm": 0.8661293387413025, "learning_rate": 9.904421508355152e-05, "loss": 0.9632997512817383, "memory(GiB)": 89.65, "step": 16835, "token_acc": 0.7514335775724753, "train_speed(iter/s)": 0.29967 }, { "epoch": 0.21851045677638686, "grad_norm": 0.8331230878829956, "learning_rate": 9.904317105888317e-05, "loss": 0.9852310180664062, "memory(GiB)": 89.65, "step": 16840, "token_acc": 0.7477691850089233, "train_speed(iter/s)": 0.299534 }, { "epoch": 0.21857533517804256, "grad_norm": 0.8975943326950073, "learning_rate": 9.904212646982918e-05, "loss": 0.9020242691040039, "memory(GiB)": 89.65, "step": 16845, "token_acc": 0.7449696774970882, "train_speed(iter/s)": 0.299406 }, { "epoch": 0.21864021357969826, "grad_norm": 0.8267907500267029, "learning_rate": 9.904108131640155e-05, "loss": 0.908936595916748, "memory(GiB)": 89.65, "step": 16850, "token_acc": 0.7815070962666754, "train_speed(iter/s)": 0.299275 }, { "epoch": 0.21870509198135396, "grad_norm": 0.7774602770805359, "learning_rate": 9.90400355986123e-05, "loss": 1.0187579154968263, "memory(GiB)": 89.65, "step": 16855, "token_acc": 0.7264659479358401, "train_speed(iter/s)": 0.299152 }, { "epoch": 0.21876997038300963, "grad_norm": 0.835635781288147, "learning_rate": 9.903898931647348e-05, "loss": 0.9660415649414062, "memory(GiB)": 89.65, "step": 16860, "token_acc": 0.7570418006430868, "train_speed(iter/s)": 0.299028 }, { "epoch": 0.21883484878466533, "grad_norm": 0.9010069370269775, "learning_rate": 9.903794246999714e-05, "loss": 0.9765010833740234, "memory(GiB)": 89.65, "step": 16865, "token_acc": 0.7381443298969073, "train_speed(iter/s)": 0.298895 }, { "epoch": 0.21889972718632103, "grad_norm": 0.9331185817718506, "learning_rate": 9.903689505919532e-05, "loss": 0.9737130165100097, "memory(GiB)": 89.65, "step": 16870, "token_acc": 0.7411437425170787, "train_speed(iter/s)": 0.298778 }, { "epoch": 0.21896460558797673, "grad_norm": 0.8392065167427063, "learning_rate": 9.903584708408005e-05, "loss": 0.9338726997375488, "memory(GiB)": 89.65, "step": 16875, "token_acc": 0.736803287324834, "train_speed(iter/s)": 0.29866 }, { "epoch": 0.21902948398963243, "grad_norm": 0.9476485848426819, "learning_rate": 9.903479854466339e-05, "loss": 1.027733039855957, "memory(GiB)": 89.65, "step": 16880, "token_acc": 0.7221539002108222, "train_speed(iter/s)": 0.298518 }, { "epoch": 0.21909436239128813, "grad_norm": 0.891740083694458, "learning_rate": 9.903374944095747e-05, "loss": 1.0317788124084473, "memory(GiB)": 89.65, "step": 16885, "token_acc": 0.7309523809523809, "train_speed(iter/s)": 0.298396 }, { "epoch": 0.21915924079294383, "grad_norm": 0.8190433979034424, "learning_rate": 9.903269977297429e-05, "loss": 0.9805360794067383, "memory(GiB)": 89.65, "step": 16890, "token_acc": 0.7357094166910118, "train_speed(iter/s)": 0.298275 }, { "epoch": 0.21922411919459953, "grad_norm": 0.8331804275512695, "learning_rate": 9.903164954072596e-05, "loss": 0.9595382690429688, "memory(GiB)": 89.65, "step": 16895, "token_acc": 0.7395754465662769, "train_speed(iter/s)": 0.298138 }, { "epoch": 0.21928899759625523, "grad_norm": 0.8268221020698547, "learning_rate": 9.903059874422456e-05, "loss": 1.0490101814270019, "memory(GiB)": 89.65, "step": 16900, "token_acc": 0.7345480418239189, "train_speed(iter/s)": 0.298002 }, { "epoch": 0.21935387599791092, "grad_norm": 0.9296405911445618, "learning_rate": 9.902954738348219e-05, "loss": 0.9916807174682617, "memory(GiB)": 89.65, "step": 16905, "token_acc": 0.760493580788931, "train_speed(iter/s)": 0.29787 }, { "epoch": 0.21941875439956662, "grad_norm": 0.904980480670929, "learning_rate": 9.902849545851094e-05, "loss": 0.9904025077819825, "memory(GiB)": 89.65, "step": 16910, "token_acc": 0.7382591170095448, "train_speed(iter/s)": 0.297746 }, { "epoch": 0.21948363280122232, "grad_norm": 0.8467010855674744, "learning_rate": 9.902744296932292e-05, "loss": 0.9651617050170899, "memory(GiB)": 89.65, "step": 16915, "token_acc": 0.7630187416331995, "train_speed(iter/s)": 0.297617 }, { "epoch": 0.219548511202878, "grad_norm": 0.8843411803245544, "learning_rate": 9.902638991593026e-05, "loss": 0.9213762283325195, "memory(GiB)": 89.65, "step": 16920, "token_acc": 0.7513609751087068, "train_speed(iter/s)": 0.297484 }, { "epoch": 0.2196133896045337, "grad_norm": 0.829371452331543, "learning_rate": 9.902533629834503e-05, "loss": 0.9869338989257812, "memory(GiB)": 89.65, "step": 16925, "token_acc": 0.7384361973914213, "train_speed(iter/s)": 0.297347 }, { "epoch": 0.2196782680061894, "grad_norm": 0.8045165538787842, "learning_rate": 9.90242821165794e-05, "loss": 0.9679193496704102, "memory(GiB)": 89.65, "step": 16930, "token_acc": 0.7519107964133276, "train_speed(iter/s)": 0.29721 }, { "epoch": 0.2197431464078451, "grad_norm": 0.85552579164505, "learning_rate": 9.90232273706455e-05, "loss": 1.0112863540649415, "memory(GiB)": 89.65, "step": 16935, "token_acc": 0.7239964633068081, "train_speed(iter/s)": 0.297082 }, { "epoch": 0.2198080248095008, "grad_norm": 0.9222652912139893, "learning_rate": 9.902217206055543e-05, "loss": 0.9658004760742187, "memory(GiB)": 89.65, "step": 16940, "token_acc": 0.7570985237390876, "train_speed(iter/s)": 0.296951 }, { "epoch": 0.2198729032111565, "grad_norm": 0.9203222393989563, "learning_rate": 9.902111618632137e-05, "loss": 0.9669721603393555, "memory(GiB)": 89.65, "step": 16945, "token_acc": 0.7689747513389441, "train_speed(iter/s)": 0.296825 }, { "epoch": 0.2199377816128122, "grad_norm": 0.9690904021263123, "learning_rate": 9.902005974795547e-05, "loss": 0.9448247909545898, "memory(GiB)": 89.65, "step": 16950, "token_acc": 0.7545441273085627, "train_speed(iter/s)": 0.296706 }, { "epoch": 0.2200026600144679, "grad_norm": 0.7934426069259644, "learning_rate": 9.901900274546986e-05, "loss": 0.9500224113464355, "memory(GiB)": 89.65, "step": 16955, "token_acc": 0.7331764552080269, "train_speed(iter/s)": 0.296575 }, { "epoch": 0.2200675384161236, "grad_norm": 0.868156909942627, "learning_rate": 9.901794517887673e-05, "loss": 0.9303023338317871, "memory(GiB)": 89.65, "step": 16960, "token_acc": 0.7613411531938948, "train_speed(iter/s)": 0.29645 }, { "epoch": 0.2201324168177793, "grad_norm": 0.8085169196128845, "learning_rate": 9.901688704818823e-05, "loss": 0.9509717941284179, "memory(GiB)": 89.65, "step": 16965, "token_acc": 0.7811408304221445, "train_speed(iter/s)": 0.296309 }, { "epoch": 0.22019729521943499, "grad_norm": 0.8201172947883606, "learning_rate": 9.901582835341657e-05, "loss": 0.9344218254089356, "memory(GiB)": 89.65, "step": 16970, "token_acc": 0.7761958198905877, "train_speed(iter/s)": 0.296173 }, { "epoch": 0.22026217362109068, "grad_norm": 1.0948143005371094, "learning_rate": 9.90147690945739e-05, "loss": 0.9478342056274414, "memory(GiB)": 89.65, "step": 16975, "token_acc": 0.7480815347721822, "train_speed(iter/s)": 0.29605 }, { "epoch": 0.22032705202274636, "grad_norm": 0.7724615931510925, "learning_rate": 9.901370927167242e-05, "loss": 1.0654441833496093, "memory(GiB)": 89.65, "step": 16980, "token_acc": 0.7304310265033477, "train_speed(iter/s)": 0.29593 }, { "epoch": 0.22039193042440205, "grad_norm": 0.9109805822372437, "learning_rate": 9.901264888472432e-05, "loss": 1.0001140594482423, "memory(GiB)": 89.65, "step": 16985, "token_acc": 0.7319837479506736, "train_speed(iter/s)": 0.295806 }, { "epoch": 0.22045680882605775, "grad_norm": 0.8952043056488037, "learning_rate": 9.901158793374183e-05, "loss": 0.9933550834655762, "memory(GiB)": 89.65, "step": 16990, "token_acc": 0.7568039285291681, "train_speed(iter/s)": 0.29567 }, { "epoch": 0.22052168722771345, "grad_norm": 0.8334885239601135, "learning_rate": 9.901052641873714e-05, "loss": 0.9437032699584961, "memory(GiB)": 89.65, "step": 16995, "token_acc": 0.7385817738581774, "train_speed(iter/s)": 0.295549 }, { "epoch": 0.22058656562936915, "grad_norm": 0.8340597152709961, "learning_rate": 9.900946433972245e-05, "loss": 0.98597412109375, "memory(GiB)": 89.65, "step": 17000, "token_acc": 0.7373025704552493, "train_speed(iter/s)": 0.295423 }, { "epoch": 0.22065144403102485, "grad_norm": 0.8676717877388, "learning_rate": 9.900840169671e-05, "loss": 1.0064556121826171, "memory(GiB)": 89.65, "step": 17005, "token_acc": 0.7330187682664883, "train_speed(iter/s)": 0.295301 }, { "epoch": 0.22071632243268055, "grad_norm": 0.8324628472328186, "learning_rate": 9.900733848971203e-05, "loss": 1.037412166595459, "memory(GiB)": 89.65, "step": 17010, "token_acc": 0.7146993697836825, "train_speed(iter/s)": 0.295185 }, { "epoch": 0.22078120083433625, "grad_norm": 0.8385588526725769, "learning_rate": 9.900627471874075e-05, "loss": 0.9244489669799805, "memory(GiB)": 89.65, "step": 17015, "token_acc": 0.78, "train_speed(iter/s)": 0.295059 }, { "epoch": 0.22084607923599195, "grad_norm": 0.7574717402458191, "learning_rate": 9.900521038380843e-05, "loss": 0.9328505516052246, "memory(GiB)": 89.65, "step": 17020, "token_acc": 0.7453678342409693, "train_speed(iter/s)": 0.294929 }, { "epoch": 0.22091095763764765, "grad_norm": 0.7803806066513062, "learning_rate": 9.900414548492729e-05, "loss": 1.0235593795776368, "memory(GiB)": 89.65, "step": 17025, "token_acc": 0.7426611414841383, "train_speed(iter/s)": 0.294812 }, { "epoch": 0.22097583603930335, "grad_norm": 0.8413056135177612, "learning_rate": 9.900308002210961e-05, "loss": 0.9329937934875489, "memory(GiB)": 89.65, "step": 17030, "token_acc": 0.7385191570311803, "train_speed(iter/s)": 0.294681 }, { "epoch": 0.22104071444095905, "grad_norm": 0.8425288796424866, "learning_rate": 9.900201399536762e-05, "loss": 0.9859163284301757, "memory(GiB)": 89.65, "step": 17035, "token_acc": 0.7365679264555669, "train_speed(iter/s)": 0.294552 }, { "epoch": 0.22110559284261472, "grad_norm": 0.8102934956550598, "learning_rate": 9.900094740471362e-05, "loss": 0.9827215194702148, "memory(GiB)": 89.65, "step": 17040, "token_acc": 0.7438770086273868, "train_speed(iter/s)": 0.294434 }, { "epoch": 0.22117047124427042, "grad_norm": 0.7768281698226929, "learning_rate": 9.899988025015988e-05, "loss": 0.9918432235717773, "memory(GiB)": 89.65, "step": 17045, "token_acc": 0.7405305473150179, "train_speed(iter/s)": 0.294319 }, { "epoch": 0.22123534964592612, "grad_norm": 0.8119745254516602, "learning_rate": 9.899881253171866e-05, "loss": 0.9643823623657226, "memory(GiB)": 89.65, "step": 17050, "token_acc": 0.7551543816407421, "train_speed(iter/s)": 0.294191 }, { "epoch": 0.22130022804758182, "grad_norm": 0.7999879121780396, "learning_rate": 9.899774424940225e-05, "loss": 0.9787260055541992, "memory(GiB)": 89.65, "step": 17055, "token_acc": 0.7175688431074815, "train_speed(iter/s)": 0.294062 }, { "epoch": 0.22136510644923751, "grad_norm": 0.8250886797904968, "learning_rate": 9.899667540322296e-05, "loss": 0.9604772567749024, "memory(GiB)": 89.65, "step": 17060, "token_acc": 0.7451038575667656, "train_speed(iter/s)": 0.293935 }, { "epoch": 0.2214299848508932, "grad_norm": 0.783955454826355, "learning_rate": 9.899560599319308e-05, "loss": 0.9521082878112793, "memory(GiB)": 89.65, "step": 17065, "token_acc": 0.7506455944633819, "train_speed(iter/s)": 0.293812 }, { "epoch": 0.2214948632525489, "grad_norm": 0.9027988314628601, "learning_rate": 9.899453601932494e-05, "loss": 0.9840729713439942, "memory(GiB)": 89.65, "step": 17070, "token_acc": 0.7451792095672788, "train_speed(iter/s)": 0.293687 }, { "epoch": 0.2215597416542046, "grad_norm": 0.787187933921814, "learning_rate": 9.899346548163081e-05, "loss": 0.988679027557373, "memory(GiB)": 89.65, "step": 17075, "token_acc": 0.7044009976667471, "train_speed(iter/s)": 0.293573 }, { "epoch": 0.2216246200558603, "grad_norm": 0.8186782002449036, "learning_rate": 9.899239438012304e-05, "loss": 0.9489560127258301, "memory(GiB)": 89.65, "step": 17080, "token_acc": 0.748536490178223, "train_speed(iter/s)": 0.293441 }, { "epoch": 0.221689498457516, "grad_norm": 0.7690673470497131, "learning_rate": 9.899132271481395e-05, "loss": 0.9589876174926758, "memory(GiB)": 89.65, "step": 17085, "token_acc": 0.7552832476000131, "train_speed(iter/s)": 0.293314 }, { "epoch": 0.2217543768591717, "grad_norm": 0.8175057768821716, "learning_rate": 9.899025048571587e-05, "loss": 0.9568123817443848, "memory(GiB)": 89.65, "step": 17090, "token_acc": 0.7252678745373076, "train_speed(iter/s)": 0.2932 }, { "epoch": 0.22181925526082738, "grad_norm": 0.792021632194519, "learning_rate": 9.898917769284112e-05, "loss": 0.9664779663085937, "memory(GiB)": 89.65, "step": 17095, "token_acc": 0.7364281370362863, "train_speed(iter/s)": 0.293078 }, { "epoch": 0.22188413366248308, "grad_norm": 0.8107345700263977, "learning_rate": 9.89881043362021e-05, "loss": 1.0155464172363282, "memory(GiB)": 89.65, "step": 17100, "token_acc": 0.7425084001920044, "train_speed(iter/s)": 0.292955 }, { "epoch": 0.22194901206413878, "grad_norm": 0.826039731502533, "learning_rate": 9.898703041581112e-05, "loss": 0.993625259399414, "memory(GiB)": 89.65, "step": 17105, "token_acc": 0.7665210553905294, "train_speed(iter/s)": 0.292839 }, { "epoch": 0.22201389046579448, "grad_norm": 0.9490756392478943, "learning_rate": 9.898595593168054e-05, "loss": 0.9846755981445312, "memory(GiB)": 89.65, "step": 17110, "token_acc": 0.7657785002473673, "train_speed(iter/s)": 0.29272 }, { "epoch": 0.22207876886745018, "grad_norm": 0.8603165149688721, "learning_rate": 9.898488088382273e-05, "loss": 0.9723499298095704, "memory(GiB)": 89.65, "step": 17115, "token_acc": 0.7267143933685004, "train_speed(iter/s)": 0.292595 }, { "epoch": 0.22214364726910588, "grad_norm": 0.866544783115387, "learning_rate": 9.898380527225006e-05, "loss": 0.9896469116210938, "memory(GiB)": 89.65, "step": 17120, "token_acc": 0.7304093326188219, "train_speed(iter/s)": 0.292486 }, { "epoch": 0.22220852567076158, "grad_norm": 0.9135472774505615, "learning_rate": 9.898272909697493e-05, "loss": 0.9794010162353516, "memory(GiB)": 89.65, "step": 17125, "token_acc": 0.7427586465320193, "train_speed(iter/s)": 0.292362 }, { "epoch": 0.22227340407241727, "grad_norm": 0.8601201176643372, "learning_rate": 9.898165235800969e-05, "loss": 0.9652246475219727, "memory(GiB)": 89.65, "step": 17130, "token_acc": 0.7266509698396681, "train_speed(iter/s)": 0.292242 }, { "epoch": 0.22233828247407297, "grad_norm": 0.9344911575317383, "learning_rate": 9.898057505536675e-05, "loss": 1.04111967086792, "memory(GiB)": 89.65, "step": 17135, "token_acc": 0.7359055735834255, "train_speed(iter/s)": 0.292117 }, { "epoch": 0.22240316087572867, "grad_norm": 0.8159346580505371, "learning_rate": 9.897949718905851e-05, "loss": 0.9774627685546875, "memory(GiB)": 89.65, "step": 17140, "token_acc": 0.7478215223097113, "train_speed(iter/s)": 0.292001 }, { "epoch": 0.22246803927738437, "grad_norm": 0.7615069150924683, "learning_rate": 9.897841875909734e-05, "loss": 0.9979592323303222, "memory(GiB)": 89.65, "step": 17145, "token_acc": 0.7353246416293702, "train_speed(iter/s)": 0.291884 }, { "epoch": 0.22253291767904007, "grad_norm": 0.8066954612731934, "learning_rate": 9.897733976549571e-05, "loss": 1.0021343231201172, "memory(GiB)": 89.65, "step": 17150, "token_acc": 0.7324649298597194, "train_speed(iter/s)": 0.291772 }, { "epoch": 0.22259779608069574, "grad_norm": 0.9310891032218933, "learning_rate": 9.897626020826601e-05, "loss": 0.9719936370849609, "memory(GiB)": 89.65, "step": 17155, "token_acc": 0.7330143227032874, "train_speed(iter/s)": 0.291646 }, { "epoch": 0.22266267448235144, "grad_norm": 0.9107502698898315, "learning_rate": 9.897518008742064e-05, "loss": 0.9970306396484375, "memory(GiB)": 89.65, "step": 17160, "token_acc": 0.7184701295964598, "train_speed(iter/s)": 0.291516 }, { "epoch": 0.22272755288400714, "grad_norm": 0.9870148301124573, "learning_rate": 9.897409940297206e-05, "loss": 0.9914274215698242, "memory(GiB)": 89.65, "step": 17165, "token_acc": 0.74947501882008, "train_speed(iter/s)": 0.291397 }, { "epoch": 0.22279243128566284, "grad_norm": 0.8063662052154541, "learning_rate": 9.897301815493269e-05, "loss": 0.9814144134521484, "memory(GiB)": 89.65, "step": 17170, "token_acc": 0.7560063860441572, "train_speed(iter/s)": 0.291283 }, { "epoch": 0.22285730968731854, "grad_norm": 0.8445936441421509, "learning_rate": 9.897193634331499e-05, "loss": 1.0130651473999024, "memory(GiB)": 89.65, "step": 17175, "token_acc": 0.7373621801539422, "train_speed(iter/s)": 0.291169 }, { "epoch": 0.22292218808897424, "grad_norm": 0.9336414933204651, "learning_rate": 9.89708539681314e-05, "loss": 1.001682662963867, "memory(GiB)": 89.65, "step": 17180, "token_acc": 0.759468364740291, "train_speed(iter/s)": 0.291052 }, { "epoch": 0.22298706649062994, "grad_norm": 0.9420745372772217, "learning_rate": 9.896977102939435e-05, "loss": 1.0214558601379395, "memory(GiB)": 89.65, "step": 17185, "token_acc": 0.7487549643825254, "train_speed(iter/s)": 0.290929 }, { "epoch": 0.22305194489228564, "grad_norm": 0.8523697853088379, "learning_rate": 9.896868752711634e-05, "loss": 0.9651752471923828, "memory(GiB)": 89.65, "step": 17190, "token_acc": 0.7493705488813754, "train_speed(iter/s)": 0.290823 }, { "epoch": 0.22311682329394134, "grad_norm": 0.9484432935714722, "learning_rate": 9.896760346130984e-05, "loss": 0.9776748657226563, "memory(GiB)": 89.65, "step": 17195, "token_acc": 0.7515087824650953, "train_speed(iter/s)": 0.290701 }, { "epoch": 0.22318170169559703, "grad_norm": 0.757086992263794, "learning_rate": 9.89665188319873e-05, "loss": 0.9496509552001953, "memory(GiB)": 89.65, "step": 17200, "token_acc": 0.7435805479845072, "train_speed(iter/s)": 0.290577 }, { "epoch": 0.22324658009725273, "grad_norm": 0.8167540431022644, "learning_rate": 9.896543363916121e-05, "loss": 0.9387133598327637, "memory(GiB)": 89.65, "step": 17205, "token_acc": 0.7606762978250516, "train_speed(iter/s)": 0.290463 }, { "epoch": 0.22331145849890843, "grad_norm": 0.7921172380447388, "learning_rate": 9.896434788284406e-05, "loss": 0.9822622299194336, "memory(GiB)": 89.65, "step": 17210, "token_acc": 0.7475241617945353, "train_speed(iter/s)": 0.29034 }, { "epoch": 0.2233763369005641, "grad_norm": 0.9981180429458618, "learning_rate": 9.896326156304835e-05, "loss": 1.003938865661621, "memory(GiB)": 89.65, "step": 17215, "token_acc": 0.7522291350096046, "train_speed(iter/s)": 0.290211 }, { "epoch": 0.2234412153022198, "grad_norm": 0.8950685858726501, "learning_rate": 9.896217467978656e-05, "loss": 0.9817968368530273, "memory(GiB)": 89.65, "step": 17220, "token_acc": 0.745298728727509, "train_speed(iter/s)": 0.290093 }, { "epoch": 0.2235060937038755, "grad_norm": 0.8179036378860474, "learning_rate": 9.896108723307125e-05, "loss": 0.9523914337158204, "memory(GiB)": 89.65, "step": 17225, "token_acc": 0.7330947564960324, "train_speed(iter/s)": 0.289971 }, { "epoch": 0.2235709721055312, "grad_norm": 0.8293091058731079, "learning_rate": 9.895999922291487e-05, "loss": 0.9768095016479492, "memory(GiB)": 89.65, "step": 17230, "token_acc": 0.7345526268603192, "train_speed(iter/s)": 0.289849 }, { "epoch": 0.2236358505071869, "grad_norm": 0.8283094763755798, "learning_rate": 9.895891064932999e-05, "loss": 1.017164134979248, "memory(GiB)": 89.65, "step": 17235, "token_acc": 0.728970783113523, "train_speed(iter/s)": 0.289727 }, { "epoch": 0.2237007289088426, "grad_norm": 0.8309333324432373, "learning_rate": 9.89578215123291e-05, "loss": 0.912103271484375, "memory(GiB)": 89.65, "step": 17240, "token_acc": 0.7526374236535258, "train_speed(iter/s)": 0.289617 }, { "epoch": 0.2237656073104983, "grad_norm": 0.7821773886680603, "learning_rate": 9.895673181192476e-05, "loss": 0.9549776077270508, "memory(GiB)": 89.65, "step": 17245, "token_acc": 0.7660453666256374, "train_speed(iter/s)": 0.289508 }, { "epoch": 0.223830485712154, "grad_norm": 0.8911106586456299, "learning_rate": 9.895564154812951e-05, "loss": 1.034892463684082, "memory(GiB)": 89.65, "step": 17250, "token_acc": 0.7427171341669483, "train_speed(iter/s)": 0.289397 }, { "epoch": 0.2238953641138097, "grad_norm": 0.8384466171264648, "learning_rate": 9.895455072095587e-05, "loss": 1.0011549949645997, "memory(GiB)": 89.65, "step": 17255, "token_acc": 0.739185135004355, "train_speed(iter/s)": 0.289283 }, { "epoch": 0.2239602425154654, "grad_norm": 0.9069672226905823, "learning_rate": 9.895345933041642e-05, "loss": 1.008791732788086, "memory(GiB)": 89.65, "step": 17260, "token_acc": 0.7146830798366574, "train_speed(iter/s)": 0.289162 }, { "epoch": 0.2240251209171211, "grad_norm": 0.943501889705658, "learning_rate": 9.895236737652372e-05, "loss": 0.9983652114868165, "memory(GiB)": 89.65, "step": 17265, "token_acc": 0.7349889135254989, "train_speed(iter/s)": 0.289055 }, { "epoch": 0.2240899993187768, "grad_norm": 0.857123076915741, "learning_rate": 9.89512748592903e-05, "loss": 0.995146656036377, "memory(GiB)": 89.65, "step": 17270, "token_acc": 0.7438467611822457, "train_speed(iter/s)": 0.288946 }, { "epoch": 0.22415487772043247, "grad_norm": 0.8381252288818359, "learning_rate": 9.895018177872877e-05, "loss": 0.9954146385192871, "memory(GiB)": 89.65, "step": 17275, "token_acc": 0.7441140636833613, "train_speed(iter/s)": 0.288832 }, { "epoch": 0.22421975612208817, "grad_norm": 0.8705167174339294, "learning_rate": 9.89490881348517e-05, "loss": 1.0061253547668456, "memory(GiB)": 89.65, "step": 17280, "token_acc": 0.7167417968340497, "train_speed(iter/s)": 0.288716 }, { "epoch": 0.22428463452374386, "grad_norm": 0.8756353855133057, "learning_rate": 9.894799392767169e-05, "loss": 1.0123425483703614, "memory(GiB)": 89.65, "step": 17285, "token_acc": 0.7376084066594923, "train_speed(iter/s)": 0.288592 }, { "epoch": 0.22434951292539956, "grad_norm": 0.957970917224884, "learning_rate": 9.89468991572013e-05, "loss": 1.0191009521484375, "memory(GiB)": 89.65, "step": 17290, "token_acc": 0.7404923271412017, "train_speed(iter/s)": 0.288472 }, { "epoch": 0.22441439132705526, "grad_norm": 0.9061269164085388, "learning_rate": 9.894580382345315e-05, "loss": 0.9615097045898438, "memory(GiB)": 89.65, "step": 17295, "token_acc": 0.7445973229641816, "train_speed(iter/s)": 0.288359 }, { "epoch": 0.22447926972871096, "grad_norm": 0.759789764881134, "learning_rate": 9.894470792643983e-05, "loss": 1.0099374771118164, "memory(GiB)": 89.65, "step": 17300, "token_acc": 0.7411363716223339, "train_speed(iter/s)": 0.288246 }, { "epoch": 0.22454414813036666, "grad_norm": 0.8149442672729492, "learning_rate": 9.894361146617396e-05, "loss": 0.9592534065246582, "memory(GiB)": 89.65, "step": 17305, "token_acc": 0.731214177713703, "train_speed(iter/s)": 0.288136 }, { "epoch": 0.22460902653202236, "grad_norm": 0.8524155616760254, "learning_rate": 9.894251444266818e-05, "loss": 0.9737499237060547, "memory(GiB)": 89.65, "step": 17310, "token_acc": 0.7343542660402396, "train_speed(iter/s)": 0.288027 }, { "epoch": 0.22467390493367806, "grad_norm": 0.7885525822639465, "learning_rate": 9.894141685593509e-05, "loss": 0.9982420921325683, "memory(GiB)": 89.65, "step": 17315, "token_acc": 0.7263316196271815, "train_speed(iter/s)": 0.287913 }, { "epoch": 0.22473878333533376, "grad_norm": 0.856496274471283, "learning_rate": 9.894031870598731e-05, "loss": 1.0220525741577149, "memory(GiB)": 89.65, "step": 17320, "token_acc": 0.7398246712586098, "train_speed(iter/s)": 0.287802 }, { "epoch": 0.22480366173698946, "grad_norm": 0.8164582848548889, "learning_rate": 9.89392199928375e-05, "loss": 0.9523216247558594, "memory(GiB)": 89.65, "step": 17325, "token_acc": 0.7463027263374485, "train_speed(iter/s)": 0.287677 }, { "epoch": 0.22486854013864516, "grad_norm": 0.8693197965621948, "learning_rate": 9.89381207164983e-05, "loss": 0.9807113647460938, "memory(GiB)": 89.65, "step": 17330, "token_acc": 0.7316240642333501, "train_speed(iter/s)": 0.287563 }, { "epoch": 0.22493341854030083, "grad_norm": 0.7717934846878052, "learning_rate": 9.893702087698236e-05, "loss": 0.9777944564819336, "memory(GiB)": 89.65, "step": 17335, "token_acc": 0.7310504301696875, "train_speed(iter/s)": 0.287449 }, { "epoch": 0.22499829694195653, "grad_norm": 0.8614188432693481, "learning_rate": 9.893592047430232e-05, "loss": 1.0015474319458009, "memory(GiB)": 89.65, "step": 17340, "token_acc": 0.7271325796505652, "train_speed(iter/s)": 0.287338 }, { "epoch": 0.22506317534361223, "grad_norm": 0.9063681960105896, "learning_rate": 9.893481950847086e-05, "loss": 1.0055204391479493, "memory(GiB)": 89.65, "step": 17345, "token_acc": 0.7462812339708361, "train_speed(iter/s)": 0.28722 }, { "epoch": 0.22512805374526793, "grad_norm": 0.8347246050834656, "learning_rate": 9.893371797950064e-05, "loss": 1.0002464294433593, "memory(GiB)": 89.65, "step": 17350, "token_acc": 0.755075036307891, "train_speed(iter/s)": 0.287115 }, { "epoch": 0.22519293214692362, "grad_norm": 0.7969226837158203, "learning_rate": 9.893261588740436e-05, "loss": 1.0293831825256348, "memory(GiB)": 89.65, "step": 17355, "token_acc": 0.72504115710254, "train_speed(iter/s)": 0.286998 }, { "epoch": 0.22525781054857932, "grad_norm": 0.9335231184959412, "learning_rate": 9.893151323219468e-05, "loss": 0.9489224433898926, "memory(GiB)": 89.65, "step": 17360, "token_acc": 0.7544862123206317, "train_speed(iter/s)": 0.286878 }, { "epoch": 0.22532268895023502, "grad_norm": 0.8473792672157288, "learning_rate": 9.893041001388428e-05, "loss": 0.974092674255371, "memory(GiB)": 89.65, "step": 17365, "token_acc": 0.718815755472701, "train_speed(iter/s)": 0.28677 }, { "epoch": 0.22538756735189072, "grad_norm": 0.9054101705551147, "learning_rate": 9.892930623248589e-05, "loss": 0.9927421569824219, "memory(GiB)": 89.65, "step": 17370, "token_acc": 0.7314563585750027, "train_speed(iter/s)": 0.286662 }, { "epoch": 0.22545244575354642, "grad_norm": 0.8899745345115662, "learning_rate": 9.892820188801218e-05, "loss": 1.0196393966674804, "memory(GiB)": 89.65, "step": 17375, "token_acc": 0.7354567489774276, "train_speed(iter/s)": 0.28655 }, { "epoch": 0.22551732415520212, "grad_norm": 0.8355426788330078, "learning_rate": 9.892709698047588e-05, "loss": 0.9684829711914062, "memory(GiB)": 89.65, "step": 17380, "token_acc": 0.7736336941753955, "train_speed(iter/s)": 0.286443 }, { "epoch": 0.22558220255685782, "grad_norm": 0.9029908776283264, "learning_rate": 9.892599150988969e-05, "loss": 0.9773782730102539, "memory(GiB)": 89.65, "step": 17385, "token_acc": 0.7232319391634981, "train_speed(iter/s)": 0.286324 }, { "epoch": 0.22564708095851352, "grad_norm": 0.7821699380874634, "learning_rate": 9.892488547626634e-05, "loss": 1.0218776702880858, "memory(GiB)": 89.65, "step": 17390, "token_acc": 0.7363225980326101, "train_speed(iter/s)": 0.286216 }, { "epoch": 0.2257119593601692, "grad_norm": 0.8709577322006226, "learning_rate": 9.892377887961857e-05, "loss": 0.9764727592468262, "memory(GiB)": 89.65, "step": 17395, "token_acc": 0.7752444044250064, "train_speed(iter/s)": 0.286098 }, { "epoch": 0.2257768377618249, "grad_norm": 0.7508866190910339, "learning_rate": 9.892267171995909e-05, "loss": 1.0048341751098633, "memory(GiB)": 89.65, "step": 17400, "token_acc": 0.7390102098695406, "train_speed(iter/s)": 0.285989 }, { "epoch": 0.2258417161634806, "grad_norm": 0.7835751175880432, "learning_rate": 9.892156399730064e-05, "loss": 0.9363296508789063, "memory(GiB)": 89.65, "step": 17405, "token_acc": 0.7583991250555384, "train_speed(iter/s)": 0.285885 }, { "epoch": 0.2259065945651363, "grad_norm": 0.7960089445114136, "learning_rate": 9.8920455711656e-05, "loss": 0.9536567687988281, "memory(GiB)": 89.65, "step": 17410, "token_acc": 0.7407572851249442, "train_speed(iter/s)": 0.285772 }, { "epoch": 0.225971472966792, "grad_norm": 0.7833794355392456, "learning_rate": 9.89193468630379e-05, "loss": 0.9720420837402344, "memory(GiB)": 89.65, "step": 17415, "token_acc": 0.7433306255077173, "train_speed(iter/s)": 0.285664 }, { "epoch": 0.22603635136844769, "grad_norm": 0.8269431591033936, "learning_rate": 9.89182374514591e-05, "loss": 0.9753644943237305, "memory(GiB)": 89.65, "step": 17420, "token_acc": 0.753219536927497, "train_speed(iter/s)": 0.285551 }, { "epoch": 0.22610122977010338, "grad_norm": 0.7755223512649536, "learning_rate": 9.891712747693237e-05, "loss": 0.9836019515991211, "memory(GiB)": 89.65, "step": 17425, "token_acc": 0.7348848707444554, "train_speed(iter/s)": 0.285446 }, { "epoch": 0.22616610817175908, "grad_norm": 0.8650115728378296, "learning_rate": 9.89160169394705e-05, "loss": 0.9620616912841797, "memory(GiB)": 89.65, "step": 17430, "token_acc": 0.7576594584771943, "train_speed(iter/s)": 0.285338 }, { "epoch": 0.22623098657341478, "grad_norm": 0.8353161215782166, "learning_rate": 9.891490583908623e-05, "loss": 0.930241584777832, "memory(GiB)": 89.65, "step": 17435, "token_acc": 0.7429848222553243, "train_speed(iter/s)": 0.285217 }, { "epoch": 0.22629586497507048, "grad_norm": 0.8359803557395935, "learning_rate": 9.891379417579238e-05, "loss": 0.9907550811767578, "memory(GiB)": 89.65, "step": 17440, "token_acc": 0.7541986126323476, "train_speed(iter/s)": 0.285115 }, { "epoch": 0.22636074337672618, "grad_norm": 0.662946343421936, "learning_rate": 9.891268194960175e-05, "loss": 0.9396881103515625, "memory(GiB)": 89.65, "step": 17445, "token_acc": 0.7493414167659053, "train_speed(iter/s)": 0.285005 }, { "epoch": 0.22642562177838188, "grad_norm": 0.882146954536438, "learning_rate": 9.89115691605271e-05, "loss": 1.0283717155456542, "memory(GiB)": 89.65, "step": 17450, "token_acc": 0.7335247250088707, "train_speed(iter/s)": 0.284893 }, { "epoch": 0.22649050018003755, "grad_norm": 0.8722025752067566, "learning_rate": 9.89104558085813e-05, "loss": 0.9630584716796875, "memory(GiB)": 89.65, "step": 17455, "token_acc": 0.7243154772053056, "train_speed(iter/s)": 0.284783 }, { "epoch": 0.22655537858169325, "grad_norm": 0.86752849817276, "learning_rate": 9.890934189377708e-05, "loss": 0.9560373306274415, "memory(GiB)": 89.65, "step": 17460, "token_acc": 0.7488093309573951, "train_speed(iter/s)": 0.284675 }, { "epoch": 0.22662025698334895, "grad_norm": 0.8555724620819092, "learning_rate": 9.890822741612733e-05, "loss": 0.9505088806152344, "memory(GiB)": 89.65, "step": 17465, "token_acc": 0.7663627906976744, "train_speed(iter/s)": 0.284556 }, { "epoch": 0.22668513538500465, "grad_norm": 0.8869437575340271, "learning_rate": 9.890711237564483e-05, "loss": 1.011770534515381, "memory(GiB)": 89.65, "step": 17470, "token_acc": 0.7236102564102564, "train_speed(iter/s)": 0.284456 }, { "epoch": 0.22675001378666035, "grad_norm": 0.8538012504577637, "learning_rate": 9.890599677234243e-05, "loss": 0.9471236228942871, "memory(GiB)": 89.65, "step": 17475, "token_acc": 0.7451923076923077, "train_speed(iter/s)": 0.284354 }, { "epoch": 0.22681489218831605, "grad_norm": 0.8064212203025818, "learning_rate": 9.890488060623298e-05, "loss": 0.9623922348022461, "memory(GiB)": 89.65, "step": 17480, "token_acc": 0.7162697125056785, "train_speed(iter/s)": 0.284246 }, { "epoch": 0.22687977058997175, "grad_norm": 0.8148370385169983, "learning_rate": 9.89037638773293e-05, "loss": 0.9886935234069825, "memory(GiB)": 89.65, "step": 17485, "token_acc": 0.7292729707234105, "train_speed(iter/s)": 0.284141 }, { "epoch": 0.22694464899162745, "grad_norm": 0.8040368556976318, "learning_rate": 9.890264658564425e-05, "loss": 0.998694896697998, "memory(GiB)": 89.65, "step": 17490, "token_acc": 0.7296880704173935, "train_speed(iter/s)": 0.284034 }, { "epoch": 0.22700952739328314, "grad_norm": 0.8916625380516052, "learning_rate": 9.89015287311907e-05, "loss": 1.0070348739624024, "memory(GiB)": 89.65, "step": 17495, "token_acc": 0.7267371211842251, "train_speed(iter/s)": 0.283927 }, { "epoch": 0.22707440579493884, "grad_norm": 0.8605049848556519, "learning_rate": 9.89004103139815e-05, "loss": 0.9759889602661133, "memory(GiB)": 89.65, "step": 17500, "token_acc": 0.7434667868177137, "train_speed(iter/s)": 0.28382 }, { "epoch": 0.22713928419659454, "grad_norm": 0.7791354656219482, "learning_rate": 9.889929133402951e-05, "loss": 0.9691853523254395, "memory(GiB)": 89.65, "step": 17505, "token_acc": 0.7446179129005752, "train_speed(iter/s)": 0.283716 }, { "epoch": 0.22720416259825024, "grad_norm": 0.7812994718551636, "learning_rate": 9.889817179134764e-05, "loss": 0.9687994003295899, "memory(GiB)": 89.65, "step": 17510, "token_acc": 0.7784266233119878, "train_speed(iter/s)": 0.283615 }, { "epoch": 0.2272690409999059, "grad_norm": 0.8488151431083679, "learning_rate": 9.889705168594876e-05, "loss": 1.0202356338500977, "memory(GiB)": 89.65, "step": 17515, "token_acc": 0.7390995597216304, "train_speed(iter/s)": 0.283506 }, { "epoch": 0.2273339194015616, "grad_norm": 0.8630655407905579, "learning_rate": 9.889593101784574e-05, "loss": 1.0065043449401856, "memory(GiB)": 89.65, "step": 17520, "token_acc": 0.7422575731934338, "train_speed(iter/s)": 0.283392 }, { "epoch": 0.2273987978032173, "grad_norm": 0.8152391910552979, "learning_rate": 9.889480978705151e-05, "loss": 0.9823558807373047, "memory(GiB)": 89.65, "step": 17525, "token_acc": 0.7308579668348955, "train_speed(iter/s)": 0.283275 }, { "epoch": 0.227463676204873, "grad_norm": 0.7232562303543091, "learning_rate": 9.889368799357894e-05, "loss": 0.9853357315063477, "memory(GiB)": 89.65, "step": 17530, "token_acc": 0.7431110498759989, "train_speed(iter/s)": 0.283172 }, { "epoch": 0.2275285546065287, "grad_norm": 0.794258713722229, "learning_rate": 9.889256563744097e-05, "loss": 0.9616291046142578, "memory(GiB)": 89.65, "step": 17535, "token_acc": 0.7454792922418821, "train_speed(iter/s)": 0.283059 }, { "epoch": 0.2275934330081844, "grad_norm": 0.8729557991027832, "learning_rate": 9.889144271865049e-05, "loss": 0.9879283905029297, "memory(GiB)": 89.65, "step": 17540, "token_acc": 0.7297553017944535, "train_speed(iter/s)": 0.28295 }, { "epoch": 0.2276583114098401, "grad_norm": 0.7992429733276367, "learning_rate": 9.889031923722044e-05, "loss": 0.9644233703613281, "memory(GiB)": 89.65, "step": 17545, "token_acc": 0.7346205657835654, "train_speed(iter/s)": 0.282841 }, { "epoch": 0.2277231898114958, "grad_norm": 0.7260344624519348, "learning_rate": 9.888919519316374e-05, "loss": 1.030397319793701, "memory(GiB)": 89.65, "step": 17550, "token_acc": 0.7135664898972948, "train_speed(iter/s)": 0.282736 }, { "epoch": 0.2277880682131515, "grad_norm": 0.8712726831436157, "learning_rate": 9.888807058649333e-05, "loss": 0.9926657676696777, "memory(GiB)": 89.65, "step": 17555, "token_acc": 0.7501980269316627, "train_speed(iter/s)": 0.282629 }, { "epoch": 0.2278529466148072, "grad_norm": 0.8217464685440063, "learning_rate": 9.888694541722216e-05, "loss": 0.9660636901855468, "memory(GiB)": 89.65, "step": 17560, "token_acc": 0.7398011928429423, "train_speed(iter/s)": 0.282529 }, { "epoch": 0.2279178250164629, "grad_norm": 0.79106205701828, "learning_rate": 9.888581968536315e-05, "loss": 0.9584945678710938, "memory(GiB)": 89.65, "step": 17565, "token_acc": 0.7549489873610477, "train_speed(iter/s)": 0.282427 }, { "epoch": 0.2279827034181186, "grad_norm": 0.7707583904266357, "learning_rate": 9.888469339092928e-05, "loss": 0.9429773330688477, "memory(GiB)": 89.65, "step": 17570, "token_acc": 0.7646782379848361, "train_speed(iter/s)": 0.282314 }, { "epoch": 0.22804758181977428, "grad_norm": 0.7486750483512878, "learning_rate": 9.888356653393352e-05, "loss": 0.9591537475585937, "memory(GiB)": 89.65, "step": 17575, "token_acc": 0.7470448307410796, "train_speed(iter/s)": 0.282211 }, { "epoch": 0.22811246022142997, "grad_norm": 0.8629991412162781, "learning_rate": 9.888243911438879e-05, "loss": 0.988223934173584, "memory(GiB)": 89.65, "step": 17580, "token_acc": 0.7574322773186409, "train_speed(iter/s)": 0.282097 }, { "epoch": 0.22817733862308567, "grad_norm": 0.8600365519523621, "learning_rate": 9.888131113230811e-05, "loss": 0.9938793182373047, "memory(GiB)": 89.65, "step": 17585, "token_acc": 0.730129786549394, "train_speed(iter/s)": 0.28199 }, { "epoch": 0.22824221702474137, "grad_norm": 0.8309784531593323, "learning_rate": 9.888018258770447e-05, "loss": 0.9446208953857422, "memory(GiB)": 89.65, "step": 17590, "token_acc": 0.7504470762897729, "train_speed(iter/s)": 0.281887 }, { "epoch": 0.22830709542639707, "grad_norm": 0.7863656282424927, "learning_rate": 9.88790534805908e-05, "loss": 0.9959630966186523, "memory(GiB)": 89.65, "step": 17595, "token_acc": 0.7451413427561837, "train_speed(iter/s)": 0.281785 }, { "epoch": 0.22837197382805277, "grad_norm": 0.9133409857749939, "learning_rate": 9.887792381098016e-05, "loss": 0.9747843742370605, "memory(GiB)": 89.65, "step": 17600, "token_acc": 0.7457783781862135, "train_speed(iter/s)": 0.281679 }, { "epoch": 0.22843685222970847, "grad_norm": 0.8205558657646179, "learning_rate": 9.88767935788855e-05, "loss": 0.9836832046508789, "memory(GiB)": 89.65, "step": 17605, "token_acc": 0.7413892445582586, "train_speed(iter/s)": 0.281576 }, { "epoch": 0.22850173063136417, "grad_norm": 0.9813835620880127, "learning_rate": 9.887566278431985e-05, "loss": 1.0288915634155273, "memory(GiB)": 89.65, "step": 17610, "token_acc": 0.7511206123715606, "train_speed(iter/s)": 0.281479 }, { "epoch": 0.22856660903301987, "grad_norm": 0.864966630935669, "learning_rate": 9.887453142729621e-05, "loss": 0.9895265579223633, "memory(GiB)": 89.65, "step": 17615, "token_acc": 0.7583667317507622, "train_speed(iter/s)": 0.281374 }, { "epoch": 0.22863148743467557, "grad_norm": 0.8147640824317932, "learning_rate": 9.887339950782763e-05, "loss": 0.9681478500366211, "memory(GiB)": 89.65, "step": 17620, "token_acc": 0.7514732510288066, "train_speed(iter/s)": 0.281264 }, { "epoch": 0.22869636583633127, "grad_norm": 0.8474066257476807, "learning_rate": 9.887226702592708e-05, "loss": 0.9541415214538574, "memory(GiB)": 89.65, "step": 17625, "token_acc": 0.7397940945163366, "train_speed(iter/s)": 0.281154 }, { "epoch": 0.22876124423798697, "grad_norm": 0.8248730897903442, "learning_rate": 9.887113398160765e-05, "loss": 0.9909526824951171, "memory(GiB)": 89.65, "step": 17630, "token_acc": 0.7518530665792063, "train_speed(iter/s)": 0.28106 }, { "epoch": 0.22882612263964264, "grad_norm": 0.8876137137413025, "learning_rate": 9.887000037488237e-05, "loss": 1.0151216506958007, "memory(GiB)": 89.65, "step": 17635, "token_acc": 0.7502605976372481, "train_speed(iter/s)": 0.280959 }, { "epoch": 0.22889100104129834, "grad_norm": 0.7622548341751099, "learning_rate": 9.886886620576424e-05, "loss": 1.0050028800964355, "memory(GiB)": 89.65, "step": 17640, "token_acc": 0.7347274256441624, "train_speed(iter/s)": 0.280859 }, { "epoch": 0.22895587944295404, "grad_norm": 0.8220796585083008, "learning_rate": 9.886773147426636e-05, "loss": 0.9980592727661133, "memory(GiB)": 89.65, "step": 17645, "token_acc": 0.7373881007012895, "train_speed(iter/s)": 0.280752 }, { "epoch": 0.22902075784460973, "grad_norm": 0.7421501874923706, "learning_rate": 9.886659618040177e-05, "loss": 0.9350472450256347, "memory(GiB)": 89.65, "step": 17650, "token_acc": 0.7469842037952369, "train_speed(iter/s)": 0.280647 }, { "epoch": 0.22908563624626543, "grad_norm": 0.8896437287330627, "learning_rate": 9.886546032418354e-05, "loss": 0.9882371902465821, "memory(GiB)": 89.65, "step": 17655, "token_acc": 0.7348822292698571, "train_speed(iter/s)": 0.28054 }, { "epoch": 0.22915051464792113, "grad_norm": 0.7861928343772888, "learning_rate": 9.886432390562473e-05, "loss": 0.9237601280212402, "memory(GiB)": 89.65, "step": 17660, "token_acc": 0.7616851506167417, "train_speed(iter/s)": 0.280429 }, { "epoch": 0.22921539304957683, "grad_norm": 0.7787418365478516, "learning_rate": 9.886318692473842e-05, "loss": 0.9803020477294921, "memory(GiB)": 89.65, "step": 17665, "token_acc": 0.7408256880733946, "train_speed(iter/s)": 0.280312 }, { "epoch": 0.22928027145123253, "grad_norm": 0.8004326820373535, "learning_rate": 9.886204938153771e-05, "loss": 0.9894633293151855, "memory(GiB)": 89.65, "step": 17670, "token_acc": 0.7315160042208935, "train_speed(iter/s)": 0.280214 }, { "epoch": 0.22934514985288823, "grad_norm": 0.8968176245689392, "learning_rate": 9.886091127603567e-05, "loss": 0.9698382377624511, "memory(GiB)": 89.65, "step": 17675, "token_acc": 0.7355515343993501, "train_speed(iter/s)": 0.28011 }, { "epoch": 0.22941002825454393, "grad_norm": 0.831779420375824, "learning_rate": 9.885977260824542e-05, "loss": 1.02900390625, "memory(GiB)": 89.65, "step": 17680, "token_acc": 0.7339241279958092, "train_speed(iter/s)": 0.280008 }, { "epoch": 0.22947490665619963, "grad_norm": 0.8567668795585632, "learning_rate": 9.885863337818003e-05, "loss": 0.9816278457641602, "memory(GiB)": 89.65, "step": 17685, "token_acc": 0.7469143296507628, "train_speed(iter/s)": 0.279899 }, { "epoch": 0.22953978505785533, "grad_norm": 0.7312150001525879, "learning_rate": 9.885749358585266e-05, "loss": 0.9255436897277832, "memory(GiB)": 89.65, "step": 17690, "token_acc": 0.7587636363636364, "train_speed(iter/s)": 0.279797 }, { "epoch": 0.229604663459511, "grad_norm": 0.8181172609329224, "learning_rate": 9.885635323127637e-05, "loss": 1.0141196250915527, "memory(GiB)": 89.65, "step": 17695, "token_acc": 0.7528676397247066, "train_speed(iter/s)": 0.279687 }, { "epoch": 0.2296695418611667, "grad_norm": 0.812454342842102, "learning_rate": 9.885521231446432e-05, "loss": 0.9997463226318359, "memory(GiB)": 89.65, "step": 17700, "token_acc": 0.7422954489803057, "train_speed(iter/s)": 0.279584 }, { "epoch": 0.2297344202628224, "grad_norm": 0.9861627817153931, "learning_rate": 9.885407083542966e-05, "loss": 0.9963162422180176, "memory(GiB)": 89.65, "step": 17705, "token_acc": 0.7354366970994587, "train_speed(iter/s)": 0.279484 }, { "epoch": 0.2297992986644781, "grad_norm": 0.864838719367981, "learning_rate": 9.885292879418546e-05, "loss": 0.9791587829589844, "memory(GiB)": 89.65, "step": 17710, "token_acc": 0.7359244460588449, "train_speed(iter/s)": 0.279382 }, { "epoch": 0.2298641770661338, "grad_norm": 0.9578641057014465, "learning_rate": 9.88517861907449e-05, "loss": 0.9573233604431153, "memory(GiB)": 89.65, "step": 17715, "token_acc": 0.735773831098224, "train_speed(iter/s)": 0.279272 }, { "epoch": 0.2299290554677895, "grad_norm": 0.877549409866333, "learning_rate": 9.885064302512116e-05, "loss": 0.9516688346862793, "memory(GiB)": 89.65, "step": 17720, "token_acc": 0.7552427924134889, "train_speed(iter/s)": 0.279177 }, { "epoch": 0.2299939338694452, "grad_norm": 0.7440200448036194, "learning_rate": 9.884949929732733e-05, "loss": 0.9737276077270508, "memory(GiB)": 89.65, "step": 17725, "token_acc": 0.7611108916367084, "train_speed(iter/s)": 0.279077 }, { "epoch": 0.2300588122711009, "grad_norm": 0.811749279499054, "learning_rate": 9.884835500737663e-05, "loss": 1.000279998779297, "memory(GiB)": 89.65, "step": 17730, "token_acc": 0.722288106034182, "train_speed(iter/s)": 0.278968 }, { "epoch": 0.2301236906727566, "grad_norm": 0.7350977063179016, "learning_rate": 9.88472101552822e-05, "loss": 0.9790416717529297, "memory(GiB)": 89.65, "step": 17735, "token_acc": 0.755511797179929, "train_speed(iter/s)": 0.278872 }, { "epoch": 0.2301885690744123, "grad_norm": 0.8437801599502563, "learning_rate": 9.884606474105722e-05, "loss": 0.937005615234375, "memory(GiB)": 89.65, "step": 17740, "token_acc": 0.7702333403405508, "train_speed(iter/s)": 0.27878 }, { "epoch": 0.230253447476068, "grad_norm": 0.7811281681060791, "learning_rate": 9.884491876471487e-05, "loss": 1.0010136604309081, "memory(GiB)": 89.65, "step": 17745, "token_acc": 0.7259106838568626, "train_speed(iter/s)": 0.278683 }, { "epoch": 0.2303183258777237, "grad_norm": 0.8971163630485535, "learning_rate": 9.884377222626833e-05, "loss": 0.9930477142333984, "memory(GiB)": 89.65, "step": 17750, "token_acc": 0.7300572330767713, "train_speed(iter/s)": 0.278585 }, { "epoch": 0.23038320427937936, "grad_norm": 0.8641830086708069, "learning_rate": 9.884262512573082e-05, "loss": 0.9673701286315918, "memory(GiB)": 89.65, "step": 17755, "token_acc": 0.7307665383269163, "train_speed(iter/s)": 0.27848 }, { "epoch": 0.23044808268103506, "grad_norm": 0.885386049747467, "learning_rate": 9.884147746311551e-05, "loss": 1.0046198844909668, "memory(GiB)": 89.65, "step": 17760, "token_acc": 0.7237855242895143, "train_speed(iter/s)": 0.278381 }, { "epoch": 0.23051296108269076, "grad_norm": 0.8834132552146912, "learning_rate": 9.884032923843561e-05, "loss": 0.9729728698730469, "memory(GiB)": 89.65, "step": 17765, "token_acc": 0.7423009578741374, "train_speed(iter/s)": 0.278277 }, { "epoch": 0.23057783948434646, "grad_norm": 0.8679227828979492, "learning_rate": 9.883918045170437e-05, "loss": 0.9470075607299805, "memory(GiB)": 89.65, "step": 17770, "token_acc": 0.7555918700847717, "train_speed(iter/s)": 0.278176 }, { "epoch": 0.23064271788600216, "grad_norm": 0.8531380891799927, "learning_rate": 9.883803110293497e-05, "loss": 0.955046272277832, "memory(GiB)": 89.65, "step": 17775, "token_acc": 0.7510125218295991, "train_speed(iter/s)": 0.278076 }, { "epoch": 0.23070759628765786, "grad_norm": 0.8116587400436401, "learning_rate": 9.883688119214065e-05, "loss": 0.9527061462402344, "memory(GiB)": 89.65, "step": 17780, "token_acc": 0.765134061237437, "train_speed(iter/s)": 0.277971 }, { "epoch": 0.23077247468931356, "grad_norm": 0.8005638718605042, "learning_rate": 9.883573071933465e-05, "loss": 0.9848451614379883, "memory(GiB)": 89.65, "step": 17785, "token_acc": 0.742282571330354, "train_speed(iter/s)": 0.277871 }, { "epoch": 0.23083735309096926, "grad_norm": 0.8570212721824646, "learning_rate": 9.883457968453019e-05, "loss": 0.9926306724548339, "memory(GiB)": 89.65, "step": 17790, "token_acc": 0.7467603991188285, "train_speed(iter/s)": 0.277769 }, { "epoch": 0.23090223149262495, "grad_norm": 0.777396559715271, "learning_rate": 9.883342808774054e-05, "loss": 0.9722638130187988, "memory(GiB)": 89.65, "step": 17795, "token_acc": 0.7533748338812338, "train_speed(iter/s)": 0.277672 }, { "epoch": 0.23096710989428065, "grad_norm": 0.7398306727409363, "learning_rate": 9.883227592897894e-05, "loss": 0.9783205032348633, "memory(GiB)": 89.65, "step": 17800, "token_acc": 0.7497515184980673, "train_speed(iter/s)": 0.277569 }, { "epoch": 0.23103198829593635, "grad_norm": 0.8471439480781555, "learning_rate": 9.883112320825868e-05, "loss": 1.0007457733154297, "memory(GiB)": 89.65, "step": 17805, "token_acc": 0.7410573491576065, "train_speed(iter/s)": 0.277469 }, { "epoch": 0.23109686669759205, "grad_norm": 0.8057240843772888, "learning_rate": 9.882996992559295e-05, "loss": 0.9668535232543946, "memory(GiB)": 89.65, "step": 17810, "token_acc": 0.7406678118061083, "train_speed(iter/s)": 0.277375 }, { "epoch": 0.23116174509924772, "grad_norm": 0.8818402886390686, "learning_rate": 9.88288160809951e-05, "loss": 0.9555568695068359, "memory(GiB)": 89.65, "step": 17815, "token_acc": 0.7377609732040111, "train_speed(iter/s)": 0.277276 }, { "epoch": 0.23122662350090342, "grad_norm": 0.8001708984375, "learning_rate": 9.882766167447837e-05, "loss": 0.9281889915466308, "memory(GiB)": 89.65, "step": 17820, "token_acc": 0.7413327156204154, "train_speed(iter/s)": 0.277172 }, { "epoch": 0.23129150190255912, "grad_norm": 0.8072583675384521, "learning_rate": 9.882650670605604e-05, "loss": 0.9637676239013672, "memory(GiB)": 89.65, "step": 17825, "token_acc": 0.7416080940523844, "train_speed(iter/s)": 0.277068 }, { "epoch": 0.23135638030421482, "grad_norm": 0.8129255771636963, "learning_rate": 9.882535117574143e-05, "loss": 0.9813686370849609, "memory(GiB)": 89.65, "step": 17830, "token_acc": 0.7275706403611436, "train_speed(iter/s)": 0.276967 }, { "epoch": 0.23142125870587052, "grad_norm": 0.9495411515235901, "learning_rate": 9.882419508354781e-05, "loss": 1.0122894287109374, "memory(GiB)": 89.65, "step": 17835, "token_acc": 0.7390523404102454, "train_speed(iter/s)": 0.276874 }, { "epoch": 0.23148613710752622, "grad_norm": 0.8419917821884155, "learning_rate": 9.88230384294885e-05, "loss": 1.035440158843994, "memory(GiB)": 89.65, "step": 17840, "token_acc": 0.726389638424177, "train_speed(iter/s)": 0.276782 }, { "epoch": 0.23155101550918192, "grad_norm": 0.7936493754386902, "learning_rate": 9.882188121357681e-05, "loss": 0.9867071151733399, "memory(GiB)": 89.65, "step": 17845, "token_acc": 0.7329043740160175, "train_speed(iter/s)": 0.276678 }, { "epoch": 0.23161589391083762, "grad_norm": 1.0654449462890625, "learning_rate": 9.882072343582605e-05, "loss": 0.9807212829589844, "memory(GiB)": 89.65, "step": 17850, "token_acc": 0.7653209611522748, "train_speed(iter/s)": 0.276588 }, { "epoch": 0.23168077231249332, "grad_norm": 0.8232943415641785, "learning_rate": 9.881956509624954e-05, "loss": 0.9941825866699219, "memory(GiB)": 89.65, "step": 17855, "token_acc": 0.7326683739946983, "train_speed(iter/s)": 0.276487 }, { "epoch": 0.23174565071414902, "grad_norm": 0.9076293706893921, "learning_rate": 9.881840619486063e-05, "loss": 1.010754108428955, "memory(GiB)": 89.65, "step": 17860, "token_acc": 0.7592931139549055, "train_speed(iter/s)": 0.276392 }, { "epoch": 0.23181052911580471, "grad_norm": 0.8621106147766113, "learning_rate": 9.881724673167264e-05, "loss": 0.9775266647338867, "memory(GiB)": 89.65, "step": 17865, "token_acc": 0.7561397440332065, "train_speed(iter/s)": 0.276299 }, { "epoch": 0.2318754075174604, "grad_norm": 0.8981738686561584, "learning_rate": 9.88160867066989e-05, "loss": 0.9357199668884277, "memory(GiB)": 89.65, "step": 17870, "token_acc": 0.7600392459177238, "train_speed(iter/s)": 0.276202 }, { "epoch": 0.23194028591911608, "grad_norm": 0.8949713110923767, "learning_rate": 9.881492611995279e-05, "loss": 1.0133855819702149, "memory(GiB)": 89.65, "step": 17875, "token_acc": 0.7298288508557457, "train_speed(iter/s)": 0.276107 }, { "epoch": 0.23200516432077178, "grad_norm": 0.7117490768432617, "learning_rate": 9.881376497144765e-05, "loss": 0.9368446350097657, "memory(GiB)": 89.65, "step": 17880, "token_acc": 0.7226876090750436, "train_speed(iter/s)": 0.275995 }, { "epoch": 0.23207004272242748, "grad_norm": 0.7685368657112122, "learning_rate": 9.881260326119683e-05, "loss": 0.9790252685546875, "memory(GiB)": 89.65, "step": 17885, "token_acc": 0.7401063022019742, "train_speed(iter/s)": 0.275894 }, { "epoch": 0.23213492112408318, "grad_norm": 0.8802623152732849, "learning_rate": 9.881144098921373e-05, "loss": 0.9393339157104492, "memory(GiB)": 89.65, "step": 17890, "token_acc": 0.7368735281435537, "train_speed(iter/s)": 0.275801 }, { "epoch": 0.23219979952573888, "grad_norm": 0.9692672491073608, "learning_rate": 9.881027815551171e-05, "loss": 1.0023906707763672, "memory(GiB)": 89.65, "step": 17895, "token_acc": 0.7171221178479932, "train_speed(iter/s)": 0.275712 }, { "epoch": 0.23226467792739458, "grad_norm": 0.8356239795684814, "learning_rate": 9.880911476010415e-05, "loss": 0.983006763458252, "memory(GiB)": 89.65, "step": 17900, "token_acc": 0.7389777922926192, "train_speed(iter/s)": 0.275618 }, { "epoch": 0.23232955632905028, "grad_norm": 0.9194226861000061, "learning_rate": 9.880795080300444e-05, "loss": 0.9577692031860352, "memory(GiB)": 89.65, "step": 17905, "token_acc": 0.7540792172952501, "train_speed(iter/s)": 0.275517 }, { "epoch": 0.23239443473070598, "grad_norm": 0.8797876834869385, "learning_rate": 9.880678628422598e-05, "loss": 0.959744930267334, "memory(GiB)": 89.65, "step": 17910, "token_acc": 0.7549414416078619, "train_speed(iter/s)": 0.27542 }, { "epoch": 0.23245931313236168, "grad_norm": 0.8910137414932251, "learning_rate": 9.880562120378214e-05, "loss": 0.9953501701354981, "memory(GiB)": 89.65, "step": 17915, "token_acc": 0.7486489696537799, "train_speed(iter/s)": 0.275324 }, { "epoch": 0.23252419153401738, "grad_norm": 0.7872313261032104, "learning_rate": 9.880445556168635e-05, "loss": 0.9321403503417969, "memory(GiB)": 89.65, "step": 17920, "token_acc": 0.773312706864297, "train_speed(iter/s)": 0.275229 }, { "epoch": 0.23258906993567308, "grad_norm": 0.7061395049095154, "learning_rate": 9.880328935795205e-05, "loss": 0.9191070556640625, "memory(GiB)": 89.65, "step": 17925, "token_acc": 0.7608384945212006, "train_speed(iter/s)": 0.275133 }, { "epoch": 0.23265394833732878, "grad_norm": 0.8388098478317261, "learning_rate": 9.880212259259263e-05, "loss": 0.9779850006103515, "memory(GiB)": 89.65, "step": 17930, "token_acc": 0.7207809993348202, "train_speed(iter/s)": 0.275039 }, { "epoch": 0.23271882673898445, "grad_norm": 0.8286349177360535, "learning_rate": 9.880095526562152e-05, "loss": 0.9514440536499024, "memory(GiB)": 89.65, "step": 17935, "token_acc": 0.750970420967111, "train_speed(iter/s)": 0.274944 }, { "epoch": 0.23278370514064015, "grad_norm": 1.0176939964294434, "learning_rate": 9.879978737705216e-05, "loss": 0.992467212677002, "memory(GiB)": 89.65, "step": 17940, "token_acc": 0.7377860235003092, "train_speed(iter/s)": 0.274849 }, { "epoch": 0.23284858354229584, "grad_norm": 1.014822244644165, "learning_rate": 9.879861892689799e-05, "loss": 0.9944012641906739, "memory(GiB)": 89.65, "step": 17945, "token_acc": 0.7464652454820687, "train_speed(iter/s)": 0.274752 }, { "epoch": 0.23291346194395154, "grad_norm": 0.8616572618484497, "learning_rate": 9.879744991517246e-05, "loss": 0.9689811706542969, "memory(GiB)": 89.65, "step": 17950, "token_acc": 0.7479399888381866, "train_speed(iter/s)": 0.274655 }, { "epoch": 0.23297834034560724, "grad_norm": 0.8290077447891235, "learning_rate": 9.879628034188902e-05, "loss": 0.9507726669311524, "memory(GiB)": 89.65, "step": 17955, "token_acc": 0.7443664301731426, "train_speed(iter/s)": 0.274562 }, { "epoch": 0.23304321874726294, "grad_norm": 0.8931608200073242, "learning_rate": 9.87951102070611e-05, "loss": 0.9798184394836426, "memory(GiB)": 89.65, "step": 17960, "token_acc": 0.7327898550724637, "train_speed(iter/s)": 0.274462 }, { "epoch": 0.23310809714891864, "grad_norm": 0.7043183445930481, "learning_rate": 9.879393951070221e-05, "loss": 0.9605695724487304, "memory(GiB)": 89.65, "step": 17965, "token_acc": 0.7312387187566922, "train_speed(iter/s)": 0.27437 }, { "epoch": 0.23317297555057434, "grad_norm": 0.9115808010101318, "learning_rate": 9.87927682528258e-05, "loss": 1.0709598541259766, "memory(GiB)": 89.65, "step": 17970, "token_acc": 0.7119695528068506, "train_speed(iter/s)": 0.274281 }, { "epoch": 0.23323785395223004, "grad_norm": 0.6981563568115234, "learning_rate": 9.879159643344535e-05, "loss": 0.9446475982666016, "memory(GiB)": 89.65, "step": 17975, "token_acc": 0.7569124855015528, "train_speed(iter/s)": 0.274191 }, { "epoch": 0.23330273235388574, "grad_norm": 0.8431258797645569, "learning_rate": 9.879042405257435e-05, "loss": 1.0033555984497071, "memory(GiB)": 89.65, "step": 17980, "token_acc": 0.7230688386757751, "train_speed(iter/s)": 0.274099 }, { "epoch": 0.23336761075554144, "grad_norm": 0.8339694142341614, "learning_rate": 9.87892511102263e-05, "loss": 0.9160335540771485, "memory(GiB)": 89.65, "step": 17985, "token_acc": 0.7655603140613847, "train_speed(iter/s)": 0.274004 }, { "epoch": 0.2334324891571971, "grad_norm": 0.8318307995796204, "learning_rate": 9.878807760641469e-05, "loss": 0.8980727195739746, "memory(GiB)": 89.65, "step": 17990, "token_acc": 0.7607640519805856, "train_speed(iter/s)": 0.273911 }, { "epoch": 0.2334973675588528, "grad_norm": 0.8787282705307007, "learning_rate": 9.878690354115302e-05, "loss": 0.9524376869201661, "memory(GiB)": 89.65, "step": 17995, "token_acc": 0.7416657656683605, "train_speed(iter/s)": 0.273807 }, { "epoch": 0.2335622459605085, "grad_norm": 0.8916409611701965, "learning_rate": 9.878572891445479e-05, "loss": 1.0301288604736327, "memory(GiB)": 89.65, "step": 18000, "token_acc": 0.7390307914094555, "train_speed(iter/s)": 0.273709 }, { "epoch": 0.2336271243621642, "grad_norm": 0.8564149737358093, "learning_rate": 9.878455372633354e-05, "loss": 0.9742926597595215, "memory(GiB)": 89.65, "step": 18005, "token_acc": 0.7446833463643472, "train_speed(iter/s)": 0.273616 }, { "epoch": 0.2336920027638199, "grad_norm": 0.7648012638092041, "learning_rate": 9.878337797680279e-05, "loss": 0.9718362808227539, "memory(GiB)": 89.65, "step": 18010, "token_acc": 0.7181650324716669, "train_speed(iter/s)": 0.273524 }, { "epoch": 0.2337568811654756, "grad_norm": 0.727179229259491, "learning_rate": 9.878220166587606e-05, "loss": 0.9386873245239258, "memory(GiB)": 89.65, "step": 18015, "token_acc": 0.7674794511125632, "train_speed(iter/s)": 0.273421 }, { "epoch": 0.2338217595671313, "grad_norm": 0.89437335729599, "learning_rate": 9.87810247935669e-05, "loss": 1.0069419860839843, "memory(GiB)": 89.65, "step": 18020, "token_acc": 0.732356192717806, "train_speed(iter/s)": 0.273333 }, { "epoch": 0.233886637968787, "grad_norm": 0.862628698348999, "learning_rate": 9.877984735988884e-05, "loss": 0.9829089164733886, "memory(GiB)": 89.65, "step": 18025, "token_acc": 0.7585459396364849, "train_speed(iter/s)": 0.273244 }, { "epoch": 0.2339515163704427, "grad_norm": 0.766778290271759, "learning_rate": 9.877866936485542e-05, "loss": 0.9322196960449218, "memory(GiB)": 89.65, "step": 18030, "token_acc": 0.7411910231489662, "train_speed(iter/s)": 0.273146 }, { "epoch": 0.2340163947720984, "grad_norm": 0.9850755333900452, "learning_rate": 9.877749080848025e-05, "loss": 1.0182756423950194, "memory(GiB)": 89.65, "step": 18035, "token_acc": 0.7352063986985697, "train_speed(iter/s)": 0.27306 }, { "epoch": 0.2340812731737541, "grad_norm": 0.8711355328559875, "learning_rate": 9.877631169077682e-05, "loss": 0.9286043167114257, "memory(GiB)": 89.65, "step": 18040, "token_acc": 0.7488496779098147, "train_speed(iter/s)": 0.272973 }, { "epoch": 0.2341461515754098, "grad_norm": 0.8213779330253601, "learning_rate": 9.877513201175876e-05, "loss": 0.9252409934997559, "memory(GiB)": 89.65, "step": 18045, "token_acc": 0.7283480907581626, "train_speed(iter/s)": 0.272878 }, { "epoch": 0.23421102997706547, "grad_norm": 0.9919857382774353, "learning_rate": 9.87739517714396e-05, "loss": 0.9600215911865234, "memory(GiB)": 89.65, "step": 18050, "token_acc": 0.7487128501935163, "train_speed(iter/s)": 0.272783 }, { "epoch": 0.23427590837872117, "grad_norm": 0.9036800265312195, "learning_rate": 9.877277096983295e-05, "loss": 0.9443897247314453, "memory(GiB)": 89.65, "step": 18055, "token_acc": 0.7327406394255428, "train_speed(iter/s)": 0.272687 }, { "epoch": 0.23434078678037687, "grad_norm": 0.8456308245658875, "learning_rate": 9.877158960695239e-05, "loss": 1.0042993545532226, "memory(GiB)": 89.65, "step": 18060, "token_acc": 0.736409637435588, "train_speed(iter/s)": 0.272603 }, { "epoch": 0.23440566518203257, "grad_norm": 0.9586524963378906, "learning_rate": 9.877040768281152e-05, "loss": 1.0068504333496093, "memory(GiB)": 89.65, "step": 18065, "token_acc": 0.7444789922666029, "train_speed(iter/s)": 0.272512 }, { "epoch": 0.23447054358368827, "grad_norm": 0.8330002427101135, "learning_rate": 9.876922519742393e-05, "loss": 0.9675239562988281, "memory(GiB)": 89.65, "step": 18070, "token_acc": 0.724429945960195, "train_speed(iter/s)": 0.272421 }, { "epoch": 0.23453542198534397, "grad_norm": 0.7109394073486328, "learning_rate": 9.876804215080325e-05, "loss": 1.0152023315429688, "memory(GiB)": 89.65, "step": 18075, "token_acc": 0.7388741526672561, "train_speed(iter/s)": 0.272333 }, { "epoch": 0.23460030038699967, "grad_norm": 0.8030576705932617, "learning_rate": 9.876685854296306e-05, "loss": 0.9578128814697265, "memory(GiB)": 89.65, "step": 18080, "token_acc": 0.754048757005811, "train_speed(iter/s)": 0.27224 }, { "epoch": 0.23466517878865537, "grad_norm": 0.9375611543655396, "learning_rate": 9.876567437391701e-05, "loss": 0.9841373443603516, "memory(GiB)": 89.65, "step": 18085, "token_acc": 0.7422381508476624, "train_speed(iter/s)": 0.272153 }, { "epoch": 0.23473005719031106, "grad_norm": 0.7721485495567322, "learning_rate": 9.876448964367873e-05, "loss": 0.9954802513122558, "memory(GiB)": 89.65, "step": 18090, "token_acc": 0.7311557026198354, "train_speed(iter/s)": 0.272061 }, { "epoch": 0.23479493559196676, "grad_norm": 0.820061445236206, "learning_rate": 9.876330435226182e-05, "loss": 1.0011557579040526, "memory(GiB)": 89.65, "step": 18095, "token_acc": 0.7047662219919699, "train_speed(iter/s)": 0.271966 }, { "epoch": 0.23485981399362246, "grad_norm": 0.8538479804992676, "learning_rate": 9.876211849967996e-05, "loss": 1.0010589599609374, "memory(GiB)": 89.65, "step": 18100, "token_acc": 0.739023735572683, "train_speed(iter/s)": 0.271877 }, { "epoch": 0.23492469239527816, "grad_norm": 0.88443523645401, "learning_rate": 9.876093208594677e-05, "loss": 0.9933340072631835, "memory(GiB)": 89.65, "step": 18105, "token_acc": 0.7430621417673497, "train_speed(iter/s)": 0.271786 }, { "epoch": 0.23498957079693383, "grad_norm": 0.8187074661254883, "learning_rate": 9.87597451110759e-05, "loss": 0.9725074768066406, "memory(GiB)": 89.65, "step": 18110, "token_acc": 0.7472828042899302, "train_speed(iter/s)": 0.271693 }, { "epoch": 0.23505444919858953, "grad_norm": 0.7338123917579651, "learning_rate": 9.875855757508103e-05, "loss": 0.9523103713989258, "memory(GiB)": 89.65, "step": 18115, "token_acc": 0.733670871084266, "train_speed(iter/s)": 0.271599 }, { "epoch": 0.23511932760024523, "grad_norm": 0.9782770872116089, "learning_rate": 9.875736947797582e-05, "loss": 1.0070694923400878, "memory(GiB)": 89.65, "step": 18120, "token_acc": 0.7261101375787629, "train_speed(iter/s)": 0.271514 }, { "epoch": 0.23518420600190093, "grad_norm": 0.8755218386650085, "learning_rate": 9.875618081977395e-05, "loss": 1.003959560394287, "memory(GiB)": 89.65, "step": 18125, "token_acc": 0.7263834028572483, "train_speed(iter/s)": 0.271422 }, { "epoch": 0.23524908440355663, "grad_norm": 0.990858256816864, "learning_rate": 9.875499160048907e-05, "loss": 1.0267624855041504, "memory(GiB)": 89.65, "step": 18130, "token_acc": 0.7364296376713259, "train_speed(iter/s)": 0.271336 }, { "epoch": 0.23531396280521233, "grad_norm": 0.9888952970504761, "learning_rate": 9.87538018201349e-05, "loss": 0.9806462287902832, "memory(GiB)": 89.65, "step": 18135, "token_acc": 0.7490708976558034, "train_speed(iter/s)": 0.271242 }, { "epoch": 0.23537884120686803, "grad_norm": 0.9191423058509827, "learning_rate": 9.875261147872511e-05, "loss": 0.9081013679504395, "memory(GiB)": 89.65, "step": 18140, "token_acc": 0.7706781115879828, "train_speed(iter/s)": 0.271152 }, { "epoch": 0.23544371960852373, "grad_norm": 0.9549767971038818, "learning_rate": 9.87514205762734e-05, "loss": 0.9847804069519043, "memory(GiB)": 89.65, "step": 18145, "token_acc": 0.7257169441398332, "train_speed(iter/s)": 0.271072 }, { "epoch": 0.23550859801017943, "grad_norm": 0.885941743850708, "learning_rate": 9.875022911279349e-05, "loss": 1.012033748626709, "memory(GiB)": 89.65, "step": 18150, "token_acc": 0.7339015856472866, "train_speed(iter/s)": 0.270987 }, { "epoch": 0.23557347641183513, "grad_norm": 0.8985833525657654, "learning_rate": 9.874903708829908e-05, "loss": 0.9486292839050293, "memory(GiB)": 89.65, "step": 18155, "token_acc": 0.7590913976568378, "train_speed(iter/s)": 0.27089 }, { "epoch": 0.23563835481349082, "grad_norm": 0.7324755787849426, "learning_rate": 9.874784450280388e-05, "loss": 0.9667669296264648, "memory(GiB)": 89.65, "step": 18160, "token_acc": 0.7569325453718921, "train_speed(iter/s)": 0.270801 }, { "epoch": 0.23570323321514652, "grad_norm": 0.8090101480484009, "learning_rate": 9.874665135632161e-05, "loss": 0.9637800216674804, "memory(GiB)": 89.65, "step": 18165, "token_acc": 0.7559032817644351, "train_speed(iter/s)": 0.270712 }, { "epoch": 0.2357681116168022, "grad_norm": 0.8179773688316345, "learning_rate": 9.874545764886605e-05, "loss": 1.003087615966797, "memory(GiB)": 89.65, "step": 18170, "token_acc": 0.7400633640552995, "train_speed(iter/s)": 0.27062 }, { "epoch": 0.2358329900184579, "grad_norm": 0.8949105143547058, "learning_rate": 9.874426338045087e-05, "loss": 0.9897966384887695, "memory(GiB)": 89.65, "step": 18175, "token_acc": 0.7377391863643673, "train_speed(iter/s)": 0.270523 }, { "epoch": 0.2358978684201136, "grad_norm": 0.9610512256622314, "learning_rate": 9.874306855108985e-05, "loss": 0.9976032257080079, "memory(GiB)": 89.65, "step": 18180, "token_acc": 0.752190847127556, "train_speed(iter/s)": 0.270419 }, { "epoch": 0.2359627468217693, "grad_norm": 0.8027799129486084, "learning_rate": 9.874187316079675e-05, "loss": 0.967467212677002, "memory(GiB)": 89.65, "step": 18185, "token_acc": 0.7672076724455921, "train_speed(iter/s)": 0.270327 }, { "epoch": 0.236027625223425, "grad_norm": 0.804201602935791, "learning_rate": 9.87406772095853e-05, "loss": 0.9450019836425781, "memory(GiB)": 89.65, "step": 18190, "token_acc": 0.7410283449972838, "train_speed(iter/s)": 0.270242 }, { "epoch": 0.2360925036250807, "grad_norm": 0.7100619077682495, "learning_rate": 9.873948069746927e-05, "loss": 0.9755618095397949, "memory(GiB)": 89.65, "step": 18195, "token_acc": 0.7234004313443566, "train_speed(iter/s)": 0.270155 }, { "epoch": 0.2361573820267364, "grad_norm": 0.761673092842102, "learning_rate": 9.873828362446245e-05, "loss": 0.9501700401306152, "memory(GiB)": 89.65, "step": 18200, "token_acc": 0.7421074071559508, "train_speed(iter/s)": 0.27006 }, { "epoch": 0.2362222604283921, "grad_norm": 0.8667165040969849, "learning_rate": 9.873708599057858e-05, "loss": 0.9853815078735352, "memory(GiB)": 89.65, "step": 18205, "token_acc": 0.7467532467532467, "train_speed(iter/s)": 0.269973 }, { "epoch": 0.2362871388300478, "grad_norm": 0.784023642539978, "learning_rate": 9.873588779583147e-05, "loss": 0.9912500381469727, "memory(GiB)": 89.65, "step": 18210, "token_acc": 0.7363207687060708, "train_speed(iter/s)": 0.269886 }, { "epoch": 0.2363520172317035, "grad_norm": 0.8242594003677368, "learning_rate": 9.873468904023488e-05, "loss": 0.9747700691223145, "memory(GiB)": 89.65, "step": 18215, "token_acc": 0.7226037512922758, "train_speed(iter/s)": 0.269793 }, { "epoch": 0.2364168956333592, "grad_norm": 0.8317850828170776, "learning_rate": 9.873348972380265e-05, "loss": 0.9957642555236816, "memory(GiB)": 89.65, "step": 18220, "token_acc": 0.7429808786288399, "train_speed(iter/s)": 0.269706 }, { "epoch": 0.23648177403501489, "grad_norm": 0.7886954545974731, "learning_rate": 9.873228984654855e-05, "loss": 0.9078508377075195, "memory(GiB)": 89.65, "step": 18225, "token_acc": 0.7450687479040273, "train_speed(iter/s)": 0.269613 }, { "epoch": 0.23654665243667056, "grad_norm": 0.828685462474823, "learning_rate": 9.873108940848639e-05, "loss": 0.9799746513366699, "memory(GiB)": 89.65, "step": 18230, "token_acc": 0.75159085552675, "train_speed(iter/s)": 0.269518 }, { "epoch": 0.23661153083832626, "grad_norm": 0.8746681213378906, "learning_rate": 9.872988840963e-05, "loss": 1.0147876739501953, "memory(GiB)": 89.65, "step": 18235, "token_acc": 0.7207441995149673, "train_speed(iter/s)": 0.269424 }, { "epoch": 0.23667640923998196, "grad_norm": 0.8514030575752258, "learning_rate": 9.872868684999318e-05, "loss": 1.0114157676696778, "memory(GiB)": 89.65, "step": 18240, "token_acc": 0.7373618912412814, "train_speed(iter/s)": 0.269329 }, { "epoch": 0.23674128764163765, "grad_norm": 0.7544258236885071, "learning_rate": 9.872748472958976e-05, "loss": 0.9907022476196289, "memory(GiB)": 89.65, "step": 18245, "token_acc": 0.7437121411640769, "train_speed(iter/s)": 0.269239 }, { "epoch": 0.23680616604329335, "grad_norm": 0.8151013255119324, "learning_rate": 9.87262820484336e-05, "loss": 1.015070629119873, "memory(GiB)": 89.65, "step": 18250, "token_acc": 0.7287173306267445, "train_speed(iter/s)": 0.269148 }, { "epoch": 0.23687104444494905, "grad_norm": 0.7919565439224243, "learning_rate": 9.872507880653851e-05, "loss": 0.9570981025695801, "memory(GiB)": 89.65, "step": 18255, "token_acc": 0.7452071952959187, "train_speed(iter/s)": 0.269058 }, { "epoch": 0.23693592284660475, "grad_norm": 0.8803032040596008, "learning_rate": 9.872387500391837e-05, "loss": 0.9787053108215332, "memory(GiB)": 89.65, "step": 18260, "token_acc": 0.7245553553239834, "train_speed(iter/s)": 0.26897 }, { "epoch": 0.23700080124826045, "grad_norm": 0.8399314284324646, "learning_rate": 9.872267064058699e-05, "loss": 1.0076406478881836, "memory(GiB)": 89.65, "step": 18265, "token_acc": 0.7192756478254281, "train_speed(iter/s)": 0.268882 }, { "epoch": 0.23706567964991615, "grad_norm": 0.9223935008049011, "learning_rate": 9.872146571655826e-05, "loss": 1.0451221466064453, "memory(GiB)": 89.65, "step": 18270, "token_acc": 0.7390015015015015, "train_speed(iter/s)": 0.268789 }, { "epoch": 0.23713055805157185, "grad_norm": 0.8090369701385498, "learning_rate": 9.872026023184604e-05, "loss": 0.9658290863037109, "memory(GiB)": 89.65, "step": 18275, "token_acc": 0.7242284057170476, "train_speed(iter/s)": 0.268702 }, { "epoch": 0.23719543645322755, "grad_norm": 0.8217471837997437, "learning_rate": 9.871905418646418e-05, "loss": 0.9507560729980469, "memory(GiB)": 89.65, "step": 18280, "token_acc": 0.7509902054100525, "train_speed(iter/s)": 0.268614 }, { "epoch": 0.23726031485488325, "grad_norm": 0.810818612575531, "learning_rate": 9.871784758042662e-05, "loss": 0.9933138847351074, "memory(GiB)": 89.65, "step": 18285, "token_acc": 0.7511968410385436, "train_speed(iter/s)": 0.268519 }, { "epoch": 0.23732519325653892, "grad_norm": 0.8558022379875183, "learning_rate": 9.871664041374717e-05, "loss": 1.0096216201782227, "memory(GiB)": 89.65, "step": 18290, "token_acc": 0.7374925054959697, "train_speed(iter/s)": 0.268431 }, { "epoch": 0.23739007165819462, "grad_norm": 0.8233579397201538, "learning_rate": 9.871543268643977e-05, "loss": 1.0013692855834961, "memory(GiB)": 89.65, "step": 18295, "token_acc": 0.7194361836476962, "train_speed(iter/s)": 0.268349 }, { "epoch": 0.23745495005985032, "grad_norm": 0.7863311171531677, "learning_rate": 9.871422439851831e-05, "loss": 0.9906682968139648, "memory(GiB)": 89.65, "step": 18300, "token_acc": 0.7522858854062687, "train_speed(iter/s)": 0.26827 }, { "epoch": 0.23751982846150602, "grad_norm": 0.8145574927330017, "learning_rate": 9.871301554999668e-05, "loss": 0.9707754135131836, "memory(GiB)": 89.65, "step": 18305, "token_acc": 0.7515461232992644, "train_speed(iter/s)": 0.268181 }, { "epoch": 0.23758470686316172, "grad_norm": 0.861220121383667, "learning_rate": 9.871180614088882e-05, "loss": 1.0222333908081054, "memory(GiB)": 89.65, "step": 18310, "token_acc": 0.7260911160228285, "train_speed(iter/s)": 0.268099 }, { "epoch": 0.23764958526481741, "grad_norm": 0.9108986258506775, "learning_rate": 9.871059617120861e-05, "loss": 1.0093875885009767, "memory(GiB)": 89.65, "step": 18315, "token_acc": 0.7335783732511264, "train_speed(iter/s)": 0.26801 }, { "epoch": 0.2377144636664731, "grad_norm": 0.9328981041908264, "learning_rate": 9.870938564097001e-05, "loss": 0.9759645462036133, "memory(GiB)": 89.65, "step": 18320, "token_acc": 0.7284459959801122, "train_speed(iter/s)": 0.267923 }, { "epoch": 0.2377793420681288, "grad_norm": 0.9563009738922119, "learning_rate": 9.870817455018693e-05, "loss": 1.0387001037597656, "memory(GiB)": 89.65, "step": 18325, "token_acc": 0.71893470306783, "train_speed(iter/s)": 0.267839 }, { "epoch": 0.2378442204697845, "grad_norm": 0.8698214292526245, "learning_rate": 9.87069628988733e-05, "loss": 0.9798370361328125, "memory(GiB)": 89.65, "step": 18330, "token_acc": 0.7615647524618456, "train_speed(iter/s)": 0.267754 }, { "epoch": 0.2379090988714402, "grad_norm": 0.7987834811210632, "learning_rate": 9.870575068704308e-05, "loss": 0.9988308906555176, "memory(GiB)": 89.65, "step": 18335, "token_acc": 0.7364058815369677, "train_speed(iter/s)": 0.267667 }, { "epoch": 0.2379739772730959, "grad_norm": 0.9686516523361206, "learning_rate": 9.870453791471023e-05, "loss": 0.9873565673828125, "memory(GiB)": 89.65, "step": 18340, "token_acc": 0.7179370150616157, "train_speed(iter/s)": 0.267581 }, { "epoch": 0.2380388556747516, "grad_norm": 0.8055091500282288, "learning_rate": 9.870332458188869e-05, "loss": 0.9380264282226562, "memory(GiB)": 89.65, "step": 18345, "token_acc": 0.7559197445321444, "train_speed(iter/s)": 0.267485 }, { "epoch": 0.23810373407640728, "grad_norm": 0.8732351064682007, "learning_rate": 9.870211068859242e-05, "loss": 0.9980950355529785, "memory(GiB)": 89.65, "step": 18350, "token_acc": 0.7334274228038662, "train_speed(iter/s)": 0.267406 }, { "epoch": 0.23816861247806298, "grad_norm": 0.7975016832351685, "learning_rate": 9.87008962348354e-05, "loss": 0.9756482124328614, "memory(GiB)": 89.65, "step": 18355, "token_acc": 0.7385426969996703, "train_speed(iter/s)": 0.267327 }, { "epoch": 0.23823349087971868, "grad_norm": 0.7091912031173706, "learning_rate": 9.869968122063159e-05, "loss": 0.9448719024658203, "memory(GiB)": 89.65, "step": 18360, "token_acc": 0.7564417564417565, "train_speed(iter/s)": 0.267238 }, { "epoch": 0.23829836928137438, "grad_norm": 0.7958028316497803, "learning_rate": 9.869846564599498e-05, "loss": 0.9323701858520508, "memory(GiB)": 89.65, "step": 18365, "token_acc": 0.7577573494807517, "train_speed(iter/s)": 0.26715 }, { "epoch": 0.23836324768303008, "grad_norm": 0.9023668766021729, "learning_rate": 9.869724951093957e-05, "loss": 0.9329133987426758, "memory(GiB)": 89.65, "step": 18370, "token_acc": 0.7576399034018068, "train_speed(iter/s)": 0.267066 }, { "epoch": 0.23842812608468578, "grad_norm": 0.8940521478652954, "learning_rate": 9.869603281547934e-05, "loss": 1.0188028335571289, "memory(GiB)": 89.65, "step": 18375, "token_acc": 0.7265651553259466, "train_speed(iter/s)": 0.266977 }, { "epoch": 0.23849300448634148, "grad_norm": 0.836290180683136, "learning_rate": 9.869481555962832e-05, "loss": 0.9911392211914063, "memory(GiB)": 89.65, "step": 18380, "token_acc": 0.7487585146121731, "train_speed(iter/s)": 0.266894 }, { "epoch": 0.23855788288799717, "grad_norm": 0.7898989319801331, "learning_rate": 9.869359774340048e-05, "loss": 1.001165771484375, "memory(GiB)": 89.65, "step": 18385, "token_acc": 0.7525031827409421, "train_speed(iter/s)": 0.266803 }, { "epoch": 0.23862276128965287, "grad_norm": 0.9045366048812866, "learning_rate": 9.869237936680984e-05, "loss": 0.9866744041442871, "memory(GiB)": 89.65, "step": 18390, "token_acc": 0.7288756142210232, "train_speed(iter/s)": 0.266723 }, { "epoch": 0.23868763969130857, "grad_norm": 0.6959513425827026, "learning_rate": 9.869116042987046e-05, "loss": 0.9401785850524902, "memory(GiB)": 89.65, "step": 18395, "token_acc": 0.7523657884459398, "train_speed(iter/s)": 0.26663 }, { "epoch": 0.23875251809296427, "grad_norm": 0.8224229216575623, "learning_rate": 9.868994093259631e-05, "loss": 0.989957046508789, "memory(GiB)": 89.65, "step": 18400, "token_acc": 0.7384336452229093, "train_speed(iter/s)": 0.266539 }, { "epoch": 0.23881739649461997, "grad_norm": 0.8702182769775391, "learning_rate": 9.868872087500147e-05, "loss": 0.9593051910400391, "memory(GiB)": 89.65, "step": 18405, "token_acc": 0.7407395212538277, "train_speed(iter/s)": 0.266448 }, { "epoch": 0.23888227489627564, "grad_norm": 0.7656755447387695, "learning_rate": 9.868750025709996e-05, "loss": 1.0163703918457032, "memory(GiB)": 89.65, "step": 18410, "token_acc": 0.7232243517474634, "train_speed(iter/s)": 0.26636 }, { "epoch": 0.23894715329793134, "grad_norm": 0.8806934952735901, "learning_rate": 9.868627907890584e-05, "loss": 0.913983154296875, "memory(GiB)": 89.65, "step": 18415, "token_acc": 0.7587404666616325, "train_speed(iter/s)": 0.266276 }, { "epoch": 0.23901203169958704, "grad_norm": 0.8662809729576111, "learning_rate": 9.868505734043314e-05, "loss": 1.0033624649047852, "memory(GiB)": 89.65, "step": 18420, "token_acc": 0.739892498247254, "train_speed(iter/s)": 0.26619 }, { "epoch": 0.23907691010124274, "grad_norm": 0.8399474024772644, "learning_rate": 9.868383504169594e-05, "loss": 0.9779571533203125, "memory(GiB)": 89.65, "step": 18425, "token_acc": 0.726692417157959, "train_speed(iter/s)": 0.266102 }, { "epoch": 0.23914178850289844, "grad_norm": 0.884291410446167, "learning_rate": 9.86826121827083e-05, "loss": 0.9730466842651367, "memory(GiB)": 89.65, "step": 18430, "token_acc": 0.7376017293997965, "train_speed(iter/s)": 0.266012 }, { "epoch": 0.23920666690455414, "grad_norm": 0.7944501638412476, "learning_rate": 9.868138876348428e-05, "loss": 0.944025993347168, "memory(GiB)": 89.65, "step": 18435, "token_acc": 0.7375531914893617, "train_speed(iter/s)": 0.265931 }, { "epoch": 0.23927154530620984, "grad_norm": 0.8676150441169739, "learning_rate": 9.868016478403799e-05, "loss": 1.0202629089355468, "memory(GiB)": 89.65, "step": 18440, "token_acc": 0.7098740992952728, "train_speed(iter/s)": 0.26585 }, { "epoch": 0.23933642370786554, "grad_norm": 0.8357170224189758, "learning_rate": 9.867894024438348e-05, "loss": 0.9374734878540039, "memory(GiB)": 89.65, "step": 18445, "token_acc": 0.7382636222805954, "train_speed(iter/s)": 0.265766 }, { "epoch": 0.23940130210952124, "grad_norm": 0.7988686561584473, "learning_rate": 9.867771514453488e-05, "loss": 0.9686020851135254, "memory(GiB)": 89.65, "step": 18450, "token_acc": 0.7307769524191929, "train_speed(iter/s)": 0.265678 }, { "epoch": 0.23946618051117693, "grad_norm": 0.7506223320960999, "learning_rate": 9.867648948450625e-05, "loss": 1.0208728790283204, "memory(GiB)": 89.65, "step": 18455, "token_acc": 0.7350430578637256, "train_speed(iter/s)": 0.265584 }, { "epoch": 0.23953105891283263, "grad_norm": 0.8696061372756958, "learning_rate": 9.867526326431172e-05, "loss": 0.9686273574829102, "memory(GiB)": 89.65, "step": 18460, "token_acc": 0.734103044171694, "train_speed(iter/s)": 0.265501 }, { "epoch": 0.23959593731448833, "grad_norm": 0.9495109915733337, "learning_rate": 9.867403648396538e-05, "loss": 1.0317914962768555, "memory(GiB)": 89.65, "step": 18465, "token_acc": 0.7337909222435315, "train_speed(iter/s)": 0.265418 }, { "epoch": 0.239660815716144, "grad_norm": 0.8883520364761353, "learning_rate": 9.867280914348138e-05, "loss": 0.953788948059082, "memory(GiB)": 89.65, "step": 18470, "token_acc": 0.7618825387708213, "train_speed(iter/s)": 0.265335 }, { "epoch": 0.2397256941177997, "grad_norm": 0.8122742772102356, "learning_rate": 9.86715812428738e-05, "loss": 0.9842626571655273, "memory(GiB)": 89.65, "step": 18475, "token_acc": 0.7461191974090757, "train_speed(iter/s)": 0.265253 }, { "epoch": 0.2397905725194554, "grad_norm": 0.8549482822418213, "learning_rate": 9.867035278215682e-05, "loss": 1.0090733528137208, "memory(GiB)": 89.65, "step": 18480, "token_acc": 0.7182023034551828, "train_speed(iter/s)": 0.265171 }, { "epoch": 0.2398554509211111, "grad_norm": 0.7918896675109863, "learning_rate": 9.866912376134454e-05, "loss": 0.9726371765136719, "memory(GiB)": 89.65, "step": 18485, "token_acc": 0.7480335691953266, "train_speed(iter/s)": 0.265082 }, { "epoch": 0.2399203293227668, "grad_norm": 0.8359068036079407, "learning_rate": 9.866789418045111e-05, "loss": 0.9294398307800293, "memory(GiB)": 89.65, "step": 18490, "token_acc": 0.7576621274499454, "train_speed(iter/s)": 0.264998 }, { "epoch": 0.2399852077244225, "grad_norm": 0.842466413974762, "learning_rate": 9.866666403949069e-05, "loss": 0.931053352355957, "memory(GiB)": 89.65, "step": 18495, "token_acc": 0.7495981047465945, "train_speed(iter/s)": 0.264916 }, { "epoch": 0.2400500861260782, "grad_norm": 0.8268043398857117, "learning_rate": 9.866543333847744e-05, "loss": 0.975762939453125, "memory(GiB)": 89.65, "step": 18500, "token_acc": 0.7333333333333333, "train_speed(iter/s)": 0.264833 }, { "epoch": 0.2401149645277339, "grad_norm": 0.81349778175354, "learning_rate": 9.866420207742552e-05, "loss": 1.028448486328125, "memory(GiB)": 89.65, "step": 18505, "token_acc": 0.7384047786366831, "train_speed(iter/s)": 0.264757 }, { "epoch": 0.2401798429293896, "grad_norm": 0.8231828808784485, "learning_rate": 9.866297025634909e-05, "loss": 1.024376106262207, "memory(GiB)": 89.65, "step": 18510, "token_acc": 0.7312329044676324, "train_speed(iter/s)": 0.26467 }, { "epoch": 0.2402447213310453, "grad_norm": 0.798740029335022, "learning_rate": 9.866173787526231e-05, "loss": 0.9829652786254883, "memory(GiB)": 89.65, "step": 18515, "token_acc": 0.7382843120548038, "train_speed(iter/s)": 0.264588 }, { "epoch": 0.240309599732701, "grad_norm": 0.8134321570396423, "learning_rate": 9.866050493417941e-05, "loss": 1.0093151092529298, "memory(GiB)": 89.65, "step": 18520, "token_acc": 0.7500397519478454, "train_speed(iter/s)": 0.264503 }, { "epoch": 0.2403744781343567, "grad_norm": 0.9366515278816223, "learning_rate": 9.865927143311454e-05, "loss": 0.9699275970458985, "memory(GiB)": 89.65, "step": 18525, "token_acc": 0.7618624714539457, "train_speed(iter/s)": 0.264419 }, { "epoch": 0.24043935653601237, "grad_norm": 0.7739557027816772, "learning_rate": 9.86580373720819e-05, "loss": 0.9352416038513184, "memory(GiB)": 91.52, "step": 18530, "token_acc": 0.734752084247477, "train_speed(iter/s)": 0.264329 }, { "epoch": 0.24050423493766807, "grad_norm": 0.7596821188926697, "learning_rate": 9.865680275109571e-05, "loss": 0.92788724899292, "memory(GiB)": 91.52, "step": 18535, "token_acc": 0.742213768977346, "train_speed(iter/s)": 0.264234 }, { "epoch": 0.24056911333932376, "grad_norm": 0.8763746619224548, "learning_rate": 9.865556757017015e-05, "loss": 1.0291788101196289, "memory(GiB)": 91.52, "step": 18540, "token_acc": 0.7355983772819472, "train_speed(iter/s)": 0.264154 }, { "epoch": 0.24063399174097946, "grad_norm": 0.8408603072166443, "learning_rate": 9.865433182931947e-05, "loss": 0.9535184860229492, "memory(GiB)": 91.52, "step": 18545, "token_acc": 0.7377856245543748, "train_speed(iter/s)": 0.264068 }, { "epoch": 0.24069887014263516, "grad_norm": 0.8234400153160095, "learning_rate": 9.865309552855786e-05, "loss": 0.9809211730957031, "memory(GiB)": 91.52, "step": 18550, "token_acc": 0.7334865669392215, "train_speed(iter/s)": 0.263984 }, { "epoch": 0.24076374854429086, "grad_norm": 0.9996107816696167, "learning_rate": 9.865185866789956e-05, "loss": 0.9985074996948242, "memory(GiB)": 91.52, "step": 18555, "token_acc": 0.7471632574295639, "train_speed(iter/s)": 0.263902 }, { "epoch": 0.24082862694594656, "grad_norm": 0.8357856273651123, "learning_rate": 9.86506212473588e-05, "loss": 1.0140938758850098, "memory(GiB)": 91.52, "step": 18560, "token_acc": 0.7286265459921393, "train_speed(iter/s)": 0.263821 }, { "epoch": 0.24089350534760226, "grad_norm": 0.7881413698196411, "learning_rate": 9.864938326694982e-05, "loss": 0.9486724853515625, "memory(GiB)": 91.52, "step": 18565, "token_acc": 0.726463418942311, "train_speed(iter/s)": 0.26374 }, { "epoch": 0.24095838374925796, "grad_norm": 0.8424237966537476, "learning_rate": 9.864814472668686e-05, "loss": 0.9663345336914062, "memory(GiB)": 91.52, "step": 18570, "token_acc": 0.7799005096591083, "train_speed(iter/s)": 0.26366 }, { "epoch": 0.24102326215091366, "grad_norm": 0.7732756733894348, "learning_rate": 9.86469056265842e-05, "loss": 0.9545061111450195, "memory(GiB)": 91.52, "step": 18575, "token_acc": 0.7414171190279326, "train_speed(iter/s)": 0.26358 }, { "epoch": 0.24108814055256936, "grad_norm": 0.8768664002418518, "learning_rate": 9.864566596665608e-05, "loss": 0.9849550247192382, "memory(GiB)": 91.52, "step": 18580, "token_acc": 0.7376822185468672, "train_speed(iter/s)": 0.263497 }, { "epoch": 0.24115301895422506, "grad_norm": 0.8601141571998596, "learning_rate": 9.864442574691675e-05, "loss": 0.9648020744323731, "memory(GiB)": 91.52, "step": 18585, "token_acc": 0.7409116000315432, "train_speed(iter/s)": 0.263419 }, { "epoch": 0.24121789735588073, "grad_norm": 0.8191564679145813, "learning_rate": 9.86431849673805e-05, "loss": 0.9736324310302734, "memory(GiB)": 91.52, "step": 18590, "token_acc": 0.7236728496144341, "train_speed(iter/s)": 0.263335 }, { "epoch": 0.24128277575753643, "grad_norm": 0.7919169068336487, "learning_rate": 9.864194362806163e-05, "loss": 0.9754151344299317, "memory(GiB)": 91.52, "step": 18595, "token_acc": 0.7398559702648289, "train_speed(iter/s)": 0.263249 }, { "epoch": 0.24134765415919213, "grad_norm": 0.8517628908157349, "learning_rate": 9.864070172897438e-05, "loss": 0.9605453491210938, "memory(GiB)": 91.52, "step": 18600, "token_acc": 0.7537545410284259, "train_speed(iter/s)": 0.263159 }, { "epoch": 0.24141253256084783, "grad_norm": 0.7301941514015198, "learning_rate": 9.863945927013308e-05, "loss": 0.9342527389526367, "memory(GiB)": 91.52, "step": 18605, "token_acc": 0.7698483763132761, "train_speed(iter/s)": 0.263076 }, { "epoch": 0.24147741096250352, "grad_norm": 0.8228328227996826, "learning_rate": 9.863821625155201e-05, "loss": 0.9618989944458007, "memory(GiB)": 91.52, "step": 18610, "token_acc": 0.7390682468328565, "train_speed(iter/s)": 0.262986 }, { "epoch": 0.24154228936415922, "grad_norm": 0.7739117741584778, "learning_rate": 9.863697267324548e-05, "loss": 0.9963807106018067, "memory(GiB)": 91.52, "step": 18615, "token_acc": 0.7337820893552431, "train_speed(iter/s)": 0.262902 }, { "epoch": 0.24160716776581492, "grad_norm": 0.8503333926200867, "learning_rate": 9.863572853522778e-05, "loss": 0.9336765289306641, "memory(GiB)": 91.52, "step": 18620, "token_acc": 0.7349160743989113, "train_speed(iter/s)": 0.262817 }, { "epoch": 0.24167204616747062, "grad_norm": 0.9349395632743835, "learning_rate": 9.863448383751326e-05, "loss": 0.9671969413757324, "memory(GiB)": 91.52, "step": 18625, "token_acc": 0.7624820062125919, "train_speed(iter/s)": 0.262736 }, { "epoch": 0.24173692456912632, "grad_norm": 0.8779591917991638, "learning_rate": 9.863323858011624e-05, "loss": 1.0109676361083983, "memory(GiB)": 91.52, "step": 18630, "token_acc": 0.7353549545330367, "train_speed(iter/s)": 0.26266 }, { "epoch": 0.24180180297078202, "grad_norm": 0.8529767990112305, "learning_rate": 9.863199276305102e-05, "loss": 1.0087150573730468, "memory(GiB)": 91.52, "step": 18635, "token_acc": 0.7164746499088406, "train_speed(iter/s)": 0.262575 }, { "epoch": 0.24186668137243772, "grad_norm": 0.9300409555435181, "learning_rate": 9.863074638633197e-05, "loss": 0.9880202293395997, "memory(GiB)": 91.52, "step": 18640, "token_acc": 0.7426616310810306, "train_speed(iter/s)": 0.262496 }, { "epoch": 0.24193155977409342, "grad_norm": 0.8357877731323242, "learning_rate": 9.862949944997341e-05, "loss": 1.0121443748474122, "memory(GiB)": 91.52, "step": 18645, "token_acc": 0.7345098913701379, "train_speed(iter/s)": 0.262409 }, { "epoch": 0.2419964381757491, "grad_norm": 0.8258823752403259, "learning_rate": 9.862825195398973e-05, "loss": 0.9874334335327148, "memory(GiB)": 91.52, "step": 18650, "token_acc": 0.7353573907009021, "train_speed(iter/s)": 0.262331 }, { "epoch": 0.2420613165774048, "grad_norm": 0.8479883670806885, "learning_rate": 9.862700389839523e-05, "loss": 0.9945347785949707, "memory(GiB)": 91.52, "step": 18655, "token_acc": 0.7441370126507361, "train_speed(iter/s)": 0.262244 }, { "epoch": 0.2421261949790605, "grad_norm": 0.7912545204162598, "learning_rate": 9.86257552832043e-05, "loss": 0.9496955871582031, "memory(GiB)": 91.52, "step": 18660, "token_acc": 0.7581632653061224, "train_speed(iter/s)": 0.262156 }, { "epoch": 0.2421910733807162, "grad_norm": 0.7467861175537109, "learning_rate": 9.86245061084313e-05, "loss": 0.9364703178405762, "memory(GiB)": 91.52, "step": 18665, "token_acc": 0.7510755220599781, "train_speed(iter/s)": 0.262074 }, { "epoch": 0.2422559517823719, "grad_norm": 0.7935744524002075, "learning_rate": 9.862325637409064e-05, "loss": 0.9420627593994141, "memory(GiB)": 91.52, "step": 18670, "token_acc": 0.7279235409550112, "train_speed(iter/s)": 0.261997 }, { "epoch": 0.24232083018402759, "grad_norm": 0.8530329465866089, "learning_rate": 9.862200608019665e-05, "loss": 0.993359375, "memory(GiB)": 91.52, "step": 18675, "token_acc": 0.7368996960486323, "train_speed(iter/s)": 0.261917 }, { "epoch": 0.24238570858568328, "grad_norm": 0.9504438042640686, "learning_rate": 9.862075522676375e-05, "loss": 1.0112954139709474, "memory(GiB)": 91.52, "step": 18680, "token_acc": 0.7318387583530933, "train_speed(iter/s)": 0.261836 }, { "epoch": 0.24245058698733898, "grad_norm": 0.8483529090881348, "learning_rate": 9.861950381380633e-05, "loss": 0.9975545883178711, "memory(GiB)": 91.52, "step": 18685, "token_acc": 0.7367536791014592, "train_speed(iter/s)": 0.261748 }, { "epoch": 0.24251546538899468, "grad_norm": 0.9186041355133057, "learning_rate": 9.86182518413388e-05, "loss": 1.0125215530395508, "memory(GiB)": 91.52, "step": 18690, "token_acc": 0.7263309641947163, "train_speed(iter/s)": 0.261667 }, { "epoch": 0.24258034379065038, "grad_norm": 0.7762837409973145, "learning_rate": 9.861699930937553e-05, "loss": 0.9689929008483886, "memory(GiB)": 91.52, "step": 18695, "token_acc": 0.7379250520471895, "train_speed(iter/s)": 0.261573 }, { "epoch": 0.24264522219230608, "grad_norm": 0.806169331073761, "learning_rate": 9.8615746217931e-05, "loss": 1.0041091918945313, "memory(GiB)": 91.52, "step": 18700, "token_acc": 0.7362850800697638, "train_speed(iter/s)": 0.26149 }, { "epoch": 0.24271010059396178, "grad_norm": 0.800456702709198, "learning_rate": 9.861449256701956e-05, "loss": 0.9385133743286133, "memory(GiB)": 91.52, "step": 18705, "token_acc": 0.7164110630488923, "train_speed(iter/s)": 0.261417 }, { "epoch": 0.24277497899561745, "grad_norm": 0.8488313555717468, "learning_rate": 9.861323835665567e-05, "loss": 0.978923225402832, "memory(GiB)": 91.52, "step": 18710, "token_acc": 0.7267101460016305, "train_speed(iter/s)": 0.261336 }, { "epoch": 0.24283985739727315, "grad_norm": 0.8537747859954834, "learning_rate": 9.861198358685378e-05, "loss": 0.9983505249023438, "memory(GiB)": 91.52, "step": 18715, "token_acc": 0.7562153576869469, "train_speed(iter/s)": 0.261261 }, { "epoch": 0.24290473579892885, "grad_norm": 0.7921149134635925, "learning_rate": 9.86107282576283e-05, "loss": 0.9776409149169922, "memory(GiB)": 91.52, "step": 18720, "token_acc": 0.7427660617949976, "train_speed(iter/s)": 0.261178 }, { "epoch": 0.24296961420058455, "grad_norm": 0.8030125498771667, "learning_rate": 9.860947236899368e-05, "loss": 0.9817221641540528, "memory(GiB)": 91.52, "step": 18725, "token_acc": 0.751393899639226, "train_speed(iter/s)": 0.261096 }, { "epoch": 0.24303449260224025, "grad_norm": 0.802825927734375, "learning_rate": 9.860821592096439e-05, "loss": 0.9871999740600585, "memory(GiB)": 91.52, "step": 18730, "token_acc": 0.7453459665043037, "train_speed(iter/s)": 0.261018 }, { "epoch": 0.24309937100389595, "grad_norm": 0.9074850678443909, "learning_rate": 9.860695891355488e-05, "loss": 0.9605493545532227, "memory(GiB)": 91.52, "step": 18735, "token_acc": 0.7381099991143388, "train_speed(iter/s)": 0.260941 }, { "epoch": 0.24316424940555165, "grad_norm": 0.8013137578964233, "learning_rate": 9.860570134677963e-05, "loss": 0.9811981201171875, "memory(GiB)": 91.52, "step": 18740, "token_acc": 0.7402849080338626, "train_speed(iter/s)": 0.260856 }, { "epoch": 0.24322912780720735, "grad_norm": 0.8639530539512634, "learning_rate": 9.860444322065307e-05, "loss": 0.9578632354736328, "memory(GiB)": 91.52, "step": 18745, "token_acc": 0.7636765518511697, "train_speed(iter/s)": 0.26078 }, { "epoch": 0.24329400620886305, "grad_norm": 0.7152777314186096, "learning_rate": 9.86031845351897e-05, "loss": 0.9094611167907715, "memory(GiB)": 91.52, "step": 18750, "token_acc": 0.777217041506674, "train_speed(iter/s)": 0.260692 }, { "epoch": 0.24335888461051874, "grad_norm": 0.85276198387146, "learning_rate": 9.860192529040404e-05, "loss": 0.9595471382141113, "memory(GiB)": 91.52, "step": 18755, "token_acc": 0.7312557416437001, "train_speed(iter/s)": 0.260619 }, { "epoch": 0.24342376301217444, "grad_norm": 0.772159218788147, "learning_rate": 9.860066548631053e-05, "loss": 0.9798797607421875, "memory(GiB)": 91.52, "step": 18760, "token_acc": 0.7396025144453616, "train_speed(iter/s)": 0.260544 }, { "epoch": 0.24348864141383014, "grad_norm": 0.9055817127227783, "learning_rate": 9.85994051229237e-05, "loss": 0.9758199691772461, "memory(GiB)": 91.52, "step": 18765, "token_acc": 0.7270924533243537, "train_speed(iter/s)": 0.26047 }, { "epoch": 0.2435535198154858, "grad_norm": 0.8989699482917786, "learning_rate": 9.859814420025805e-05, "loss": 1.0444473266601562, "memory(GiB)": 91.52, "step": 18770, "token_acc": 0.7405500087581013, "train_speed(iter/s)": 0.260391 }, { "epoch": 0.2436183982171415, "grad_norm": 0.8841001987457275, "learning_rate": 9.859688271832808e-05, "loss": 0.9656167984008789, "memory(GiB)": 91.52, "step": 18775, "token_acc": 0.748438470248749, "train_speed(iter/s)": 0.260316 }, { "epoch": 0.2436832766187972, "grad_norm": 0.8758032917976379, "learning_rate": 9.85956206771483e-05, "loss": 0.9724338531494141, "memory(GiB)": 91.52, "step": 18780, "token_acc": 0.7442415156058334, "train_speed(iter/s)": 0.260235 }, { "epoch": 0.2437481550204529, "grad_norm": 0.8293058276176453, "learning_rate": 9.859435807673325e-05, "loss": 0.9885763168334961, "memory(GiB)": 91.52, "step": 18785, "token_acc": 0.7263349120871991, "train_speed(iter/s)": 0.260158 }, { "epoch": 0.2438130334221086, "grad_norm": 0.8364304304122925, "learning_rate": 9.859309491709747e-05, "loss": 0.9838298797607422, "memory(GiB)": 91.52, "step": 18790, "token_acc": 0.7324987311791575, "train_speed(iter/s)": 0.260076 }, { "epoch": 0.2438779118237643, "grad_norm": 0.8356867432594299, "learning_rate": 9.859183119825545e-05, "loss": 0.9850551605224609, "memory(GiB)": 91.52, "step": 18795, "token_acc": 0.741766000065675, "train_speed(iter/s)": 0.259994 }, { "epoch": 0.24394279022542, "grad_norm": 0.7935364246368408, "learning_rate": 9.859056692022179e-05, "loss": 1.0100942611694337, "memory(GiB)": 91.52, "step": 18800, "token_acc": 0.7308244086344377, "train_speed(iter/s)": 0.259917 }, { "epoch": 0.2440076686270757, "grad_norm": 0.917024552822113, "learning_rate": 9.8589302083011e-05, "loss": 0.9667928695678711, "memory(GiB)": 91.52, "step": 18805, "token_acc": 0.7416666666666667, "train_speed(iter/s)": 0.259845 }, { "epoch": 0.2440725470287314, "grad_norm": 0.8364055156707764, "learning_rate": 9.858803668663765e-05, "loss": 0.9572595596313477, "memory(GiB)": 91.52, "step": 18810, "token_acc": 0.7486077033921408, "train_speed(iter/s)": 0.259772 }, { "epoch": 0.2441374254303871, "grad_norm": 0.8091124296188354, "learning_rate": 9.85867707311163e-05, "loss": 0.9712382316589355, "memory(GiB)": 91.52, "step": 18815, "token_acc": 0.7348151614412728, "train_speed(iter/s)": 0.259699 }, { "epoch": 0.2442023038320428, "grad_norm": 0.7865084409713745, "learning_rate": 9.858550421646153e-05, "loss": 1.0014291763305665, "memory(GiB)": 91.52, "step": 18820, "token_acc": 0.7301640865567132, "train_speed(iter/s)": 0.259619 }, { "epoch": 0.2442671822336985, "grad_norm": 0.8503217697143555, "learning_rate": 9.858423714268791e-05, "loss": 1.03233642578125, "memory(GiB)": 91.52, "step": 18825, "token_acc": 0.7392757660167131, "train_speed(iter/s)": 0.259546 }, { "epoch": 0.24433206063535418, "grad_norm": 0.8130428194999695, "learning_rate": 9.858296950981001e-05, "loss": 0.9284585952758789, "memory(GiB)": 91.52, "step": 18830, "token_acc": 0.7456770817670586, "train_speed(iter/s)": 0.259466 }, { "epoch": 0.24439693903700987, "grad_norm": 0.8383827209472656, "learning_rate": 9.85817013178424e-05, "loss": 0.9784183502197266, "memory(GiB)": 91.52, "step": 18835, "token_acc": 0.7577431052263881, "train_speed(iter/s)": 0.259389 }, { "epoch": 0.24446181743866557, "grad_norm": 0.8032699823379517, "learning_rate": 9.858043256679972e-05, "loss": 0.9359640121459961, "memory(GiB)": 91.52, "step": 18840, "token_acc": 0.7417289220917823, "train_speed(iter/s)": 0.259307 }, { "epoch": 0.24452669584032127, "grad_norm": 0.9251989722251892, "learning_rate": 9.857916325669655e-05, "loss": 0.9586177825927734, "memory(GiB)": 91.52, "step": 18845, "token_acc": 0.7628283445326818, "train_speed(iter/s)": 0.259231 }, { "epoch": 0.24459157424197697, "grad_norm": 0.8808385133743286, "learning_rate": 9.85778933875475e-05, "loss": 1.034927749633789, "memory(GiB)": 91.52, "step": 18850, "token_acc": 0.7491173489006188, "train_speed(iter/s)": 0.259159 }, { "epoch": 0.24465645264363267, "grad_norm": 0.7738673686981201, "learning_rate": 9.857662295936716e-05, "loss": 0.9585190773010254, "memory(GiB)": 91.52, "step": 18855, "token_acc": 0.7518256460706043, "train_speed(iter/s)": 0.259077 }, { "epoch": 0.24472133104528837, "grad_norm": 0.8030029535293579, "learning_rate": 9.857535197217018e-05, "loss": 0.9536224365234375, "memory(GiB)": 91.52, "step": 18860, "token_acc": 0.738637590413473, "train_speed(iter/s)": 0.259002 }, { "epoch": 0.24478620944694407, "grad_norm": 0.8757067322731018, "learning_rate": 9.857408042597118e-05, "loss": 0.9872812271118164, "memory(GiB)": 91.52, "step": 18865, "token_acc": 0.746985864745011, "train_speed(iter/s)": 0.258915 }, { "epoch": 0.24485108784859977, "grad_norm": 0.6990556120872498, "learning_rate": 9.857280832078477e-05, "loss": 0.9652992248535156, "memory(GiB)": 91.52, "step": 18870, "token_acc": 0.7421208427651054, "train_speed(iter/s)": 0.258837 }, { "epoch": 0.24491596625025547, "grad_norm": 0.820277988910675, "learning_rate": 9.857153565662562e-05, "loss": 0.9957365036010742, "memory(GiB)": 91.52, "step": 18875, "token_acc": 0.7449259853754235, "train_speed(iter/s)": 0.258762 }, { "epoch": 0.24498084465191117, "grad_norm": 0.8797391653060913, "learning_rate": 9.857026243350838e-05, "loss": 1.0003297805786133, "memory(GiB)": 91.52, "step": 18880, "token_acc": 0.7301352585569522, "train_speed(iter/s)": 0.258687 }, { "epoch": 0.24504572305356684, "grad_norm": 0.7896788716316223, "learning_rate": 9.856898865144766e-05, "loss": 0.9980606079101563, "memory(GiB)": 91.52, "step": 18885, "token_acc": 0.7296873435883746, "train_speed(iter/s)": 0.258603 }, { "epoch": 0.24511060145522254, "grad_norm": 0.906220555305481, "learning_rate": 9.856771431045817e-05, "loss": 0.9431648254394531, "memory(GiB)": 91.52, "step": 18890, "token_acc": 0.7495685787257541, "train_speed(iter/s)": 0.258521 }, { "epoch": 0.24517547985687824, "grad_norm": 0.8826572895050049, "learning_rate": 9.856643941055452e-05, "loss": 0.9972901344299316, "memory(GiB)": 91.52, "step": 18895, "token_acc": 0.7415608965703484, "train_speed(iter/s)": 0.258445 }, { "epoch": 0.24524035825853394, "grad_norm": 0.7360451221466064, "learning_rate": 9.856516395175144e-05, "loss": 0.9351289749145508, "memory(GiB)": 91.52, "step": 18900, "token_acc": 0.7427733948673173, "train_speed(iter/s)": 0.25837 }, { "epoch": 0.24530523666018964, "grad_norm": 0.8637291193008423, "learning_rate": 9.856388793406355e-05, "loss": 0.9962314605712891, "memory(GiB)": 91.52, "step": 18905, "token_acc": 0.7569832402234636, "train_speed(iter/s)": 0.258295 }, { "epoch": 0.24537011506184533, "grad_norm": 0.778425931930542, "learning_rate": 9.856261135750559e-05, "loss": 0.9740610122680664, "memory(GiB)": 91.52, "step": 18910, "token_acc": 0.7655451939236169, "train_speed(iter/s)": 0.258214 }, { "epoch": 0.24543499346350103, "grad_norm": 0.8496865630149841, "learning_rate": 9.856133422209222e-05, "loss": 0.9628773689270019, "memory(GiB)": 91.52, "step": 18915, "token_acc": 0.7400332225913622, "train_speed(iter/s)": 0.258143 }, { "epoch": 0.24549987186515673, "grad_norm": 0.8730642199516296, "learning_rate": 9.856005652783813e-05, "loss": 1.0150303840637207, "memory(GiB)": 91.52, "step": 18920, "token_acc": 0.7418580733630442, "train_speed(iter/s)": 0.258071 }, { "epoch": 0.24556475026681243, "grad_norm": 0.7721337080001831, "learning_rate": 9.855877827475804e-05, "loss": 0.9531678199768067, "memory(GiB)": 91.52, "step": 18925, "token_acc": 0.7562634551184051, "train_speed(iter/s)": 0.257993 }, { "epoch": 0.24562962866846813, "grad_norm": 0.7887148261070251, "learning_rate": 9.855749946286664e-05, "loss": 0.9909093856811524, "memory(GiB)": 91.52, "step": 18930, "token_acc": 0.7280920906992189, "train_speed(iter/s)": 0.257915 }, { "epoch": 0.24569450707012383, "grad_norm": 0.7813959121704102, "learning_rate": 9.855622009217869e-05, "loss": 0.9611472129821778, "memory(GiB)": 91.52, "step": 18935, "token_acc": 0.7470850776735979, "train_speed(iter/s)": 0.257829 }, { "epoch": 0.24575938547177953, "grad_norm": 1.0031553506851196, "learning_rate": 9.855494016270885e-05, "loss": 0.9718607902526856, "memory(GiB)": 91.52, "step": 18940, "token_acc": 0.7324274511221134, "train_speed(iter/s)": 0.257757 }, { "epoch": 0.2458242638734352, "grad_norm": 0.8881062865257263, "learning_rate": 9.855365967447191e-05, "loss": 0.950715160369873, "memory(GiB)": 91.52, "step": 18945, "token_acc": 0.7563827075062826, "train_speed(iter/s)": 0.257678 }, { "epoch": 0.2458891422750909, "grad_norm": 0.9704944491386414, "learning_rate": 9.855237862748257e-05, "loss": 1.0325806617736817, "memory(GiB)": 91.52, "step": 18950, "token_acc": 0.7202853143123792, "train_speed(iter/s)": 0.257603 }, { "epoch": 0.2459540206767466, "grad_norm": 0.8761231303215027, "learning_rate": 9.855109702175558e-05, "loss": 0.9684654235839844, "memory(GiB)": 91.52, "step": 18955, "token_acc": 0.7472980714149322, "train_speed(iter/s)": 0.257521 }, { "epoch": 0.2460188990784023, "grad_norm": 0.8557912707328796, "learning_rate": 9.85498148573057e-05, "loss": 0.981406307220459, "memory(GiB)": 91.52, "step": 18960, "token_acc": 0.7601598401598402, "train_speed(iter/s)": 0.257443 }, { "epoch": 0.246083777480058, "grad_norm": 0.8732818961143494, "learning_rate": 9.854853213414766e-05, "loss": 0.970220947265625, "memory(GiB)": 91.52, "step": 18965, "token_acc": 0.7359142486944913, "train_speed(iter/s)": 0.257368 }, { "epoch": 0.2461486558817137, "grad_norm": 0.7731190919876099, "learning_rate": 9.854724885229625e-05, "loss": 0.9521804809570312, "memory(GiB)": 91.52, "step": 18970, "token_acc": 0.7614737288734884, "train_speed(iter/s)": 0.257295 }, { "epoch": 0.2462135342833694, "grad_norm": 0.8368595242500305, "learning_rate": 9.854596501176623e-05, "loss": 0.9595155715942383, "memory(GiB)": 91.52, "step": 18975, "token_acc": 0.7337246963562754, "train_speed(iter/s)": 0.257218 }, { "epoch": 0.2462784126850251, "grad_norm": 0.7883833646774292, "learning_rate": 9.854468061257235e-05, "loss": 0.9835950851440429, "memory(GiB)": 91.52, "step": 18980, "token_acc": 0.7410925626012209, "train_speed(iter/s)": 0.25714 }, { "epoch": 0.2463432910866808, "grad_norm": 0.9015264511108398, "learning_rate": 9.854339565472942e-05, "loss": 1.0102005004882812, "memory(GiB)": 91.52, "step": 18985, "token_acc": 0.7481627296587926, "train_speed(iter/s)": 0.257073 }, { "epoch": 0.2464081694883365, "grad_norm": 0.8502921462059021, "learning_rate": 9.85421101382522e-05, "loss": 0.9335282325744629, "memory(GiB)": 91.52, "step": 18990, "token_acc": 0.7466373550085331, "train_speed(iter/s)": 0.256994 }, { "epoch": 0.2464730478899922, "grad_norm": 0.9127014875411987, "learning_rate": 9.854082406315554e-05, "loss": 1.0229724884033202, "memory(GiB)": 91.52, "step": 18995, "token_acc": 0.7576951672862453, "train_speed(iter/s)": 0.256925 }, { "epoch": 0.2465379262916479, "grad_norm": 0.8906004428863525, "learning_rate": 9.853953742945417e-05, "loss": 0.9034536361694336, "memory(GiB)": 91.52, "step": 19000, "token_acc": 0.7366845346715328, "train_speed(iter/s)": 0.25685 }, { "epoch": 0.24660280469330356, "grad_norm": 0.7337106466293335, "learning_rate": 9.853825023716294e-05, "loss": 0.9847560882568359, "memory(GiB)": 91.52, "step": 19005, "token_acc": 0.7364935746131654, "train_speed(iter/s)": 0.256782 }, { "epoch": 0.24666768309495926, "grad_norm": 0.7869563102722168, "learning_rate": 9.853696248629664e-05, "loss": 0.9792670249938965, "memory(GiB)": 91.52, "step": 19010, "token_acc": 0.7357063879873025, "train_speed(iter/s)": 0.25671 }, { "epoch": 0.24673256149661496, "grad_norm": 0.8830661177635193, "learning_rate": 9.853567417687011e-05, "loss": 0.97476167678833, "memory(GiB)": 91.52, "step": 19015, "token_acc": 0.7189271784148941, "train_speed(iter/s)": 0.256649 }, { "epoch": 0.24679743989827066, "grad_norm": 0.7864252924919128, "learning_rate": 9.853438530889815e-05, "loss": 0.9338842391967773, "memory(GiB)": 91.52, "step": 19020, "token_acc": 0.7552230635890886, "train_speed(iter/s)": 0.256575 }, { "epoch": 0.24686231829992636, "grad_norm": 0.7865648865699768, "learning_rate": 9.853309588239562e-05, "loss": 0.9635520935058594, "memory(GiB)": 91.52, "step": 19025, "token_acc": 0.7515176042088223, "train_speed(iter/s)": 0.25649 }, { "epoch": 0.24692719670158206, "grad_norm": 0.8777707815170288, "learning_rate": 9.853180589737734e-05, "loss": 0.9826797485351563, "memory(GiB)": 91.52, "step": 19030, "token_acc": 0.7658152750889902, "train_speed(iter/s)": 0.256413 }, { "epoch": 0.24699207510323776, "grad_norm": 0.776792585849762, "learning_rate": 9.853051535385816e-05, "loss": 0.9811770439147949, "memory(GiB)": 91.52, "step": 19035, "token_acc": 0.7568425662360412, "train_speed(iter/s)": 0.256335 }, { "epoch": 0.24705695350489346, "grad_norm": 0.7814787030220032, "learning_rate": 9.852922425185293e-05, "loss": 0.9469558715820312, "memory(GiB)": 91.52, "step": 19040, "token_acc": 0.7530998084083432, "train_speed(iter/s)": 0.256261 }, { "epoch": 0.24712183190654916, "grad_norm": 0.7649329900741577, "learning_rate": 9.852793259137653e-05, "loss": 0.9949789047241211, "memory(GiB)": 91.52, "step": 19045, "token_acc": 0.7350249143018694, "train_speed(iter/s)": 0.256188 }, { "epoch": 0.24718671030820485, "grad_norm": 0.8588359355926514, "learning_rate": 9.852664037244379e-05, "loss": 0.9378047943115234, "memory(GiB)": 91.52, "step": 19050, "token_acc": 0.7618373557930498, "train_speed(iter/s)": 0.256119 }, { "epoch": 0.24725158870986055, "grad_norm": 0.7439168095588684, "learning_rate": 9.852534759506961e-05, "loss": 0.9639114379882813, "memory(GiB)": 91.52, "step": 19055, "token_acc": 0.7429399498109073, "train_speed(iter/s)": 0.25605 }, { "epoch": 0.24731646711151625, "grad_norm": 0.8817676901817322, "learning_rate": 9.852405425926882e-05, "loss": 0.9717471122741699, "memory(GiB)": 91.52, "step": 19060, "token_acc": 0.7662217278457546, "train_speed(iter/s)": 0.255974 }, { "epoch": 0.24738134551317192, "grad_norm": 0.8138179779052734, "learning_rate": 9.852276036505636e-05, "loss": 0.9854951858520508, "memory(GiB)": 91.52, "step": 19065, "token_acc": 0.7445599044136283, "train_speed(iter/s)": 0.255901 }, { "epoch": 0.24744622391482762, "grad_norm": 0.8484119176864624, "learning_rate": 9.85214659124471e-05, "loss": 0.976534652709961, "memory(GiB)": 91.52, "step": 19070, "token_acc": 0.7592926625008053, "train_speed(iter/s)": 0.255827 }, { "epoch": 0.24751110231648332, "grad_norm": 0.816222608089447, "learning_rate": 9.852017090145594e-05, "loss": 1.0002431869506836, "memory(GiB)": 91.52, "step": 19075, "token_acc": 0.7354662234687165, "train_speed(iter/s)": 0.255755 }, { "epoch": 0.24757598071813902, "grad_norm": 0.8288376927375793, "learning_rate": 9.851887533209776e-05, "loss": 0.9530854225158691, "memory(GiB)": 91.52, "step": 19080, "token_acc": 0.74943741209564, "train_speed(iter/s)": 0.255682 }, { "epoch": 0.24764085911979472, "grad_norm": 0.7619239091873169, "learning_rate": 9.851757920438749e-05, "loss": 0.9827866554260254, "memory(GiB)": 91.52, "step": 19085, "token_acc": 0.7370415155179363, "train_speed(iter/s)": 0.255614 }, { "epoch": 0.24770573752145042, "grad_norm": 0.8778370022773743, "learning_rate": 9.851628251834004e-05, "loss": 1.0117389678955078, "memory(GiB)": 91.52, "step": 19090, "token_acc": 0.7370356037151703, "train_speed(iter/s)": 0.255541 }, { "epoch": 0.24777061592310612, "grad_norm": 0.8296106457710266, "learning_rate": 9.851498527397033e-05, "loss": 0.9859077453613281, "memory(GiB)": 91.52, "step": 19095, "token_acc": 0.7536247924737134, "train_speed(iter/s)": 0.255473 }, { "epoch": 0.24783549432476182, "grad_norm": 0.9402297139167786, "learning_rate": 9.851368747129331e-05, "loss": 0.9693698883056641, "memory(GiB)": 91.52, "step": 19100, "token_acc": 0.7270648756787653, "train_speed(iter/s)": 0.255404 }, { "epoch": 0.24790037272641752, "grad_norm": 0.7812699675559998, "learning_rate": 9.851238911032387e-05, "loss": 0.9778465270996094, "memory(GiB)": 91.52, "step": 19105, "token_acc": 0.7538622777734648, "train_speed(iter/s)": 0.255328 }, { "epoch": 0.24796525112807322, "grad_norm": 0.7909954786300659, "learning_rate": 9.851109019107698e-05, "loss": 0.9610834121704102, "memory(GiB)": 91.52, "step": 19110, "token_acc": 0.7684300117847322, "train_speed(iter/s)": 0.255253 }, { "epoch": 0.24803012952972892, "grad_norm": 0.9128926396369934, "learning_rate": 9.850979071356761e-05, "loss": 0.9681051254272461, "memory(GiB)": 91.52, "step": 19115, "token_acc": 0.7616259042369962, "train_speed(iter/s)": 0.255185 }, { "epoch": 0.24809500793138461, "grad_norm": 0.7777133584022522, "learning_rate": 9.850849067781067e-05, "loss": 0.9728170394897461, "memory(GiB)": 91.52, "step": 19120, "token_acc": 0.7446425715656485, "train_speed(iter/s)": 0.255119 }, { "epoch": 0.24815988633304029, "grad_norm": 0.7953392863273621, "learning_rate": 9.850719008382114e-05, "loss": 1.0001127243041992, "memory(GiB)": 91.52, "step": 19125, "token_acc": 0.7518418939208771, "train_speed(iter/s)": 0.255043 }, { "epoch": 0.24822476473469599, "grad_norm": 0.7956090569496155, "learning_rate": 9.850588893161401e-05, "loss": 0.971750545501709, "memory(GiB)": 91.52, "step": 19130, "token_acc": 0.7565315739868049, "train_speed(iter/s)": 0.254962 }, { "epoch": 0.24828964313635168, "grad_norm": 0.7791692614555359, "learning_rate": 9.850458722120422e-05, "loss": 0.9803807258605957, "memory(GiB)": 91.52, "step": 19135, "token_acc": 0.7383695688474768, "train_speed(iter/s)": 0.254885 }, { "epoch": 0.24835452153800738, "grad_norm": 0.8942127823829651, "learning_rate": 9.850328495260676e-05, "loss": 0.9776883125305176, "memory(GiB)": 91.52, "step": 19140, "token_acc": 0.7233837590980449, "train_speed(iter/s)": 0.254813 }, { "epoch": 0.24841939993966308, "grad_norm": 0.7649713158607483, "learning_rate": 9.850198212583661e-05, "loss": 0.9773937225341797, "memory(GiB)": 91.52, "step": 19145, "token_acc": 0.7609754412628233, "train_speed(iter/s)": 0.254746 }, { "epoch": 0.24848427834131878, "grad_norm": 0.8005204796791077, "learning_rate": 9.850067874090878e-05, "loss": 0.9601383209228516, "memory(GiB)": 91.52, "step": 19150, "token_acc": 0.7104449938195303, "train_speed(iter/s)": 0.254674 }, { "epoch": 0.24854915674297448, "grad_norm": 0.8195960521697998, "learning_rate": 9.849937479783827e-05, "loss": 0.9717693328857422, "memory(GiB)": 91.52, "step": 19155, "token_acc": 0.7519065510955694, "train_speed(iter/s)": 0.254604 }, { "epoch": 0.24861403514463018, "grad_norm": 0.7683772444725037, "learning_rate": 9.849807029664007e-05, "loss": 0.9670141220092774, "memory(GiB)": 91.52, "step": 19160, "token_acc": 0.7483956075299487, "train_speed(iter/s)": 0.254533 }, { "epoch": 0.24867891354628588, "grad_norm": 0.9171163439750671, "learning_rate": 9.849676523732919e-05, "loss": 0.9817670822143555, "memory(GiB)": 91.52, "step": 19165, "token_acc": 0.7205550737207286, "train_speed(iter/s)": 0.254458 }, { "epoch": 0.24874379194794158, "grad_norm": 0.8597093224525452, "learning_rate": 9.849545961992067e-05, "loss": 0.9452398300170899, "memory(GiB)": 91.52, "step": 19170, "token_acc": 0.7592938041305796, "train_speed(iter/s)": 0.254387 }, { "epoch": 0.24880867034959728, "grad_norm": 0.8048121333122253, "learning_rate": 9.849415344442952e-05, "loss": 0.9995682716369629, "memory(GiB)": 91.52, "step": 19175, "token_acc": 0.7627141508161338, "train_speed(iter/s)": 0.254313 }, { "epoch": 0.24887354875125298, "grad_norm": 0.8653323650360107, "learning_rate": 9.849284671087076e-05, "loss": 0.9753574371337891, "memory(GiB)": 91.52, "step": 19180, "token_acc": 0.7437263531773608, "train_speed(iter/s)": 0.254236 }, { "epoch": 0.24893842715290865, "grad_norm": 0.8381220698356628, "learning_rate": 9.849153941925944e-05, "loss": 0.9942594528198242, "memory(GiB)": 91.52, "step": 19185, "token_acc": 0.7468473302924604, "train_speed(iter/s)": 0.254169 }, { "epoch": 0.24900330555456435, "grad_norm": 0.812444269657135, "learning_rate": 9.849023156961061e-05, "loss": 0.9347949028015137, "memory(GiB)": 91.52, "step": 19190, "token_acc": 0.7439246880245202, "train_speed(iter/s)": 0.254096 }, { "epoch": 0.24906818395622005, "grad_norm": 0.8132603168487549, "learning_rate": 9.848892316193933e-05, "loss": 0.9770585060119629, "memory(GiB)": 91.52, "step": 19195, "token_acc": 0.7469020539806485, "train_speed(iter/s)": 0.254026 }, { "epoch": 0.24913306235787575, "grad_norm": 0.7938859462738037, "learning_rate": 9.848761419626063e-05, "loss": 0.9634374618530274, "memory(GiB)": 91.52, "step": 19200, "token_acc": 0.7597617471872932, "train_speed(iter/s)": 0.253943 }, { "epoch": 0.24919794075953144, "grad_norm": 0.8247929811477661, "learning_rate": 9.848630467258959e-05, "loss": 0.9846911430358887, "memory(GiB)": 91.52, "step": 19205, "token_acc": 0.7417563211247913, "train_speed(iter/s)": 0.253877 }, { "epoch": 0.24926281916118714, "grad_norm": 0.8373366594314575, "learning_rate": 9.848499459094128e-05, "loss": 0.9227493286132813, "memory(GiB)": 91.52, "step": 19210, "token_acc": 0.7634688784497945, "train_speed(iter/s)": 0.253803 }, { "epoch": 0.24932769756284284, "grad_norm": 0.922519326210022, "learning_rate": 9.848368395133076e-05, "loss": 0.9786969184875488, "memory(GiB)": 91.52, "step": 19215, "token_acc": 0.7441297329028471, "train_speed(iter/s)": 0.253735 }, { "epoch": 0.24939257596449854, "grad_norm": 0.8892744779586792, "learning_rate": 9.848237275377314e-05, "loss": 0.947629165649414, "memory(GiB)": 91.52, "step": 19220, "token_acc": 0.746244635193133, "train_speed(iter/s)": 0.253663 }, { "epoch": 0.24945745436615424, "grad_norm": 0.7946031093597412, "learning_rate": 9.848106099828349e-05, "loss": 0.9388744354248046, "memory(GiB)": 91.52, "step": 19225, "token_acc": 0.7650943052059392, "train_speed(iter/s)": 0.25359 }, { "epoch": 0.24952233276780994, "grad_norm": 0.8710043430328369, "learning_rate": 9.84797486848769e-05, "loss": 0.9345524787902832, "memory(GiB)": 91.52, "step": 19230, "token_acc": 0.7327877319137103, "train_speed(iter/s)": 0.253521 }, { "epoch": 0.24958721116946564, "grad_norm": 0.892522394657135, "learning_rate": 9.84784358135685e-05, "loss": 0.9563603401184082, "memory(GiB)": 91.52, "step": 19235, "token_acc": 0.7393622546134404, "train_speed(iter/s)": 0.253451 }, { "epoch": 0.24965208957112134, "grad_norm": 0.8155663013458252, "learning_rate": 9.847712238437336e-05, "loss": 1.0064095497131347, "memory(GiB)": 91.52, "step": 19240, "token_acc": 0.7564443498918555, "train_speed(iter/s)": 0.253378 }, { "epoch": 0.249716967972777, "grad_norm": 0.8286650776863098, "learning_rate": 9.847580839730664e-05, "loss": 0.9814332008361817, "memory(GiB)": 91.52, "step": 19245, "token_acc": 0.7479972198851373, "train_speed(iter/s)": 0.253313 }, { "epoch": 0.2497818463744327, "grad_norm": 0.9050952792167664, "learning_rate": 9.847449385238342e-05, "loss": 0.9776090621948242, "memory(GiB)": 91.52, "step": 19250, "token_acc": 0.7369171885873347, "train_speed(iter/s)": 0.253241 }, { "epoch": 0.2498467247760884, "grad_norm": 0.8090375065803528, "learning_rate": 9.847317874961886e-05, "loss": 0.9557350158691407, "memory(GiB)": 91.52, "step": 19255, "token_acc": 0.7487049263585577, "train_speed(iter/s)": 0.253169 }, { "epoch": 0.2499116031777441, "grad_norm": 0.8875361084938049, "learning_rate": 9.847186308902805e-05, "loss": 1.00674467086792, "memory(GiB)": 91.52, "step": 19260, "token_acc": 0.7379342091285719, "train_speed(iter/s)": 0.253104 }, { "epoch": 0.2499764815793998, "grad_norm": 0.8710760474205017, "learning_rate": 9.847054687062617e-05, "loss": 0.9865741729736328, "memory(GiB)": 91.52, "step": 19265, "token_acc": 0.7506157868808974, "train_speed(iter/s)": 0.253034 }, { "epoch": 0.25004135998105553, "grad_norm": 0.8704342842102051, "learning_rate": 9.846923009442837e-05, "loss": 0.9728250503540039, "memory(GiB)": 91.52, "step": 19270, "token_acc": 0.7446705112421435, "train_speed(iter/s)": 0.252968 }, { "epoch": 0.2501062383827112, "grad_norm": 0.8388590812683105, "learning_rate": 9.846791276044979e-05, "loss": 0.9827726364135743, "memory(GiB)": 91.52, "step": 19275, "token_acc": 0.7362013542165379, "train_speed(iter/s)": 0.252896 }, { "epoch": 0.2501711167843669, "grad_norm": 0.9239264726638794, "learning_rate": 9.846659486870558e-05, "loss": 0.9967517852783203, "memory(GiB)": 91.52, "step": 19280, "token_acc": 0.7341747697855625, "train_speed(iter/s)": 0.252826 }, { "epoch": 0.2502359951860226, "grad_norm": 0.8795484900474548, "learning_rate": 9.846527641921091e-05, "loss": 0.98294095993042, "memory(GiB)": 91.52, "step": 19285, "token_acc": 0.7437214611872146, "train_speed(iter/s)": 0.252758 }, { "epoch": 0.2503008735876783, "grad_norm": 0.7561033964157104, "learning_rate": 9.846395741198098e-05, "loss": 0.9509098052978515, "memory(GiB)": 91.52, "step": 19290, "token_acc": 0.7550195203569436, "train_speed(iter/s)": 0.252681 }, { "epoch": 0.250365751989334, "grad_norm": 0.8186119794845581, "learning_rate": 9.846263784703092e-05, "loss": 1.0290590286254884, "memory(GiB)": 91.52, "step": 19295, "token_acc": 0.7321573633515145, "train_speed(iter/s)": 0.252616 }, { "epoch": 0.2504306303909897, "grad_norm": 0.9024289846420288, "learning_rate": 9.846131772437596e-05, "loss": 0.956886100769043, "memory(GiB)": 91.52, "step": 19300, "token_acc": 0.7607491968008749, "train_speed(iter/s)": 0.252555 }, { "epoch": 0.25049550879264537, "grad_norm": 0.7889317870140076, "learning_rate": 9.845999704403125e-05, "loss": 0.9815816879272461, "memory(GiB)": 91.52, "step": 19305, "token_acc": 0.7144792607517543, "train_speed(iter/s)": 0.252486 }, { "epoch": 0.25056038719430107, "grad_norm": 0.8377766013145447, "learning_rate": 9.845867580601205e-05, "loss": 0.9472404479980469, "memory(GiB)": 91.52, "step": 19310, "token_acc": 0.7351685044261531, "train_speed(iter/s)": 0.252417 }, { "epoch": 0.25062526559595677, "grad_norm": 0.8578782081604004, "learning_rate": 9.845735401033351e-05, "loss": 1.0328039169311523, "memory(GiB)": 91.52, "step": 19315, "token_acc": 0.7414826106994717, "train_speed(iter/s)": 0.252357 }, { "epoch": 0.25069014399761247, "grad_norm": 0.8457196950912476, "learning_rate": 9.845603165701086e-05, "loss": 0.9538898468017578, "memory(GiB)": 91.52, "step": 19320, "token_acc": 0.7359914586554746, "train_speed(iter/s)": 0.252289 }, { "epoch": 0.25075502239926817, "grad_norm": 0.7565163969993591, "learning_rate": 9.845470874605931e-05, "loss": 0.9781501770019532, "memory(GiB)": 91.52, "step": 19325, "token_acc": 0.7434317000103302, "train_speed(iter/s)": 0.252223 }, { "epoch": 0.25081990080092387, "grad_norm": 0.8440299034118652, "learning_rate": 9.84533852774941e-05, "loss": 0.9527228355407715, "memory(GiB)": 91.52, "step": 19330, "token_acc": 0.759968303137269, "train_speed(iter/s)": 0.252151 }, { "epoch": 0.25088477920257957, "grad_norm": 0.9359654188156128, "learning_rate": 9.845206125133044e-05, "loss": 0.9761508941650391, "memory(GiB)": 91.52, "step": 19335, "token_acc": 0.731671808381573, "train_speed(iter/s)": 0.252082 }, { "epoch": 0.25094965760423527, "grad_norm": 0.8878812193870544, "learning_rate": 9.845073666758359e-05, "loss": 0.987394905090332, "memory(GiB)": 91.52, "step": 19340, "token_acc": 0.7438524590163934, "train_speed(iter/s)": 0.252012 }, { "epoch": 0.25101453600589096, "grad_norm": 0.732444703578949, "learning_rate": 9.844941152626877e-05, "loss": 0.912411880493164, "memory(GiB)": 91.52, "step": 19345, "token_acc": 0.7724610040187998, "train_speed(iter/s)": 0.251937 }, { "epoch": 0.25107941440754666, "grad_norm": 0.8219122290611267, "learning_rate": 9.844808582740125e-05, "loss": 0.9938063621520996, "memory(GiB)": 91.52, "step": 19350, "token_acc": 0.7202427259996889, "train_speed(iter/s)": 0.25187 }, { "epoch": 0.25114429280920236, "grad_norm": 0.7877597808837891, "learning_rate": 9.844675957099627e-05, "loss": 0.9861539840698242, "memory(GiB)": 91.52, "step": 19355, "token_acc": 0.7592223871411505, "train_speed(iter/s)": 0.251798 }, { "epoch": 0.25120917121085806, "grad_norm": 0.8753249049186707, "learning_rate": 9.84454327570691e-05, "loss": 0.9473814964294434, "memory(GiB)": 91.52, "step": 19360, "token_acc": 0.7569791360564208, "train_speed(iter/s)": 0.251728 }, { "epoch": 0.25127404961251376, "grad_norm": 0.8159354329109192, "learning_rate": 9.8444105385635e-05, "loss": 1.0012750625610352, "memory(GiB)": 91.52, "step": 19365, "token_acc": 0.7347360330499111, "train_speed(iter/s)": 0.251659 }, { "epoch": 0.25133892801416946, "grad_norm": 0.8477802872657776, "learning_rate": 9.844277745670927e-05, "loss": 0.9991573333740235, "memory(GiB)": 91.52, "step": 19370, "token_acc": 0.7239568941696601, "train_speed(iter/s)": 0.251594 }, { "epoch": 0.25140380641582516, "grad_norm": 0.8593881726264954, "learning_rate": 9.844144897030715e-05, "loss": 0.9914951324462891, "memory(GiB)": 91.52, "step": 19375, "token_acc": 0.7423886211355354, "train_speed(iter/s)": 0.251524 }, { "epoch": 0.25146868481748086, "grad_norm": 0.7285523414611816, "learning_rate": 9.844011992644398e-05, "loss": 0.9797137260437012, "memory(GiB)": 91.52, "step": 19380, "token_acc": 0.7258132508958055, "train_speed(iter/s)": 0.25145 }, { "epoch": 0.25153356321913656, "grad_norm": 0.8679062128067017, "learning_rate": 9.843879032513502e-05, "loss": 0.9466489791870117, "memory(GiB)": 91.52, "step": 19385, "token_acc": 0.7468554924065964, "train_speed(iter/s)": 0.251374 }, { "epoch": 0.25159844162079226, "grad_norm": 0.7944739460945129, "learning_rate": 9.843746016639557e-05, "loss": 0.9619850158691406, "memory(GiB)": 91.52, "step": 19390, "token_acc": 0.7339897911160989, "train_speed(iter/s)": 0.251309 }, { "epoch": 0.2516633200224479, "grad_norm": 0.8580064177513123, "learning_rate": 9.843612945024095e-05, "loss": 0.932366943359375, "memory(GiB)": 91.52, "step": 19395, "token_acc": 0.7551526584224132, "train_speed(iter/s)": 0.25124 }, { "epoch": 0.2517281984241036, "grad_norm": 0.7961704134941101, "learning_rate": 9.843479817668647e-05, "loss": 0.9050445556640625, "memory(GiB)": 91.52, "step": 19400, "token_acc": 0.7652540882714964, "train_speed(iter/s)": 0.251171 }, { "epoch": 0.2517930768257593, "grad_norm": 0.7926412224769592, "learning_rate": 9.843346634574744e-05, "loss": 0.9416187286376954, "memory(GiB)": 91.52, "step": 19405, "token_acc": 0.7505884083553986, "train_speed(iter/s)": 0.251101 }, { "epoch": 0.251857955227415, "grad_norm": 0.760705828666687, "learning_rate": 9.843213395743923e-05, "loss": 0.9969057083129883, "memory(GiB)": 91.52, "step": 19410, "token_acc": 0.7256449165402125, "train_speed(iter/s)": 0.251026 }, { "epoch": 0.2519228336290707, "grad_norm": 0.7446479201316833, "learning_rate": 9.843080101177711e-05, "loss": 0.9314289093017578, "memory(GiB)": 91.52, "step": 19415, "token_acc": 0.7558548861024641, "train_speed(iter/s)": 0.250956 }, { "epoch": 0.2519877120307264, "grad_norm": 0.7818141579627991, "learning_rate": 9.842946750877644e-05, "loss": 1.0186881065368651, "memory(GiB)": 91.52, "step": 19420, "token_acc": 0.7187327912927991, "train_speed(iter/s)": 0.250887 }, { "epoch": 0.2520525904323821, "grad_norm": 0.897996723651886, "learning_rate": 9.842813344845259e-05, "loss": 0.9962574005126953, "memory(GiB)": 91.52, "step": 19425, "token_acc": 0.7142766878336967, "train_speed(iter/s)": 0.250815 }, { "epoch": 0.2521174688340378, "grad_norm": 0.8824489712715149, "learning_rate": 9.84267988308209e-05, "loss": 0.9533540725708007, "memory(GiB)": 91.52, "step": 19430, "token_acc": 0.7336250202560363, "train_speed(iter/s)": 0.250751 }, { "epoch": 0.2521823472356935, "grad_norm": 0.993045449256897, "learning_rate": 9.842546365589674e-05, "loss": 0.98538818359375, "memory(GiB)": 91.52, "step": 19435, "token_acc": 0.7221797400088542, "train_speed(iter/s)": 0.250683 }, { "epoch": 0.2522472256373492, "grad_norm": 0.7919338941574097, "learning_rate": 9.842412792369543e-05, "loss": 0.9787320137023926, "memory(GiB)": 91.52, "step": 19440, "token_acc": 0.7433997380574895, "train_speed(iter/s)": 0.250616 }, { "epoch": 0.2523121040390049, "grad_norm": 0.8607485890388489, "learning_rate": 9.842279163423238e-05, "loss": 0.995635986328125, "memory(GiB)": 91.52, "step": 19445, "token_acc": 0.740054672325732, "train_speed(iter/s)": 0.250555 }, { "epoch": 0.2523769824406606, "grad_norm": 0.8237221837043762, "learning_rate": 9.842145478752297e-05, "loss": 0.9468307495117188, "memory(GiB)": 91.52, "step": 19450, "token_acc": 0.7533171684515391, "train_speed(iter/s)": 0.250485 }, { "epoch": 0.2524418608423163, "grad_norm": 0.8636910319328308, "learning_rate": 9.842011738358257e-05, "loss": 0.9522436141967774, "memory(GiB)": 91.52, "step": 19455, "token_acc": 0.7470872113064461, "train_speed(iter/s)": 0.250418 }, { "epoch": 0.252506739243972, "grad_norm": 0.7621403932571411, "learning_rate": 9.841877942242658e-05, "loss": 0.9433738708496093, "memory(GiB)": 91.52, "step": 19460, "token_acc": 0.745633631194151, "train_speed(iter/s)": 0.250345 }, { "epoch": 0.2525716176456277, "grad_norm": 0.8427668809890747, "learning_rate": 9.841744090407039e-05, "loss": 0.955130672454834, "memory(GiB)": 91.52, "step": 19465, "token_acc": 0.7337928657590901, "train_speed(iter/s)": 0.250278 }, { "epoch": 0.2526364960472834, "grad_norm": 0.7456766963005066, "learning_rate": 9.84161018285294e-05, "loss": 0.925164794921875, "memory(GiB)": 91.52, "step": 19470, "token_acc": 0.7420182825235286, "train_speed(iter/s)": 0.250209 }, { "epoch": 0.2527013744489391, "grad_norm": 0.8355446457862854, "learning_rate": 9.841476219581902e-05, "loss": 0.9150419235229492, "memory(GiB)": 91.52, "step": 19475, "token_acc": 0.760261748958953, "train_speed(iter/s)": 0.250144 }, { "epoch": 0.2527662528505948, "grad_norm": 0.8449410200119019, "learning_rate": 9.84134220059547e-05, "loss": 0.9968713760375977, "memory(GiB)": 91.52, "step": 19480, "token_acc": 0.7308116179032316, "train_speed(iter/s)": 0.250076 }, { "epoch": 0.2528311312522505, "grad_norm": 0.7913740873336792, "learning_rate": 9.84120812589518e-05, "loss": 0.9660428047180176, "memory(GiB)": 91.52, "step": 19485, "token_acc": 0.7202915917715199, "train_speed(iter/s)": 0.250007 }, { "epoch": 0.2528960096539062, "grad_norm": 0.8575067520141602, "learning_rate": 9.841073995482582e-05, "loss": 0.995481014251709, "memory(GiB)": 91.52, "step": 19490, "token_acc": 0.7292093831450912, "train_speed(iter/s)": 0.24994 }, { "epoch": 0.2529608880555619, "grad_norm": 0.8864401578903198, "learning_rate": 9.840939809359213e-05, "loss": 0.9200820922851562, "memory(GiB)": 91.52, "step": 19495, "token_acc": 0.7704415701706286, "train_speed(iter/s)": 0.249876 }, { "epoch": 0.2530257664572176, "grad_norm": 0.7981665134429932, "learning_rate": 9.840805567526623e-05, "loss": 0.9118657112121582, "memory(GiB)": 91.52, "step": 19500, "token_acc": 0.7631741669529372, "train_speed(iter/s)": 0.249807 }, { "epoch": 0.2530906448588733, "grad_norm": 0.7706780433654785, "learning_rate": 9.840671269986353e-05, "loss": 0.9723356246948243, "memory(GiB)": 91.52, "step": 19505, "token_acc": 0.7715827338129496, "train_speed(iter/s)": 0.249733 }, { "epoch": 0.253155523260529, "grad_norm": 0.9210135340690613, "learning_rate": 9.840536916739948e-05, "loss": 0.8964217185974122, "memory(GiB)": 91.52, "step": 19510, "token_acc": 0.7528641571194763, "train_speed(iter/s)": 0.249659 }, { "epoch": 0.2532204016621846, "grad_norm": 0.8102902770042419, "learning_rate": 9.840402507788957e-05, "loss": 0.9539652824401855, "memory(GiB)": 91.52, "step": 19515, "token_acc": 0.742646516771102, "train_speed(iter/s)": 0.249593 }, { "epoch": 0.2532852800638403, "grad_norm": 0.8404375910758972, "learning_rate": 9.840268043134926e-05, "loss": 0.9826994895935058, "memory(GiB)": 91.52, "step": 19520, "token_acc": 0.7379847706746945, "train_speed(iter/s)": 0.249536 }, { "epoch": 0.253350158465496, "grad_norm": 0.9240986704826355, "learning_rate": 9.8401335227794e-05, "loss": 0.9357822418212891, "memory(GiB)": 91.52, "step": 19525, "token_acc": 0.7562768417575156, "train_speed(iter/s)": 0.249473 }, { "epoch": 0.2534150368671517, "grad_norm": 0.8384693264961243, "learning_rate": 9.839998946723932e-05, "loss": 0.9703956604003906, "memory(GiB)": 91.52, "step": 19530, "token_acc": 0.7603005248893615, "train_speed(iter/s)": 0.249404 }, { "epoch": 0.2534799152688074, "grad_norm": 0.7600223422050476, "learning_rate": 9.839864314970065e-05, "loss": 0.9629981040954589, "memory(GiB)": 91.52, "step": 19535, "token_acc": 0.7385591300407793, "train_speed(iter/s)": 0.249339 }, { "epoch": 0.2535447936704631, "grad_norm": 0.7901366949081421, "learning_rate": 9.839729627519352e-05, "loss": 0.9541144371032715, "memory(GiB)": 91.52, "step": 19540, "token_acc": 0.7189176261573679, "train_speed(iter/s)": 0.249269 }, { "epoch": 0.2536096720721188, "grad_norm": 0.7712010741233826, "learning_rate": 9.839594884373342e-05, "loss": 0.9967941284179688, "memory(GiB)": 91.52, "step": 19545, "token_acc": 0.7457502095557418, "train_speed(iter/s)": 0.24921 }, { "epoch": 0.2536745504737745, "grad_norm": 0.790576159954071, "learning_rate": 9.839460085533584e-05, "loss": 0.9746267318725585, "memory(GiB)": 91.52, "step": 19550, "token_acc": 0.7542872189855502, "train_speed(iter/s)": 0.249146 }, { "epoch": 0.2537394288754302, "grad_norm": 0.8447348475456238, "learning_rate": 9.839325231001632e-05, "loss": 0.9740488052368164, "memory(GiB)": 91.52, "step": 19555, "token_acc": 0.7524266075036646, "train_speed(iter/s)": 0.249082 }, { "epoch": 0.2538043072770859, "grad_norm": 0.8281823396682739, "learning_rate": 9.839190320779038e-05, "loss": 0.9567392349243165, "memory(GiB)": 91.52, "step": 19560, "token_acc": 0.7499427873979708, "train_speed(iter/s)": 0.24902 }, { "epoch": 0.2538691856787416, "grad_norm": 0.7390719056129456, "learning_rate": 9.839055354867353e-05, "loss": 0.9742780685424804, "memory(GiB)": 91.52, "step": 19565, "token_acc": 0.7391850404688808, "train_speed(iter/s)": 0.248953 }, { "epoch": 0.2539340640803973, "grad_norm": 0.9018622040748596, "learning_rate": 9.83892033326813e-05, "loss": 0.951017189025879, "memory(GiB)": 91.52, "step": 19570, "token_acc": 0.7788284917341316, "train_speed(iter/s)": 0.24889 }, { "epoch": 0.253998942482053, "grad_norm": 0.9257366061210632, "learning_rate": 9.83878525598292e-05, "loss": 1.0039093017578125, "memory(GiB)": 91.52, "step": 19575, "token_acc": 0.728626540009126, "train_speed(iter/s)": 0.248834 }, { "epoch": 0.2540638208837087, "grad_norm": 0.7868173718452454, "learning_rate": 9.838650123013284e-05, "loss": 0.9739165306091309, "memory(GiB)": 91.52, "step": 19580, "token_acc": 0.7505276116778051, "train_speed(iter/s)": 0.248766 }, { "epoch": 0.2541286992853644, "grad_norm": 0.7562344670295715, "learning_rate": 9.838514934360774e-05, "loss": 0.9860769271850586, "memory(GiB)": 91.52, "step": 19585, "token_acc": 0.7371545847687458, "train_speed(iter/s)": 0.2487 }, { "epoch": 0.2541935776870201, "grad_norm": 0.8685703277587891, "learning_rate": 9.838379690026944e-05, "loss": 0.9693199157714844, "memory(GiB)": 91.52, "step": 19590, "token_acc": 0.743889591234724, "train_speed(iter/s)": 0.248629 }, { "epoch": 0.2542584560886758, "grad_norm": 0.7606721520423889, "learning_rate": 9.838244390013354e-05, "loss": 0.9890514373779297, "memory(GiB)": 91.52, "step": 19595, "token_acc": 0.7311257410500802, "train_speed(iter/s)": 0.248559 }, { "epoch": 0.2543233344903315, "grad_norm": 0.767708420753479, "learning_rate": 9.838109034321557e-05, "loss": 0.928415584564209, "memory(GiB)": 91.52, "step": 19600, "token_acc": 0.7561860136718123, "train_speed(iter/s)": 0.248489 }, { "epoch": 0.2543882128919872, "grad_norm": 0.90973299741745, "learning_rate": 9.837973622953113e-05, "loss": 1.015909194946289, "memory(GiB)": 91.52, "step": 19605, "token_acc": 0.7435628300587136, "train_speed(iter/s)": 0.248426 }, { "epoch": 0.2544530912936429, "grad_norm": 0.8718367218971252, "learning_rate": 9.83783815590958e-05, "loss": 0.9764108657836914, "memory(GiB)": 91.52, "step": 19610, "token_acc": 0.7427233188357311, "train_speed(iter/s)": 0.248359 }, { "epoch": 0.2545179696952986, "grad_norm": 0.7711452841758728, "learning_rate": 9.837702633192516e-05, "loss": 0.9672216415405274, "memory(GiB)": 91.52, "step": 19615, "token_acc": 0.7599917593737124, "train_speed(iter/s)": 0.248289 }, { "epoch": 0.2545828480969543, "grad_norm": 0.8734551668167114, "learning_rate": 9.837567054803483e-05, "loss": 0.9843461036682128, "memory(GiB)": 91.52, "step": 19620, "token_acc": 0.7243523726530936, "train_speed(iter/s)": 0.248229 }, { "epoch": 0.25464772649861, "grad_norm": 0.747886598110199, "learning_rate": 9.837431420744038e-05, "loss": 0.9817228317260742, "memory(GiB)": 91.52, "step": 19625, "token_acc": 0.7330553349218957, "train_speed(iter/s)": 0.248162 }, { "epoch": 0.2547126049002657, "grad_norm": 0.6492181420326233, "learning_rate": 9.837295731015746e-05, "loss": 0.9254844665527344, "memory(GiB)": 91.52, "step": 19630, "token_acc": 0.7567376839841388, "train_speed(iter/s)": 0.248095 }, { "epoch": 0.25477748330192135, "grad_norm": 0.8983922600746155, "learning_rate": 9.837159985620164e-05, "loss": 0.9914315223693848, "memory(GiB)": 91.52, "step": 19635, "token_acc": 0.7707528069206699, "train_speed(iter/s)": 0.248018 }, { "epoch": 0.25484236170357705, "grad_norm": 0.9520987868309021, "learning_rate": 9.837024184558857e-05, "loss": 0.9668484687805176, "memory(GiB)": 91.52, "step": 19640, "token_acc": 0.7547136439209028, "train_speed(iter/s)": 0.247957 }, { "epoch": 0.25490724010523275, "grad_norm": 0.7431703805923462, "learning_rate": 9.836888327833387e-05, "loss": 0.95455322265625, "memory(GiB)": 91.52, "step": 19645, "token_acc": 0.7348878979313762, "train_speed(iter/s)": 0.247893 }, { "epoch": 0.25497211850688845, "grad_norm": 0.7261223793029785, "learning_rate": 9.836752415445319e-05, "loss": 0.969570541381836, "memory(GiB)": 91.52, "step": 19650, "token_acc": 0.7431945350314099, "train_speed(iter/s)": 0.247819 }, { "epoch": 0.25503699690854414, "grad_norm": 0.8537670373916626, "learning_rate": 9.836616447396212e-05, "loss": 0.9853603363037109, "memory(GiB)": 91.52, "step": 19655, "token_acc": 0.7636229749631811, "train_speed(iter/s)": 0.247747 }, { "epoch": 0.25510187531019984, "grad_norm": 0.7673606872558594, "learning_rate": 9.836480423687636e-05, "loss": 0.9657347679138184, "memory(GiB)": 91.52, "step": 19660, "token_acc": 0.7318509529588122, "train_speed(iter/s)": 0.247688 }, { "epoch": 0.25516675371185554, "grad_norm": 1.0390057563781738, "learning_rate": 9.836344344321156e-05, "loss": 0.978863525390625, "memory(GiB)": 91.52, "step": 19665, "token_acc": 0.7638211661287, "train_speed(iter/s)": 0.247624 }, { "epoch": 0.25523163211351124, "grad_norm": 0.7638096809387207, "learning_rate": 9.836208209298335e-05, "loss": 0.9850854873657227, "memory(GiB)": 91.52, "step": 19670, "token_acc": 0.7387615213123482, "train_speed(iter/s)": 0.247551 }, { "epoch": 0.25529651051516694, "grad_norm": 0.8983140587806702, "learning_rate": 9.836072018620741e-05, "loss": 0.9724695205688476, "memory(GiB)": 91.52, "step": 19675, "token_acc": 0.74060114351423, "train_speed(iter/s)": 0.247487 }, { "epoch": 0.25536138891682264, "grad_norm": 0.793219268321991, "learning_rate": 9.835935772289941e-05, "loss": 1.0453593254089355, "memory(GiB)": 91.52, "step": 19680, "token_acc": 0.7165033290940376, "train_speed(iter/s)": 0.247426 }, { "epoch": 0.25542626731847834, "grad_norm": 0.8164571523666382, "learning_rate": 9.835799470307504e-05, "loss": 0.9535318374633789, "memory(GiB)": 91.52, "step": 19685, "token_acc": 0.743781512605042, "train_speed(iter/s)": 0.247363 }, { "epoch": 0.25549114572013404, "grad_norm": 0.8467084169387817, "learning_rate": 9.835663112674997e-05, "loss": 0.9346012115478516, "memory(GiB)": 91.52, "step": 19690, "token_acc": 0.7375739855950938, "train_speed(iter/s)": 0.247298 }, { "epoch": 0.25555602412178974, "grad_norm": 0.8104267716407776, "learning_rate": 9.835526699393993e-05, "loss": 0.9661064147949219, "memory(GiB)": 91.52, "step": 19695, "token_acc": 0.7349443939287963, "train_speed(iter/s)": 0.24724 }, { "epoch": 0.25562090252344544, "grad_norm": 0.930913507938385, "learning_rate": 9.835390230466057e-05, "loss": 1.0026163101196288, "memory(GiB)": 91.52, "step": 19700, "token_acc": 0.7510222288305702, "train_speed(iter/s)": 0.247184 }, { "epoch": 0.25568578092510114, "grad_norm": 0.852098286151886, "learning_rate": 9.835253705892761e-05, "loss": 0.995450782775879, "memory(GiB)": 91.52, "step": 19705, "token_acc": 0.7362035999208809, "train_speed(iter/s)": 0.247122 }, { "epoch": 0.25575065932675684, "grad_norm": 0.8255922794342041, "learning_rate": 9.835117125675676e-05, "loss": 0.9246318817138672, "memory(GiB)": 91.52, "step": 19710, "token_acc": 0.747342381493615, "train_speed(iter/s)": 0.247056 }, { "epoch": 0.25581553772841253, "grad_norm": 0.7836141586303711, "learning_rate": 9.834980489816376e-05, "loss": 0.9987754821777344, "memory(GiB)": 91.52, "step": 19715, "token_acc": 0.7350265838191019, "train_speed(iter/s)": 0.246997 }, { "epoch": 0.25588041613006823, "grad_norm": 0.8458544015884399, "learning_rate": 9.83484379831643e-05, "loss": 0.9655141830444336, "memory(GiB)": 91.52, "step": 19720, "token_acc": 0.7422083550008667, "train_speed(iter/s)": 0.246932 }, { "epoch": 0.25594529453172393, "grad_norm": 0.8606575727462769, "learning_rate": 9.834707051177417e-05, "loss": 0.9845767021179199, "memory(GiB)": 91.52, "step": 19725, "token_acc": 0.7464134860535028, "train_speed(iter/s)": 0.246862 }, { "epoch": 0.25601017293337963, "grad_norm": 0.9227774739265442, "learning_rate": 9.834570248400901e-05, "loss": 0.9619680404663086, "memory(GiB)": 91.52, "step": 19730, "token_acc": 0.7643280443509846, "train_speed(iter/s)": 0.246799 }, { "epoch": 0.25607505133503533, "grad_norm": 0.7450866103172302, "learning_rate": 9.834433389988465e-05, "loss": 0.9833030700683594, "memory(GiB)": 91.52, "step": 19735, "token_acc": 0.768570960877394, "train_speed(iter/s)": 0.246734 }, { "epoch": 0.25613992973669103, "grad_norm": 0.8208307027816772, "learning_rate": 9.83429647594168e-05, "loss": 0.9756547927856445, "memory(GiB)": 91.52, "step": 19740, "token_acc": 0.7331821105731794, "train_speed(iter/s)": 0.246669 }, { "epoch": 0.25620480813834673, "grad_norm": 0.8540536761283875, "learning_rate": 9.834159506262122e-05, "loss": 0.9938111305236816, "memory(GiB)": 91.52, "step": 19745, "token_acc": 0.7386055678738606, "train_speed(iter/s)": 0.246606 }, { "epoch": 0.25626968654000243, "grad_norm": 0.9661107659339905, "learning_rate": 9.834022480951367e-05, "loss": 0.975925064086914, "memory(GiB)": 91.52, "step": 19750, "token_acc": 0.7591999683781968, "train_speed(iter/s)": 0.246539 }, { "epoch": 0.25633456494165807, "grad_norm": 0.7724992036819458, "learning_rate": 9.833885400010993e-05, "loss": 0.9789333343505859, "memory(GiB)": 91.52, "step": 19755, "token_acc": 0.7404086738949124, "train_speed(iter/s)": 0.246471 }, { "epoch": 0.25639944334331377, "grad_norm": 0.8366218209266663, "learning_rate": 9.833748263442577e-05, "loss": 0.9855788230895997, "memory(GiB)": 91.52, "step": 19760, "token_acc": 0.740233966993942, "train_speed(iter/s)": 0.246412 }, { "epoch": 0.25646432174496947, "grad_norm": 0.9521561861038208, "learning_rate": 9.833611071247698e-05, "loss": 1.0075823783874511, "memory(GiB)": 91.52, "step": 19765, "token_acc": 0.7307227742061173, "train_speed(iter/s)": 0.246351 }, { "epoch": 0.25652920014662517, "grad_norm": 0.7684465050697327, "learning_rate": 9.833473823427933e-05, "loss": 0.9648719787597656, "memory(GiB)": 91.52, "step": 19770, "token_acc": 0.7428571428571429, "train_speed(iter/s)": 0.246286 }, { "epoch": 0.25659407854828087, "grad_norm": 0.8608960509300232, "learning_rate": 9.83333651998486e-05, "loss": 0.9577957153320312, "memory(GiB)": 91.52, "step": 19775, "token_acc": 0.7552928120198054, "train_speed(iter/s)": 0.246225 }, { "epoch": 0.25665895694993657, "grad_norm": 0.8458237648010254, "learning_rate": 9.833199160920064e-05, "loss": 0.9520825386047364, "memory(GiB)": 91.52, "step": 19780, "token_acc": 0.7410405912281354, "train_speed(iter/s)": 0.246166 }, { "epoch": 0.25672383535159227, "grad_norm": 0.8572164177894592, "learning_rate": 9.833061746235123e-05, "loss": 1.0236433029174805, "memory(GiB)": 91.52, "step": 19785, "token_acc": 0.7335526315789473, "train_speed(iter/s)": 0.246108 }, { "epoch": 0.25678871375324797, "grad_norm": 0.9450822472572327, "learning_rate": 9.832924275931618e-05, "loss": 0.9745197296142578, "memory(GiB)": 91.52, "step": 19790, "token_acc": 0.7637671462027434, "train_speed(iter/s)": 0.246046 }, { "epoch": 0.25685359215490366, "grad_norm": 0.8507212996482849, "learning_rate": 9.832786750011132e-05, "loss": 0.9370136260986328, "memory(GiB)": 91.52, "step": 19795, "token_acc": 0.7474093812121981, "train_speed(iter/s)": 0.245991 }, { "epoch": 0.25691847055655936, "grad_norm": 0.8458108305931091, "learning_rate": 9.832649168475245e-05, "loss": 0.9819103240966797, "memory(GiB)": 91.52, "step": 19800, "token_acc": 0.7285132315437353, "train_speed(iter/s)": 0.245924 }, { "epoch": 0.25698334895821506, "grad_norm": 0.8066827654838562, "learning_rate": 9.832511531325545e-05, "loss": 0.9739803314208985, "memory(GiB)": 91.52, "step": 19805, "token_acc": 0.7471151828799216, "train_speed(iter/s)": 0.245862 }, { "epoch": 0.25704822735987076, "grad_norm": 0.9297577738761902, "learning_rate": 9.832373838563614e-05, "loss": 0.9930131912231446, "memory(GiB)": 91.52, "step": 19810, "token_acc": 0.7419859754570498, "train_speed(iter/s)": 0.2458 }, { "epoch": 0.25711310576152646, "grad_norm": 0.8777366876602173, "learning_rate": 9.832236090191032e-05, "loss": 1.0107108116149903, "memory(GiB)": 91.52, "step": 19815, "token_acc": 0.7395451949884115, "train_speed(iter/s)": 0.245732 }, { "epoch": 0.25717798416318216, "grad_norm": 1.1076314449310303, "learning_rate": 9.832098286209391e-05, "loss": 0.9869579315185547, "memory(GiB)": 91.52, "step": 19820, "token_acc": 0.7228120193570169, "train_speed(iter/s)": 0.24567 }, { "epoch": 0.25724286256483786, "grad_norm": 0.785891056060791, "learning_rate": 9.831960426620274e-05, "loss": 0.9774017333984375, "memory(GiB)": 91.52, "step": 19825, "token_acc": 0.7592729850794367, "train_speed(iter/s)": 0.245606 }, { "epoch": 0.25730774096649356, "grad_norm": 0.8861672878265381, "learning_rate": 9.831822511425265e-05, "loss": 0.9445336341857911, "memory(GiB)": 91.52, "step": 19830, "token_acc": 0.7749631012982319, "train_speed(iter/s)": 0.245542 }, { "epoch": 0.25737261936814926, "grad_norm": 0.8094263672828674, "learning_rate": 9.831684540625956e-05, "loss": 0.9869330406188965, "memory(GiB)": 91.52, "step": 19835, "token_acc": 0.7432729477753354, "train_speed(iter/s)": 0.245479 }, { "epoch": 0.25743749776980496, "grad_norm": 0.807216227054596, "learning_rate": 9.831546514223931e-05, "loss": 0.9846342086791993, "memory(GiB)": 91.52, "step": 19840, "token_acc": 0.7474644648041323, "train_speed(iter/s)": 0.245417 }, { "epoch": 0.25750237617146066, "grad_norm": 0.8363562226295471, "learning_rate": 9.831408432220781e-05, "loss": 0.9747697830200195, "memory(GiB)": 91.52, "step": 19845, "token_acc": 0.7342210038509236, "train_speed(iter/s)": 0.245351 }, { "epoch": 0.25756725457311636, "grad_norm": 0.7642056941986084, "learning_rate": 9.831270294618094e-05, "loss": 0.9718999862670898, "memory(GiB)": 91.52, "step": 19850, "token_acc": 0.7488677234581058, "train_speed(iter/s)": 0.245292 }, { "epoch": 0.25763213297477205, "grad_norm": 0.8189076781272888, "learning_rate": 9.831132101417457e-05, "loss": 0.9402183532714844, "memory(GiB)": 91.52, "step": 19855, "token_acc": 0.7313438132074802, "train_speed(iter/s)": 0.245231 }, { "epoch": 0.25769701137642775, "grad_norm": 0.7485895752906799, "learning_rate": 9.830993852620464e-05, "loss": 0.9580957412719726, "memory(GiB)": 91.52, "step": 19860, "token_acc": 0.7622356147556716, "train_speed(iter/s)": 0.245166 }, { "epoch": 0.25776188977808345, "grad_norm": 0.8128094673156738, "learning_rate": 9.830855548228705e-05, "loss": 0.9942525863647461, "memory(GiB)": 91.52, "step": 19865, "token_acc": 0.7377847540851646, "train_speed(iter/s)": 0.245109 }, { "epoch": 0.2578267681797391, "grad_norm": 0.7434903979301453, "learning_rate": 9.83071718824377e-05, "loss": 0.973941421508789, "memory(GiB)": 91.52, "step": 19870, "token_acc": 0.7568755676657584, "train_speed(iter/s)": 0.24505 }, { "epoch": 0.2578916465813948, "grad_norm": 0.7389636039733887, "learning_rate": 9.830578772667253e-05, "loss": 0.969854736328125, "memory(GiB)": 91.52, "step": 19875, "token_acc": 0.7391419755134543, "train_speed(iter/s)": 0.244993 }, { "epoch": 0.2579565249830505, "grad_norm": 0.8142733573913574, "learning_rate": 9.830440301500747e-05, "loss": 0.9778583526611329, "memory(GiB)": 91.52, "step": 19880, "token_acc": 0.7131279807795747, "train_speed(iter/s)": 0.244931 }, { "epoch": 0.2580214033847062, "grad_norm": 0.9090842008590698, "learning_rate": 9.830301774745845e-05, "loss": 0.9780099868774415, "memory(GiB)": 91.52, "step": 19885, "token_acc": 0.7348863006100943, "train_speed(iter/s)": 0.244868 }, { "epoch": 0.2580862817863619, "grad_norm": 0.8558185696601868, "learning_rate": 9.830163192404141e-05, "loss": 1.0092992782592773, "memory(GiB)": 91.52, "step": 19890, "token_acc": 0.7316101276535721, "train_speed(iter/s)": 0.244805 }, { "epoch": 0.2581511601880176, "grad_norm": 0.8654559850692749, "learning_rate": 9.830024554477229e-05, "loss": 0.9847373008728028, "memory(GiB)": 91.52, "step": 19895, "token_acc": 0.7430609597924773, "train_speed(iter/s)": 0.24474 }, { "epoch": 0.2582160385896733, "grad_norm": 0.8350492119789124, "learning_rate": 9.829885860966706e-05, "loss": 0.95663480758667, "memory(GiB)": 91.52, "step": 19900, "token_acc": 0.7282136748388228, "train_speed(iter/s)": 0.244672 }, { "epoch": 0.258280916991329, "grad_norm": 0.7856038808822632, "learning_rate": 9.829747111874167e-05, "loss": 0.9778614044189453, "memory(GiB)": 91.52, "step": 19905, "token_acc": 0.7362257215956182, "train_speed(iter/s)": 0.244616 }, { "epoch": 0.2583457953929847, "grad_norm": 0.8158506751060486, "learning_rate": 9.829608307201209e-05, "loss": 0.9463346481323243, "memory(GiB)": 91.52, "step": 19910, "token_acc": 0.7615543307571512, "train_speed(iter/s)": 0.244553 }, { "epoch": 0.2584106737946404, "grad_norm": 0.7874920964241028, "learning_rate": 9.829469446949431e-05, "loss": 0.9854043006896973, "memory(GiB)": 91.52, "step": 19915, "token_acc": 0.7498684441326083, "train_speed(iter/s)": 0.24449 }, { "epoch": 0.2584755521962961, "grad_norm": 0.7457108497619629, "learning_rate": 9.829330531120426e-05, "loss": 1.0429163932800294, "memory(GiB)": 91.52, "step": 19920, "token_acc": 0.7252501924557352, "train_speed(iter/s)": 0.244425 }, { "epoch": 0.2585404305979518, "grad_norm": 0.7067890763282776, "learning_rate": 9.829191559715798e-05, "loss": 0.963259220123291, "memory(GiB)": 91.52, "step": 19925, "token_acc": 0.7292631435576137, "train_speed(iter/s)": 0.244362 }, { "epoch": 0.2586053089996075, "grad_norm": 0.8753408789634705, "learning_rate": 9.829052532737147e-05, "loss": 0.9852873802185058, "memory(GiB)": 91.52, "step": 19930, "token_acc": 0.7058383233532934, "train_speed(iter/s)": 0.2443 }, { "epoch": 0.2586701874012632, "grad_norm": 0.7686488032341003, "learning_rate": 9.828913450186065e-05, "loss": 0.9556828498840332, "memory(GiB)": 91.52, "step": 19935, "token_acc": 0.7426235834824404, "train_speed(iter/s)": 0.244237 }, { "epoch": 0.2587350658029189, "grad_norm": 0.8374437689781189, "learning_rate": 9.828774312064162e-05, "loss": 0.9964566230773926, "memory(GiB)": 91.52, "step": 19940, "token_acc": 0.7433977574505753, "train_speed(iter/s)": 0.244182 }, { "epoch": 0.2587999442045746, "grad_norm": 0.8089773058891296, "learning_rate": 9.828635118373033e-05, "loss": 0.9858186721801758, "memory(GiB)": 91.52, "step": 19945, "token_acc": 0.7666410689602212, "train_speed(iter/s)": 0.244115 }, { "epoch": 0.2588648226062303, "grad_norm": 0.8072102665901184, "learning_rate": 9.828495869114283e-05, "loss": 0.9977538108825683, "memory(GiB)": 91.52, "step": 19950, "token_acc": 0.7385014279272508, "train_speed(iter/s)": 0.244057 }, { "epoch": 0.258929701007886, "grad_norm": 0.8919587731361389, "learning_rate": 9.828356564289512e-05, "loss": 1.021998977661133, "memory(GiB)": 91.52, "step": 19955, "token_acc": 0.7233859043863973, "train_speed(iter/s)": 0.243996 }, { "epoch": 0.2589945794095417, "grad_norm": 0.7973723411560059, "learning_rate": 9.828217203900325e-05, "loss": 0.932733154296875, "memory(GiB)": 91.52, "step": 19960, "token_acc": 0.7329474989665151, "train_speed(iter/s)": 0.243937 }, { "epoch": 0.2590594578111974, "grad_norm": 0.7998731732368469, "learning_rate": 9.828077787948327e-05, "loss": 0.9932252883911132, "memory(GiB)": 91.52, "step": 19965, "token_acc": 0.7149342824314099, "train_speed(iter/s)": 0.243876 }, { "epoch": 0.2591243362128531, "grad_norm": 0.8646062612533569, "learning_rate": 9.827938316435118e-05, "loss": 0.9920969009399414, "memory(GiB)": 91.52, "step": 19970, "token_acc": 0.7353002027558241, "train_speed(iter/s)": 0.243814 }, { "epoch": 0.2591892146145088, "grad_norm": 0.8809945583343506, "learning_rate": 9.827798789362306e-05, "loss": 0.9898725509643554, "memory(GiB)": 91.52, "step": 19975, "token_acc": 0.7365503013892918, "train_speed(iter/s)": 0.243753 }, { "epoch": 0.2592540930161645, "grad_norm": 0.9276567101478577, "learning_rate": 9.827659206731497e-05, "loss": 1.0431201934814454, "memory(GiB)": 91.52, "step": 19980, "token_acc": 0.7261280542179419, "train_speed(iter/s)": 0.243688 }, { "epoch": 0.2593189714178202, "grad_norm": 0.7408774495124817, "learning_rate": 9.827519568544296e-05, "loss": 0.9706040382385254, "memory(GiB)": 91.52, "step": 19985, "token_acc": 0.7552532665115447, "train_speed(iter/s)": 0.243627 }, { "epoch": 0.2593838498194758, "grad_norm": 0.7424233555793762, "learning_rate": 9.82737987480231e-05, "loss": 0.938092041015625, "memory(GiB)": 91.52, "step": 19990, "token_acc": 0.7436727486756916, "train_speed(iter/s)": 0.243564 }, { "epoch": 0.2594487282211315, "grad_norm": 0.8442904949188232, "learning_rate": 9.827240125507148e-05, "loss": 0.9526702880859375, "memory(GiB)": 91.52, "step": 19995, "token_acc": 0.7316551161842642, "train_speed(iter/s)": 0.243502 }, { "epoch": 0.2595136066227872, "grad_norm": 0.7485336661338806, "learning_rate": 9.827100320660417e-05, "loss": 0.9539190292358398, "memory(GiB)": 91.52, "step": 20000, "token_acc": 0.742422908781504, "train_speed(iter/s)": 0.243444 }, { "epoch": 0.2595136066227872, "eval_loss": 0.9675002694129944, "eval_runtime": 1664.8795, "eval_samples_per_second": 29.925, "eval_steps_per_second": 1.87, "eval_token_acc": 0.7441804923716367, "step": 20000 }, { "epoch": 0.2595784850244429, "grad_norm": 0.7938413619995117, "learning_rate": 9.826960460263726e-05, "loss": 1.0108969688415528, "memory(GiB)": 91.52, "step": 20005, "token_acc": 0.7457266824696165, "train_speed(iter/s)": 0.238214 }, { "epoch": 0.2596433634260986, "grad_norm": 0.9340024590492249, "learning_rate": 9.826820544318685e-05, "loss": 1.004442310333252, "memory(GiB)": 91.52, "step": 20010, "token_acc": 0.7277861716670264, "train_speed(iter/s)": 0.238164 }, { "epoch": 0.2597082418277543, "grad_norm": 0.8201935887336731, "learning_rate": 9.826680572826905e-05, "loss": 0.9245826721191406, "memory(GiB)": 91.52, "step": 20015, "token_acc": 0.7413994547578865, "train_speed(iter/s)": 0.238107 }, { "epoch": 0.25977312022941, "grad_norm": 0.7440066337585449, "learning_rate": 9.826540545789993e-05, "loss": 0.9630326271057129, "memory(GiB)": 91.52, "step": 20020, "token_acc": 0.7604080759949907, "train_speed(iter/s)": 0.238047 }, { "epoch": 0.2598379986310657, "grad_norm": 0.8160881400108337, "learning_rate": 9.826400463209566e-05, "loss": 0.9566374778747558, "memory(GiB)": 91.52, "step": 20025, "token_acc": 0.7270858317628919, "train_speed(iter/s)": 0.237994 }, { "epoch": 0.2599028770327214, "grad_norm": 0.822765588760376, "learning_rate": 9.826260325087231e-05, "loss": 0.9560744285583496, "memory(GiB)": 91.52, "step": 20030, "token_acc": 0.7492860202770242, "train_speed(iter/s)": 0.237927 }, { "epoch": 0.2599677554343771, "grad_norm": 0.862164318561554, "learning_rate": 9.826120131424603e-05, "loss": 1.00294828414917, "memory(GiB)": 91.52, "step": 20035, "token_acc": 0.7576049552649691, "train_speed(iter/s)": 0.237877 }, { "epoch": 0.2600326338360328, "grad_norm": 0.8323860168457031, "learning_rate": 9.825979882223295e-05, "loss": 0.9744629859924316, "memory(GiB)": 91.52, "step": 20040, "token_acc": 0.7569094414938602, "train_speed(iter/s)": 0.237824 }, { "epoch": 0.2600975122376885, "grad_norm": 0.9648064970970154, "learning_rate": 9.82583957748492e-05, "loss": 0.9731292724609375, "memory(GiB)": 91.52, "step": 20045, "token_acc": 0.7490121523894729, "train_speed(iter/s)": 0.237766 }, { "epoch": 0.2601623906393442, "grad_norm": 0.7084342241287231, "learning_rate": 9.825699217211098e-05, "loss": 0.9234586715698242, "memory(GiB)": 91.52, "step": 20050, "token_acc": 0.7726479146459748, "train_speed(iter/s)": 0.237705 }, { "epoch": 0.2602272690409999, "grad_norm": 0.8553163409233093, "learning_rate": 9.825558801403436e-05, "loss": 0.9660427093505859, "memory(GiB)": 91.52, "step": 20055, "token_acc": 0.7409411044197203, "train_speed(iter/s)": 0.237647 }, { "epoch": 0.2602921474426556, "grad_norm": 0.8358486890792847, "learning_rate": 9.825418330063556e-05, "loss": 0.9662521362304688, "memory(GiB)": 91.52, "step": 20060, "token_acc": 0.7538709028755278, "train_speed(iter/s)": 0.237586 }, { "epoch": 0.2603570258443113, "grad_norm": 0.8725128173828125, "learning_rate": 9.825277803193072e-05, "loss": 0.9571468353271484, "memory(GiB)": 91.52, "step": 20065, "token_acc": 0.7393613254974155, "train_speed(iter/s)": 0.237529 }, { "epoch": 0.260421904245967, "grad_norm": 0.8242884874343872, "learning_rate": 9.825137220793601e-05, "loss": 0.9675264358520508, "memory(GiB)": 91.52, "step": 20070, "token_acc": 0.7235891995482701, "train_speed(iter/s)": 0.237473 }, { "epoch": 0.2604867826476227, "grad_norm": 0.8642526865005493, "learning_rate": 9.824996582866761e-05, "loss": 1.0053614616394042, "memory(GiB)": 91.52, "step": 20075, "token_acc": 0.7265323316716984, "train_speed(iter/s)": 0.237416 }, { "epoch": 0.2605516610492784, "grad_norm": 0.8731544017791748, "learning_rate": 9.824855889414171e-05, "loss": 1.0236178398132325, "memory(GiB)": 91.52, "step": 20080, "token_acc": 0.7373258473695155, "train_speed(iter/s)": 0.237363 }, { "epoch": 0.2606165394509341, "grad_norm": 0.8193444609642029, "learning_rate": 9.824715140437451e-05, "loss": 0.9713802337646484, "memory(GiB)": 91.52, "step": 20085, "token_acc": 0.7478698142198631, "train_speed(iter/s)": 0.23731 }, { "epoch": 0.2606814178525898, "grad_norm": 0.8488943576812744, "learning_rate": 9.824574335938219e-05, "loss": 0.963650131225586, "memory(GiB)": 91.52, "step": 20090, "token_acc": 0.7460156945230968, "train_speed(iter/s)": 0.237249 }, { "epoch": 0.2607462962542455, "grad_norm": 0.7652410864830017, "learning_rate": 9.824433475918095e-05, "loss": 0.9294547080993653, "memory(GiB)": 91.52, "step": 20095, "token_acc": 0.7604888119339371, "train_speed(iter/s)": 0.237197 }, { "epoch": 0.2608111746559012, "grad_norm": 0.9217264652252197, "learning_rate": 9.824292560378703e-05, "loss": 0.9839907646179199, "memory(GiB)": 91.52, "step": 20100, "token_acc": 0.7349666713941285, "train_speed(iter/s)": 0.237147 }, { "epoch": 0.2608760530575569, "grad_norm": 0.7535634636878967, "learning_rate": 9.824151589321661e-05, "loss": 0.9643705368041993, "memory(GiB)": 91.52, "step": 20105, "token_acc": 0.754491392801252, "train_speed(iter/s)": 0.237089 }, { "epoch": 0.26094093145921254, "grad_norm": 0.8682676553726196, "learning_rate": 9.824010562748595e-05, "loss": 1.001143455505371, "memory(GiB)": 91.52, "step": 20110, "token_acc": 0.7394575089446805, "train_speed(iter/s)": 0.237029 }, { "epoch": 0.26100580986086824, "grad_norm": 0.8546190857887268, "learning_rate": 9.823869480661124e-05, "loss": 1.0106236457824707, "memory(GiB)": 91.52, "step": 20115, "token_acc": 0.7408809838115751, "train_speed(iter/s)": 0.236978 }, { "epoch": 0.26107068826252394, "grad_norm": 0.8331301808357239, "learning_rate": 9.823728343060874e-05, "loss": 1.012166976928711, "memory(GiB)": 91.52, "step": 20120, "token_acc": 0.755839481734749, "train_speed(iter/s)": 0.236925 }, { "epoch": 0.26113556666417964, "grad_norm": 0.6855529546737671, "learning_rate": 9.823587149949469e-05, "loss": 0.9864505767822266, "memory(GiB)": 91.52, "step": 20125, "token_acc": 0.7393258426966293, "train_speed(iter/s)": 0.236867 }, { "epoch": 0.26120044506583534, "grad_norm": 0.7672666907310486, "learning_rate": 9.823445901328534e-05, "loss": 0.9572643280029297, "memory(GiB)": 91.52, "step": 20130, "token_acc": 0.7572398384463349, "train_speed(iter/s)": 0.236812 }, { "epoch": 0.26126532346749104, "grad_norm": 0.7261452674865723, "learning_rate": 9.823304597199694e-05, "loss": 0.9527466773986817, "memory(GiB)": 91.52, "step": 20135, "token_acc": 0.754573819316409, "train_speed(iter/s)": 0.236752 }, { "epoch": 0.26133020186914674, "grad_norm": 0.8983685970306396, "learning_rate": 9.823163237564575e-05, "loss": 0.9875364303588867, "memory(GiB)": 91.52, "step": 20140, "token_acc": 0.7147564867489941, "train_speed(iter/s)": 0.236702 }, { "epoch": 0.26139508027080244, "grad_norm": 0.8049636483192444, "learning_rate": 9.823021822424802e-05, "loss": 0.8948249816894531, "memory(GiB)": 91.52, "step": 20145, "token_acc": 0.7487365648800627, "train_speed(iter/s)": 0.23665 }, { "epoch": 0.26145995867245814, "grad_norm": 0.8399835228919983, "learning_rate": 9.822880351782007e-05, "loss": 0.997528076171875, "memory(GiB)": 91.52, "step": 20150, "token_acc": 0.7149078240673538, "train_speed(iter/s)": 0.236596 }, { "epoch": 0.26152483707411384, "grad_norm": 0.8193078637123108, "learning_rate": 9.822738825637814e-05, "loss": 0.9293139457702637, "memory(GiB)": 91.52, "step": 20155, "token_acc": 0.7417437676689796, "train_speed(iter/s)": 0.236541 }, { "epoch": 0.26158971547576954, "grad_norm": 0.8161044120788574, "learning_rate": 9.822597243993851e-05, "loss": 0.9204076766967774, "memory(GiB)": 91.52, "step": 20160, "token_acc": 0.7674842894790188, "train_speed(iter/s)": 0.236486 }, { "epoch": 0.26165459387742523, "grad_norm": 0.77072674036026, "learning_rate": 9.822455606851753e-05, "loss": 0.9984979629516602, "memory(GiB)": 91.52, "step": 20165, "token_acc": 0.7330097087378641, "train_speed(iter/s)": 0.236432 }, { "epoch": 0.26171947227908093, "grad_norm": 0.7963871955871582, "learning_rate": 9.822313914213144e-05, "loss": 0.8944086074829102, "memory(GiB)": 91.52, "step": 20170, "token_acc": 0.7504913736623717, "train_speed(iter/s)": 0.236375 }, { "epoch": 0.26178435068073663, "grad_norm": 0.782799243927002, "learning_rate": 9.822172166079657e-05, "loss": 0.947902774810791, "memory(GiB)": 91.52, "step": 20175, "token_acc": 0.7473647496512169, "train_speed(iter/s)": 0.236323 }, { "epoch": 0.26184922908239233, "grad_norm": 0.8225666284561157, "learning_rate": 9.822030362452924e-05, "loss": 1.02503662109375, "memory(GiB)": 91.52, "step": 20180, "token_acc": 0.7331545741324921, "train_speed(iter/s)": 0.236267 }, { "epoch": 0.26191410748404803, "grad_norm": 0.9209067821502686, "learning_rate": 9.821888503334575e-05, "loss": 0.9807851791381836, "memory(GiB)": 91.52, "step": 20185, "token_acc": 0.7563677745422901, "train_speed(iter/s)": 0.236207 }, { "epoch": 0.26197898588570373, "grad_norm": 0.8344457149505615, "learning_rate": 9.821746588726244e-05, "loss": 0.9929694175720215, "memory(GiB)": 91.52, "step": 20190, "token_acc": 0.7541639527309947, "train_speed(iter/s)": 0.236152 }, { "epoch": 0.26204386428735943, "grad_norm": 0.9293700456619263, "learning_rate": 9.821604618629563e-05, "loss": 1.0218521118164063, "memory(GiB)": 91.52, "step": 20195, "token_acc": 0.7171534169142691, "train_speed(iter/s)": 0.2361 }, { "epoch": 0.26210874268901513, "grad_norm": 0.8957673907279968, "learning_rate": 9.821462593046167e-05, "loss": 1.0090224266052246, "memory(GiB)": 91.52, "step": 20200, "token_acc": 0.7330137907837202, "train_speed(iter/s)": 0.236046 }, { "epoch": 0.2621736210906708, "grad_norm": 0.8322402834892273, "learning_rate": 9.82132051197769e-05, "loss": 1.006631851196289, "memory(GiB)": 91.52, "step": 20205, "token_acc": 0.7460834889388233, "train_speed(iter/s)": 0.23599 }, { "epoch": 0.2622384994923265, "grad_norm": 0.831896960735321, "learning_rate": 9.821178375425766e-05, "loss": 0.9795996665954589, "memory(GiB)": 91.52, "step": 20210, "token_acc": 0.7373946300343428, "train_speed(iter/s)": 0.235937 }, { "epoch": 0.2623033778939822, "grad_norm": 0.8301029801368713, "learning_rate": 9.821036183392031e-05, "loss": 0.9354351997375489, "memory(GiB)": 91.52, "step": 20215, "token_acc": 0.7426609167864558, "train_speed(iter/s)": 0.235879 }, { "epoch": 0.2623682562956379, "grad_norm": 0.7985482811927795, "learning_rate": 9.820893935878124e-05, "loss": 0.981059455871582, "memory(GiB)": 91.52, "step": 20220, "token_acc": 0.7358668246301528, "train_speed(iter/s)": 0.23582 }, { "epoch": 0.2624331346972936, "grad_norm": 0.9860088229179382, "learning_rate": 9.820751632885679e-05, "loss": 0.9663931846618652, "memory(GiB)": 91.52, "step": 20225, "token_acc": 0.7215160167403293, "train_speed(iter/s)": 0.235766 }, { "epoch": 0.26249801309894927, "grad_norm": 0.7956962585449219, "learning_rate": 9.820609274416334e-05, "loss": 0.9570135116577149, "memory(GiB)": 91.52, "step": 20230, "token_acc": 0.7482153941651148, "train_speed(iter/s)": 0.235708 }, { "epoch": 0.26256289150060497, "grad_norm": 0.918849527835846, "learning_rate": 9.820466860471727e-05, "loss": 0.9426759719848633, "memory(GiB)": 91.52, "step": 20235, "token_acc": 0.741923294917816, "train_speed(iter/s)": 0.235655 }, { "epoch": 0.26262776990226067, "grad_norm": 0.7861966490745544, "learning_rate": 9.8203243910535e-05, "loss": 0.963377571105957, "memory(GiB)": 91.52, "step": 20240, "token_acc": 0.7394618392088577, "train_speed(iter/s)": 0.235599 }, { "epoch": 0.26269264830391637, "grad_norm": 0.7639973759651184, "learning_rate": 9.820181866163287e-05, "loss": 0.9375391006469727, "memory(GiB)": 91.52, "step": 20245, "token_acc": 0.7364128524221332, "train_speed(iter/s)": 0.235543 }, { "epoch": 0.26275752670557206, "grad_norm": 0.845356285572052, "learning_rate": 9.820039285802733e-05, "loss": 0.9616747856140136, "memory(GiB)": 91.52, "step": 20250, "token_acc": 0.7379783753023189, "train_speed(iter/s)": 0.23549 }, { "epoch": 0.26282240510722776, "grad_norm": 0.7811019420623779, "learning_rate": 9.819896649973477e-05, "loss": 0.9986933708190918, "memory(GiB)": 91.52, "step": 20255, "token_acc": 0.7253559860661248, "train_speed(iter/s)": 0.235436 }, { "epoch": 0.26288728350888346, "grad_norm": 0.8354082107543945, "learning_rate": 9.819753958677161e-05, "loss": 0.9417971611022949, "memory(GiB)": 91.52, "step": 20260, "token_acc": 0.7418939881200106, "train_speed(iter/s)": 0.235387 }, { "epoch": 0.26295216191053916, "grad_norm": 0.723780632019043, "learning_rate": 9.819611211915427e-05, "loss": 0.9287131309509278, "memory(GiB)": 91.52, "step": 20265, "token_acc": 0.7624282073713438, "train_speed(iter/s)": 0.235334 }, { "epoch": 0.26301704031219486, "grad_norm": 0.8208619356155396, "learning_rate": 9.819468409689915e-05, "loss": 0.9379802703857422, "memory(GiB)": 91.52, "step": 20270, "token_acc": 0.7595620218989051, "train_speed(iter/s)": 0.235282 }, { "epoch": 0.26308191871385056, "grad_norm": 0.8070282340049744, "learning_rate": 9.819325552002274e-05, "loss": 1.0313443183898925, "memory(GiB)": 91.52, "step": 20275, "token_acc": 0.7344392698099739, "train_speed(iter/s)": 0.235235 }, { "epoch": 0.26314679711550626, "grad_norm": 0.9327945113182068, "learning_rate": 9.819182638854143e-05, "loss": 0.9857901573181153, "memory(GiB)": 91.52, "step": 20280, "token_acc": 0.7404460131557837, "train_speed(iter/s)": 0.235179 }, { "epoch": 0.26321167551716196, "grad_norm": 0.8308930993080139, "learning_rate": 9.819039670247167e-05, "loss": 0.9441509246826172, "memory(GiB)": 91.52, "step": 20285, "token_acc": 0.7454662966201574, "train_speed(iter/s)": 0.235124 }, { "epoch": 0.26327655391881766, "grad_norm": 0.8071615695953369, "learning_rate": 9.818896646182996e-05, "loss": 0.9751079559326172, "memory(GiB)": 91.52, "step": 20290, "token_acc": 0.743472430385738, "train_speed(iter/s)": 0.235074 }, { "epoch": 0.26334143232047336, "grad_norm": 0.8680942058563232, "learning_rate": 9.81875356666327e-05, "loss": 0.9688607215881347, "memory(GiB)": 91.52, "step": 20295, "token_acc": 0.7353119935170178, "train_speed(iter/s)": 0.235019 }, { "epoch": 0.26340631072212906, "grad_norm": 0.8259257674217224, "learning_rate": 9.818610431689639e-05, "loss": 0.9721879005432129, "memory(GiB)": 91.52, "step": 20300, "token_acc": 0.7600856309747905, "train_speed(iter/s)": 0.234967 }, { "epoch": 0.26347118912378475, "grad_norm": 0.7313911318778992, "learning_rate": 9.818467241263749e-05, "loss": 0.9588773727416993, "memory(GiB)": 91.52, "step": 20305, "token_acc": 0.7517525475175255, "train_speed(iter/s)": 0.234915 }, { "epoch": 0.26353606752544045, "grad_norm": 0.7846621870994568, "learning_rate": 9.818323995387248e-05, "loss": 0.9541653633117676, "memory(GiB)": 91.52, "step": 20310, "token_acc": 0.7376342430424574, "train_speed(iter/s)": 0.234868 }, { "epoch": 0.26360094592709615, "grad_norm": 0.7581576108932495, "learning_rate": 9.818180694061784e-05, "loss": 0.9493746757507324, "memory(GiB)": 91.52, "step": 20315, "token_acc": 0.7697262479871175, "train_speed(iter/s)": 0.234812 }, { "epoch": 0.26366582432875185, "grad_norm": 0.8094688057899475, "learning_rate": 9.818037337289007e-05, "loss": 1.0075715065002442, "memory(GiB)": 91.52, "step": 20320, "token_acc": 0.7461153251343987, "train_speed(iter/s)": 0.234756 }, { "epoch": 0.26373070273040755, "grad_norm": 0.9575514793395996, "learning_rate": 9.817893925070568e-05, "loss": 0.9471948623657227, "memory(GiB)": 91.52, "step": 20325, "token_acc": 0.7511174861565147, "train_speed(iter/s)": 0.234701 }, { "epoch": 0.26379558113206325, "grad_norm": 0.7253096103668213, "learning_rate": 9.817750457408113e-05, "loss": 0.9056568145751953, "memory(GiB)": 91.52, "step": 20330, "token_acc": 0.7393325344082508, "train_speed(iter/s)": 0.234647 }, { "epoch": 0.26386045953371895, "grad_norm": 0.7053079605102539, "learning_rate": 9.817606934303299e-05, "loss": 0.9972396850585937, "memory(GiB)": 91.52, "step": 20335, "token_acc": 0.7227907280870959, "train_speed(iter/s)": 0.234593 }, { "epoch": 0.26392533793537465, "grad_norm": 0.7920882701873779, "learning_rate": 9.817463355757772e-05, "loss": 1.0025032043457032, "memory(GiB)": 91.52, "step": 20340, "token_acc": 0.7359329913433793, "train_speed(iter/s)": 0.234537 }, { "epoch": 0.26399021633703035, "grad_norm": 0.9885205030441284, "learning_rate": 9.817319721773187e-05, "loss": 1.0133584976196288, "memory(GiB)": 91.52, "step": 20345, "token_acc": 0.7163518399621532, "train_speed(iter/s)": 0.234483 }, { "epoch": 0.264055094738686, "grad_norm": 0.841145932674408, "learning_rate": 9.817176032351197e-05, "loss": 0.9886816024780274, "memory(GiB)": 91.52, "step": 20350, "token_acc": 0.7428960096735188, "train_speed(iter/s)": 0.234431 }, { "epoch": 0.2641199731403417, "grad_norm": 0.7530752420425415, "learning_rate": 9.817032287493456e-05, "loss": 0.9613146781921387, "memory(GiB)": 91.52, "step": 20355, "token_acc": 0.7688548572851496, "train_speed(iter/s)": 0.234375 }, { "epoch": 0.2641848515419974, "grad_norm": 0.803911566734314, "learning_rate": 9.816888487201617e-05, "loss": 1.001744842529297, "memory(GiB)": 91.52, "step": 20360, "token_acc": 0.7432784041630529, "train_speed(iter/s)": 0.234321 }, { "epoch": 0.2642497299436531, "grad_norm": 0.7834573984146118, "learning_rate": 9.816744631477334e-05, "loss": 1.0033200263977051, "memory(GiB)": 91.52, "step": 20365, "token_acc": 0.7412, "train_speed(iter/s)": 0.234266 }, { "epoch": 0.2643146083453088, "grad_norm": 0.8339532613754272, "learning_rate": 9.816600720322266e-05, "loss": 0.9648202896118164, "memory(GiB)": 91.52, "step": 20370, "token_acc": 0.7672842597229177, "train_speed(iter/s)": 0.234211 }, { "epoch": 0.2643794867469645, "grad_norm": 0.8092498183250427, "learning_rate": 9.816456753738066e-05, "loss": 0.9648891448974609, "memory(GiB)": 91.52, "step": 20375, "token_acc": 0.7488684125444552, "train_speed(iter/s)": 0.234158 }, { "epoch": 0.2644443651486202, "grad_norm": 0.8136612772941589, "learning_rate": 9.816312731726391e-05, "loss": 0.986021614074707, "memory(GiB)": 91.52, "step": 20380, "token_acc": 0.7428350116189001, "train_speed(iter/s)": 0.2341 }, { "epoch": 0.2645092435502759, "grad_norm": 0.8813261389732361, "learning_rate": 9.8161686542889e-05, "loss": 0.952142333984375, "memory(GiB)": 91.52, "step": 20385, "token_acc": 0.7550949647493932, "train_speed(iter/s)": 0.234046 }, { "epoch": 0.2645741219519316, "grad_norm": 0.758049726486206, "learning_rate": 9.81602452142725e-05, "loss": 0.981506061553955, "memory(GiB)": 91.52, "step": 20390, "token_acc": 0.7463064188661565, "train_speed(iter/s)": 0.233991 }, { "epoch": 0.2646390003535873, "grad_norm": 0.7946995496749878, "learning_rate": 9.815880333143099e-05, "loss": 0.9245089530944824, "memory(GiB)": 91.52, "step": 20395, "token_acc": 0.7329239989423896, "train_speed(iter/s)": 0.233933 }, { "epoch": 0.264703878755243, "grad_norm": 0.7588101029396057, "learning_rate": 9.815736089438107e-05, "loss": 0.9684091567993164, "memory(GiB)": 91.52, "step": 20400, "token_acc": 0.7431479510377861, "train_speed(iter/s)": 0.233876 }, { "epoch": 0.2647687571568987, "grad_norm": 0.8447871804237366, "learning_rate": 9.815591790313935e-05, "loss": 0.9897775650024414, "memory(GiB)": 91.52, "step": 20405, "token_acc": 0.756362004074445, "train_speed(iter/s)": 0.233824 }, { "epoch": 0.2648336355585544, "grad_norm": 0.8541004657745361, "learning_rate": 9.815447435772242e-05, "loss": 0.9724319458007813, "memory(GiB)": 91.52, "step": 20410, "token_acc": 0.7565686235062166, "train_speed(iter/s)": 0.233772 }, { "epoch": 0.2648985139602101, "grad_norm": 0.7451399564743042, "learning_rate": 9.81530302581469e-05, "loss": 0.9775632858276367, "memory(GiB)": 91.52, "step": 20415, "token_acc": 0.7347837182595381, "train_speed(iter/s)": 0.233717 }, { "epoch": 0.2649633923618658, "grad_norm": 0.7563984990119934, "learning_rate": 9.815158560442941e-05, "loss": 0.942872428894043, "memory(GiB)": 91.52, "step": 20420, "token_acc": 0.7319867456288776, "train_speed(iter/s)": 0.233666 }, { "epoch": 0.2650282707635215, "grad_norm": 0.8536483645439148, "learning_rate": 9.815014039658657e-05, "loss": 0.9372012138366699, "memory(GiB)": 91.52, "step": 20425, "token_acc": 0.752470765748774, "train_speed(iter/s)": 0.233611 }, { "epoch": 0.2650931491651772, "grad_norm": 0.8292986154556274, "learning_rate": 9.8148694634635e-05, "loss": 0.9893059730529785, "memory(GiB)": 91.52, "step": 20430, "token_acc": 0.7256621947024424, "train_speed(iter/s)": 0.233563 }, { "epoch": 0.2651580275668329, "grad_norm": 0.8368973135948181, "learning_rate": 9.814724831859136e-05, "loss": 0.9591025352478028, "memory(GiB)": 91.52, "step": 20435, "token_acc": 0.7414879745001449, "train_speed(iter/s)": 0.233511 }, { "epoch": 0.2652229059684886, "grad_norm": 0.7414999604225159, "learning_rate": 9.81458014484723e-05, "loss": 0.9051456451416016, "memory(GiB)": 91.52, "step": 20440, "token_acc": 0.7573529411764706, "train_speed(iter/s)": 0.233454 }, { "epoch": 0.2652877843701443, "grad_norm": 0.9038750529289246, "learning_rate": 9.814435402429443e-05, "loss": 0.9774452209472656, "memory(GiB)": 91.52, "step": 20445, "token_acc": 0.7405679966776347, "train_speed(iter/s)": 0.233401 }, { "epoch": 0.2653526627718, "grad_norm": 0.816831111907959, "learning_rate": 9.814290604607445e-05, "loss": 0.9802054405212403, "memory(GiB)": 91.52, "step": 20450, "token_acc": 0.7440607485084072, "train_speed(iter/s)": 0.233343 }, { "epoch": 0.2654175411734557, "grad_norm": 0.7449690699577332, "learning_rate": 9.814145751382901e-05, "loss": 0.94132080078125, "memory(GiB)": 91.52, "step": 20455, "token_acc": 0.7463139142629723, "train_speed(iter/s)": 0.233292 }, { "epoch": 0.2654824195751114, "grad_norm": 0.7852256894111633, "learning_rate": 9.814000842757477e-05, "loss": 0.9372459411621094, "memory(GiB)": 91.52, "step": 20460, "token_acc": 0.748350934755118, "train_speed(iter/s)": 0.233239 }, { "epoch": 0.26554729797676707, "grad_norm": 0.7899089455604553, "learning_rate": 9.813855878732841e-05, "loss": 0.9986455917358399, "memory(GiB)": 91.52, "step": 20465, "token_acc": 0.7428206102913225, "train_speed(iter/s)": 0.233186 }, { "epoch": 0.2656121763784227, "grad_norm": 0.7845184206962585, "learning_rate": 9.81371085931066e-05, "loss": 0.9230837821960449, "memory(GiB)": 91.52, "step": 20470, "token_acc": 0.7421264003381949, "train_speed(iter/s)": 0.23313 }, { "epoch": 0.2656770547800784, "grad_norm": 0.8772034645080566, "learning_rate": 9.813565784492607e-05, "loss": 0.9964668273925781, "memory(GiB)": 91.52, "step": 20475, "token_acc": 0.760788471355087, "train_speed(iter/s)": 0.233076 }, { "epoch": 0.2657419331817341, "grad_norm": 0.7526885867118835, "learning_rate": 9.813420654280346e-05, "loss": 0.9909885406494141, "memory(GiB)": 91.52, "step": 20480, "token_acc": 0.7418566189688646, "train_speed(iter/s)": 0.233022 }, { "epoch": 0.2658068115833898, "grad_norm": 0.8327978253364563, "learning_rate": 9.813275468675551e-05, "loss": 0.8982292175292969, "memory(GiB)": 91.52, "step": 20485, "token_acc": 0.7450617283950617, "train_speed(iter/s)": 0.232971 }, { "epoch": 0.2658716899850455, "grad_norm": 0.9750200510025024, "learning_rate": 9.813130227679891e-05, "loss": 0.9319672584533691, "memory(GiB)": 91.52, "step": 20490, "token_acc": 0.7713464140730717, "train_speed(iter/s)": 0.232921 }, { "epoch": 0.2659365683867012, "grad_norm": 0.7717300057411194, "learning_rate": 9.81298493129504e-05, "loss": 0.9816614151000976, "memory(GiB)": 91.52, "step": 20495, "token_acc": 0.7640902576409025, "train_speed(iter/s)": 0.232867 }, { "epoch": 0.2660014467883569, "grad_norm": 0.8306869864463806, "learning_rate": 9.812839579522666e-05, "loss": 0.9671871185302734, "memory(GiB)": 91.52, "step": 20500, "token_acc": 0.743784693019344, "train_speed(iter/s)": 0.232816 }, { "epoch": 0.2660663251900126, "grad_norm": 1.0136783123016357, "learning_rate": 9.812694172364445e-05, "loss": 0.9900777816772461, "memory(GiB)": 91.52, "step": 20505, "token_acc": 0.7510096851350822, "train_speed(iter/s)": 0.232767 }, { "epoch": 0.2661312035916683, "grad_norm": 0.7972057461738586, "learning_rate": 9.81254870982205e-05, "loss": 0.9600808143615722, "memory(GiB)": 91.52, "step": 20510, "token_acc": 0.7401914344516677, "train_speed(iter/s)": 0.232717 }, { "epoch": 0.266196081993324, "grad_norm": 0.8470242619514465, "learning_rate": 9.812403191897153e-05, "loss": 1.016122817993164, "memory(GiB)": 91.52, "step": 20515, "token_acc": 0.7309644670050761, "train_speed(iter/s)": 0.23267 }, { "epoch": 0.2662609603949797, "grad_norm": 0.8351538777351379, "learning_rate": 9.81225761859143e-05, "loss": 0.925013542175293, "memory(GiB)": 91.52, "step": 20520, "token_acc": 0.7413049548571251, "train_speed(iter/s)": 0.232612 }, { "epoch": 0.2663258387966354, "grad_norm": 0.7404946684837341, "learning_rate": 9.812111989906556e-05, "loss": 0.9475324630737305, "memory(GiB)": 91.52, "step": 20525, "token_acc": 0.7598397904952631, "train_speed(iter/s)": 0.232562 }, { "epoch": 0.2663907171982911, "grad_norm": 0.8055899739265442, "learning_rate": 9.811966305844207e-05, "loss": 0.9786470413208008, "memory(GiB)": 91.52, "step": 20530, "token_acc": 0.7476033002638875, "train_speed(iter/s)": 0.232513 }, { "epoch": 0.2664555955999468, "grad_norm": 0.8751816749572754, "learning_rate": 9.811820566406061e-05, "loss": 0.9355323791503907, "memory(GiB)": 91.52, "step": 20535, "token_acc": 0.7503561939235242, "train_speed(iter/s)": 0.23246 }, { "epoch": 0.2665204740016025, "grad_norm": 0.7560890316963196, "learning_rate": 9.81167477159379e-05, "loss": 0.9397668838500977, "memory(GiB)": 91.52, "step": 20540, "token_acc": 0.7297488253788815, "train_speed(iter/s)": 0.232407 }, { "epoch": 0.2665853524032582, "grad_norm": 0.8576838970184326, "learning_rate": 9.811528921409079e-05, "loss": 0.9816333770751953, "memory(GiB)": 91.52, "step": 20545, "token_acc": 0.7320868364914042, "train_speed(iter/s)": 0.232355 }, { "epoch": 0.2666502308049139, "grad_norm": 0.8652957677841187, "learning_rate": 9.811383015853602e-05, "loss": 0.9251211166381836, "memory(GiB)": 91.52, "step": 20550, "token_acc": 0.7355321885165292, "train_speed(iter/s)": 0.232309 }, { "epoch": 0.2667151092065696, "grad_norm": 0.7248571515083313, "learning_rate": 9.811237054929036e-05, "loss": 0.9976523399353028, "memory(GiB)": 91.52, "step": 20555, "token_acc": 0.7439487756825218, "train_speed(iter/s)": 0.23226 }, { "epoch": 0.2667799876082253, "grad_norm": 0.8351628184318542, "learning_rate": 9.811091038637067e-05, "loss": 0.9801761627197265, "memory(GiB)": 91.52, "step": 20560, "token_acc": 0.7536855036855037, "train_speed(iter/s)": 0.232209 }, { "epoch": 0.266844866009881, "grad_norm": 0.6919762492179871, "learning_rate": 9.81094496697937e-05, "loss": 0.9585410118103027, "memory(GiB)": 91.52, "step": 20565, "token_acc": 0.7306654190808651, "train_speed(iter/s)": 0.232157 }, { "epoch": 0.2669097444115367, "grad_norm": 0.8099597096443176, "learning_rate": 9.810798839957628e-05, "loss": 1.0035730361938477, "memory(GiB)": 91.52, "step": 20570, "token_acc": 0.7348993288590604, "train_speed(iter/s)": 0.232107 }, { "epoch": 0.2669746228131924, "grad_norm": 0.814293384552002, "learning_rate": 9.810652657573524e-05, "loss": 1.0351699829101562, "memory(GiB)": 91.52, "step": 20575, "token_acc": 0.710319336399192, "train_speed(iter/s)": 0.232056 }, { "epoch": 0.2670395012148481, "grad_norm": 0.8022615313529968, "learning_rate": 9.810506419828739e-05, "loss": 1.0017637252807616, "memory(GiB)": 91.52, "step": 20580, "token_acc": 0.743299343033332, "train_speed(iter/s)": 0.232005 }, { "epoch": 0.2671043796165038, "grad_norm": 0.8929449319839478, "learning_rate": 9.810360126724955e-05, "loss": 0.9896095275878907, "memory(GiB)": 91.52, "step": 20585, "token_acc": 0.7290076335877863, "train_speed(iter/s)": 0.231958 }, { "epoch": 0.26716925801815944, "grad_norm": 0.9583204388618469, "learning_rate": 9.810213778263856e-05, "loss": 0.9802151679992676, "memory(GiB)": 91.52, "step": 20590, "token_acc": 0.7619175249837271, "train_speed(iter/s)": 0.23191 }, { "epoch": 0.26723413641981514, "grad_norm": 0.7319755554199219, "learning_rate": 9.810067374447128e-05, "loss": 0.9579665184020996, "memory(GiB)": 91.52, "step": 20595, "token_acc": 0.7628580110916964, "train_speed(iter/s)": 0.231866 }, { "epoch": 0.26729901482147084, "grad_norm": 0.8007678389549255, "learning_rate": 9.809920915276451e-05, "loss": 0.8850923538208008, "memory(GiB)": 91.52, "step": 20600, "token_acc": 0.7883651770073736, "train_speed(iter/s)": 0.231817 }, { "epoch": 0.26736389322312654, "grad_norm": 0.7470775842666626, "learning_rate": 9.809774400753515e-05, "loss": 1.0118684768676758, "memory(GiB)": 91.52, "step": 20605, "token_acc": 0.7392267019167218, "train_speed(iter/s)": 0.231768 }, { "epoch": 0.26742877162478224, "grad_norm": 0.9818906188011169, "learning_rate": 9.809627830880005e-05, "loss": 0.9682210922241211, "memory(GiB)": 91.52, "step": 20610, "token_acc": 0.7353329194474624, "train_speed(iter/s)": 0.231716 }, { "epoch": 0.26749365002643793, "grad_norm": 0.8652198910713196, "learning_rate": 9.809481205657609e-05, "loss": 0.992336654663086, "memory(GiB)": 91.52, "step": 20615, "token_acc": 0.7152502828054299, "train_speed(iter/s)": 0.231668 }, { "epoch": 0.26755852842809363, "grad_norm": 0.765020489692688, "learning_rate": 9.809334525088011e-05, "loss": 1.0358646392822266, "memory(GiB)": 91.52, "step": 20620, "token_acc": 0.753307597166523, "train_speed(iter/s)": 0.231614 }, { "epoch": 0.26762340682974933, "grad_norm": 0.8031640648841858, "learning_rate": 9.809187789172902e-05, "loss": 0.929052734375, "memory(GiB)": 91.52, "step": 20625, "token_acc": 0.7494966818283498, "train_speed(iter/s)": 0.231568 }, { "epoch": 0.26768828523140503, "grad_norm": 0.8268193602561951, "learning_rate": 9.809040997913969e-05, "loss": 0.9635147094726563, "memory(GiB)": 91.52, "step": 20630, "token_acc": 0.7375485694101024, "train_speed(iter/s)": 0.231518 }, { "epoch": 0.26775316363306073, "grad_norm": 0.778247594833374, "learning_rate": 9.808894151312902e-05, "loss": 0.9368630409240722, "memory(GiB)": 91.52, "step": 20635, "token_acc": 0.7095677675209118, "train_speed(iter/s)": 0.231461 }, { "epoch": 0.26781804203471643, "grad_norm": 0.8020827770233154, "learning_rate": 9.80874724937139e-05, "loss": 1.0136518478393555, "memory(GiB)": 91.52, "step": 20640, "token_acc": 0.7185500465090058, "train_speed(iter/s)": 0.231405 }, { "epoch": 0.26788292043637213, "grad_norm": 0.7326405644416809, "learning_rate": 9.808600292091123e-05, "loss": 1.0260280609130858, "memory(GiB)": 91.52, "step": 20645, "token_acc": 0.7417769126719673, "train_speed(iter/s)": 0.231356 }, { "epoch": 0.26794779883802783, "grad_norm": 0.8754878640174866, "learning_rate": 9.808453279473795e-05, "loss": 0.9793760299682617, "memory(GiB)": 91.52, "step": 20650, "token_acc": 0.7526546146527117, "train_speed(iter/s)": 0.231305 }, { "epoch": 0.2680126772396835, "grad_norm": 0.9759297370910645, "learning_rate": 9.808306211521095e-05, "loss": 0.9360289573669434, "memory(GiB)": 91.52, "step": 20655, "token_acc": 0.7566738660907127, "train_speed(iter/s)": 0.231253 }, { "epoch": 0.2680775556413392, "grad_norm": 0.8131648898124695, "learning_rate": 9.808159088234719e-05, "loss": 0.9433677673339844, "memory(GiB)": 91.52, "step": 20660, "token_acc": 0.7624926513815403, "train_speed(iter/s)": 0.231201 }, { "epoch": 0.2681424340429949, "grad_norm": 0.8474138975143433, "learning_rate": 9.808011909616355e-05, "loss": 0.9738435745239258, "memory(GiB)": 91.52, "step": 20665, "token_acc": 0.7555555555555555, "train_speed(iter/s)": 0.231151 }, { "epoch": 0.2682073124446506, "grad_norm": 0.7907763719558716, "learning_rate": 9.8078646756677e-05, "loss": 0.9397075653076172, "memory(GiB)": 91.52, "step": 20670, "token_acc": 0.7601811367895066, "train_speed(iter/s)": 0.231097 }, { "epoch": 0.2682721908463063, "grad_norm": 0.8277637958526611, "learning_rate": 9.807717386390448e-05, "loss": 1.0075922012329102, "memory(GiB)": 91.52, "step": 20675, "token_acc": 0.736661316211878, "train_speed(iter/s)": 0.231045 }, { "epoch": 0.268337069247962, "grad_norm": 0.7304935455322266, "learning_rate": 9.807570041786294e-05, "loss": 0.9026592254638672, "memory(GiB)": 91.52, "step": 20680, "token_acc": 0.7535865606184494, "train_speed(iter/s)": 0.230995 }, { "epoch": 0.2684019476496177, "grad_norm": 0.8029981255531311, "learning_rate": 9.807422641856934e-05, "loss": 0.9437176704406738, "memory(GiB)": 91.52, "step": 20685, "token_acc": 0.7447691251529713, "train_speed(iter/s)": 0.230945 }, { "epoch": 0.2684668260512734, "grad_norm": 0.8472526669502258, "learning_rate": 9.807275186604061e-05, "loss": 0.9541479110717773, "memory(GiB)": 91.52, "step": 20690, "token_acc": 0.7397524976359824, "train_speed(iter/s)": 0.230895 }, { "epoch": 0.2685317044529291, "grad_norm": 0.8873759508132935, "learning_rate": 9.807127676029378e-05, "loss": 0.9775560379028321, "memory(GiB)": 91.52, "step": 20695, "token_acc": 0.7424194190210903, "train_speed(iter/s)": 0.230841 }, { "epoch": 0.2685965828545848, "grad_norm": 0.8137699365615845, "learning_rate": 9.806980110134575e-05, "loss": 0.9377897262573243, "memory(GiB)": 91.52, "step": 20700, "token_acc": 0.7641024150658258, "train_speed(iter/s)": 0.230788 }, { "epoch": 0.26866146125624046, "grad_norm": 0.8001980185508728, "learning_rate": 9.806832488921358e-05, "loss": 0.9207477569580078, "memory(GiB)": 91.52, "step": 20705, "token_acc": 0.7306388080150711, "train_speed(iter/s)": 0.230737 }, { "epoch": 0.26872633965789616, "grad_norm": 0.972815752029419, "learning_rate": 9.806684812391422e-05, "loss": 1.0059642791748047, "memory(GiB)": 91.52, "step": 20710, "token_acc": 0.7474870134250826, "train_speed(iter/s)": 0.230692 }, { "epoch": 0.26879121805955186, "grad_norm": 0.7738910913467407, "learning_rate": 9.806537080546465e-05, "loss": 0.9604607582092285, "memory(GiB)": 91.52, "step": 20715, "token_acc": 0.7423848384300362, "train_speed(iter/s)": 0.230638 }, { "epoch": 0.26885609646120756, "grad_norm": 0.8341801166534424, "learning_rate": 9.806389293388189e-05, "loss": 0.9819877624511719, "memory(GiB)": 91.52, "step": 20720, "token_acc": 0.7474205216067009, "train_speed(iter/s)": 0.23059 }, { "epoch": 0.26892097486286326, "grad_norm": 0.9137006998062134, "learning_rate": 9.806241450918294e-05, "loss": 0.9708271980285644, "memory(GiB)": 91.52, "step": 20725, "token_acc": 0.7604373354930147, "train_speed(iter/s)": 0.23054 }, { "epoch": 0.26898585326451896, "grad_norm": 0.848081111907959, "learning_rate": 9.806093553138483e-05, "loss": 0.9503934860229493, "memory(GiB)": 91.52, "step": 20730, "token_acc": 0.754614549402823, "train_speed(iter/s)": 0.230489 }, { "epoch": 0.26905073166617466, "grad_norm": 0.8039512038230896, "learning_rate": 9.805945600050455e-05, "loss": 0.9411510467529297, "memory(GiB)": 91.52, "step": 20735, "token_acc": 0.766338487557381, "train_speed(iter/s)": 0.230438 }, { "epoch": 0.26911561006783036, "grad_norm": 0.8707062602043152, "learning_rate": 9.805797591655914e-05, "loss": 0.9734573364257812, "memory(GiB)": 91.52, "step": 20740, "token_acc": 0.7424046136487108, "train_speed(iter/s)": 0.230388 }, { "epoch": 0.26918048846948606, "grad_norm": 0.7787488698959351, "learning_rate": 9.805649527956565e-05, "loss": 0.9466739654541015, "memory(GiB)": 91.52, "step": 20745, "token_acc": 0.7506826160447276, "train_speed(iter/s)": 0.230339 }, { "epoch": 0.26924536687114176, "grad_norm": 0.862285315990448, "learning_rate": 9.80550140895411e-05, "loss": 0.9434949874877929, "memory(GiB)": 91.52, "step": 20750, "token_acc": 0.7537203477235892, "train_speed(iter/s)": 0.230294 }, { "epoch": 0.26931024527279745, "grad_norm": 0.782924473285675, "learning_rate": 9.805353234650254e-05, "loss": 0.9106916427612305, "memory(GiB)": 91.52, "step": 20755, "token_acc": 0.7499393477281392, "train_speed(iter/s)": 0.23025 }, { "epoch": 0.26937512367445315, "grad_norm": 0.7251623868942261, "learning_rate": 9.805205005046703e-05, "loss": 0.9701714515686035, "memory(GiB)": 91.52, "step": 20760, "token_acc": 0.730700494095385, "train_speed(iter/s)": 0.2302 }, { "epoch": 0.26944000207610885, "grad_norm": 0.8223749399185181, "learning_rate": 9.805056720145162e-05, "loss": 0.9591178894042969, "memory(GiB)": 91.52, "step": 20765, "token_acc": 0.7466766691672918, "train_speed(iter/s)": 0.230149 }, { "epoch": 0.26950488047776455, "grad_norm": 0.7819066047668457, "learning_rate": 9.804908379947338e-05, "loss": 0.9797487258911133, "memory(GiB)": 91.52, "step": 20770, "token_acc": 0.7490104235387254, "train_speed(iter/s)": 0.230098 }, { "epoch": 0.26956975887942025, "grad_norm": 0.8190474510192871, "learning_rate": 9.804759984454935e-05, "loss": 1.033797264099121, "memory(GiB)": 91.52, "step": 20775, "token_acc": 0.72264631043257, "train_speed(iter/s)": 0.230048 }, { "epoch": 0.26963463728107595, "grad_norm": 0.8288745284080505, "learning_rate": 9.804611533669666e-05, "loss": 1.0107895851135253, "memory(GiB)": 91.52, "step": 20780, "token_acc": 0.7402530171158637, "train_speed(iter/s)": 0.229999 }, { "epoch": 0.26969951568273165, "grad_norm": 0.8033790588378906, "learning_rate": 9.804463027593236e-05, "loss": 0.9847274780273437, "memory(GiB)": 91.52, "step": 20785, "token_acc": 0.7324514139199584, "train_speed(iter/s)": 0.22995 }, { "epoch": 0.26976439408438735, "grad_norm": 0.9269682168960571, "learning_rate": 9.804314466227353e-05, "loss": 0.9054056167602539, "memory(GiB)": 91.52, "step": 20790, "token_acc": 0.7623541628489402, "train_speed(iter/s)": 0.229901 }, { "epoch": 0.26982927248604305, "grad_norm": 0.7875323295593262, "learning_rate": 9.80416584957373e-05, "loss": 1.0085514068603516, "memory(GiB)": 91.52, "step": 20795, "token_acc": 0.7313391970977788, "train_speed(iter/s)": 0.229854 }, { "epoch": 0.26989415088769875, "grad_norm": 0.8817440271377563, "learning_rate": 9.804017177634075e-05, "loss": 0.9885212898254394, "memory(GiB)": 91.52, "step": 20800, "token_acc": 0.725733972438586, "train_speed(iter/s)": 0.229807 }, { "epoch": 0.26995902928935445, "grad_norm": 0.7265475392341614, "learning_rate": 9.803868450410101e-05, "loss": 0.9741020202636719, "memory(GiB)": 91.52, "step": 20805, "token_acc": 0.7238698300737416, "train_speed(iter/s)": 0.229759 }, { "epoch": 0.27002390769101015, "grad_norm": 0.7464887499809265, "learning_rate": 9.803719667903517e-05, "loss": 0.9949625968933106, "memory(GiB)": 91.52, "step": 20810, "token_acc": 0.7605146502324576, "train_speed(iter/s)": 0.22971 }, { "epoch": 0.27008878609266584, "grad_norm": 0.8445771932601929, "learning_rate": 9.803570830116035e-05, "loss": 0.9529098510742188, "memory(GiB)": 91.52, "step": 20815, "token_acc": 0.7548283627926022, "train_speed(iter/s)": 0.229661 }, { "epoch": 0.27015366449432154, "grad_norm": 0.729854166507721, "learning_rate": 9.803421937049371e-05, "loss": 0.9788396835327149, "memory(GiB)": 91.52, "step": 20820, "token_acc": 0.7333543594056631, "train_speed(iter/s)": 0.22961 }, { "epoch": 0.2702185428959772, "grad_norm": 0.8237418532371521, "learning_rate": 9.803272988705235e-05, "loss": 0.9740761756896973, "memory(GiB)": 91.52, "step": 20825, "token_acc": 0.7430947842661584, "train_speed(iter/s)": 0.229561 }, { "epoch": 0.2702834212976329, "grad_norm": 0.8047208189964294, "learning_rate": 9.803123985085342e-05, "loss": 0.9401844024658204, "memory(GiB)": 91.52, "step": 20830, "token_acc": 0.7635615701049556, "train_speed(iter/s)": 0.22951 }, { "epoch": 0.2703482996992886, "grad_norm": 0.787090539932251, "learning_rate": 9.80297492619141e-05, "loss": 0.9339275360107422, "memory(GiB)": 91.52, "step": 20835, "token_acc": 0.7530124935314556, "train_speed(iter/s)": 0.229457 }, { "epoch": 0.2704131781009443, "grad_norm": 0.7839428782463074, "learning_rate": 9.802825812025151e-05, "loss": 0.9837064743041992, "memory(GiB)": 91.52, "step": 20840, "token_acc": 0.7450762935485105, "train_speed(iter/s)": 0.229404 }, { "epoch": 0.2704780565026, "grad_norm": 1.0053364038467407, "learning_rate": 9.80267664258828e-05, "loss": 0.9976565361022949, "memory(GiB)": 91.52, "step": 20845, "token_acc": 0.7393627954779034, "train_speed(iter/s)": 0.229354 }, { "epoch": 0.2705429349042557, "grad_norm": 0.8183138370513916, "learning_rate": 9.802527417882516e-05, "loss": 0.9937220573425293, "memory(GiB)": 91.52, "step": 20850, "token_acc": 0.7429067382041976, "train_speed(iter/s)": 0.229306 }, { "epoch": 0.2706078133059114, "grad_norm": 0.7944813966751099, "learning_rate": 9.802378137909576e-05, "loss": 0.9422065734863281, "memory(GiB)": 91.52, "step": 20855, "token_acc": 0.7367085893954552, "train_speed(iter/s)": 0.229259 }, { "epoch": 0.2706726917075671, "grad_norm": 0.9048489332199097, "learning_rate": 9.802228802671178e-05, "loss": 0.981959342956543, "memory(GiB)": 91.52, "step": 20860, "token_acc": 0.7399443856969412, "train_speed(iter/s)": 0.229216 }, { "epoch": 0.2707375701092228, "grad_norm": 0.8746182322502136, "learning_rate": 9.802079412169039e-05, "loss": 1.0027040481567382, "memory(GiB)": 91.52, "step": 20865, "token_acc": 0.7513070562497682, "train_speed(iter/s)": 0.229169 }, { "epoch": 0.2708024485108785, "grad_norm": 0.8919680714607239, "learning_rate": 9.80192996640488e-05, "loss": 0.9750099182128906, "memory(GiB)": 91.52, "step": 20870, "token_acc": 0.745720276656339, "train_speed(iter/s)": 0.22912 }, { "epoch": 0.2708673269125342, "grad_norm": 0.7909883856773376, "learning_rate": 9.801780465380418e-05, "loss": 0.9849617004394531, "memory(GiB)": 91.52, "step": 20875, "token_acc": 0.736359746095143, "train_speed(iter/s)": 0.229074 }, { "epoch": 0.2709322053141899, "grad_norm": 0.7876741290092468, "learning_rate": 9.801630909097377e-05, "loss": 0.9525190353393554, "memory(GiB)": 91.52, "step": 20880, "token_acc": 0.7382315643853514, "train_speed(iter/s)": 0.229024 }, { "epoch": 0.2709970837158456, "grad_norm": 0.8965265154838562, "learning_rate": 9.801481297557476e-05, "loss": 0.955905818939209, "memory(GiB)": 91.52, "step": 20885, "token_acc": 0.7430701309980459, "train_speed(iter/s)": 0.228976 }, { "epoch": 0.2710619621175013, "grad_norm": 0.7254857420921326, "learning_rate": 9.801331630762438e-05, "loss": 0.8898698806762695, "memory(GiB)": 91.52, "step": 20890, "token_acc": 0.7284305421103582, "train_speed(iter/s)": 0.228924 }, { "epoch": 0.271126840519157, "grad_norm": 0.8188930153846741, "learning_rate": 9.801181908713984e-05, "loss": 0.9704693794250489, "memory(GiB)": 91.52, "step": 20895, "token_acc": 0.7471812360801782, "train_speed(iter/s)": 0.228875 }, { "epoch": 0.2711917189208127, "grad_norm": 0.7606538534164429, "learning_rate": 9.801032131413837e-05, "loss": 0.8973976135253906, "memory(GiB)": 91.52, "step": 20900, "token_acc": 0.759921047163931, "train_speed(iter/s)": 0.228827 }, { "epoch": 0.2712565973224684, "grad_norm": 0.843082070350647, "learning_rate": 9.800882298863723e-05, "loss": 0.9676666259765625, "memory(GiB)": 91.52, "step": 20905, "token_acc": 0.7333938953488373, "train_speed(iter/s)": 0.22878 }, { "epoch": 0.2713214757241241, "grad_norm": 0.8478294610977173, "learning_rate": 9.800732411065362e-05, "loss": 0.964653205871582, "memory(GiB)": 91.52, "step": 20910, "token_acc": 0.7342583674452203, "train_speed(iter/s)": 0.228725 }, { "epoch": 0.27138635412577977, "grad_norm": 0.7790402770042419, "learning_rate": 9.800582468020484e-05, "loss": 0.9756986618041992, "memory(GiB)": 91.52, "step": 20915, "token_acc": 0.768774465522433, "train_speed(iter/s)": 0.228678 }, { "epoch": 0.27145123252743547, "grad_norm": 0.8185872435569763, "learning_rate": 9.800432469730811e-05, "loss": 1.0055615425109863, "memory(GiB)": 91.52, "step": 20920, "token_acc": 0.7365868218789643, "train_speed(iter/s)": 0.228634 }, { "epoch": 0.27151611092909117, "grad_norm": 0.8545237183570862, "learning_rate": 9.80028241619807e-05, "loss": 0.9509521484375, "memory(GiB)": 91.52, "step": 20925, "token_acc": 0.7525065556069721, "train_speed(iter/s)": 0.228591 }, { "epoch": 0.27158098933074687, "grad_norm": 0.8314809203147888, "learning_rate": 9.800132307423989e-05, "loss": 0.9795688629150391, "memory(GiB)": 91.52, "step": 20930, "token_acc": 0.7494736842105263, "train_speed(iter/s)": 0.228548 }, { "epoch": 0.27164586773240257, "grad_norm": 0.8993959426879883, "learning_rate": 9.799982143410292e-05, "loss": 1.042734432220459, "memory(GiB)": 91.52, "step": 20935, "token_acc": 0.7201657179426724, "train_speed(iter/s)": 0.228503 }, { "epoch": 0.27171074613405827, "grad_norm": 0.8209414482116699, "learning_rate": 9.799831924158711e-05, "loss": 0.9881317138671875, "memory(GiB)": 91.52, "step": 20940, "token_acc": 0.7510868175425137, "train_speed(iter/s)": 0.228456 }, { "epoch": 0.2717756245357139, "grad_norm": 0.7745116949081421, "learning_rate": 9.799681649670972e-05, "loss": 0.9557762145996094, "memory(GiB)": 91.52, "step": 20945, "token_acc": 0.7597416933436362, "train_speed(iter/s)": 0.228406 }, { "epoch": 0.2718405029373696, "grad_norm": 0.759699821472168, "learning_rate": 9.799531319948807e-05, "loss": 0.9660114288330078, "memory(GiB)": 91.52, "step": 20950, "token_acc": 0.7511310552837753, "train_speed(iter/s)": 0.228362 }, { "epoch": 0.2719053813390253, "grad_norm": 0.9168879985809326, "learning_rate": 9.799380934993943e-05, "loss": 0.9732624053955078, "memory(GiB)": 91.52, "step": 20955, "token_acc": 0.737877094972067, "train_speed(iter/s)": 0.228319 }, { "epoch": 0.271970259740681, "grad_norm": 0.8958728313446045, "learning_rate": 9.799230494808113e-05, "loss": 0.9768226623535157, "memory(GiB)": 91.52, "step": 20960, "token_acc": 0.736523869743914, "train_speed(iter/s)": 0.228271 }, { "epoch": 0.2720351381423367, "grad_norm": 0.7459481358528137, "learning_rate": 9.799079999393046e-05, "loss": 0.9150368690490722, "memory(GiB)": 91.52, "step": 20965, "token_acc": 0.7459219925889139, "train_speed(iter/s)": 0.228224 }, { "epoch": 0.2721000165439924, "grad_norm": 0.9173232913017273, "learning_rate": 9.798929448750476e-05, "loss": 0.9632033348083496, "memory(GiB)": 91.52, "step": 20970, "token_acc": 0.74078134343233, "train_speed(iter/s)": 0.228177 }, { "epoch": 0.2721648949456481, "grad_norm": 0.8073248863220215, "learning_rate": 9.798778842882136e-05, "loss": 0.9504834175109863, "memory(GiB)": 91.52, "step": 20975, "token_acc": 0.7478558433682752, "train_speed(iter/s)": 0.228128 }, { "epoch": 0.2722297733473038, "grad_norm": 0.7741050124168396, "learning_rate": 9.798628181789757e-05, "loss": 0.9556239128112793, "memory(GiB)": 91.52, "step": 20980, "token_acc": 0.7446779798252421, "train_speed(iter/s)": 0.228083 }, { "epoch": 0.2722946517489595, "grad_norm": 0.8960683345794678, "learning_rate": 9.798477465475074e-05, "loss": 1.0087343215942384, "memory(GiB)": 91.52, "step": 20985, "token_acc": 0.7226241689479859, "train_speed(iter/s)": 0.228036 }, { "epoch": 0.2723595301506152, "grad_norm": 0.8860099911689758, "learning_rate": 9.798326693939821e-05, "loss": 0.9373861312866211, "memory(GiB)": 91.52, "step": 20990, "token_acc": 0.7394647536406203, "train_speed(iter/s)": 0.227991 }, { "epoch": 0.2724244085522709, "grad_norm": 0.806562066078186, "learning_rate": 9.798175867185732e-05, "loss": 0.9662758827209472, "memory(GiB)": 91.52, "step": 20995, "token_acc": 0.74639475600874, "train_speed(iter/s)": 0.227948 }, { "epoch": 0.2724892869539266, "grad_norm": 0.7937402725219727, "learning_rate": 9.798024985214545e-05, "loss": 0.9376752853393555, "memory(GiB)": 91.52, "step": 21000, "token_acc": 0.7453282223287014, "train_speed(iter/s)": 0.227907 }, { "epoch": 0.2725541653555823, "grad_norm": 0.8361474871635437, "learning_rate": 9.797874048027996e-05, "loss": 0.9595085144042969, "memory(GiB)": 91.52, "step": 21005, "token_acc": 0.7454336708068051, "train_speed(iter/s)": 0.227859 }, { "epoch": 0.272619043757238, "grad_norm": 0.8884007334709167, "learning_rate": 9.797723055627821e-05, "loss": 0.9720357894897461, "memory(GiB)": 91.52, "step": 21010, "token_acc": 0.7445574480251603, "train_speed(iter/s)": 0.227815 }, { "epoch": 0.2726839221588937, "grad_norm": 0.9172837138175964, "learning_rate": 9.797572008015757e-05, "loss": 0.9561245918273926, "memory(GiB)": 91.52, "step": 21015, "token_acc": 0.7518126745974449, "train_speed(iter/s)": 0.227763 }, { "epoch": 0.2727488005605494, "grad_norm": 0.8234944939613342, "learning_rate": 9.797420905193544e-05, "loss": 0.9266239166259765, "memory(GiB)": 91.52, "step": 21020, "token_acc": 0.7704410441044104, "train_speed(iter/s)": 0.227713 }, { "epoch": 0.2728136789622051, "grad_norm": 0.8700910806655884, "learning_rate": 9.797269747162919e-05, "loss": 1.0398063659667969, "memory(GiB)": 91.52, "step": 21025, "token_acc": 0.717224640752013, "train_speed(iter/s)": 0.227667 }, { "epoch": 0.2728785573638608, "grad_norm": 0.9071058034896851, "learning_rate": 9.797118533925622e-05, "loss": 0.983216381072998, "memory(GiB)": 91.52, "step": 21030, "token_acc": 0.7479150995846552, "train_speed(iter/s)": 0.227622 }, { "epoch": 0.2729434357655165, "grad_norm": 0.8239136934280396, "learning_rate": 9.796967265483396e-05, "loss": 0.9656274795532227, "memory(GiB)": 91.52, "step": 21035, "token_acc": 0.7467457189228578, "train_speed(iter/s)": 0.227573 }, { "epoch": 0.2730083141671722, "grad_norm": 0.7565280795097351, "learning_rate": 9.796815941837978e-05, "loss": 1.0148361206054688, "memory(GiB)": 91.52, "step": 21040, "token_acc": 0.728869690424766, "train_speed(iter/s)": 0.227527 }, { "epoch": 0.2730731925688279, "grad_norm": 0.7847192287445068, "learning_rate": 9.79666456299111e-05, "loss": 0.9767778396606446, "memory(GiB)": 91.52, "step": 21045, "token_acc": 0.7571838606446505, "train_speed(iter/s)": 0.227476 }, { "epoch": 0.2731380709704836, "grad_norm": 0.780741274356842, "learning_rate": 9.796513128944535e-05, "loss": 0.9471027374267578, "memory(GiB)": 91.52, "step": 21050, "token_acc": 0.7331137845285302, "train_speed(iter/s)": 0.227428 }, { "epoch": 0.2732029493721393, "grad_norm": 0.8241477608680725, "learning_rate": 9.796361639699998e-05, "loss": 0.9949974060058594, "memory(GiB)": 91.52, "step": 21055, "token_acc": 0.7278002650714618, "train_speed(iter/s)": 0.227381 }, { "epoch": 0.273267827773795, "grad_norm": 0.8688598871231079, "learning_rate": 9.796210095259239e-05, "loss": 0.9368365287780762, "memory(GiB)": 91.52, "step": 21060, "token_acc": 0.7354440846137945, "train_speed(iter/s)": 0.227335 }, { "epoch": 0.27333270617545063, "grad_norm": 0.8730815052986145, "learning_rate": 9.796058495624001e-05, "loss": 1.0088716506958009, "memory(GiB)": 91.52, "step": 21065, "token_acc": 0.725769884753448, "train_speed(iter/s)": 0.227289 }, { "epoch": 0.27339758457710633, "grad_norm": 0.8407790064811707, "learning_rate": 9.795906840796031e-05, "loss": 0.9539487838745118, "memory(GiB)": 91.52, "step": 21070, "token_acc": 0.745316115548257, "train_speed(iter/s)": 0.227244 }, { "epoch": 0.27346246297876203, "grad_norm": 0.8047857284545898, "learning_rate": 9.795755130777075e-05, "loss": 0.9087621688842773, "memory(GiB)": 91.52, "step": 21075, "token_acc": 0.7740092103936267, "train_speed(iter/s)": 0.227198 }, { "epoch": 0.27352734138041773, "grad_norm": 0.7512270212173462, "learning_rate": 9.795603365568877e-05, "loss": 0.9698062896728515, "memory(GiB)": 91.52, "step": 21080, "token_acc": 0.7398055051920224, "train_speed(iter/s)": 0.227147 }, { "epoch": 0.27359221978207343, "grad_norm": 0.8184128403663635, "learning_rate": 9.795451545173185e-05, "loss": 1.0219975471496583, "memory(GiB)": 91.52, "step": 21085, "token_acc": 0.7443440428380187, "train_speed(iter/s)": 0.2271 }, { "epoch": 0.27365709818372913, "grad_norm": 0.7722994089126587, "learning_rate": 9.795299669591745e-05, "loss": 0.9874195098876953, "memory(GiB)": 91.52, "step": 21090, "token_acc": 0.7354203935599285, "train_speed(iter/s)": 0.227052 }, { "epoch": 0.27372197658538483, "grad_norm": 0.7827503085136414, "learning_rate": 9.795147738826305e-05, "loss": 0.9787317276000976, "memory(GiB)": 91.52, "step": 21095, "token_acc": 0.7357669756005296, "train_speed(iter/s)": 0.227007 }, { "epoch": 0.27378685498704053, "grad_norm": 0.751297652721405, "learning_rate": 9.794995752878613e-05, "loss": 0.9281147003173829, "memory(GiB)": 91.52, "step": 21100, "token_acc": 0.7544633406252205, "train_speed(iter/s)": 0.226957 }, { "epoch": 0.27385173338869623, "grad_norm": 0.8522372841835022, "learning_rate": 9.794843711750417e-05, "loss": 0.9782655715942383, "memory(GiB)": 91.52, "step": 21105, "token_acc": 0.730701844790271, "train_speed(iter/s)": 0.226909 }, { "epoch": 0.2739166117903519, "grad_norm": 0.817785382270813, "learning_rate": 9.794691615443469e-05, "loss": 1.0187429428100585, "memory(GiB)": 91.52, "step": 21110, "token_acc": 0.7099477216269326, "train_speed(iter/s)": 0.226865 }, { "epoch": 0.2739814901920076, "grad_norm": 0.767064094543457, "learning_rate": 9.79453946395952e-05, "loss": 0.94723482131958, "memory(GiB)": 91.52, "step": 21115, "token_acc": 0.7570195142496139, "train_speed(iter/s)": 0.22682 }, { "epoch": 0.2740463685936633, "grad_norm": 0.9192124605178833, "learning_rate": 9.79438725730032e-05, "loss": 1.025661087036133, "memory(GiB)": 91.52, "step": 21120, "token_acc": 0.720554926387316, "train_speed(iter/s)": 0.226776 }, { "epoch": 0.274111246995319, "grad_norm": 0.8544924259185791, "learning_rate": 9.794234995467618e-05, "loss": 0.9771215438842773, "memory(GiB)": 91.52, "step": 21125, "token_acc": 0.7359368345182321, "train_speed(iter/s)": 0.226733 }, { "epoch": 0.2741761253969747, "grad_norm": 0.8948453664779663, "learning_rate": 9.794082678463168e-05, "loss": 0.9826698303222656, "memory(GiB)": 91.52, "step": 21130, "token_acc": 0.75002645409333, "train_speed(iter/s)": 0.226688 }, { "epoch": 0.2742410037986304, "grad_norm": 0.8750212788581848, "learning_rate": 9.793930306288723e-05, "loss": 0.9955509185791016, "memory(GiB)": 91.52, "step": 21135, "token_acc": 0.738590341353844, "train_speed(iter/s)": 0.226637 }, { "epoch": 0.2743058822002861, "grad_norm": 0.8051549196243286, "learning_rate": 9.793777878946038e-05, "loss": 1.0257444381713867, "memory(GiB)": 91.52, "step": 21140, "token_acc": 0.7212643678160919, "train_speed(iter/s)": 0.226595 }, { "epoch": 0.2743707606019418, "grad_norm": 0.8338591456413269, "learning_rate": 9.793625396436865e-05, "loss": 0.9958317756652832, "memory(GiB)": 91.52, "step": 21145, "token_acc": 0.751412632715494, "train_speed(iter/s)": 0.226552 }, { "epoch": 0.2744356390035975, "grad_norm": 0.9074296355247498, "learning_rate": 9.79347285876296e-05, "loss": 0.9695674896240234, "memory(GiB)": 91.52, "step": 21150, "token_acc": 0.7418872758326217, "train_speed(iter/s)": 0.226507 }, { "epoch": 0.2745005174052532, "grad_norm": 0.8221033811569214, "learning_rate": 9.793320265926078e-05, "loss": 0.9840201377868653, "memory(GiB)": 91.52, "step": 21155, "token_acc": 0.736162193379738, "train_speed(iter/s)": 0.226464 }, { "epoch": 0.2745653958069089, "grad_norm": 0.7946078181266785, "learning_rate": 9.793167617927974e-05, "loss": 0.9624773979187011, "memory(GiB)": 91.52, "step": 21160, "token_acc": 0.722217625155151, "train_speed(iter/s)": 0.226424 }, { "epoch": 0.2746302742085646, "grad_norm": 0.8472609519958496, "learning_rate": 9.793014914770405e-05, "loss": 0.9636016845703125, "memory(GiB)": 91.52, "step": 21165, "token_acc": 0.7488072699734949, "train_speed(iter/s)": 0.226382 }, { "epoch": 0.2746951526102203, "grad_norm": 0.7715975046157837, "learning_rate": 9.79286215645513e-05, "loss": 0.9740085601806641, "memory(GiB)": 91.52, "step": 21170, "token_acc": 0.7552092453160567, "train_speed(iter/s)": 0.226334 }, { "epoch": 0.274760031011876, "grad_norm": 0.7533130645751953, "learning_rate": 9.792709342983906e-05, "loss": 0.9074460983276367, "memory(GiB)": 91.52, "step": 21175, "token_acc": 0.7652337827316313, "train_speed(iter/s)": 0.226285 }, { "epoch": 0.2748249094135317, "grad_norm": 0.7438366413116455, "learning_rate": 9.79255647435849e-05, "loss": 0.9520670890808105, "memory(GiB)": 91.52, "step": 21180, "token_acc": 0.7389166376272893, "train_speed(iter/s)": 0.226241 }, { "epoch": 0.27488978781518736, "grad_norm": 0.8735015392303467, "learning_rate": 9.792403550580643e-05, "loss": 1.0102268218994142, "memory(GiB)": 91.52, "step": 21185, "token_acc": 0.7344316170894076, "train_speed(iter/s)": 0.2262 }, { "epoch": 0.27495466621684306, "grad_norm": 0.8656796813011169, "learning_rate": 9.792250571652125e-05, "loss": 0.9649183273315429, "memory(GiB)": 91.52, "step": 21190, "token_acc": 0.7614803942126231, "train_speed(iter/s)": 0.226158 }, { "epoch": 0.27501954461849876, "grad_norm": 0.7966575622558594, "learning_rate": 9.792097537574696e-05, "loss": 0.9902006149291992, "memory(GiB)": 91.52, "step": 21195, "token_acc": 0.7196987848707855, "train_speed(iter/s)": 0.226115 }, { "epoch": 0.27508442302015446, "grad_norm": 0.8548538684844971, "learning_rate": 9.791944448350116e-05, "loss": 0.958040428161621, "memory(GiB)": 91.52, "step": 21200, "token_acc": 0.7488376361189282, "train_speed(iter/s)": 0.226069 }, { "epoch": 0.27514930142181016, "grad_norm": 0.7592957019805908, "learning_rate": 9.791791303980148e-05, "loss": 0.9831403732299805, "memory(GiB)": 91.52, "step": 21205, "token_acc": 0.7404227003390474, "train_speed(iter/s)": 0.226022 }, { "epoch": 0.27521417982346585, "grad_norm": 0.8459081649780273, "learning_rate": 9.791638104466554e-05, "loss": 0.9713271141052247, "memory(GiB)": 91.52, "step": 21210, "token_acc": 0.7515941321997283, "train_speed(iter/s)": 0.225979 }, { "epoch": 0.27527905822512155, "grad_norm": 0.9397168755531311, "learning_rate": 9.791484849811098e-05, "loss": 0.9739777565002441, "memory(GiB)": 91.52, "step": 21215, "token_acc": 0.7346995101083281, "train_speed(iter/s)": 0.225934 }, { "epoch": 0.27534393662677725, "grad_norm": 0.8808263540267944, "learning_rate": 9.791331540015542e-05, "loss": 0.9643915176391602, "memory(GiB)": 91.52, "step": 21220, "token_acc": 0.7390500622477082, "train_speed(iter/s)": 0.225891 }, { "epoch": 0.27540881502843295, "grad_norm": 0.9752432107925415, "learning_rate": 9.79117817508165e-05, "loss": 0.9749538421630859, "memory(GiB)": 91.52, "step": 21225, "token_acc": 0.7335042353990192, "train_speed(iter/s)": 0.225847 }, { "epoch": 0.27547369343008865, "grad_norm": 0.8423094749450684, "learning_rate": 9.79102475501119e-05, "loss": 0.9871770858764648, "memory(GiB)": 91.52, "step": 21230, "token_acc": 0.742165483991444, "train_speed(iter/s)": 0.225798 }, { "epoch": 0.27553857183174435, "grad_norm": 0.7540364861488342, "learning_rate": 9.790871279805925e-05, "loss": 0.890168571472168, "memory(GiB)": 91.52, "step": 21235, "token_acc": 0.7757254917114262, "train_speed(iter/s)": 0.22575 }, { "epoch": 0.27560345023340005, "grad_norm": 0.8091983795166016, "learning_rate": 9.79071774946762e-05, "loss": 0.9065481185913086, "memory(GiB)": 91.52, "step": 21240, "token_acc": 0.752714507306719, "train_speed(iter/s)": 0.225703 }, { "epoch": 0.27566832863505575, "grad_norm": 0.7303376793861389, "learning_rate": 9.790564163998045e-05, "loss": 0.9769521713256836, "memory(GiB)": 91.52, "step": 21245, "token_acc": 0.730624980301932, "train_speed(iter/s)": 0.225654 }, { "epoch": 0.27573320703671145, "grad_norm": 0.7923125624656677, "learning_rate": 9.790410523398966e-05, "loss": 0.9540234565734863, "memory(GiB)": 91.52, "step": 21250, "token_acc": 0.7508078016117102, "train_speed(iter/s)": 0.225606 }, { "epoch": 0.27579808543836715, "grad_norm": 0.9261542558670044, "learning_rate": 9.79025682767215e-05, "loss": 0.9334018707275391, "memory(GiB)": 91.52, "step": 21255, "token_acc": 0.7717074841582463, "train_speed(iter/s)": 0.225558 }, { "epoch": 0.27586296384002285, "grad_norm": 0.9398972988128662, "learning_rate": 9.790103076819367e-05, "loss": 0.9810643196105957, "memory(GiB)": 91.52, "step": 21260, "token_acc": 0.7298309980094454, "train_speed(iter/s)": 0.225515 }, { "epoch": 0.27592784224167854, "grad_norm": 0.8049576878547668, "learning_rate": 9.789949270842387e-05, "loss": 0.9374639511108398, "memory(GiB)": 91.52, "step": 21265, "token_acc": 0.7496724780812254, "train_speed(iter/s)": 0.225469 }, { "epoch": 0.27599272064333424, "grad_norm": 0.7497462630271912, "learning_rate": 9.789795409742978e-05, "loss": 0.9541641235351562, "memory(GiB)": 91.52, "step": 21270, "token_acc": 0.7480861669930567, "train_speed(iter/s)": 0.225423 }, { "epoch": 0.27605759904498994, "grad_norm": 0.8438789248466492, "learning_rate": 9.789641493522912e-05, "loss": 0.9732002258300781, "memory(GiB)": 91.52, "step": 21275, "token_acc": 0.7513965499068966, "train_speed(iter/s)": 0.225378 }, { "epoch": 0.27612247744664564, "grad_norm": 0.7441282868385315, "learning_rate": 9.78948752218396e-05, "loss": 0.9524846076965332, "memory(GiB)": 91.52, "step": 21280, "token_acc": 0.7636944961012692, "train_speed(iter/s)": 0.225333 }, { "epoch": 0.27618735584830134, "grad_norm": 0.8671967387199402, "learning_rate": 9.789333495727893e-05, "loss": 0.9927375793457032, "memory(GiB)": 91.52, "step": 21285, "token_acc": 0.738320192871571, "train_speed(iter/s)": 0.22529 }, { "epoch": 0.27625223424995704, "grad_norm": 0.8296175003051758, "learning_rate": 9.789179414156486e-05, "loss": 1.0030193328857422, "memory(GiB)": 91.52, "step": 21290, "token_acc": 0.7414871730276693, "train_speed(iter/s)": 0.225251 }, { "epoch": 0.27631711265161274, "grad_norm": 0.777190625667572, "learning_rate": 9.78902527747151e-05, "loss": 0.9952360153198242, "memory(GiB)": 91.52, "step": 21295, "token_acc": 0.7186883701763165, "train_speed(iter/s)": 0.225208 }, { "epoch": 0.27638199105326844, "grad_norm": 0.8124163746833801, "learning_rate": 9.788871085674738e-05, "loss": 1.0143193244934081, "memory(GiB)": 91.52, "step": 21300, "token_acc": 0.7230168167147953, "train_speed(iter/s)": 0.225164 }, { "epoch": 0.2764468694549241, "grad_norm": 0.8104662299156189, "learning_rate": 9.788716838767947e-05, "loss": 0.977332878112793, "memory(GiB)": 91.52, "step": 21305, "token_acc": 0.7404992199687987, "train_speed(iter/s)": 0.225119 }, { "epoch": 0.2765117478565798, "grad_norm": 0.9081911444664001, "learning_rate": 9.788562536752911e-05, "loss": 0.9810894966125489, "memory(GiB)": 91.52, "step": 21310, "token_acc": 0.7487939967839914, "train_speed(iter/s)": 0.225074 }, { "epoch": 0.2765766262582355, "grad_norm": 0.7084597945213318, "learning_rate": 9.788408179631405e-05, "loss": 0.9405341148376465, "memory(GiB)": 91.52, "step": 21315, "token_acc": 0.7620788372605153, "train_speed(iter/s)": 0.22503 }, { "epoch": 0.2766415046598912, "grad_norm": 0.762983500957489, "learning_rate": 9.788253767405206e-05, "loss": 0.9512006759643554, "memory(GiB)": 91.52, "step": 21320, "token_acc": 0.7363879365448311, "train_speed(iter/s)": 0.224982 }, { "epoch": 0.2767063830615469, "grad_norm": 0.8306678533554077, "learning_rate": 9.78809930007609e-05, "loss": 0.9652883529663085, "memory(GiB)": 91.52, "step": 21325, "token_acc": 0.7655621301775148, "train_speed(iter/s)": 0.224939 }, { "epoch": 0.2767712614632026, "grad_norm": 0.7925214767456055, "learning_rate": 9.787944777645837e-05, "loss": 0.946841812133789, "memory(GiB)": 91.52, "step": 21330, "token_acc": 0.744331641285956, "train_speed(iter/s)": 0.224894 }, { "epoch": 0.2768361398648583, "grad_norm": 0.8093237280845642, "learning_rate": 9.787790200116222e-05, "loss": 0.9728555679321289, "memory(GiB)": 91.52, "step": 21335, "token_acc": 0.7458148787628557, "train_speed(iter/s)": 0.224853 }, { "epoch": 0.276901018266514, "grad_norm": 0.9194319248199463, "learning_rate": 9.787635567489028e-05, "loss": 0.9707344055175782, "memory(GiB)": 91.52, "step": 21340, "token_acc": 0.7369580074417192, "train_speed(iter/s)": 0.224805 }, { "epoch": 0.2769658966681697, "grad_norm": 0.7561703324317932, "learning_rate": 9.787480879766031e-05, "loss": 0.9778984069824219, "memory(GiB)": 91.52, "step": 21345, "token_acc": 0.7469314885070297, "train_speed(iter/s)": 0.224757 }, { "epoch": 0.2770307750698254, "grad_norm": 0.8783120512962341, "learning_rate": 9.787326136949011e-05, "loss": 0.9943760871887207, "memory(GiB)": 91.52, "step": 21350, "token_acc": 0.7255230763367506, "train_speed(iter/s)": 0.224715 }, { "epoch": 0.2770956534714811, "grad_norm": 0.8432942032814026, "learning_rate": 9.78717133903975e-05, "loss": 0.9915300369262695, "memory(GiB)": 91.52, "step": 21355, "token_acc": 0.7380149441248429, "train_speed(iter/s)": 0.224669 }, { "epoch": 0.2771605318731368, "grad_norm": 0.8839889168739319, "learning_rate": 9.78701648604003e-05, "loss": 0.9606081962585449, "memory(GiB)": 91.52, "step": 21360, "token_acc": 0.767835848513508, "train_speed(iter/s)": 0.224628 }, { "epoch": 0.27722541027479247, "grad_norm": 0.9101589918136597, "learning_rate": 9.786861577951632e-05, "loss": 1.0081117630004883, "memory(GiB)": 91.52, "step": 21365, "token_acc": 0.752293254304262, "train_speed(iter/s)": 0.224585 }, { "epoch": 0.27729028867644817, "grad_norm": 0.8122881650924683, "learning_rate": 9.78670661477634e-05, "loss": 0.9849809646606446, "memory(GiB)": 91.52, "step": 21370, "token_acc": 0.755278869957769, "train_speed(iter/s)": 0.22454 }, { "epoch": 0.27735516707810387, "grad_norm": 0.7549201846122742, "learning_rate": 9.786551596515935e-05, "loss": 0.931862735748291, "memory(GiB)": 91.52, "step": 21375, "token_acc": 0.7430081855388813, "train_speed(iter/s)": 0.224492 }, { "epoch": 0.27742004547975957, "grad_norm": 0.8233435153961182, "learning_rate": 9.786396523172203e-05, "loss": 0.9293117523193359, "memory(GiB)": 91.52, "step": 21380, "token_acc": 0.7515062092708718, "train_speed(iter/s)": 0.224445 }, { "epoch": 0.27748492388141527, "grad_norm": 0.8164054751396179, "learning_rate": 9.786241394746927e-05, "loss": 0.9765100479125977, "memory(GiB)": 91.52, "step": 21385, "token_acc": 0.73207209510418, "train_speed(iter/s)": 0.224397 }, { "epoch": 0.27754980228307097, "grad_norm": 0.8532091379165649, "learning_rate": 9.786086211241894e-05, "loss": 0.9677194595336914, "memory(GiB)": 91.52, "step": 21390, "token_acc": 0.7479317021651117, "train_speed(iter/s)": 0.224352 }, { "epoch": 0.27761468068472667, "grad_norm": 0.8198643922805786, "learning_rate": 9.785930972658889e-05, "loss": 0.9567408561706543, "memory(GiB)": 91.52, "step": 21395, "token_acc": 0.7358027117311849, "train_speed(iter/s)": 0.224307 }, { "epoch": 0.27767955908638237, "grad_norm": 1.0518698692321777, "learning_rate": 9.785775678999698e-05, "loss": 1.0203115463256835, "memory(GiB)": 91.52, "step": 21400, "token_acc": 0.7226967180703836, "train_speed(iter/s)": 0.22427 }, { "epoch": 0.27774443748803807, "grad_norm": 0.7431901693344116, "learning_rate": 9.785620330266109e-05, "loss": 0.955318832397461, "memory(GiB)": 91.52, "step": 21405, "token_acc": 0.7654643823264202, "train_speed(iter/s)": 0.224223 }, { "epoch": 0.27780931588969376, "grad_norm": 0.7449613213539124, "learning_rate": 9.785464926459909e-05, "loss": 0.9235830307006836, "memory(GiB)": 91.52, "step": 21410, "token_acc": 0.7633569513596323, "train_speed(iter/s)": 0.224183 }, { "epoch": 0.27787419429134946, "grad_norm": 0.9069463014602661, "learning_rate": 9.785309467582886e-05, "loss": 0.9633302688598633, "memory(GiB)": 91.52, "step": 21415, "token_acc": 0.7793519377631861, "train_speed(iter/s)": 0.224138 }, { "epoch": 0.27793907269300516, "grad_norm": 0.9035757184028625, "learning_rate": 9.78515395363683e-05, "loss": 0.9474138259887696, "memory(GiB)": 91.52, "step": 21420, "token_acc": 0.7444854844834133, "train_speed(iter/s)": 0.22409 }, { "epoch": 0.2780039510946608, "grad_norm": 0.7802601456642151, "learning_rate": 9.78499838462353e-05, "loss": 0.9528528213500976, "memory(GiB)": 91.52, "step": 21425, "token_acc": 0.7289643485628625, "train_speed(iter/s)": 0.224046 }, { "epoch": 0.2780688294963165, "grad_norm": 0.7713150978088379, "learning_rate": 9.784842760544777e-05, "loss": 0.9480582237243652, "memory(GiB)": 91.52, "step": 21430, "token_acc": 0.7288469399130467, "train_speed(iter/s)": 0.224 }, { "epoch": 0.2781337078979722, "grad_norm": 0.8631779551506042, "learning_rate": 9.784687081402362e-05, "loss": 0.9968036651611328, "memory(GiB)": 91.52, "step": 21435, "token_acc": 0.7493182921818385, "train_speed(iter/s)": 0.223956 }, { "epoch": 0.2781985862996279, "grad_norm": 0.7642533779144287, "learning_rate": 9.784531347198075e-05, "loss": 0.9621688842773437, "memory(GiB)": 91.52, "step": 21440, "token_acc": 0.7348733034699472, "train_speed(iter/s)": 0.223917 }, { "epoch": 0.2782634647012836, "grad_norm": 0.7542057633399963, "learning_rate": 9.78437555793371e-05, "loss": 0.9557703018188477, "memory(GiB)": 91.52, "step": 21445, "token_acc": 0.7506587004119197, "train_speed(iter/s)": 0.223875 }, { "epoch": 0.2783283431029393, "grad_norm": 0.7438122034072876, "learning_rate": 9.784219713611058e-05, "loss": 0.9539024353027343, "memory(GiB)": 91.52, "step": 21450, "token_acc": 0.7316679300003296, "train_speed(iter/s)": 0.223835 }, { "epoch": 0.278393221504595, "grad_norm": 0.8743171691894531, "learning_rate": 9.784063814231914e-05, "loss": 0.937199592590332, "memory(GiB)": 91.52, "step": 21455, "token_acc": 0.7573257248611968, "train_speed(iter/s)": 0.22379 }, { "epoch": 0.2784580999062507, "grad_norm": 0.7967373728752136, "learning_rate": 9.783907859798071e-05, "loss": 1.0104249000549317, "memory(GiB)": 91.52, "step": 21460, "token_acc": 0.7335187551680072, "train_speed(iter/s)": 0.223749 }, { "epoch": 0.2785229783079064, "grad_norm": 0.7768226861953735, "learning_rate": 9.783751850311324e-05, "loss": 0.9282342910766601, "memory(GiB)": 91.52, "step": 21465, "token_acc": 0.748433968356474, "train_speed(iter/s)": 0.223705 }, { "epoch": 0.2785878567095621, "grad_norm": 0.7184181809425354, "learning_rate": 9.78359578577347e-05, "loss": 0.9625404357910157, "memory(GiB)": 91.52, "step": 21470, "token_acc": 0.7594947915129696, "train_speed(iter/s)": 0.223661 }, { "epoch": 0.2786527351112178, "grad_norm": 0.7923840284347534, "learning_rate": 9.783439666186304e-05, "loss": 0.9336822509765625, "memory(GiB)": 91.52, "step": 21475, "token_acc": 0.7621290459664282, "train_speed(iter/s)": 0.223613 }, { "epoch": 0.2787176135128735, "grad_norm": 0.8790590167045593, "learning_rate": 9.783283491551621e-05, "loss": 0.9530932426452636, "memory(GiB)": 91.52, "step": 21480, "token_acc": 0.7532190536377281, "train_speed(iter/s)": 0.223568 }, { "epoch": 0.2787824919145292, "grad_norm": 0.8344899415969849, "learning_rate": 9.783127261871218e-05, "loss": 0.9359639167785645, "memory(GiB)": 91.52, "step": 21485, "token_acc": 0.7630144453529573, "train_speed(iter/s)": 0.223525 }, { "epoch": 0.2788473703161849, "grad_norm": 0.8179435133934021, "learning_rate": 9.782970977146896e-05, "loss": 0.9545145988464355, "memory(GiB)": 91.52, "step": 21490, "token_acc": 0.7571891523111035, "train_speed(iter/s)": 0.223484 }, { "epoch": 0.2789122487178406, "grad_norm": 0.7586804032325745, "learning_rate": 9.782814637380453e-05, "loss": 0.981259536743164, "memory(GiB)": 91.52, "step": 21495, "token_acc": 0.7449260140248988, "train_speed(iter/s)": 0.223436 }, { "epoch": 0.2789771271194963, "grad_norm": 0.7908377647399902, "learning_rate": 9.782658242573685e-05, "loss": 0.9538786888122559, "memory(GiB)": 91.52, "step": 21500, "token_acc": 0.7650250928230447, "train_speed(iter/s)": 0.223396 }, { "epoch": 0.279042005521152, "grad_norm": 0.7823823690414429, "learning_rate": 9.782501792728394e-05, "loss": 1.017143154144287, "memory(GiB)": 91.52, "step": 21505, "token_acc": 0.7374648076149618, "train_speed(iter/s)": 0.223354 }, { "epoch": 0.2791068839228077, "grad_norm": 0.9606181383132935, "learning_rate": 9.782345287846382e-05, "loss": 0.947971248626709, "memory(GiB)": 91.52, "step": 21510, "token_acc": 0.7623234119660414, "train_speed(iter/s)": 0.223308 }, { "epoch": 0.2791717623244634, "grad_norm": 0.774479866027832, "learning_rate": 9.782188727929447e-05, "loss": 0.9654895782470703, "memory(GiB)": 91.52, "step": 21515, "token_acc": 0.7516971029735157, "train_speed(iter/s)": 0.223265 }, { "epoch": 0.2792366407261191, "grad_norm": 0.8631022572517395, "learning_rate": 9.782032112979391e-05, "loss": 0.9550060272216797, "memory(GiB)": 91.52, "step": 21520, "token_acc": 0.7490958695514244, "train_speed(iter/s)": 0.223224 }, { "epoch": 0.2793015191277748, "grad_norm": 0.7679449319839478, "learning_rate": 9.781875442998018e-05, "loss": 0.9532633781433105, "memory(GiB)": 91.52, "step": 21525, "token_acc": 0.7635467980295566, "train_speed(iter/s)": 0.223178 }, { "epoch": 0.2793663975294305, "grad_norm": 0.9000017642974854, "learning_rate": 9.78171871798713e-05, "loss": 0.9509708404541015, "memory(GiB)": 91.52, "step": 21530, "token_acc": 0.7571448106112403, "train_speed(iter/s)": 0.223132 }, { "epoch": 0.2794312759310862, "grad_norm": 0.8588171005249023, "learning_rate": 9.78156193794853e-05, "loss": 0.9947460174560547, "memory(GiB)": 91.52, "step": 21535, "token_acc": 0.7534471193262279, "train_speed(iter/s)": 0.223088 }, { "epoch": 0.2794961543327419, "grad_norm": 0.7317374348640442, "learning_rate": 9.781405102884025e-05, "loss": 0.9348701477050781, "memory(GiB)": 91.52, "step": 21540, "token_acc": 0.7503439481896581, "train_speed(iter/s)": 0.223043 }, { "epoch": 0.27956103273439753, "grad_norm": 0.806836724281311, "learning_rate": 9.781248212795416e-05, "loss": 0.9931005477905274, "memory(GiB)": 91.52, "step": 21545, "token_acc": 0.7679068329382666, "train_speed(iter/s)": 0.222997 }, { "epoch": 0.27962591113605323, "grad_norm": 0.8810785412788391, "learning_rate": 9.78109126768451e-05, "loss": 1.0072389602661134, "memory(GiB)": 91.52, "step": 21550, "token_acc": 0.7449609339373119, "train_speed(iter/s)": 0.222952 }, { "epoch": 0.27969078953770893, "grad_norm": 0.8332881927490234, "learning_rate": 9.780934267553115e-05, "loss": 0.9042935371398926, "memory(GiB)": 91.52, "step": 21555, "token_acc": 0.7622023107757604, "train_speed(iter/s)": 0.222909 }, { "epoch": 0.2797556679393646, "grad_norm": 0.898820698261261, "learning_rate": 9.780777212403036e-05, "loss": 0.9327257156372071, "memory(GiB)": 91.52, "step": 21560, "token_acc": 0.7421011973735033, "train_speed(iter/s)": 0.222867 }, { "epoch": 0.2798205463410203, "grad_norm": 0.820088803768158, "learning_rate": 9.78062010223608e-05, "loss": 0.9805275917053222, "memory(GiB)": 91.52, "step": 21565, "token_acc": 0.7525118104997006, "train_speed(iter/s)": 0.222828 }, { "epoch": 0.279885424742676, "grad_norm": 0.7718658447265625, "learning_rate": 9.780462937054055e-05, "loss": 0.9552275657653808, "memory(GiB)": 91.52, "step": 21570, "token_acc": 0.7617707267144319, "train_speed(iter/s)": 0.222781 }, { "epoch": 0.2799503031443317, "grad_norm": 0.9367533326148987, "learning_rate": 9.78030571685877e-05, "loss": 0.9577156066894531, "memory(GiB)": 91.52, "step": 21575, "token_acc": 0.7488706650913892, "train_speed(iter/s)": 0.22274 }, { "epoch": 0.2800151815459874, "grad_norm": 0.8117251992225647, "learning_rate": 9.780148441652035e-05, "loss": 0.9710678100585938, "memory(GiB)": 91.52, "step": 21580, "token_acc": 0.7225732537147619, "train_speed(iter/s)": 0.222699 }, { "epoch": 0.2800800599476431, "grad_norm": 0.7594039440155029, "learning_rate": 9.77999111143566e-05, "loss": 0.9688579559326171, "memory(GiB)": 91.52, "step": 21585, "token_acc": 0.7420672027928433, "train_speed(iter/s)": 0.222657 }, { "epoch": 0.2801449383492988, "grad_norm": 0.8398513197898865, "learning_rate": 9.779833726211453e-05, "loss": 0.929775333404541, "memory(GiB)": 91.52, "step": 21590, "token_acc": 0.7480271896241893, "train_speed(iter/s)": 0.222613 }, { "epoch": 0.2802098167509545, "grad_norm": 0.9727306365966797, "learning_rate": 9.779676285981229e-05, "loss": 0.9896013259887695, "memory(GiB)": 91.52, "step": 21595, "token_acc": 0.7465195784724915, "train_speed(iter/s)": 0.222569 }, { "epoch": 0.2802746951526102, "grad_norm": 0.7402079105377197, "learning_rate": 9.779518790746798e-05, "loss": 0.9435401916503906, "memory(GiB)": 91.52, "step": 21600, "token_acc": 0.7489906856979369, "train_speed(iter/s)": 0.222523 }, { "epoch": 0.2803395735542659, "grad_norm": 0.9162473678588867, "learning_rate": 9.779361240509973e-05, "loss": 0.9903450012207031, "memory(GiB)": 91.52, "step": 21605, "token_acc": 0.7042660024512711, "train_speed(iter/s)": 0.222483 }, { "epoch": 0.2804044519559216, "grad_norm": 0.7938041090965271, "learning_rate": 9.779203635272564e-05, "loss": 0.9839879989624023, "memory(GiB)": 91.52, "step": 21610, "token_acc": 0.7398822960335376, "train_speed(iter/s)": 0.22244 }, { "epoch": 0.2804693303575773, "grad_norm": 0.844192624092102, "learning_rate": 9.779045975036389e-05, "loss": 0.9969748497009278, "memory(GiB)": 91.52, "step": 21615, "token_acc": 0.7552316012226663, "train_speed(iter/s)": 0.222402 }, { "epoch": 0.280534208759233, "grad_norm": 0.8085711002349854, "learning_rate": 9.778888259803259e-05, "loss": 0.9978261947631836, "memory(GiB)": 91.52, "step": 21620, "token_acc": 0.7248568950563746, "train_speed(iter/s)": 0.222355 }, { "epoch": 0.2805990871608887, "grad_norm": 0.8565436601638794, "learning_rate": 9.778730489574991e-05, "loss": 0.9122110366821289, "memory(GiB)": 91.52, "step": 21625, "token_acc": 0.7435388300554715, "train_speed(iter/s)": 0.222317 }, { "epoch": 0.2806639655625444, "grad_norm": 1.0578759908676147, "learning_rate": 9.778572664353401e-05, "loss": 0.9588779449462891, "memory(GiB)": 91.52, "step": 21630, "token_acc": 0.7436722353579596, "train_speed(iter/s)": 0.222276 }, { "epoch": 0.2807288439642001, "grad_norm": 0.9940432906150818, "learning_rate": 9.778414784140304e-05, "loss": 0.988309383392334, "memory(GiB)": 91.52, "step": 21635, "token_acc": 0.7328258029640975, "train_speed(iter/s)": 0.222237 }, { "epoch": 0.2807937223658558, "grad_norm": 0.8193584680557251, "learning_rate": 9.778256848937517e-05, "loss": 0.9285437583923339, "memory(GiB)": 91.52, "step": 21640, "token_acc": 0.7695478443743428, "train_speed(iter/s)": 0.222197 }, { "epoch": 0.2808586007675115, "grad_norm": 0.858870804309845, "learning_rate": 9.778098858746858e-05, "loss": 0.9435853958129883, "memory(GiB)": 91.52, "step": 21645, "token_acc": 0.7562109434537418, "train_speed(iter/s)": 0.222155 }, { "epoch": 0.2809234791691672, "grad_norm": 0.8117131590843201, "learning_rate": 9.777940813570145e-05, "loss": 0.9332549095153808, "memory(GiB)": 91.52, "step": 21650, "token_acc": 0.7586528420110399, "train_speed(iter/s)": 0.222112 }, { "epoch": 0.2809883575708229, "grad_norm": 0.7273964285850525, "learning_rate": 9.777782713409195e-05, "loss": 0.9453163146972656, "memory(GiB)": 91.52, "step": 21655, "token_acc": 0.7558797762652469, "train_speed(iter/s)": 0.222071 }, { "epoch": 0.28105323597247855, "grad_norm": 0.8597024083137512, "learning_rate": 9.777624558265832e-05, "loss": 0.9926103591918946, "memory(GiB)": 91.52, "step": 21660, "token_acc": 0.733999524149417, "train_speed(iter/s)": 0.222029 }, { "epoch": 0.28111811437413425, "grad_norm": 0.8744332790374756, "learning_rate": 9.777466348141871e-05, "loss": 0.9966981887817383, "memory(GiB)": 91.52, "step": 21665, "token_acc": 0.7467144022478021, "train_speed(iter/s)": 0.221989 }, { "epoch": 0.28118299277578995, "grad_norm": 0.8115676045417786, "learning_rate": 9.777308083039134e-05, "loss": 0.9401262283325196, "memory(GiB)": 91.52, "step": 21670, "token_acc": 0.7539548456458302, "train_speed(iter/s)": 0.221947 }, { "epoch": 0.28124787117744565, "grad_norm": 0.854759693145752, "learning_rate": 9.777149762959444e-05, "loss": 0.9522994041442872, "memory(GiB)": 91.52, "step": 21675, "token_acc": 0.7560370849504097, "train_speed(iter/s)": 0.221906 }, { "epoch": 0.28131274957910135, "grad_norm": 1.0358778238296509, "learning_rate": 9.776991387904621e-05, "loss": 0.9540512084960937, "memory(GiB)": 91.52, "step": 21680, "token_acc": 0.7539704653106715, "train_speed(iter/s)": 0.221865 }, { "epoch": 0.28137762798075705, "grad_norm": 0.8610665202140808, "learning_rate": 9.77683295787649e-05, "loss": 0.9591175079345703, "memory(GiB)": 91.52, "step": 21685, "token_acc": 0.7360745876165431, "train_speed(iter/s)": 0.221827 }, { "epoch": 0.28144250638241275, "grad_norm": 0.8305655717849731, "learning_rate": 9.776674472876874e-05, "loss": 0.9621681213378906, "memory(GiB)": 91.52, "step": 21690, "token_acc": 0.7391602791442626, "train_speed(iter/s)": 0.221786 }, { "epoch": 0.28150738478406845, "grad_norm": 0.8505256772041321, "learning_rate": 9.776515932907593e-05, "loss": 0.96937255859375, "memory(GiB)": 91.52, "step": 21695, "token_acc": 0.7514707514707515, "train_speed(iter/s)": 0.221744 }, { "epoch": 0.28157226318572415, "grad_norm": 0.8851627111434937, "learning_rate": 9.776357337970474e-05, "loss": 0.98544921875, "memory(GiB)": 91.52, "step": 21700, "token_acc": 0.756748395080826, "train_speed(iter/s)": 0.221702 }, { "epoch": 0.28163714158737985, "grad_norm": 0.7512015700340271, "learning_rate": 9.776198688067344e-05, "loss": 0.9581916809082032, "memory(GiB)": 91.52, "step": 21705, "token_acc": 0.7296288757717619, "train_speed(iter/s)": 0.221657 }, { "epoch": 0.28170201998903555, "grad_norm": 0.9332983493804932, "learning_rate": 9.776039983200026e-05, "loss": 1.0161014556884767, "memory(GiB)": 91.52, "step": 21710, "token_acc": 0.7400822071370742, "train_speed(iter/s)": 0.221616 }, { "epoch": 0.28176689839069124, "grad_norm": 0.8477906584739685, "learning_rate": 9.775881223370347e-05, "loss": 0.9535416603088379, "memory(GiB)": 91.52, "step": 21715, "token_acc": 0.7324136440586392, "train_speed(iter/s)": 0.221577 }, { "epoch": 0.28183177679234694, "grad_norm": 0.8223069310188293, "learning_rate": 9.775722408580135e-05, "loss": 0.9728676795959472, "memory(GiB)": 91.52, "step": 21720, "token_acc": 0.7418895098075985, "train_speed(iter/s)": 0.221535 }, { "epoch": 0.28189665519400264, "grad_norm": 0.8373615741729736, "learning_rate": 9.775563538831216e-05, "loss": 0.9350790023803711, "memory(GiB)": 91.52, "step": 21725, "token_acc": 0.7516176814531846, "train_speed(iter/s)": 0.221496 }, { "epoch": 0.28196153359565834, "grad_norm": 0.743557870388031, "learning_rate": 9.77540461412542e-05, "loss": 0.923221492767334, "memory(GiB)": 91.52, "step": 21730, "token_acc": 0.7574625632210505, "train_speed(iter/s)": 0.221451 }, { "epoch": 0.28202641199731404, "grad_norm": 0.8537276983261108, "learning_rate": 9.775245634464574e-05, "loss": 0.9665790557861328, "memory(GiB)": 91.52, "step": 21735, "token_acc": 0.7388477919748935, "train_speed(iter/s)": 0.221412 }, { "epoch": 0.28209129039896974, "grad_norm": 0.8026741147041321, "learning_rate": 9.775086599850509e-05, "loss": 0.9679250717163086, "memory(GiB)": 91.52, "step": 21740, "token_acc": 0.7535007242877837, "train_speed(iter/s)": 0.221371 }, { "epoch": 0.28215616880062544, "grad_norm": 0.868885338306427, "learning_rate": 9.774927510285056e-05, "loss": 0.9768373489379882, "memory(GiB)": 91.52, "step": 21745, "token_acc": 0.7279537265313863, "train_speed(iter/s)": 0.221332 }, { "epoch": 0.28222104720228114, "grad_norm": 0.8317494988441467, "learning_rate": 9.774768365770044e-05, "loss": 0.9748526573181152, "memory(GiB)": 91.52, "step": 21750, "token_acc": 0.7556748294755675, "train_speed(iter/s)": 0.22129 }, { "epoch": 0.28228592560393684, "grad_norm": 0.8048713207244873, "learning_rate": 9.774609166307304e-05, "loss": 1.005009651184082, "memory(GiB)": 91.52, "step": 21755, "token_acc": 0.7096440175670368, "train_speed(iter/s)": 0.221249 }, { "epoch": 0.28235080400559254, "grad_norm": 0.8065807819366455, "learning_rate": 9.774449911898669e-05, "loss": 0.9855700492858886, "memory(GiB)": 91.52, "step": 21760, "token_acc": 0.7422898645935984, "train_speed(iter/s)": 0.221212 }, { "epoch": 0.28241568240724824, "grad_norm": 0.7968733906745911, "learning_rate": 9.774290602545972e-05, "loss": 0.9801658630371094, "memory(GiB)": 91.52, "step": 21765, "token_acc": 0.7353339928914294, "train_speed(iter/s)": 0.221173 }, { "epoch": 0.28248056080890394, "grad_norm": 0.8136852383613586, "learning_rate": 9.774131238251047e-05, "loss": 0.922767448425293, "memory(GiB)": 91.52, "step": 21770, "token_acc": 0.7458432304038005, "train_speed(iter/s)": 0.221131 }, { "epoch": 0.28254543921055963, "grad_norm": 0.7343787550926208, "learning_rate": 9.773971819015725e-05, "loss": 0.9826709747314453, "memory(GiB)": 91.52, "step": 21775, "token_acc": 0.7456953207150369, "train_speed(iter/s)": 0.221095 }, { "epoch": 0.2826103176122153, "grad_norm": 0.8243134617805481, "learning_rate": 9.773812344841844e-05, "loss": 0.949644660949707, "memory(GiB)": 91.52, "step": 21780, "token_acc": 0.7319615540814945, "train_speed(iter/s)": 0.22105 }, { "epoch": 0.282675196013871, "grad_norm": 0.8962975144386292, "learning_rate": 9.773652815731237e-05, "loss": 0.9226154327392578, "memory(GiB)": 91.52, "step": 21785, "token_acc": 0.7441887359777958, "train_speed(iter/s)": 0.221015 }, { "epoch": 0.2827400744155267, "grad_norm": 0.9113373160362244, "learning_rate": 9.773493231685742e-05, "loss": 0.9860810279846192, "memory(GiB)": 91.52, "step": 21790, "token_acc": 0.7338803023081637, "train_speed(iter/s)": 0.220976 }, { "epoch": 0.2828049528171824, "grad_norm": 0.7991740107536316, "learning_rate": 9.773333592707192e-05, "loss": 0.995457935333252, "memory(GiB)": 91.52, "step": 21795, "token_acc": 0.733045122939746, "train_speed(iter/s)": 0.220933 }, { "epoch": 0.2828698312188381, "grad_norm": 0.8038498759269714, "learning_rate": 9.773173898797428e-05, "loss": 0.9509033203125, "memory(GiB)": 91.52, "step": 21800, "token_acc": 0.7368716427483364, "train_speed(iter/s)": 0.220894 }, { "epoch": 0.2829347096204938, "grad_norm": 0.8929453492164612, "learning_rate": 9.773014149958286e-05, "loss": 0.9474960327148437, "memory(GiB)": 91.52, "step": 21805, "token_acc": 0.7218784378437844, "train_speed(iter/s)": 0.220853 }, { "epoch": 0.2829995880221495, "grad_norm": 0.9359064102172852, "learning_rate": 9.772854346191604e-05, "loss": 0.9835193634033204, "memory(GiB)": 91.52, "step": 21810, "token_acc": 0.7337976782752902, "train_speed(iter/s)": 0.220806 }, { "epoch": 0.28306446642380517, "grad_norm": 0.942180871963501, "learning_rate": 9.77269448749922e-05, "loss": 0.9570097923278809, "memory(GiB)": 91.52, "step": 21815, "token_acc": 0.7434390651085142, "train_speed(iter/s)": 0.220765 }, { "epoch": 0.28312934482546087, "grad_norm": 0.9299966096878052, "learning_rate": 9.772534573882977e-05, "loss": 0.9299263000488281, "memory(GiB)": 91.52, "step": 21820, "token_acc": 0.7552903388336063, "train_speed(iter/s)": 0.220725 }, { "epoch": 0.28319422322711657, "grad_norm": 0.8325894474983215, "learning_rate": 9.772374605344711e-05, "loss": 0.9701330184936523, "memory(GiB)": 91.52, "step": 21825, "token_acc": 0.7416435556853014, "train_speed(iter/s)": 0.220681 }, { "epoch": 0.28325910162877227, "grad_norm": 0.8137956857681274, "learning_rate": 9.772214581886268e-05, "loss": 0.9713286399841309, "memory(GiB)": 91.52, "step": 21830, "token_acc": 0.7522638031893581, "train_speed(iter/s)": 0.220642 }, { "epoch": 0.28332398003042797, "grad_norm": 0.8434600234031677, "learning_rate": 9.772054503509485e-05, "loss": 0.9494356155395508, "memory(GiB)": 91.52, "step": 21835, "token_acc": 0.7506150506512301, "train_speed(iter/s)": 0.220601 }, { "epoch": 0.28338885843208367, "grad_norm": 0.9599152207374573, "learning_rate": 9.771894370216206e-05, "loss": 0.955567455291748, "memory(GiB)": 91.52, "step": 21840, "token_acc": 0.7374114420659709, "train_speed(iter/s)": 0.220561 }, { "epoch": 0.28345373683373937, "grad_norm": 0.8614458441734314, "learning_rate": 9.771734182008274e-05, "loss": 1.0048998832702636, "memory(GiB)": 91.52, "step": 21845, "token_acc": 0.7249623711937015, "train_speed(iter/s)": 0.220521 }, { "epoch": 0.28351861523539507, "grad_norm": 0.8350766897201538, "learning_rate": 9.771573938887532e-05, "loss": 0.9622482299804688, "memory(GiB)": 91.52, "step": 21850, "token_acc": 0.7539933636854695, "train_speed(iter/s)": 0.220482 }, { "epoch": 0.28358349363705077, "grad_norm": 0.8537085652351379, "learning_rate": 9.771413640855824e-05, "loss": 0.9573246002197265, "memory(GiB)": 91.52, "step": 21855, "token_acc": 0.7568206162182597, "train_speed(iter/s)": 0.220441 }, { "epoch": 0.28364837203870646, "grad_norm": 0.7827755212783813, "learning_rate": 9.771253287914995e-05, "loss": 0.9750970840454102, "memory(GiB)": 91.52, "step": 21860, "token_acc": 0.74140972040763, "train_speed(iter/s)": 0.2204 }, { "epoch": 0.28371325044036216, "grad_norm": 0.7918134927749634, "learning_rate": 9.77109288006689e-05, "loss": 0.9280236244201661, "memory(GiB)": 91.52, "step": 21865, "token_acc": 0.7501367989056088, "train_speed(iter/s)": 0.220359 }, { "epoch": 0.28377812884201786, "grad_norm": 0.6846869587898254, "learning_rate": 9.770932417313356e-05, "loss": 0.9650531768798828, "memory(GiB)": 91.52, "step": 21870, "token_acc": 0.7370462396218411, "train_speed(iter/s)": 0.220315 }, { "epoch": 0.28384300724367356, "grad_norm": 0.8424803018569946, "learning_rate": 9.770771899656238e-05, "loss": 1.0072148323059082, "memory(GiB)": 91.52, "step": 21875, "token_acc": 0.7368244826893457, "train_speed(iter/s)": 0.220278 }, { "epoch": 0.28390788564532926, "grad_norm": 0.7896029949188232, "learning_rate": 9.770611327097385e-05, "loss": 0.9776512145996094, "memory(GiB)": 91.52, "step": 21880, "token_acc": 0.7279471469698088, "train_speed(iter/s)": 0.220241 }, { "epoch": 0.28397276404698496, "grad_norm": 0.8236000537872314, "learning_rate": 9.770450699638643e-05, "loss": 0.9521888732910156, "memory(GiB)": 91.52, "step": 21885, "token_acc": 0.7473794549266247, "train_speed(iter/s)": 0.220197 }, { "epoch": 0.28403764244864066, "grad_norm": 0.754245400428772, "learning_rate": 9.770290017281862e-05, "loss": 0.9174995422363281, "memory(GiB)": 91.52, "step": 21890, "token_acc": 0.7448525214081827, "train_speed(iter/s)": 0.220158 }, { "epoch": 0.28410252085029636, "grad_norm": 0.7679694890975952, "learning_rate": 9.77012928002889e-05, "loss": 0.9752352714538575, "memory(GiB)": 91.52, "step": 21895, "token_acc": 0.7403042346169659, "train_speed(iter/s)": 0.220118 }, { "epoch": 0.284167399251952, "grad_norm": 0.7564762830734253, "learning_rate": 9.769968487881578e-05, "loss": 0.9615440368652344, "memory(GiB)": 91.52, "step": 21900, "token_acc": 0.7539329751459163, "train_speed(iter/s)": 0.220078 }, { "epoch": 0.2842322776536077, "grad_norm": 0.7657825350761414, "learning_rate": 9.769807640841775e-05, "loss": 0.9902444839477539, "memory(GiB)": 91.52, "step": 21905, "token_acc": 0.7148825065274151, "train_speed(iter/s)": 0.220037 }, { "epoch": 0.2842971560552634, "grad_norm": 0.7800065875053406, "learning_rate": 9.769646738911331e-05, "loss": 0.9709840774536133, "memory(GiB)": 91.52, "step": 21910, "token_acc": 0.7454855433005114, "train_speed(iter/s)": 0.219997 }, { "epoch": 0.2843620344569191, "grad_norm": 0.8687646389007568, "learning_rate": 9.769485782092103e-05, "loss": 0.983340072631836, "memory(GiB)": 91.52, "step": 21915, "token_acc": 0.7406050420168068, "train_speed(iter/s)": 0.219955 }, { "epoch": 0.2844269128585748, "grad_norm": 0.8106768131256104, "learning_rate": 9.769324770385936e-05, "loss": 0.9810133934020996, "memory(GiB)": 91.52, "step": 21920, "token_acc": 0.7339579560664514, "train_speed(iter/s)": 0.219915 }, { "epoch": 0.2844917912602305, "grad_norm": 0.819119393825531, "learning_rate": 9.769163703794689e-05, "loss": 1.024553108215332, "memory(GiB)": 91.52, "step": 21925, "token_acc": 0.7270571580251283, "train_speed(iter/s)": 0.219878 }, { "epoch": 0.2845566696618862, "grad_norm": 0.820268988609314, "learning_rate": 9.769002582320212e-05, "loss": 0.944996452331543, "memory(GiB)": 91.52, "step": 21930, "token_acc": 0.7530551312317734, "train_speed(iter/s)": 0.219836 }, { "epoch": 0.2846215480635419, "grad_norm": 0.7205905318260193, "learning_rate": 9.768841405964359e-05, "loss": 0.9204631805419922, "memory(GiB)": 91.52, "step": 21935, "token_acc": 0.7721431052448767, "train_speed(iter/s)": 0.219795 }, { "epoch": 0.2846864264651976, "grad_norm": 0.821998119354248, "learning_rate": 9.768680174728988e-05, "loss": 0.9806231498718262, "memory(GiB)": 91.52, "step": 21940, "token_acc": 0.7439342154241819, "train_speed(iter/s)": 0.219757 }, { "epoch": 0.2847513048668533, "grad_norm": 0.8154707551002502, "learning_rate": 9.768518888615952e-05, "loss": 0.9464645385742188, "memory(GiB)": 91.52, "step": 21945, "token_acc": 0.7566819605272741, "train_speed(iter/s)": 0.21972 }, { "epoch": 0.284816183268509, "grad_norm": 0.7711706757545471, "learning_rate": 9.768357547627107e-05, "loss": 0.9687013626098633, "memory(GiB)": 91.52, "step": 21950, "token_acc": 0.763874256584537, "train_speed(iter/s)": 0.219678 }, { "epoch": 0.2848810616701647, "grad_norm": 0.7397475838661194, "learning_rate": 9.768196151764311e-05, "loss": 0.9988219261169433, "memory(GiB)": 91.52, "step": 21955, "token_acc": 0.7349711520565791, "train_speed(iter/s)": 0.219641 }, { "epoch": 0.2849459400718204, "grad_norm": 0.7525886297225952, "learning_rate": 9.768034701029421e-05, "loss": 0.9229602813720703, "memory(GiB)": 91.52, "step": 21960, "token_acc": 0.7621520265397375, "train_speed(iter/s)": 0.219602 }, { "epoch": 0.2850108184734761, "grad_norm": 0.7517659068107605, "learning_rate": 9.767873195424293e-05, "loss": 0.9720605850219727, "memory(GiB)": 91.52, "step": 21965, "token_acc": 0.7383363598267915, "train_speed(iter/s)": 0.219558 }, { "epoch": 0.2850756968751318, "grad_norm": 0.8642855882644653, "learning_rate": 9.767711634950788e-05, "loss": 0.9309101104736328, "memory(GiB)": 91.52, "step": 21970, "token_acc": 0.7379582893140822, "train_speed(iter/s)": 0.219518 }, { "epoch": 0.2851405752767875, "grad_norm": 0.8133112192153931, "learning_rate": 9.767550019610763e-05, "loss": 0.9657167434692383, "memory(GiB)": 91.52, "step": 21975, "token_acc": 0.7182423244829313, "train_speed(iter/s)": 0.21948 }, { "epoch": 0.2852054536784432, "grad_norm": 0.7605153918266296, "learning_rate": 9.767388349406081e-05, "loss": 0.9556680679321289, "memory(GiB)": 91.52, "step": 21980, "token_acc": 0.7521243606781846, "train_speed(iter/s)": 0.219439 }, { "epoch": 0.2852703320800989, "grad_norm": 1.25392746925354, "learning_rate": 9.767226624338599e-05, "loss": 1.0011610031127929, "memory(GiB)": 91.52, "step": 21985, "token_acc": 0.7513037150826697, "train_speed(iter/s)": 0.219401 }, { "epoch": 0.2853352104817546, "grad_norm": 0.8141369223594666, "learning_rate": 9.76706484441018e-05, "loss": 0.9653793334960937, "memory(GiB)": 91.52, "step": 21990, "token_acc": 0.7359275496815898, "train_speed(iter/s)": 0.219361 }, { "epoch": 0.2854000888834103, "grad_norm": 0.8491421937942505, "learning_rate": 9.766903009622688e-05, "loss": 0.9400522232055664, "memory(GiB)": 91.52, "step": 21995, "token_acc": 0.7637142367654617, "train_speed(iter/s)": 0.219322 }, { "epoch": 0.285464967285066, "grad_norm": 0.7240492701530457, "learning_rate": 9.766741119977982e-05, "loss": 1.0010656356811523, "memory(GiB)": 91.52, "step": 22000, "token_acc": 0.7187147688838782, "train_speed(iter/s)": 0.219285 }, { "epoch": 0.2855298456867217, "grad_norm": 0.7777644991874695, "learning_rate": 9.766579175477925e-05, "loss": 0.9652616500854492, "memory(GiB)": 91.52, "step": 22005, "token_acc": 0.7386045191727857, "train_speed(iter/s)": 0.219249 }, { "epoch": 0.2855947240883774, "grad_norm": 0.8412004113197327, "learning_rate": 9.766417176124381e-05, "loss": 0.9563894271850586, "memory(GiB)": 91.52, "step": 22010, "token_acc": 0.7452567577466852, "train_speed(iter/s)": 0.219208 }, { "epoch": 0.2856596024900331, "grad_norm": 0.7814468741416931, "learning_rate": 9.766255121919217e-05, "loss": 0.9581088066101074, "memory(GiB)": 91.52, "step": 22015, "token_acc": 0.7401032024264387, "train_speed(iter/s)": 0.219167 }, { "epoch": 0.2857244808916887, "grad_norm": 0.8047492504119873, "learning_rate": 9.766093012864295e-05, "loss": 0.9935239791870117, "memory(GiB)": 91.52, "step": 22020, "token_acc": 0.7393680536419583, "train_speed(iter/s)": 0.219125 }, { "epoch": 0.2857893592933444, "grad_norm": 0.9052708148956299, "learning_rate": 9.76593084896148e-05, "loss": 0.958537483215332, "memory(GiB)": 91.52, "step": 22025, "token_acc": 0.7637667433364903, "train_speed(iter/s)": 0.219088 }, { "epoch": 0.2858542376950001, "grad_norm": 0.797186017036438, "learning_rate": 9.765768630212642e-05, "loss": 1.0109514236450194, "memory(GiB)": 91.52, "step": 22030, "token_acc": 0.734815652283148, "train_speed(iter/s)": 0.219049 }, { "epoch": 0.2859191160966558, "grad_norm": 0.869403600692749, "learning_rate": 9.765606356619643e-05, "loss": 0.9280590057373047, "memory(GiB)": 91.52, "step": 22035, "token_acc": 0.7496953358691236, "train_speed(iter/s)": 0.219012 }, { "epoch": 0.2859839944983115, "grad_norm": 0.6739431619644165, "learning_rate": 9.765444028184355e-05, "loss": 0.9641224861145019, "memory(GiB)": 91.52, "step": 22040, "token_acc": 0.7477743057602736, "train_speed(iter/s)": 0.218972 }, { "epoch": 0.2860488728999672, "grad_norm": 0.8404438495635986, "learning_rate": 9.765281644908642e-05, "loss": 0.9451364517211914, "memory(GiB)": 91.52, "step": 22045, "token_acc": 0.7454611594298753, "train_speed(iter/s)": 0.218928 }, { "epoch": 0.2861137513016229, "grad_norm": 0.8129997253417969, "learning_rate": 9.765119206794375e-05, "loss": 0.9312982559204102, "memory(GiB)": 91.52, "step": 22050, "token_acc": 0.7411188958912791, "train_speed(iter/s)": 0.21889 }, { "epoch": 0.2861786297032786, "grad_norm": 0.7927801609039307, "learning_rate": 9.764956713843421e-05, "loss": 0.969898796081543, "memory(GiB)": 91.52, "step": 22055, "token_acc": 0.7423417021948524, "train_speed(iter/s)": 0.218852 }, { "epoch": 0.2862435081049343, "grad_norm": 0.867654025554657, "learning_rate": 9.764794166057655e-05, "loss": 0.9747716903686523, "memory(GiB)": 91.52, "step": 22060, "token_acc": 0.7470466125684091, "train_speed(iter/s)": 0.218813 }, { "epoch": 0.28630838650659, "grad_norm": 0.8290441036224365, "learning_rate": 9.764631563438943e-05, "loss": 1.0020797729492188, "memory(GiB)": 91.52, "step": 22065, "token_acc": 0.7332414532368592, "train_speed(iter/s)": 0.218776 }, { "epoch": 0.2863732649082457, "grad_norm": 0.7442308664321899, "learning_rate": 9.764468905989157e-05, "loss": 0.9884443283081055, "memory(GiB)": 91.52, "step": 22070, "token_acc": 0.7553184504267892, "train_speed(iter/s)": 0.218737 }, { "epoch": 0.2864381433099014, "grad_norm": 0.8151142597198486, "learning_rate": 9.764306193710171e-05, "loss": 0.9713533401489258, "memory(GiB)": 91.52, "step": 22075, "token_acc": 0.7512042083914311, "train_speed(iter/s)": 0.218695 }, { "epoch": 0.2865030217115571, "grad_norm": 0.941923975944519, "learning_rate": 9.764143426603854e-05, "loss": 0.9908472061157226, "memory(GiB)": 91.52, "step": 22080, "token_acc": 0.7400372682990618, "train_speed(iter/s)": 0.218659 }, { "epoch": 0.2865679001132128, "grad_norm": 0.8273646235466003, "learning_rate": 9.763980604672083e-05, "loss": 1.0134233474731444, "memory(GiB)": 91.52, "step": 22085, "token_acc": 0.7419905536903204, "train_speed(iter/s)": 0.218618 }, { "epoch": 0.2866327785148685, "grad_norm": 0.8832806348800659, "learning_rate": 9.763817727916729e-05, "loss": 0.9597549438476562, "memory(GiB)": 91.52, "step": 22090, "token_acc": 0.7561245164855406, "train_speed(iter/s)": 0.218581 }, { "epoch": 0.2866976569165242, "grad_norm": 0.7673356533050537, "learning_rate": 9.763654796339667e-05, "loss": 0.9330352783203125, "memory(GiB)": 91.52, "step": 22095, "token_acc": 0.7540555594235188, "train_speed(iter/s)": 0.218541 }, { "epoch": 0.2867625353181799, "grad_norm": 0.767941415309906, "learning_rate": 9.763491809942773e-05, "loss": 0.974670124053955, "memory(GiB)": 91.52, "step": 22100, "token_acc": 0.7523698749636523, "train_speed(iter/s)": 0.218497 }, { "epoch": 0.2868274137198356, "grad_norm": 0.813898503780365, "learning_rate": 9.763328768727921e-05, "loss": 0.949135684967041, "memory(GiB)": 91.52, "step": 22105, "token_acc": 0.750189225655708, "train_speed(iter/s)": 0.218456 }, { "epoch": 0.2868922921214913, "grad_norm": 0.826178789138794, "learning_rate": 9.763165672696987e-05, "loss": 0.9808331489562988, "memory(GiB)": 91.52, "step": 22110, "token_acc": 0.7306922040563667, "train_speed(iter/s)": 0.218414 }, { "epoch": 0.286957170523147, "grad_norm": 0.864671528339386, "learning_rate": 9.763002521851852e-05, "loss": 1.0163267135620118, "memory(GiB)": 91.52, "step": 22115, "token_acc": 0.7307154183378416, "train_speed(iter/s)": 0.218375 }, { "epoch": 0.2870220489248027, "grad_norm": 0.8403864502906799, "learning_rate": 9.762839316194387e-05, "loss": 0.9684842109680176, "memory(GiB)": 91.52, "step": 22120, "token_acc": 0.7349653891608982, "train_speed(iter/s)": 0.218335 }, { "epoch": 0.2870869273264584, "grad_norm": 0.8114408254623413, "learning_rate": 9.762676055726477e-05, "loss": 0.9541257858276367, "memory(GiB)": 91.52, "step": 22125, "token_acc": 0.7361242845461978, "train_speed(iter/s)": 0.218299 }, { "epoch": 0.2871518057281141, "grad_norm": 0.8723393082618713, "learning_rate": 9.762512740449996e-05, "loss": 0.9239099502563477, "memory(GiB)": 91.52, "step": 22130, "token_acc": 0.7426930604125894, "train_speed(iter/s)": 0.218259 }, { "epoch": 0.2872166841297698, "grad_norm": 0.866812527179718, "learning_rate": 9.762349370366824e-05, "loss": 0.9726869583129882, "memory(GiB)": 91.52, "step": 22135, "token_acc": 0.7523551075789882, "train_speed(iter/s)": 0.21822 }, { "epoch": 0.28728156253142545, "grad_norm": 0.831412136554718, "learning_rate": 9.762185945478843e-05, "loss": 0.9498489379882813, "memory(GiB)": 91.52, "step": 22140, "token_acc": 0.7710936773233279, "train_speed(iter/s)": 0.218179 }, { "epoch": 0.28734644093308115, "grad_norm": 0.7793892025947571, "learning_rate": 9.762022465787933e-05, "loss": 0.9315713882446289, "memory(GiB)": 91.52, "step": 22145, "token_acc": 0.7705783267827063, "train_speed(iter/s)": 0.218139 }, { "epoch": 0.28741131933473685, "grad_norm": 1.0063402652740479, "learning_rate": 9.761858931295973e-05, "loss": 0.9849365234375, "memory(GiB)": 91.52, "step": 22150, "token_acc": 0.7472200090068244, "train_speed(iter/s)": 0.218102 }, { "epoch": 0.28747619773639255, "grad_norm": 0.7274678945541382, "learning_rate": 9.761695342004849e-05, "loss": 0.9457477569580078, "memory(GiB)": 91.52, "step": 22155, "token_acc": 0.7373369213475491, "train_speed(iter/s)": 0.218062 }, { "epoch": 0.28754107613804825, "grad_norm": 0.8459072709083557, "learning_rate": 9.761531697916441e-05, "loss": 0.9926108360290528, "memory(GiB)": 91.52, "step": 22160, "token_acc": 0.7410060069971616, "train_speed(iter/s)": 0.218022 }, { "epoch": 0.28760595453970395, "grad_norm": 0.879242479801178, "learning_rate": 9.761367999032631e-05, "loss": 0.8806673049926758, "memory(GiB)": 91.52, "step": 22165, "token_acc": 0.7679063360881543, "train_speed(iter/s)": 0.217982 }, { "epoch": 0.28767083294135964, "grad_norm": 0.773952066898346, "learning_rate": 9.761204245355307e-05, "loss": 0.9699097633361816, "memory(GiB)": 91.52, "step": 22170, "token_acc": 0.7515424437983121, "train_speed(iter/s)": 0.217946 }, { "epoch": 0.28773571134301534, "grad_norm": 0.7613015174865723, "learning_rate": 9.761040436886349e-05, "loss": 0.9523642539978028, "memory(GiB)": 91.52, "step": 22175, "token_acc": 0.7412784968136128, "train_speed(iter/s)": 0.217911 }, { "epoch": 0.28780058974467104, "grad_norm": 0.7634870409965515, "learning_rate": 9.760876573627646e-05, "loss": 0.9765961647033692, "memory(GiB)": 91.52, "step": 22180, "token_acc": 0.7432622379759213, "train_speed(iter/s)": 0.217871 }, { "epoch": 0.28786546814632674, "grad_norm": 1.001990556716919, "learning_rate": 9.76071265558108e-05, "loss": 0.9751616477966308, "memory(GiB)": 91.52, "step": 22185, "token_acc": 0.740956049728594, "train_speed(iter/s)": 0.217828 }, { "epoch": 0.28793034654798244, "grad_norm": 0.745844304561615, "learning_rate": 9.76054868274854e-05, "loss": 0.9395033836364746, "memory(GiB)": 91.52, "step": 22190, "token_acc": 0.7353931931699385, "train_speed(iter/s)": 0.217789 }, { "epoch": 0.28799522494963814, "grad_norm": 0.7082571387290955, "learning_rate": 9.760384655131912e-05, "loss": 0.9474024772644043, "memory(GiB)": 91.52, "step": 22195, "token_acc": 0.7457352798368563, "train_speed(iter/s)": 0.21775 }, { "epoch": 0.28806010335129384, "grad_norm": 0.8332637548446655, "learning_rate": 9.760220572733085e-05, "loss": 0.958015251159668, "memory(GiB)": 91.52, "step": 22200, "token_acc": 0.7599420237340339, "train_speed(iter/s)": 0.217708 }, { "epoch": 0.28812498175294954, "grad_norm": 0.8116134405136108, "learning_rate": 9.760056435553943e-05, "loss": 0.9549520492553711, "memory(GiB)": 91.52, "step": 22205, "token_acc": 0.7473903966597077, "train_speed(iter/s)": 0.217672 }, { "epoch": 0.28818986015460524, "grad_norm": 0.7042473554611206, "learning_rate": 9.75989224359638e-05, "loss": 0.9217071533203125, "memory(GiB)": 91.52, "step": 22210, "token_acc": 0.7377791606167462, "train_speed(iter/s)": 0.217632 }, { "epoch": 0.28825473855626094, "grad_norm": 0.7442321181297302, "learning_rate": 9.759727996862284e-05, "loss": 1.0070361137390136, "memory(GiB)": 91.52, "step": 22215, "token_acc": 0.7415202907328892, "train_speed(iter/s)": 0.217598 }, { "epoch": 0.28831961695791664, "grad_norm": 0.8230230808258057, "learning_rate": 9.759563695353544e-05, "loss": 0.9738799095153808, "memory(GiB)": 91.52, "step": 22220, "token_acc": 0.7478747203579418, "train_speed(iter/s)": 0.217562 }, { "epoch": 0.28838449535957233, "grad_norm": 0.7955189943313599, "learning_rate": 9.759399339072051e-05, "loss": 0.9598669052124024, "memory(GiB)": 91.52, "step": 22225, "token_acc": 0.7227418883367437, "train_speed(iter/s)": 0.217525 }, { "epoch": 0.28844937376122803, "grad_norm": 0.8135888576507568, "learning_rate": 9.759234928019697e-05, "loss": 0.9904165267944336, "memory(GiB)": 91.52, "step": 22230, "token_acc": 0.743405838388578, "train_speed(iter/s)": 0.217488 }, { "epoch": 0.28851425216288373, "grad_norm": 0.8676417469978333, "learning_rate": 9.759070462198372e-05, "loss": 0.9922430038452148, "memory(GiB)": 91.52, "step": 22235, "token_acc": 0.7489916227117592, "train_speed(iter/s)": 0.217449 }, { "epoch": 0.28857913056453943, "grad_norm": 0.7635696530342102, "learning_rate": 9.758905941609973e-05, "loss": 0.9350639343261719, "memory(GiB)": 91.52, "step": 22240, "token_acc": 0.7598930669386894, "train_speed(iter/s)": 0.217412 }, { "epoch": 0.28864400896619513, "grad_norm": 0.778617799282074, "learning_rate": 9.758741366256389e-05, "loss": 0.9731840133666992, "memory(GiB)": 91.52, "step": 22245, "token_acc": 0.7228952380952381, "train_speed(iter/s)": 0.217377 }, { "epoch": 0.28870888736785083, "grad_norm": 0.8423578143119812, "learning_rate": 9.758576736139515e-05, "loss": 0.981232738494873, "memory(GiB)": 91.52, "step": 22250, "token_acc": 0.7454637686814688, "train_speed(iter/s)": 0.217342 }, { "epoch": 0.28877376576950653, "grad_norm": 0.8489612340927124, "learning_rate": 9.758412051261248e-05, "loss": 0.9393376350402832, "memory(GiB)": 91.52, "step": 22255, "token_acc": 0.7523708754604593, "train_speed(iter/s)": 0.217302 }, { "epoch": 0.2888386441711622, "grad_norm": 0.7663694620132446, "learning_rate": 9.75824731162348e-05, "loss": 0.9932147979736328, "memory(GiB)": 91.52, "step": 22260, "token_acc": 0.7429309231762428, "train_speed(iter/s)": 0.217262 }, { "epoch": 0.2889035225728179, "grad_norm": 0.8035334348678589, "learning_rate": 9.758082517228112e-05, "loss": 0.9678279876708984, "memory(GiB)": 91.52, "step": 22265, "token_acc": 0.7379162556549542, "train_speed(iter/s)": 0.217221 }, { "epoch": 0.28896840097447357, "grad_norm": 0.8229765892028809, "learning_rate": 9.757917668077032e-05, "loss": 1.0014341354370118, "memory(GiB)": 91.52, "step": 22270, "token_acc": 0.712113976440624, "train_speed(iter/s)": 0.217183 }, { "epoch": 0.28903327937612927, "grad_norm": 0.7749277949333191, "learning_rate": 9.757752764172143e-05, "loss": 0.9679708480834961, "memory(GiB)": 91.52, "step": 22275, "token_acc": 0.743513350429929, "train_speed(iter/s)": 0.217143 }, { "epoch": 0.28909815777778497, "grad_norm": 0.7549753189086914, "learning_rate": 9.757587805515341e-05, "loss": 0.952879810333252, "memory(GiB)": 91.52, "step": 22280, "token_acc": 0.7502632872288142, "train_speed(iter/s)": 0.217106 }, { "epoch": 0.28916303617944067, "grad_norm": 0.8697067499160767, "learning_rate": 9.757422792108526e-05, "loss": 0.9651503562927246, "memory(GiB)": 91.52, "step": 22285, "token_acc": 0.7378541212665043, "train_speed(iter/s)": 0.217065 }, { "epoch": 0.28922791458109637, "grad_norm": 0.8291692733764648, "learning_rate": 9.757257723953595e-05, "loss": 0.9793960571289062, "memory(GiB)": 91.52, "step": 22290, "token_acc": 0.737294513765102, "train_speed(iter/s)": 0.217032 }, { "epoch": 0.28929279298275207, "grad_norm": 0.7211694121360779, "learning_rate": 9.757092601052449e-05, "loss": 0.9892619132995606, "memory(GiB)": 91.52, "step": 22295, "token_acc": 0.7432815788670958, "train_speed(iter/s)": 0.216994 }, { "epoch": 0.28935767138440777, "grad_norm": 0.7920428514480591, "learning_rate": 9.756927423406986e-05, "loss": 0.9628633499145508, "memory(GiB)": 91.52, "step": 22300, "token_acc": 0.746098644492539, "train_speed(iter/s)": 0.216956 }, { "epoch": 0.28942254978606347, "grad_norm": 0.9370814561843872, "learning_rate": 9.756762191019109e-05, "loss": 0.907956314086914, "memory(GiB)": 91.52, "step": 22305, "token_acc": 0.7430444230700723, "train_speed(iter/s)": 0.216914 }, { "epoch": 0.28948742818771916, "grad_norm": 0.8053135275840759, "learning_rate": 9.75659690389072e-05, "loss": 0.9568510055541992, "memory(GiB)": 91.52, "step": 22310, "token_acc": 0.7420637764242207, "train_speed(iter/s)": 0.216878 }, { "epoch": 0.28955230658937486, "grad_norm": 0.8576846718788147, "learning_rate": 9.756431562023718e-05, "loss": 0.9982093811035156, "memory(GiB)": 91.52, "step": 22315, "token_acc": 0.7448103320035161, "train_speed(iter/s)": 0.216846 }, { "epoch": 0.28961718499103056, "grad_norm": 0.8132678866386414, "learning_rate": 9.75626616542001e-05, "loss": 0.9116452217102051, "memory(GiB)": 91.52, "step": 22320, "token_acc": 0.7595257127631229, "train_speed(iter/s)": 0.216813 }, { "epoch": 0.28968206339268626, "grad_norm": 0.9185969233512878, "learning_rate": 9.756100714081495e-05, "loss": 0.9451148986816407, "memory(GiB)": 91.52, "step": 22325, "token_acc": 0.7511137909087864, "train_speed(iter/s)": 0.216774 }, { "epoch": 0.28974694179434196, "grad_norm": 0.765958845615387, "learning_rate": 9.75593520801008e-05, "loss": 0.9596561431884766, "memory(GiB)": 91.52, "step": 22330, "token_acc": 0.7487370793502759, "train_speed(iter/s)": 0.216734 }, { "epoch": 0.28981182019599766, "grad_norm": 0.8873052000999451, "learning_rate": 9.755769647207668e-05, "loss": 0.9470781326293946, "memory(GiB)": 91.52, "step": 22335, "token_acc": 0.7484994196650638, "train_speed(iter/s)": 0.216698 }, { "epoch": 0.28987669859765336, "grad_norm": 0.799800455570221, "learning_rate": 9.755604031676166e-05, "loss": 0.9859766006469727, "memory(GiB)": 91.52, "step": 22340, "token_acc": 0.7434613660140488, "train_speed(iter/s)": 0.216665 }, { "epoch": 0.28994157699930906, "grad_norm": 0.7415028810501099, "learning_rate": 9.755438361417478e-05, "loss": 0.9692964553833008, "memory(GiB)": 91.52, "step": 22345, "token_acc": 0.7365069422695479, "train_speed(iter/s)": 0.216626 }, { "epoch": 0.29000645540096476, "grad_norm": 0.8482421636581421, "learning_rate": 9.755272636433512e-05, "loss": 0.9815614700317383, "memory(GiB)": 91.52, "step": 22350, "token_acc": 0.737928007023705, "train_speed(iter/s)": 0.216588 }, { "epoch": 0.29007133380262046, "grad_norm": 0.7993071675300598, "learning_rate": 9.755106856726175e-05, "loss": 0.9589183807373047, "memory(GiB)": 91.52, "step": 22355, "token_acc": 0.7303865724918357, "train_speed(iter/s)": 0.216551 }, { "epoch": 0.29013621220427616, "grad_norm": 0.856344997882843, "learning_rate": 9.754941022297374e-05, "loss": 1.0112578392028808, "memory(GiB)": 91.52, "step": 22360, "token_acc": 0.7357502095557418, "train_speed(iter/s)": 0.216517 }, { "epoch": 0.29020109060593186, "grad_norm": 0.8335652351379395, "learning_rate": 9.754775133149017e-05, "loss": 0.9899507522583008, "memory(GiB)": 91.52, "step": 22365, "token_acc": 0.7253897915640899, "train_speed(iter/s)": 0.216479 }, { "epoch": 0.29026596900758755, "grad_norm": 0.8931208848953247, "learning_rate": 9.754609189283015e-05, "loss": 0.9305184364318848, "memory(GiB)": 91.52, "step": 22370, "token_acc": 0.783398754737412, "train_speed(iter/s)": 0.216441 }, { "epoch": 0.29033084740924325, "grad_norm": 0.8062232732772827, "learning_rate": 9.754443190701274e-05, "loss": 1.0240602493286133, "memory(GiB)": 91.52, "step": 22375, "token_acc": 0.7340675384029453, "train_speed(iter/s)": 0.216407 }, { "epoch": 0.2903957258108989, "grad_norm": 0.8830860257148743, "learning_rate": 9.75427713740571e-05, "loss": 0.9353116989135742, "memory(GiB)": 91.52, "step": 22380, "token_acc": 0.7489349306339439, "train_speed(iter/s)": 0.216371 }, { "epoch": 0.2904606042125546, "grad_norm": 0.8751125335693359, "learning_rate": 9.754111029398229e-05, "loss": 0.9589010238647461, "memory(GiB)": 91.52, "step": 22385, "token_acc": 0.7485567932352224, "train_speed(iter/s)": 0.21633 }, { "epoch": 0.2905254826142103, "grad_norm": 0.7811453342437744, "learning_rate": 9.753944866680744e-05, "loss": 0.971958065032959, "memory(GiB)": 91.52, "step": 22390, "token_acc": 0.7574990854773808, "train_speed(iter/s)": 0.216293 }, { "epoch": 0.290590361015866, "grad_norm": 0.9379813075065613, "learning_rate": 9.753778649255168e-05, "loss": 1.0843942642211915, "memory(GiB)": 91.52, "step": 22395, "token_acc": 0.7205500329049912, "train_speed(iter/s)": 0.216257 }, { "epoch": 0.2906552394175217, "grad_norm": 0.891674280166626, "learning_rate": 9.753612377123413e-05, "loss": 0.9992159843444824, "memory(GiB)": 91.52, "step": 22400, "token_acc": 0.7352540989182937, "train_speed(iter/s)": 0.216217 }, { "epoch": 0.2907201178191774, "grad_norm": 0.7862419486045837, "learning_rate": 9.753446050287393e-05, "loss": 0.9919497489929199, "memory(GiB)": 91.52, "step": 22405, "token_acc": 0.738187265917603, "train_speed(iter/s)": 0.216176 }, { "epoch": 0.2907849962208331, "grad_norm": 0.8193061947822571, "learning_rate": 9.753279668749022e-05, "loss": 0.9268470764160156, "memory(GiB)": 91.52, "step": 22410, "token_acc": 0.7596790142237773, "train_speed(iter/s)": 0.21614 }, { "epoch": 0.2908498746224888, "grad_norm": 0.7886427044868469, "learning_rate": 9.753113232510214e-05, "loss": 0.9749114990234375, "memory(GiB)": 91.52, "step": 22415, "token_acc": 0.7502174543345898, "train_speed(iter/s)": 0.216103 }, { "epoch": 0.2909147530241445, "grad_norm": 0.9100260734558105, "learning_rate": 9.752946741572884e-05, "loss": 0.952372932434082, "memory(GiB)": 91.52, "step": 22420, "token_acc": 0.7446133890610362, "train_speed(iter/s)": 0.216063 }, { "epoch": 0.2909796314258002, "grad_norm": 0.8253270387649536, "learning_rate": 9.75278019593895e-05, "loss": 0.9559485435485839, "memory(GiB)": 91.52, "step": 22425, "token_acc": 0.7379735682819383, "train_speed(iter/s)": 0.216028 }, { "epoch": 0.2910445098274559, "grad_norm": 0.7755334377288818, "learning_rate": 9.752613595610327e-05, "loss": 0.9394211769104004, "memory(GiB)": 91.52, "step": 22430, "token_acc": 0.7536557583966998, "train_speed(iter/s)": 0.215993 }, { "epoch": 0.2911093882291116, "grad_norm": 0.8145129084587097, "learning_rate": 9.752446940588933e-05, "loss": 1.0109460830688477, "memory(GiB)": 91.52, "step": 22435, "token_acc": 0.7450755840586349, "train_speed(iter/s)": 0.215954 }, { "epoch": 0.2911742666307673, "grad_norm": 0.8326326012611389, "learning_rate": 9.752280230876685e-05, "loss": 0.9433183670043945, "memory(GiB)": 91.52, "step": 22440, "token_acc": 0.7603108188882247, "train_speed(iter/s)": 0.215918 }, { "epoch": 0.291239145032423, "grad_norm": 0.780939519405365, "learning_rate": 9.7521134664755e-05, "loss": 0.9471885681152343, "memory(GiB)": 91.52, "step": 22445, "token_acc": 0.7549367925307775, "train_speed(iter/s)": 0.21588 }, { "epoch": 0.2913040234340787, "grad_norm": 0.8631105422973633, "learning_rate": 9.751946647387302e-05, "loss": 1.0067092895507812, "memory(GiB)": 91.52, "step": 22450, "token_acc": 0.7168089835068429, "train_speed(iter/s)": 0.215844 }, { "epoch": 0.2913689018357344, "grad_norm": 0.8649609088897705, "learning_rate": 9.751779773614008e-05, "loss": 0.9946430206298829, "memory(GiB)": 91.52, "step": 22455, "token_acc": 0.7281570953276436, "train_speed(iter/s)": 0.21581 }, { "epoch": 0.2914337802373901, "grad_norm": 0.7827351689338684, "learning_rate": 9.751612845157538e-05, "loss": 0.9248789787292481, "memory(GiB)": 91.52, "step": 22460, "token_acc": 0.7526060480957787, "train_speed(iter/s)": 0.215771 }, { "epoch": 0.2914986586390458, "grad_norm": 0.893903374671936, "learning_rate": 9.751445862019812e-05, "loss": 0.9922880172729492, "memory(GiB)": 91.52, "step": 22465, "token_acc": 0.7601433979964486, "train_speed(iter/s)": 0.215736 }, { "epoch": 0.2915635370407015, "grad_norm": 0.8585269451141357, "learning_rate": 9.751278824202751e-05, "loss": 0.9799441337585449, "memory(GiB)": 91.52, "step": 22470, "token_acc": 0.7491966646329062, "train_speed(iter/s)": 0.215698 }, { "epoch": 0.2916284154423572, "grad_norm": 0.8210256099700928, "learning_rate": 9.751111731708283e-05, "loss": 0.9473297119140625, "memory(GiB)": 91.52, "step": 22475, "token_acc": 0.7776487663280116, "train_speed(iter/s)": 0.215657 }, { "epoch": 0.2916932938440129, "grad_norm": 0.8450677394866943, "learning_rate": 9.750944584538325e-05, "loss": 0.9834068298339844, "memory(GiB)": 91.52, "step": 22480, "token_acc": 0.7585296139513007, "train_speed(iter/s)": 0.215623 }, { "epoch": 0.2917581722456686, "grad_norm": 0.8273333311080933, "learning_rate": 9.750777382694802e-05, "loss": 0.8657312393188477, "memory(GiB)": 91.52, "step": 22485, "token_acc": 0.7478537027989112, "train_speed(iter/s)": 0.215582 }, { "epoch": 0.2918230506473243, "grad_norm": 0.8264237642288208, "learning_rate": 9.75061012617964e-05, "loss": 0.9760722160339356, "memory(GiB)": 91.52, "step": 22490, "token_acc": 0.7586229148517407, "train_speed(iter/s)": 0.215546 }, { "epoch": 0.2918879290489799, "grad_norm": 0.8227353692054749, "learning_rate": 9.750442814994762e-05, "loss": 0.9790445327758789, "memory(GiB)": 91.52, "step": 22495, "token_acc": 0.7403259611474333, "train_speed(iter/s)": 0.215504 }, { "epoch": 0.2919528074506356, "grad_norm": 0.90206378698349, "learning_rate": 9.750275449142095e-05, "loss": 0.9703187942504883, "memory(GiB)": 91.52, "step": 22500, "token_acc": 0.7548259777538572, "train_speed(iter/s)": 0.215467 }, { "epoch": 0.2920176858522913, "grad_norm": 0.7387325167655945, "learning_rate": 9.750108028623561e-05, "loss": 0.9812475204467773, "memory(GiB)": 91.52, "step": 22505, "token_acc": 0.7463073247871875, "train_speed(iter/s)": 0.215432 }, { "epoch": 0.292082564253947, "grad_norm": 0.8793684840202332, "learning_rate": 9.749940553441091e-05, "loss": 0.9969667434692383, "memory(GiB)": 91.52, "step": 22510, "token_acc": 0.7407455683003128, "train_speed(iter/s)": 0.215396 }, { "epoch": 0.2921474426556027, "grad_norm": 0.8139348030090332, "learning_rate": 9.749773023596612e-05, "loss": 0.9762872695922852, "memory(GiB)": 91.52, "step": 22515, "token_acc": 0.7632882139319908, "train_speed(iter/s)": 0.215362 }, { "epoch": 0.2922123210572584, "grad_norm": 0.7907500267028809, "learning_rate": 9.749605439092048e-05, "loss": 0.9507976531982422, "memory(GiB)": 91.52, "step": 22520, "token_acc": 0.7754554829838433, "train_speed(iter/s)": 0.215327 }, { "epoch": 0.2922771994589141, "grad_norm": 0.753102719783783, "learning_rate": 9.749437799929332e-05, "loss": 0.955440902709961, "memory(GiB)": 91.52, "step": 22525, "token_acc": 0.7558890438100829, "train_speed(iter/s)": 0.21529 }, { "epoch": 0.2923420778605698, "grad_norm": 0.8486889004707336, "learning_rate": 9.749270106110393e-05, "loss": 0.9930364608764648, "memory(GiB)": 91.52, "step": 22530, "token_acc": 0.7315177681833278, "train_speed(iter/s)": 0.215256 }, { "epoch": 0.2924069562622255, "grad_norm": 0.85170578956604, "learning_rate": 9.749102357637156e-05, "loss": 0.9446914672851563, "memory(GiB)": 91.52, "step": 22535, "token_acc": 0.7366614774978771, "train_speed(iter/s)": 0.21522 }, { "epoch": 0.2924718346638812, "grad_norm": 0.8755830526351929, "learning_rate": 9.748934554511558e-05, "loss": 1.0540770530700683, "memory(GiB)": 91.52, "step": 22540, "token_acc": 0.7091913611250628, "train_speed(iter/s)": 0.215185 }, { "epoch": 0.2925367130655369, "grad_norm": 0.8213351368904114, "learning_rate": 9.748766696735524e-05, "loss": 1.0196294784545898, "memory(GiB)": 91.52, "step": 22545, "token_acc": 0.7388093106535363, "train_speed(iter/s)": 0.215147 }, { "epoch": 0.2926015914671926, "grad_norm": 0.7959758639335632, "learning_rate": 9.748598784310989e-05, "loss": 0.9649934768676758, "memory(GiB)": 91.52, "step": 22550, "token_acc": 0.7488014414878214, "train_speed(iter/s)": 0.215112 }, { "epoch": 0.2926664698688483, "grad_norm": 0.7845593094825745, "learning_rate": 9.748430817239886e-05, "loss": 0.9613603591918946, "memory(GiB)": 91.52, "step": 22555, "token_acc": 0.7343671295963158, "train_speed(iter/s)": 0.215078 }, { "epoch": 0.292731348270504, "grad_norm": 0.7543190717697144, "learning_rate": 9.748262795524146e-05, "loss": 0.9612462997436524, "memory(GiB)": 91.52, "step": 22560, "token_acc": 0.7484759857641213, "train_speed(iter/s)": 0.215042 }, { "epoch": 0.2927962266721597, "grad_norm": 0.809224545955658, "learning_rate": 9.748094719165704e-05, "loss": 0.9611921310424805, "memory(GiB)": 91.52, "step": 22565, "token_acc": 0.7476694214876033, "train_speed(iter/s)": 0.215005 }, { "epoch": 0.2928611050738154, "grad_norm": 0.7949914932250977, "learning_rate": 9.747926588166492e-05, "loss": 0.9396793365478515, "memory(GiB)": 91.52, "step": 22570, "token_acc": 0.7458496181290114, "train_speed(iter/s)": 0.214971 }, { "epoch": 0.2929259834754711, "grad_norm": 0.7911983728408813, "learning_rate": 9.747758402528449e-05, "loss": 0.9540175437927246, "memory(GiB)": 91.52, "step": 22575, "token_acc": 0.7414507555146289, "train_speed(iter/s)": 0.21493 }, { "epoch": 0.2929908618771268, "grad_norm": 0.8519187569618225, "learning_rate": 9.747590162253507e-05, "loss": 0.9362342834472657, "memory(GiB)": 91.52, "step": 22580, "token_acc": 0.7397553755063883, "train_speed(iter/s)": 0.214893 }, { "epoch": 0.2930557402787825, "grad_norm": 0.8059895634651184, "learning_rate": 9.7474218673436e-05, "loss": 1.0062926292419434, "memory(GiB)": 91.52, "step": 22585, "token_acc": 0.7248121371843727, "train_speed(iter/s)": 0.214859 }, { "epoch": 0.2931206186804382, "grad_norm": 0.8068774342536926, "learning_rate": 9.74725351780067e-05, "loss": 0.9095612525939941, "memory(GiB)": 91.52, "step": 22590, "token_acc": 0.7811412898046561, "train_speed(iter/s)": 0.214825 }, { "epoch": 0.2931854970820939, "grad_norm": 0.7700682282447815, "learning_rate": 9.747085113626653e-05, "loss": 0.9630577087402343, "memory(GiB)": 91.52, "step": 22595, "token_acc": 0.7597315436241611, "train_speed(iter/s)": 0.214786 }, { "epoch": 0.2932503754837496, "grad_norm": 0.8591657876968384, "learning_rate": 9.746916654823485e-05, "loss": 0.9712109565734863, "memory(GiB)": 91.52, "step": 22600, "token_acc": 0.7502192397147978, "train_speed(iter/s)": 0.21475 }, { "epoch": 0.2933152538854053, "grad_norm": 0.9289772510528564, "learning_rate": 9.746748141393105e-05, "loss": 0.8824058532714844, "memory(GiB)": 91.52, "step": 22605, "token_acc": 0.7670171903532523, "train_speed(iter/s)": 0.214716 }, { "epoch": 0.293380132287061, "grad_norm": 0.9411822557449341, "learning_rate": 9.746579573337453e-05, "loss": 0.9712099075317383, "memory(GiB)": 91.52, "step": 22610, "token_acc": 0.7298961746476649, "train_speed(iter/s)": 0.214682 }, { "epoch": 0.29344501068871665, "grad_norm": 0.726144015789032, "learning_rate": 9.74641095065847e-05, "loss": 0.9827608108520508, "memory(GiB)": 91.52, "step": 22615, "token_acc": 0.7497727520634113, "train_speed(iter/s)": 0.214647 }, { "epoch": 0.29350988909037234, "grad_norm": 0.835130512714386, "learning_rate": 9.746242273358095e-05, "loss": 0.9611032485961915, "memory(GiB)": 91.52, "step": 22620, "token_acc": 0.754278611721962, "train_speed(iter/s)": 0.214609 }, { "epoch": 0.29357476749202804, "grad_norm": 0.835271418094635, "learning_rate": 9.746073541438269e-05, "loss": 0.9458924293518066, "memory(GiB)": 91.52, "step": 22625, "token_acc": 0.7534756431308155, "train_speed(iter/s)": 0.214571 }, { "epoch": 0.29363964589368374, "grad_norm": 0.8467349410057068, "learning_rate": 9.745904754900934e-05, "loss": 0.9410566329956055, "memory(GiB)": 91.52, "step": 22630, "token_acc": 0.740574841358716, "train_speed(iter/s)": 0.214532 }, { "epoch": 0.29370452429533944, "grad_norm": 0.8445695042610168, "learning_rate": 9.745735913748033e-05, "loss": 0.9680734634399414, "memory(GiB)": 91.52, "step": 22635, "token_acc": 0.7550656777027307, "train_speed(iter/s)": 0.214499 }, { "epoch": 0.29376940269699514, "grad_norm": 0.8225666284561157, "learning_rate": 9.745567017981507e-05, "loss": 0.9836173057556152, "memory(GiB)": 91.52, "step": 22640, "token_acc": 0.7277378159018348, "train_speed(iter/s)": 0.214465 }, { "epoch": 0.29383428109865084, "grad_norm": 0.7579591274261475, "learning_rate": 9.745398067603303e-05, "loss": 0.9485374450683594, "memory(GiB)": 91.52, "step": 22645, "token_acc": 0.754888875993733, "train_speed(iter/s)": 0.214428 }, { "epoch": 0.29389915950030654, "grad_norm": 0.8562624454498291, "learning_rate": 9.745229062615362e-05, "loss": 0.9564484596252442, "memory(GiB)": 91.52, "step": 22650, "token_acc": 0.7343822151856982, "train_speed(iter/s)": 0.214389 }, { "epoch": 0.29396403790196224, "grad_norm": 0.832277774810791, "learning_rate": 9.745060003019633e-05, "loss": 0.9386022567749024, "memory(GiB)": 91.52, "step": 22655, "token_acc": 0.7398173025255239, "train_speed(iter/s)": 0.214354 }, { "epoch": 0.29402891630361794, "grad_norm": 0.8277179598808289, "learning_rate": 9.744890888818057e-05, "loss": 0.9819917678833008, "memory(GiB)": 91.52, "step": 22660, "token_acc": 0.757217565728204, "train_speed(iter/s)": 0.21432 }, { "epoch": 0.29409379470527364, "grad_norm": 0.8538158535957336, "learning_rate": 9.744721720012582e-05, "loss": 0.9753553390502929, "memory(GiB)": 91.52, "step": 22665, "token_acc": 0.7297531740399023, "train_speed(iter/s)": 0.214286 }, { "epoch": 0.29415867310692934, "grad_norm": 0.7826161980628967, "learning_rate": 9.744552496605153e-05, "loss": 0.9692039489746094, "memory(GiB)": 91.52, "step": 22670, "token_acc": 0.7250690507341183, "train_speed(iter/s)": 0.214251 }, { "epoch": 0.29422355150858504, "grad_norm": 0.8311330080032349, "learning_rate": 9.744383218597721e-05, "loss": 0.9617868423461914, "memory(GiB)": 91.52, "step": 22675, "token_acc": 0.711442995372318, "train_speed(iter/s)": 0.214215 }, { "epoch": 0.29428842991024073, "grad_norm": 0.8685145378112793, "learning_rate": 9.744213885992233e-05, "loss": 0.9966928482055664, "memory(GiB)": 91.52, "step": 22680, "token_acc": 0.7392094502498864, "train_speed(iter/s)": 0.21418 }, { "epoch": 0.29435330831189643, "grad_norm": 0.7427253127098083, "learning_rate": 9.744044498790636e-05, "loss": 0.9187726974487305, "memory(GiB)": 91.52, "step": 22685, "token_acc": 0.7635696898808206, "train_speed(iter/s)": 0.214145 }, { "epoch": 0.29441818671355213, "grad_norm": 0.8129703402519226, "learning_rate": 9.743875056994882e-05, "loss": 0.9188488006591797, "memory(GiB)": 91.52, "step": 22690, "token_acc": 0.7627800305790069, "train_speed(iter/s)": 0.214108 }, { "epoch": 0.29448306511520783, "grad_norm": 0.8208444714546204, "learning_rate": 9.743705560606917e-05, "loss": 0.9720636367797851, "memory(GiB)": 91.52, "step": 22695, "token_acc": 0.7507453990883854, "train_speed(iter/s)": 0.214074 }, { "epoch": 0.29454794351686353, "grad_norm": 0.7262234091758728, "learning_rate": 9.743536009628694e-05, "loss": 0.9589697837829589, "memory(GiB)": 91.52, "step": 22700, "token_acc": 0.7457045982704102, "train_speed(iter/s)": 0.214039 }, { "epoch": 0.29461282191851923, "grad_norm": 0.8703934550285339, "learning_rate": 9.743366404062164e-05, "loss": 0.9733591079711914, "memory(GiB)": 91.52, "step": 22705, "token_acc": 0.7688662585354368, "train_speed(iter/s)": 0.213998 }, { "epoch": 0.29467770032017493, "grad_norm": 0.8280273675918579, "learning_rate": 9.74319674390928e-05, "loss": 0.9791082382202149, "memory(GiB)": 91.52, "step": 22710, "token_acc": 0.7547099421749673, "train_speed(iter/s)": 0.213964 }, { "epoch": 0.29474257872183063, "grad_norm": 0.7726371884346008, "learning_rate": 9.743027029171992e-05, "loss": 0.967892074584961, "memory(GiB)": 91.52, "step": 22715, "token_acc": 0.7438170959586707, "train_speed(iter/s)": 0.213929 }, { "epoch": 0.2948074571234863, "grad_norm": 0.80361008644104, "learning_rate": 9.742857259852254e-05, "loss": 0.9283902168273925, "memory(GiB)": 91.52, "step": 22720, "token_acc": 0.7509876543209877, "train_speed(iter/s)": 0.213896 }, { "epoch": 0.294872335525142, "grad_norm": 0.8290703296661377, "learning_rate": 9.742687435952021e-05, "loss": 0.9709705352783203, "memory(GiB)": 91.52, "step": 22725, "token_acc": 0.7485389771188959, "train_speed(iter/s)": 0.213858 }, { "epoch": 0.2949372139267977, "grad_norm": 0.8462821841239929, "learning_rate": 9.742517557473246e-05, "loss": 0.9554977416992188, "memory(GiB)": 91.52, "step": 22730, "token_acc": 0.7480880566078193, "train_speed(iter/s)": 0.213822 }, { "epoch": 0.29500209232845337, "grad_norm": 0.8790311217308044, "learning_rate": 9.742347624417882e-05, "loss": 1.0392610549926757, "memory(GiB)": 91.52, "step": 22735, "token_acc": 0.7445927289461574, "train_speed(iter/s)": 0.213786 }, { "epoch": 0.29506697073010907, "grad_norm": 0.8883122801780701, "learning_rate": 9.74217763678789e-05, "loss": 0.9458722114562989, "memory(GiB)": 91.52, "step": 22740, "token_acc": 0.7442227596399301, "train_speed(iter/s)": 0.21375 }, { "epoch": 0.29513184913176477, "grad_norm": 0.9219024181365967, "learning_rate": 9.742007594585221e-05, "loss": 0.9942684173583984, "memory(GiB)": 91.52, "step": 22745, "token_acc": 0.7397544152670811, "train_speed(iter/s)": 0.213715 }, { "epoch": 0.29519672753342047, "grad_norm": 0.8369412422180176, "learning_rate": 9.741837497811835e-05, "loss": 0.9605014801025391, "memory(GiB)": 91.52, "step": 22750, "token_acc": 0.747866563227308, "train_speed(iter/s)": 0.213678 }, { "epoch": 0.29526160593507617, "grad_norm": 0.7950398921966553, "learning_rate": 9.741667346469686e-05, "loss": 0.9462100982666015, "memory(GiB)": 91.52, "step": 22755, "token_acc": 0.7511685455045367, "train_speed(iter/s)": 0.213643 }, { "epoch": 0.29532648433673186, "grad_norm": 0.7433353662490845, "learning_rate": 9.741497140560736e-05, "loss": 0.9366922378540039, "memory(GiB)": 91.52, "step": 22760, "token_acc": 0.7532120102784329, "train_speed(iter/s)": 0.213608 }, { "epoch": 0.29539136273838756, "grad_norm": 0.7013775706291199, "learning_rate": 9.741326880086942e-05, "loss": 0.9128274917602539, "memory(GiB)": 91.52, "step": 22765, "token_acc": 0.7623951803305381, "train_speed(iter/s)": 0.213575 }, { "epoch": 0.29545624114004326, "grad_norm": 0.8280256390571594, "learning_rate": 9.741156565050263e-05, "loss": 0.9667691230773926, "memory(GiB)": 91.52, "step": 22770, "token_acc": 0.7568689977381438, "train_speed(iter/s)": 0.213541 }, { "epoch": 0.29552111954169896, "grad_norm": 0.8304272890090942, "learning_rate": 9.74098619545266e-05, "loss": 0.9362658500671387, "memory(GiB)": 91.52, "step": 22775, "token_acc": 0.7561172967374418, "train_speed(iter/s)": 0.213507 }, { "epoch": 0.29558599794335466, "grad_norm": 0.816639244556427, "learning_rate": 9.740815771296092e-05, "loss": 0.9712350845336915, "memory(GiB)": 91.52, "step": 22780, "token_acc": 0.7468354430379747, "train_speed(iter/s)": 0.213467 }, { "epoch": 0.29565087634501036, "grad_norm": 0.8289127349853516, "learning_rate": 9.740645292582522e-05, "loss": 1.0008901596069335, "memory(GiB)": 91.52, "step": 22785, "token_acc": 0.7427473438324166, "train_speed(iter/s)": 0.213429 }, { "epoch": 0.29571575474666606, "grad_norm": 0.8031802773475647, "learning_rate": 9.74047475931391e-05, "loss": 0.9460955619812011, "memory(GiB)": 91.52, "step": 22790, "token_acc": 0.7409845828095017, "train_speed(iter/s)": 0.213395 }, { "epoch": 0.29578063314832176, "grad_norm": 0.830144464969635, "learning_rate": 9.740304171492221e-05, "loss": 0.9938412666320801, "memory(GiB)": 91.52, "step": 22795, "token_acc": 0.7537798553794464, "train_speed(iter/s)": 0.213362 }, { "epoch": 0.29584551154997746, "grad_norm": 0.7394850850105286, "learning_rate": 9.740133529119417e-05, "loss": 0.9799063682556153, "memory(GiB)": 91.52, "step": 22800, "token_acc": 0.7442701178512038, "train_speed(iter/s)": 0.213329 }, { "epoch": 0.29591038995163316, "grad_norm": 0.7449653148651123, "learning_rate": 9.73996283219746e-05, "loss": 0.921320629119873, "memory(GiB)": 91.52, "step": 22805, "token_acc": 0.755289052890529, "train_speed(iter/s)": 0.213293 }, { "epoch": 0.29597526835328886, "grad_norm": 0.7979636192321777, "learning_rate": 9.739792080728315e-05, "loss": 0.9773059844970703, "memory(GiB)": 91.52, "step": 22810, "token_acc": 0.7550477533491443, "train_speed(iter/s)": 0.213254 }, { "epoch": 0.29604014675494456, "grad_norm": 0.8259201645851135, "learning_rate": 9.73962127471395e-05, "loss": 0.9831817626953125, "memory(GiB)": 91.52, "step": 22815, "token_acc": 0.7400982228032364, "train_speed(iter/s)": 0.21322 }, { "epoch": 0.29610502515660025, "grad_norm": 0.7727831602096558, "learning_rate": 9.739450414156328e-05, "loss": 0.9053201675415039, "memory(GiB)": 91.52, "step": 22820, "token_acc": 0.7671658660638598, "train_speed(iter/s)": 0.213182 }, { "epoch": 0.29616990355825595, "grad_norm": 0.8357605338096619, "learning_rate": 9.739279499057415e-05, "loss": 0.945576286315918, "memory(GiB)": 91.52, "step": 22825, "token_acc": 0.7576552081310287, "train_speed(iter/s)": 0.213145 }, { "epoch": 0.29623478195991165, "grad_norm": 0.875241219997406, "learning_rate": 9.739108529419178e-05, "loss": 0.9568386077880859, "memory(GiB)": 91.52, "step": 22830, "token_acc": 0.7598123920019748, "train_speed(iter/s)": 0.21311 }, { "epoch": 0.29629966036156735, "grad_norm": 0.8164471387863159, "learning_rate": 9.738937505243587e-05, "loss": 0.9720930099487305, "memory(GiB)": 91.52, "step": 22835, "token_acc": 0.7312701252236136, "train_speed(iter/s)": 0.213074 }, { "epoch": 0.29636453876322305, "grad_norm": 0.7637648582458496, "learning_rate": 9.738766426532606e-05, "loss": 0.8948090553283692, "memory(GiB)": 91.52, "step": 22840, "token_acc": 0.7369969151302102, "train_speed(iter/s)": 0.213038 }, { "epoch": 0.29642941716487875, "grad_norm": 0.8140031099319458, "learning_rate": 9.738595293288209e-05, "loss": 0.9392415046691894, "memory(GiB)": 91.52, "step": 22845, "token_acc": 0.7588847506004386, "train_speed(iter/s)": 0.213004 }, { "epoch": 0.29649429556653445, "grad_norm": 0.7146490216255188, "learning_rate": 9.738424105512359e-05, "loss": 0.9336771011352539, "memory(GiB)": 91.52, "step": 22850, "token_acc": 0.7600012347203359, "train_speed(iter/s)": 0.212968 }, { "epoch": 0.2965591739681901, "grad_norm": 0.7837404012680054, "learning_rate": 9.738252863207031e-05, "loss": 0.9739635467529297, "memory(GiB)": 91.52, "step": 22855, "token_acc": 0.7512244553284918, "train_speed(iter/s)": 0.212927 }, { "epoch": 0.2966240523698458, "grad_norm": 0.8442492485046387, "learning_rate": 9.738081566374193e-05, "loss": 0.9701244354248046, "memory(GiB)": 91.52, "step": 22860, "token_acc": 0.727545677565816, "train_speed(iter/s)": 0.212894 }, { "epoch": 0.2966889307715015, "grad_norm": 0.9420937895774841, "learning_rate": 9.737910215015818e-05, "loss": 0.9510702133178711, "memory(GiB)": 91.52, "step": 22865, "token_acc": 0.7519622726832262, "train_speed(iter/s)": 0.212861 }, { "epoch": 0.2967538091731572, "grad_norm": 0.8537925481796265, "learning_rate": 9.737738809133878e-05, "loss": 0.9420301437377929, "memory(GiB)": 91.52, "step": 22870, "token_acc": 0.7398535990239935, "train_speed(iter/s)": 0.212826 }, { "epoch": 0.2968186875748129, "grad_norm": 0.950160562992096, "learning_rate": 9.737567348730344e-05, "loss": 0.9648783683776856, "memory(GiB)": 91.52, "step": 22875, "token_acc": 0.7413683399752186, "train_speed(iter/s)": 0.212793 }, { "epoch": 0.2968835659764686, "grad_norm": 0.8919036388397217, "learning_rate": 9.73739583380719e-05, "loss": 0.9564715385437011, "memory(GiB)": 91.52, "step": 22880, "token_acc": 0.7230892678034102, "train_speed(iter/s)": 0.212759 }, { "epoch": 0.2969484443781243, "grad_norm": 0.7553275227546692, "learning_rate": 9.737224264366388e-05, "loss": 0.9693070411682129, "memory(GiB)": 91.52, "step": 22885, "token_acc": 0.7614249363867684, "train_speed(iter/s)": 0.212728 }, { "epoch": 0.29701332277978, "grad_norm": 0.7847737073898315, "learning_rate": 9.737052640409915e-05, "loss": 0.952237319946289, "memory(GiB)": 91.52, "step": 22890, "token_acc": 0.7332040933070616, "train_speed(iter/s)": 0.212691 }, { "epoch": 0.2970782011814357, "grad_norm": 0.8502553701400757, "learning_rate": 9.736880961939746e-05, "loss": 1.027536392211914, "memory(GiB)": 91.52, "step": 22895, "token_acc": 0.7474555451567618, "train_speed(iter/s)": 0.212658 }, { "epoch": 0.2971430795830914, "grad_norm": 0.9268811345100403, "learning_rate": 9.736709228957856e-05, "loss": 0.9736621856689454, "memory(GiB)": 91.52, "step": 22900, "token_acc": 0.7548483807501327, "train_speed(iter/s)": 0.212624 }, { "epoch": 0.2972079579847471, "grad_norm": 0.7417436838150024, "learning_rate": 9.73653744146622e-05, "loss": 0.8920705795288086, "memory(GiB)": 91.52, "step": 22905, "token_acc": 0.7759014771603043, "train_speed(iter/s)": 0.212586 }, { "epoch": 0.2972728363864028, "grad_norm": 0.7416030764579773, "learning_rate": 9.736365599466816e-05, "loss": 0.9606976509094238, "memory(GiB)": 91.52, "step": 22910, "token_acc": 0.7590734378720365, "train_speed(iter/s)": 0.212552 }, { "epoch": 0.2973377147880585, "grad_norm": 0.8293197154998779, "learning_rate": 9.736193702961622e-05, "loss": 0.9582114219665527, "memory(GiB)": 91.52, "step": 22915, "token_acc": 0.7632639954329753, "train_speed(iter/s)": 0.21252 }, { "epoch": 0.2974025931897142, "grad_norm": 0.7860613465309143, "learning_rate": 9.736021751952616e-05, "loss": 0.9672127723693847, "memory(GiB)": 91.52, "step": 22920, "token_acc": 0.7359951689105183, "train_speed(iter/s)": 0.212482 }, { "epoch": 0.2974674715913699, "grad_norm": 0.7532865405082703, "learning_rate": 9.735849746441775e-05, "loss": 0.969087791442871, "memory(GiB)": 91.52, "step": 22925, "token_acc": 0.760710944808232, "train_speed(iter/s)": 0.212444 }, { "epoch": 0.2975323499930256, "grad_norm": 0.8500006198883057, "learning_rate": 9.73567768643108e-05, "loss": 0.9357025146484375, "memory(GiB)": 91.52, "step": 22930, "token_acc": 0.7412326418076056, "train_speed(iter/s)": 0.212411 }, { "epoch": 0.2975972283946813, "grad_norm": 0.7747793793678284, "learning_rate": 9.735505571922512e-05, "loss": 0.9930515289306641, "memory(GiB)": 91.52, "step": 22935, "token_acc": 0.7378992522172627, "train_speed(iter/s)": 0.212373 }, { "epoch": 0.297662106796337, "grad_norm": 0.8246597051620483, "learning_rate": 9.735333402918051e-05, "loss": 0.9619876861572265, "memory(GiB)": 91.52, "step": 22940, "token_acc": 0.751266370150729, "train_speed(iter/s)": 0.212337 }, { "epoch": 0.2977269851979927, "grad_norm": 0.8938852548599243, "learning_rate": 9.735161179419678e-05, "loss": 1.0028481483459473, "memory(GiB)": 91.52, "step": 22945, "token_acc": 0.7361383709519136, "train_speed(iter/s)": 0.212307 }, { "epoch": 0.2977918635996484, "grad_norm": 0.7540754079818726, "learning_rate": 9.734988901429373e-05, "loss": 0.9767558097839355, "memory(GiB)": 91.52, "step": 22950, "token_acc": 0.7520430848232458, "train_speed(iter/s)": 0.212274 }, { "epoch": 0.2978567420013041, "grad_norm": 0.7984681129455566, "learning_rate": 9.734816568949122e-05, "loss": 0.946112060546875, "memory(GiB)": 91.52, "step": 22955, "token_acc": 0.7366869414101291, "train_speed(iter/s)": 0.212241 }, { "epoch": 0.2979216204029598, "grad_norm": 0.7175408601760864, "learning_rate": 9.734644181980906e-05, "loss": 0.952877140045166, "memory(GiB)": 91.52, "step": 22960, "token_acc": 0.7489658317318654, "train_speed(iter/s)": 0.212204 }, { "epoch": 0.2979864988046155, "grad_norm": 0.8554843068122864, "learning_rate": 9.73447174052671e-05, "loss": 0.9361610412597656, "memory(GiB)": 91.52, "step": 22965, "token_acc": 0.7485495870589038, "train_speed(iter/s)": 0.212169 }, { "epoch": 0.2980513772062712, "grad_norm": 0.7703229188919067, "learning_rate": 9.734299244588519e-05, "loss": 0.9065696716308593, "memory(GiB)": 91.52, "step": 22970, "token_acc": 0.7607200720072007, "train_speed(iter/s)": 0.212136 }, { "epoch": 0.2981162556079268, "grad_norm": 0.8144422173500061, "learning_rate": 9.734126694168316e-05, "loss": 0.9197742462158203, "memory(GiB)": 91.52, "step": 22975, "token_acc": 0.7460941183867216, "train_speed(iter/s)": 0.212103 }, { "epoch": 0.2981811340095825, "grad_norm": 0.9143841862678528, "learning_rate": 9.733954089268088e-05, "loss": 1.0035043716430665, "memory(GiB)": 91.52, "step": 22980, "token_acc": 0.7539273884096946, "train_speed(iter/s)": 0.212071 }, { "epoch": 0.2982460124112382, "grad_norm": 0.8039466142654419, "learning_rate": 9.733781429889819e-05, "loss": 0.8975257873535156, "memory(GiB)": 91.52, "step": 22985, "token_acc": 0.7584032829695952, "train_speed(iter/s)": 0.212039 }, { "epoch": 0.2983108908128939, "grad_norm": 0.7855985760688782, "learning_rate": 9.733608716035499e-05, "loss": 0.9840813636779785, "memory(GiB)": 91.52, "step": 22990, "token_acc": 0.7542800837541569, "train_speed(iter/s)": 0.212007 }, { "epoch": 0.2983757692145496, "grad_norm": 0.9092545509338379, "learning_rate": 9.733435947707116e-05, "loss": 0.954001522064209, "memory(GiB)": 91.52, "step": 22995, "token_acc": 0.7436123011015912, "train_speed(iter/s)": 0.211974 }, { "epoch": 0.2984406476162053, "grad_norm": 0.9041509032249451, "learning_rate": 9.733263124906655e-05, "loss": 0.9753281593322753, "memory(GiB)": 91.52, "step": 23000, "token_acc": 0.7529380341880342, "train_speed(iter/s)": 0.211933 }, { "epoch": 0.298505526017861, "grad_norm": 0.804768979549408, "learning_rate": 9.733090247636108e-05, "loss": 0.9462771415710449, "memory(GiB)": 91.52, "step": 23005, "token_acc": 0.7477447127082164, "train_speed(iter/s)": 0.211899 }, { "epoch": 0.2985704044195167, "grad_norm": 0.8701196312904358, "learning_rate": 9.732917315897463e-05, "loss": 0.9534487724304199, "memory(GiB)": 91.52, "step": 23010, "token_acc": 0.7342573962292273, "train_speed(iter/s)": 0.211865 }, { "epoch": 0.2986352828211724, "grad_norm": 0.7513986229896545, "learning_rate": 9.732744329692709e-05, "loss": 0.961749267578125, "memory(GiB)": 91.52, "step": 23015, "token_acc": 0.748269718358117, "train_speed(iter/s)": 0.211833 }, { "epoch": 0.2987001612228281, "grad_norm": 0.810889482498169, "learning_rate": 9.732571289023837e-05, "loss": 0.987004280090332, "memory(GiB)": 91.52, "step": 23020, "token_acc": 0.7287511394712853, "train_speed(iter/s)": 0.211803 }, { "epoch": 0.2987650396244838, "grad_norm": 0.7398605942726135, "learning_rate": 9.732398193892842e-05, "loss": 0.9696962356567382, "memory(GiB)": 91.52, "step": 23025, "token_acc": 0.7516264948261974, "train_speed(iter/s)": 0.211767 }, { "epoch": 0.2988299180261395, "grad_norm": 0.9218107461929321, "learning_rate": 9.732225044301711e-05, "loss": 0.943022346496582, "memory(GiB)": 91.52, "step": 23030, "token_acc": 0.7758434218330251, "train_speed(iter/s)": 0.211737 }, { "epoch": 0.2988947964277952, "grad_norm": 0.7748141884803772, "learning_rate": 9.73205184025244e-05, "loss": 0.9011056900024415, "memory(GiB)": 91.52, "step": 23035, "token_acc": 0.7638324535753617, "train_speed(iter/s)": 0.211698 }, { "epoch": 0.2989596748294509, "grad_norm": 0.7898619174957275, "learning_rate": 9.731878581747021e-05, "loss": 0.9312491416931152, "memory(GiB)": 91.52, "step": 23040, "token_acc": 0.7635181015689217, "train_speed(iter/s)": 0.211663 }, { "epoch": 0.2990245532311066, "grad_norm": 0.7935343384742737, "learning_rate": 9.731705268787447e-05, "loss": 0.9595559120178223, "memory(GiB)": 91.52, "step": 23045, "token_acc": 0.728049562059389, "train_speed(iter/s)": 0.211634 }, { "epoch": 0.2990894316327623, "grad_norm": 0.8161603808403015, "learning_rate": 9.731531901375713e-05, "loss": 0.9358865737915039, "memory(GiB)": 91.52, "step": 23050, "token_acc": 0.757814587229495, "train_speed(iter/s)": 0.211599 }, { "epoch": 0.299154310034418, "grad_norm": 0.8511160016059875, "learning_rate": 9.731358479513814e-05, "loss": 0.9727437973022461, "memory(GiB)": 91.52, "step": 23055, "token_acc": 0.7513672696676483, "train_speed(iter/s)": 0.211566 }, { "epoch": 0.2992191884360737, "grad_norm": 0.8723353743553162, "learning_rate": 9.731185003203749e-05, "loss": 0.9670475006103516, "memory(GiB)": 91.52, "step": 23060, "token_acc": 0.7366513908131728, "train_speed(iter/s)": 0.211533 }, { "epoch": 0.2992840668377294, "grad_norm": 0.8189582824707031, "learning_rate": 9.731011472447509e-05, "loss": 0.9553266525268554, "memory(GiB)": 91.52, "step": 23065, "token_acc": 0.7346808646304916, "train_speed(iter/s)": 0.211496 }, { "epoch": 0.2993489452393851, "grad_norm": 0.8794007897377014, "learning_rate": 9.730837887247094e-05, "loss": 0.9156485557556152, "memory(GiB)": 91.52, "step": 23070, "token_acc": 0.7589789768988329, "train_speed(iter/s)": 0.211465 }, { "epoch": 0.2994138236410408, "grad_norm": 0.6622955799102783, "learning_rate": 9.730664247604501e-05, "loss": 0.9540709495544434, "memory(GiB)": 91.52, "step": 23075, "token_acc": 0.7782620922384702, "train_speed(iter/s)": 0.21143 }, { "epoch": 0.2994787020426965, "grad_norm": 0.8272372484207153, "learning_rate": 9.730490553521729e-05, "loss": 0.9650933265686035, "memory(GiB)": 91.52, "step": 23080, "token_acc": 0.7231771976630307, "train_speed(iter/s)": 0.211397 }, { "epoch": 0.2995435804443522, "grad_norm": 0.9623756408691406, "learning_rate": 9.730316805000775e-05, "loss": 0.9796133041381836, "memory(GiB)": 91.52, "step": 23085, "token_acc": 0.7223800383877159, "train_speed(iter/s)": 0.211363 }, { "epoch": 0.2996084588460079, "grad_norm": 0.825222373008728, "learning_rate": 9.730143002043641e-05, "loss": 0.9602171897888183, "memory(GiB)": 91.52, "step": 23090, "token_acc": 0.7284502321684181, "train_speed(iter/s)": 0.211331 }, { "epoch": 0.29967333724766354, "grad_norm": 0.7607092261314392, "learning_rate": 9.729969144652324e-05, "loss": 0.9871995925903321, "memory(GiB)": 91.52, "step": 23095, "token_acc": 0.7365572252688555, "train_speed(iter/s)": 0.211299 }, { "epoch": 0.29973821564931924, "grad_norm": 0.7834564447402954, "learning_rate": 9.729795232828829e-05, "loss": 0.9210414886474609, "memory(GiB)": 91.52, "step": 23100, "token_acc": 0.7612722824187347, "train_speed(iter/s)": 0.211267 }, { "epoch": 0.29980309405097494, "grad_norm": 0.8380100727081299, "learning_rate": 9.729621266575153e-05, "loss": 0.93489990234375, "memory(GiB)": 91.52, "step": 23105, "token_acc": 0.7476553783058412, "train_speed(iter/s)": 0.211232 }, { "epoch": 0.29986797245263064, "grad_norm": 0.9599993228912354, "learning_rate": 9.7294472458933e-05, "loss": 0.9184208869934082, "memory(GiB)": 91.52, "step": 23110, "token_acc": 0.7596978061891617, "train_speed(iter/s)": 0.211195 }, { "epoch": 0.29993285085428634, "grad_norm": 0.9701860547065735, "learning_rate": 9.729273170785273e-05, "loss": 0.9952497482299805, "memory(GiB)": 91.52, "step": 23115, "token_acc": 0.7420173466109862, "train_speed(iter/s)": 0.21116 }, { "epoch": 0.29999772925594204, "grad_norm": 0.9316595196723938, "learning_rate": 9.729099041253074e-05, "loss": 0.9810340881347657, "memory(GiB)": 91.52, "step": 23120, "token_acc": 0.7418184815761273, "train_speed(iter/s)": 0.211127 }, { "epoch": 0.30006260765759774, "grad_norm": 0.819197416305542, "learning_rate": 9.728924857298707e-05, "loss": 0.9320234298706055, "memory(GiB)": 91.52, "step": 23125, "token_acc": 0.7513163090693695, "train_speed(iter/s)": 0.211091 }, { "epoch": 0.30012748605925343, "grad_norm": 0.8384889364242554, "learning_rate": 9.728750618924179e-05, "loss": 0.97781400680542, "memory(GiB)": 91.52, "step": 23130, "token_acc": 0.774920800910405, "train_speed(iter/s)": 0.211056 }, { "epoch": 0.30019236446090913, "grad_norm": 0.771294891834259, "learning_rate": 9.728576326131492e-05, "loss": 0.9704307556152344, "memory(GiB)": 91.52, "step": 23135, "token_acc": 0.73947711957097, "train_speed(iter/s)": 0.211023 }, { "epoch": 0.30025724286256483, "grad_norm": 0.8684874176979065, "learning_rate": 9.728401978922654e-05, "loss": 0.9958017349243165, "memory(GiB)": 91.52, "step": 23140, "token_acc": 0.7331715304988596, "train_speed(iter/s)": 0.21099 }, { "epoch": 0.30032212126422053, "grad_norm": 0.8670652508735657, "learning_rate": 9.728227577299669e-05, "loss": 0.9541094779968262, "memory(GiB)": 91.52, "step": 23145, "token_acc": 0.7475027746947835, "train_speed(iter/s)": 0.210958 }, { "epoch": 0.30038699966587623, "grad_norm": 0.7959731221199036, "learning_rate": 9.728053121264546e-05, "loss": 0.9650596618652344, "memory(GiB)": 91.52, "step": 23150, "token_acc": 0.7604936330765603, "train_speed(iter/s)": 0.210924 }, { "epoch": 0.30045187806753193, "grad_norm": 0.7990418672561646, "learning_rate": 9.727878610819292e-05, "loss": 0.9723050117492675, "memory(GiB)": 91.52, "step": 23155, "token_acc": 0.7417425489734888, "train_speed(iter/s)": 0.210891 }, { "epoch": 0.30051675646918763, "grad_norm": 0.8317358493804932, "learning_rate": 9.727704045965915e-05, "loss": 1.0116759300231934, "memory(GiB)": 91.52, "step": 23160, "token_acc": 0.7307183251112983, "train_speed(iter/s)": 0.210858 }, { "epoch": 0.30058163487084333, "grad_norm": 0.8441217541694641, "learning_rate": 9.727529426706425e-05, "loss": 0.9553965568542481, "memory(GiB)": 91.52, "step": 23165, "token_acc": 0.7282057151577582, "train_speed(iter/s)": 0.210828 }, { "epoch": 0.300646513272499, "grad_norm": 0.7570053935050964, "learning_rate": 9.727354753042829e-05, "loss": 0.9747281074523926, "memory(GiB)": 91.52, "step": 23170, "token_acc": 0.7219835567735723, "train_speed(iter/s)": 0.210798 }, { "epoch": 0.3007113916741547, "grad_norm": 0.8020026683807373, "learning_rate": 9.727180024977141e-05, "loss": 0.9261730194091797, "memory(GiB)": 91.52, "step": 23175, "token_acc": 0.7408131482156466, "train_speed(iter/s)": 0.210764 }, { "epoch": 0.3007762700758104, "grad_norm": 0.8363534808158875, "learning_rate": 9.727005242511368e-05, "loss": 0.9559893608093262, "memory(GiB)": 91.52, "step": 23180, "token_acc": 0.7484821633684486, "train_speed(iter/s)": 0.210732 }, { "epoch": 0.3008411484774661, "grad_norm": 0.7624680399894714, "learning_rate": 9.726830405647522e-05, "loss": 0.9578144073486328, "memory(GiB)": 91.52, "step": 23185, "token_acc": 0.7579644187008688, "train_speed(iter/s)": 0.210696 }, { "epoch": 0.3009060268791218, "grad_norm": 0.8686038851737976, "learning_rate": 9.726655514387617e-05, "loss": 0.9663526535034179, "memory(GiB)": 91.52, "step": 23190, "token_acc": 0.7473145189234974, "train_speed(iter/s)": 0.210664 }, { "epoch": 0.3009709052807775, "grad_norm": 0.8001667857170105, "learning_rate": 9.726480568733664e-05, "loss": 0.9632026672363281, "memory(GiB)": 91.52, "step": 23195, "token_acc": 0.7379466119096509, "train_speed(iter/s)": 0.210635 }, { "epoch": 0.3010357836824332, "grad_norm": 0.8242963552474976, "learning_rate": 9.726305568687679e-05, "loss": 0.9607961654663086, "memory(GiB)": 91.52, "step": 23200, "token_acc": 0.7371333333333333, "train_speed(iter/s)": 0.210599 }, { "epoch": 0.3011006620840889, "grad_norm": 0.811130702495575, "learning_rate": 9.726130514251673e-05, "loss": 1.013311004638672, "memory(GiB)": 91.52, "step": 23205, "token_acc": 0.759027266028003, "train_speed(iter/s)": 0.210565 }, { "epoch": 0.3011655404857446, "grad_norm": 0.7410358190536499, "learning_rate": 9.725955405427659e-05, "loss": 0.9522405624389648, "memory(GiB)": 91.52, "step": 23210, "token_acc": 0.7486329904960292, "train_speed(iter/s)": 0.21053 }, { "epoch": 0.30123041888740026, "grad_norm": 0.8573095798492432, "learning_rate": 9.725780242217656e-05, "loss": 0.960759162902832, "memory(GiB)": 91.52, "step": 23215, "token_acc": 0.7445141725982395, "train_speed(iter/s)": 0.210495 }, { "epoch": 0.30129529728905596, "grad_norm": 0.8981263041496277, "learning_rate": 9.725605024623677e-05, "loss": 0.9856636047363281, "memory(GiB)": 91.52, "step": 23220, "token_acc": 0.7394409055868868, "train_speed(iter/s)": 0.210464 }, { "epoch": 0.30136017569071166, "grad_norm": 0.7878942489624023, "learning_rate": 9.725429752647741e-05, "loss": 0.9538589477539062, "memory(GiB)": 91.52, "step": 23225, "token_acc": 0.737022679677405, "train_speed(iter/s)": 0.210432 }, { "epoch": 0.30142505409236736, "grad_norm": 0.7385645508766174, "learning_rate": 9.725254426291864e-05, "loss": 0.943882942199707, "memory(GiB)": 91.52, "step": 23230, "token_acc": 0.736016800840042, "train_speed(iter/s)": 0.210399 }, { "epoch": 0.30148993249402306, "grad_norm": 0.8664681911468506, "learning_rate": 9.725079045558061e-05, "loss": 0.9504215240478515, "memory(GiB)": 91.52, "step": 23235, "token_acc": 0.7394432734937132, "train_speed(iter/s)": 0.210366 }, { "epoch": 0.30155481089567876, "grad_norm": 0.7619251608848572, "learning_rate": 9.724903610448353e-05, "loss": 0.975886344909668, "memory(GiB)": 91.52, "step": 23240, "token_acc": 0.722138283329736, "train_speed(iter/s)": 0.210332 }, { "epoch": 0.30161968929733446, "grad_norm": 0.7527866959571838, "learning_rate": 9.72472812096476e-05, "loss": 0.9090178489685059, "memory(GiB)": 91.52, "step": 23245, "token_acc": 0.7554653341661461, "train_speed(iter/s)": 0.210294 }, { "epoch": 0.30168456769899016, "grad_norm": 0.7576496601104736, "learning_rate": 9.724552577109297e-05, "loss": 0.9514869689941406, "memory(GiB)": 91.52, "step": 23250, "token_acc": 0.7533338087427797, "train_speed(iter/s)": 0.210262 }, { "epoch": 0.30174944610064586, "grad_norm": 0.8200225830078125, "learning_rate": 9.72437697888399e-05, "loss": 0.987999153137207, "memory(GiB)": 91.52, "step": 23255, "token_acc": 0.741750732183534, "train_speed(iter/s)": 0.21023 }, { "epoch": 0.30181432450230156, "grad_norm": 0.8426377773284912, "learning_rate": 9.724201326290854e-05, "loss": 0.9791064262390137, "memory(GiB)": 91.52, "step": 23260, "token_acc": 0.7458214365917039, "train_speed(iter/s)": 0.210196 }, { "epoch": 0.30187920290395726, "grad_norm": 0.9119437336921692, "learning_rate": 9.724025619331916e-05, "loss": 0.9568349838256835, "memory(GiB)": 91.52, "step": 23265, "token_acc": 0.7411745566943224, "train_speed(iter/s)": 0.210165 }, { "epoch": 0.30194408130561295, "grad_norm": 0.819904088973999, "learning_rate": 9.723849858009193e-05, "loss": 0.980406379699707, "memory(GiB)": 91.52, "step": 23270, "token_acc": 0.7265908785147316, "train_speed(iter/s)": 0.210132 }, { "epoch": 0.30200895970726865, "grad_norm": 0.792681872844696, "learning_rate": 9.72367404232471e-05, "loss": 0.9681156158447266, "memory(GiB)": 91.52, "step": 23275, "token_acc": 0.7412159578535922, "train_speed(iter/s)": 0.2101 }, { "epoch": 0.30207383810892435, "grad_norm": 0.8281283974647522, "learning_rate": 9.723498172280492e-05, "loss": 0.9817686080932617, "memory(GiB)": 91.52, "step": 23280, "token_acc": 0.7399462859566047, "train_speed(iter/s)": 0.210068 }, { "epoch": 0.30213871651058005, "grad_norm": 0.7817476391792297, "learning_rate": 9.723322247878559e-05, "loss": 0.9872019767761231, "memory(GiB)": 91.52, "step": 23285, "token_acc": 0.7338358261688847, "train_speed(iter/s)": 0.210032 }, { "epoch": 0.30220359491223575, "grad_norm": 0.7709742188453674, "learning_rate": 9.723146269120938e-05, "loss": 0.9591838836669921, "memory(GiB)": 91.52, "step": 23290, "token_acc": 0.7680897020519662, "train_speed(iter/s)": 0.209999 }, { "epoch": 0.30226847331389145, "grad_norm": 0.9715495705604553, "learning_rate": 9.722970236009654e-05, "loss": 0.966066551208496, "memory(GiB)": 91.52, "step": 23295, "token_acc": 0.7391575446861577, "train_speed(iter/s)": 0.209963 }, { "epoch": 0.30233335171554715, "grad_norm": 0.8567602038383484, "learning_rate": 9.722794148546733e-05, "loss": 0.9887104988098144, "memory(GiB)": 91.52, "step": 23300, "token_acc": 0.7403545359749739, "train_speed(iter/s)": 0.209929 }, { "epoch": 0.30239823011720285, "grad_norm": 0.9114549160003662, "learning_rate": 9.722618006734201e-05, "loss": 0.975316047668457, "memory(GiB)": 91.52, "step": 23305, "token_acc": 0.7406168565983431, "train_speed(iter/s)": 0.2099 }, { "epoch": 0.30246310851885855, "grad_norm": 0.8389105200767517, "learning_rate": 9.722441810574084e-05, "loss": 1.0061162948608398, "memory(GiB)": 91.52, "step": 23310, "token_acc": 0.7202204040741359, "train_speed(iter/s)": 0.20987 }, { "epoch": 0.30252798692051425, "grad_norm": 0.7460615038871765, "learning_rate": 9.72226556006841e-05, "loss": 0.9351237297058106, "memory(GiB)": 91.52, "step": 23315, "token_acc": 0.7762860687812172, "train_speed(iter/s)": 0.20984 }, { "epoch": 0.30259286532216995, "grad_norm": 0.9013120532035828, "learning_rate": 9.72208925521921e-05, "loss": 0.9810935974121093, "memory(GiB)": 91.52, "step": 23320, "token_acc": 0.7489892572484694, "train_speed(iter/s)": 0.209805 }, { "epoch": 0.30265774372382565, "grad_norm": 0.747162401676178, "learning_rate": 9.72191289602851e-05, "loss": 0.921291446685791, "memory(GiB)": 91.52, "step": 23325, "token_acc": 0.7574993363419167, "train_speed(iter/s)": 0.209771 }, { "epoch": 0.30272262212548134, "grad_norm": 0.8073141574859619, "learning_rate": 9.72173648249834e-05, "loss": 0.9758679389953613, "memory(GiB)": 91.52, "step": 23330, "token_acc": 0.7368649399106252, "train_speed(iter/s)": 0.209739 }, { "epoch": 0.302787500527137, "grad_norm": 0.9491227865219116, "learning_rate": 9.721560014630731e-05, "loss": 0.9454194068908691, "memory(GiB)": 91.52, "step": 23335, "token_acc": 0.7586184403639803, "train_speed(iter/s)": 0.209707 }, { "epoch": 0.3028523789287927, "grad_norm": 0.7993320822715759, "learning_rate": 9.721383492427711e-05, "loss": 0.9543560028076172, "memory(GiB)": 91.52, "step": 23340, "token_acc": 0.7364169778251924, "train_speed(iter/s)": 0.209676 }, { "epoch": 0.3029172573304484, "grad_norm": 0.722276508808136, "learning_rate": 9.721206915891316e-05, "loss": 0.9769651412963867, "memory(GiB)": 91.52, "step": 23345, "token_acc": 0.7494914603722894, "train_speed(iter/s)": 0.209643 }, { "epoch": 0.3029821357321041, "grad_norm": 0.9121442437171936, "learning_rate": 9.721030285023575e-05, "loss": 0.9478347778320313, "memory(GiB)": 91.52, "step": 23350, "token_acc": 0.7298926767676768, "train_speed(iter/s)": 0.209608 }, { "epoch": 0.3030470141337598, "grad_norm": 0.7982528805732727, "learning_rate": 9.72085359982652e-05, "loss": 0.9527305603027344, "memory(GiB)": 91.52, "step": 23355, "token_acc": 0.7544510385756676, "train_speed(iter/s)": 0.209576 }, { "epoch": 0.3031118925354155, "grad_norm": 0.6982985138893127, "learning_rate": 9.720676860302188e-05, "loss": 0.9894152641296386, "memory(GiB)": 91.52, "step": 23360, "token_acc": 0.7170879541745763, "train_speed(iter/s)": 0.209542 }, { "epoch": 0.3031767709370712, "grad_norm": 0.7398912310600281, "learning_rate": 9.72050006645261e-05, "loss": 0.9243969917297363, "memory(GiB)": 91.52, "step": 23365, "token_acc": 0.7390119381213773, "train_speed(iter/s)": 0.20951 }, { "epoch": 0.3032416493387269, "grad_norm": 0.8265808820724487, "learning_rate": 9.72032321827982e-05, "loss": 0.9803247451782227, "memory(GiB)": 91.52, "step": 23370, "token_acc": 0.7520769567118496, "train_speed(iter/s)": 0.209477 }, { "epoch": 0.3033065277403826, "grad_norm": 0.8261830806732178, "learning_rate": 9.720146315785854e-05, "loss": 0.965755844116211, "memory(GiB)": 91.52, "step": 23375, "token_acc": 0.7500490164041566, "train_speed(iter/s)": 0.209443 }, { "epoch": 0.3033714061420383, "grad_norm": 0.8985952734947205, "learning_rate": 9.71996935897275e-05, "loss": 1.0123379707336426, "memory(GiB)": 91.52, "step": 23380, "token_acc": 0.7285896156625262, "train_speed(iter/s)": 0.209411 }, { "epoch": 0.303436284543694, "grad_norm": 0.9018717408180237, "learning_rate": 9.71979234784254e-05, "loss": 0.975128173828125, "memory(GiB)": 91.52, "step": 23385, "token_acc": 0.7353667664670659, "train_speed(iter/s)": 0.209381 }, { "epoch": 0.3035011629453497, "grad_norm": 0.918644905090332, "learning_rate": 9.719615282397264e-05, "loss": 0.9961194038391114, "memory(GiB)": 91.52, "step": 23390, "token_acc": 0.7463864618182999, "train_speed(iter/s)": 0.209348 }, { "epoch": 0.3035660413470054, "grad_norm": 1.130361795425415, "learning_rate": 9.719438162638959e-05, "loss": 0.9285065650939941, "memory(GiB)": 91.52, "step": 23395, "token_acc": 0.7369676765695983, "train_speed(iter/s)": 0.209318 }, { "epoch": 0.3036309197486611, "grad_norm": 0.7764386534690857, "learning_rate": 9.719260988569665e-05, "loss": 1.0270099639892578, "memory(GiB)": 91.52, "step": 23400, "token_acc": 0.7305889759373251, "train_speed(iter/s)": 0.209285 }, { "epoch": 0.3036957981503168, "grad_norm": 0.9239597916603088, "learning_rate": 9.719083760191417e-05, "loss": 0.9506369590759277, "memory(GiB)": 91.52, "step": 23405, "token_acc": 0.755576772849579, "train_speed(iter/s)": 0.209253 }, { "epoch": 0.3037606765519725, "grad_norm": 0.7499092221260071, "learning_rate": 9.718906477506259e-05, "loss": 0.9602323532104492, "memory(GiB)": 91.52, "step": 23410, "token_acc": 0.7155164069305973, "train_speed(iter/s)": 0.209223 }, { "epoch": 0.3038255549536282, "grad_norm": 0.9391205906867981, "learning_rate": 9.718729140516228e-05, "loss": 0.9423129081726074, "memory(GiB)": 91.52, "step": 23415, "token_acc": 0.7613233262387513, "train_speed(iter/s)": 0.209189 }, { "epoch": 0.3038904333552839, "grad_norm": 0.7977522015571594, "learning_rate": 9.718551749223366e-05, "loss": 0.9597214698791504, "memory(GiB)": 91.52, "step": 23420, "token_acc": 0.754761402768993, "train_speed(iter/s)": 0.209154 }, { "epoch": 0.3039553117569396, "grad_norm": 0.7702010869979858, "learning_rate": 9.718374303629713e-05, "loss": 0.988039207458496, "memory(GiB)": 91.52, "step": 23425, "token_acc": 0.7298380937834122, "train_speed(iter/s)": 0.209125 }, { "epoch": 0.30402019015859527, "grad_norm": 0.8848450779914856, "learning_rate": 9.718196803737312e-05, "loss": 0.9545924186706543, "memory(GiB)": 91.52, "step": 23430, "token_acc": 0.7365014862475072, "train_speed(iter/s)": 0.209093 }, { "epoch": 0.30408506856025097, "grad_norm": 0.825915515422821, "learning_rate": 9.718019249548207e-05, "loss": 0.9750959396362304, "memory(GiB)": 91.52, "step": 23435, "token_acc": 0.731609554161054, "train_speed(iter/s)": 0.209063 }, { "epoch": 0.30414994696190667, "grad_norm": 0.7654256224632263, "learning_rate": 9.717841641064441e-05, "loss": 0.964167308807373, "memory(GiB)": 91.52, "step": 23440, "token_acc": 0.7379256664720762, "train_speed(iter/s)": 0.209032 }, { "epoch": 0.30421482536356237, "grad_norm": 0.7777856588363647, "learning_rate": 9.717663978288056e-05, "loss": 0.9646549224853516, "memory(GiB)": 91.52, "step": 23445, "token_acc": 0.7530900314488408, "train_speed(iter/s)": 0.209002 }, { "epoch": 0.304279703765218, "grad_norm": 0.6709178686141968, "learning_rate": 9.717486261221097e-05, "loss": 0.9826086044311524, "memory(GiB)": 91.52, "step": 23450, "token_acc": 0.7323789641117407, "train_speed(iter/s)": 0.208972 }, { "epoch": 0.3043445821668737, "grad_norm": 0.7251468300819397, "learning_rate": 9.717308489865611e-05, "loss": 0.9266515731811523, "memory(GiB)": 91.52, "step": 23455, "token_acc": 0.7510617185949593, "train_speed(iter/s)": 0.20894 }, { "epoch": 0.3044094605685294, "grad_norm": 0.776201069355011, "learning_rate": 9.717130664223642e-05, "loss": 0.9635016441345214, "memory(GiB)": 91.52, "step": 23460, "token_acc": 0.7292475565671442, "train_speed(iter/s)": 0.208905 }, { "epoch": 0.3044743389701851, "grad_norm": 0.9228861927986145, "learning_rate": 9.716952784297235e-05, "loss": 0.9583370208740234, "memory(GiB)": 91.52, "step": 23465, "token_acc": 0.7445198907668462, "train_speed(iter/s)": 0.208871 }, { "epoch": 0.3045392173718408, "grad_norm": 0.7897010445594788, "learning_rate": 9.716774850088441e-05, "loss": 0.929173469543457, "memory(GiB)": 91.52, "step": 23470, "token_acc": 0.7446159638554217, "train_speed(iter/s)": 0.208842 }, { "epoch": 0.3046040957734965, "grad_norm": 0.8615785837173462, "learning_rate": 9.716596861599306e-05, "loss": 0.9897237777709961, "memory(GiB)": 91.52, "step": 23475, "token_acc": 0.7276291429193742, "train_speed(iter/s)": 0.208808 }, { "epoch": 0.3046689741751522, "grad_norm": 0.8333358764648438, "learning_rate": 9.716418818831876e-05, "loss": 1.0164466857910157, "memory(GiB)": 91.52, "step": 23480, "token_acc": 0.7506820566631689, "train_speed(iter/s)": 0.208775 }, { "epoch": 0.3047338525768079, "grad_norm": 0.7453902959823608, "learning_rate": 9.716240721788203e-05, "loss": 0.968167781829834, "memory(GiB)": 91.52, "step": 23485, "token_acc": 0.7478172893168958, "train_speed(iter/s)": 0.208745 }, { "epoch": 0.3047987309784636, "grad_norm": 0.7900436520576477, "learning_rate": 9.716062570470335e-05, "loss": 0.9494176864624023, "memory(GiB)": 91.52, "step": 23490, "token_acc": 0.7674379098291034, "train_speed(iter/s)": 0.208715 }, { "epoch": 0.3048636093801193, "grad_norm": 0.7457712888717651, "learning_rate": 9.715884364880324e-05, "loss": 0.9870138168334961, "memory(GiB)": 91.52, "step": 23495, "token_acc": 0.7446568492941859, "train_speed(iter/s)": 0.208684 }, { "epoch": 0.304928487781775, "grad_norm": 0.7524645924568176, "learning_rate": 9.715706105020217e-05, "loss": 0.9532528877258301, "memory(GiB)": 91.52, "step": 23500, "token_acc": 0.7232504224134544, "train_speed(iter/s)": 0.208653 }, { "epoch": 0.3049933661834307, "grad_norm": 0.8227887153625488, "learning_rate": 9.715527790892069e-05, "loss": 0.9798107147216797, "memory(GiB)": 91.52, "step": 23505, "token_acc": 0.7323171166037467, "train_speed(iter/s)": 0.208622 }, { "epoch": 0.3050582445850864, "grad_norm": 0.8447132110595703, "learning_rate": 9.71534942249793e-05, "loss": 0.9540661811828614, "memory(GiB)": 91.52, "step": 23510, "token_acc": 0.7608005388242086, "train_speed(iter/s)": 0.208589 }, { "epoch": 0.3051231229867421, "grad_norm": 0.8346050381660461, "learning_rate": 9.715170999839853e-05, "loss": 0.9685181617736817, "memory(GiB)": 91.52, "step": 23515, "token_acc": 0.7413793103448276, "train_speed(iter/s)": 0.208558 }, { "epoch": 0.3051880013883978, "grad_norm": 0.8251469731330872, "learning_rate": 9.714992522919892e-05, "loss": 0.9540866851806641, "memory(GiB)": 91.52, "step": 23520, "token_acc": 0.7408811557187326, "train_speed(iter/s)": 0.208521 }, { "epoch": 0.3052528797900535, "grad_norm": 0.784329891204834, "learning_rate": 9.714813991740101e-05, "loss": 0.9778289794921875, "memory(GiB)": 91.52, "step": 23525, "token_acc": 0.7562640013785973, "train_speed(iter/s)": 0.208485 }, { "epoch": 0.3053177581917092, "grad_norm": 0.811507523059845, "learning_rate": 9.714635406302534e-05, "loss": 0.9843647956848145, "memory(GiB)": 91.52, "step": 23530, "token_acc": 0.7218950647771741, "train_speed(iter/s)": 0.208453 }, { "epoch": 0.3053826365933649, "grad_norm": 0.8585150837898254, "learning_rate": 9.714456766609245e-05, "loss": 0.9196295738220215, "memory(GiB)": 91.52, "step": 23535, "token_acc": 0.75718838002075, "train_speed(iter/s)": 0.208421 }, { "epoch": 0.3054475149950206, "grad_norm": 0.8045168519020081, "learning_rate": 9.714278072662293e-05, "loss": 0.9231925964355469, "memory(GiB)": 91.52, "step": 23540, "token_acc": 0.7466091651303661, "train_speed(iter/s)": 0.208387 }, { "epoch": 0.3055123933966763, "grad_norm": 0.8299906849861145, "learning_rate": 9.71409932446373e-05, "loss": 0.9838041305541992, "memory(GiB)": 91.52, "step": 23545, "token_acc": 0.7282389242314579, "train_speed(iter/s)": 0.208357 }, { "epoch": 0.305577271798332, "grad_norm": 0.7397000789642334, "learning_rate": 9.713920522015617e-05, "loss": 0.9345165252685547, "memory(GiB)": 91.52, "step": 23550, "token_acc": 0.730123858944737, "train_speed(iter/s)": 0.208319 }, { "epoch": 0.3056421501999877, "grad_norm": 0.8285713791847229, "learning_rate": 9.713741665320009e-05, "loss": 0.9728466033935547, "memory(GiB)": 91.52, "step": 23555, "token_acc": 0.7340536772394562, "train_speed(iter/s)": 0.208291 }, { "epoch": 0.3057070286016434, "grad_norm": 0.9340431094169617, "learning_rate": 9.713562754378966e-05, "loss": 0.9721416473388672, "memory(GiB)": 91.52, "step": 23560, "token_acc": 0.7352344334080957, "train_speed(iter/s)": 0.208263 }, { "epoch": 0.3057719070032991, "grad_norm": 0.8589568138122559, "learning_rate": 9.713383789194546e-05, "loss": 0.9241482734680175, "memory(GiB)": 91.52, "step": 23565, "token_acc": 0.7682858319913112, "train_speed(iter/s)": 0.208233 }, { "epoch": 0.30583678540495474, "grad_norm": 0.7982679009437561, "learning_rate": 9.713204769768809e-05, "loss": 0.9708732604980469, "memory(GiB)": 91.52, "step": 23570, "token_acc": 0.7512687094494764, "train_speed(iter/s)": 0.208203 }, { "epoch": 0.30590166380661044, "grad_norm": 0.7926076054573059, "learning_rate": 9.713025696103814e-05, "loss": 0.9120923042297363, "memory(GiB)": 91.52, "step": 23575, "token_acc": 0.7606360483802758, "train_speed(iter/s)": 0.208175 }, { "epoch": 0.30596654220826613, "grad_norm": 0.8266451954841614, "learning_rate": 9.712846568201623e-05, "loss": 0.959352970123291, "memory(GiB)": 91.52, "step": 23580, "token_acc": 0.7415422347255284, "train_speed(iter/s)": 0.208146 }, { "epoch": 0.30603142060992183, "grad_norm": 0.8925368189811707, "learning_rate": 9.712667386064298e-05, "loss": 0.9810032844543457, "memory(GiB)": 91.52, "step": 23585, "token_acc": 0.7477480460988211, "train_speed(iter/s)": 0.208117 }, { "epoch": 0.30609629901157753, "grad_norm": 0.72893226146698, "learning_rate": 9.712488149693899e-05, "loss": 0.9336724281311035, "memory(GiB)": 91.52, "step": 23590, "token_acc": 0.7493721183041572, "train_speed(iter/s)": 0.208087 }, { "epoch": 0.30616117741323323, "grad_norm": 0.7661433219909668, "learning_rate": 9.712308859092489e-05, "loss": 0.9722753524780273, "memory(GiB)": 91.52, "step": 23595, "token_acc": 0.7376931504779176, "train_speed(iter/s)": 0.208057 }, { "epoch": 0.30622605581488893, "grad_norm": 0.8832687139511108, "learning_rate": 9.712129514262134e-05, "loss": 0.9512702941894531, "memory(GiB)": 91.52, "step": 23600, "token_acc": 0.7680709683947785, "train_speed(iter/s)": 0.208023 }, { "epoch": 0.30629093421654463, "grad_norm": 0.801197350025177, "learning_rate": 9.711950115204894e-05, "loss": 0.9316116333007812, "memory(GiB)": 91.52, "step": 23605, "token_acc": 0.7661461901842703, "train_speed(iter/s)": 0.207992 }, { "epoch": 0.30635581261820033, "grad_norm": 0.8068462014198303, "learning_rate": 9.711770661922837e-05, "loss": 0.9448387145996093, "memory(GiB)": 91.52, "step": 23610, "token_acc": 0.7613001010441226, "train_speed(iter/s)": 0.20796 }, { "epoch": 0.30642069101985603, "grad_norm": 0.8358851075172424, "learning_rate": 9.711591154418025e-05, "loss": 0.971809196472168, "memory(GiB)": 91.52, "step": 23615, "token_acc": 0.7471821379854979, "train_speed(iter/s)": 0.207932 }, { "epoch": 0.3064855694215117, "grad_norm": 0.7782471179962158, "learning_rate": 9.711411592692525e-05, "loss": 0.9739277839660645, "memory(GiB)": 91.52, "step": 23620, "token_acc": 0.7444011339475549, "train_speed(iter/s)": 0.207899 }, { "epoch": 0.3065504478231674, "grad_norm": 0.8006710410118103, "learning_rate": 9.711231976748405e-05, "loss": 0.9288595199584961, "memory(GiB)": 91.52, "step": 23625, "token_acc": 0.7379729098552078, "train_speed(iter/s)": 0.207866 }, { "epoch": 0.3066153262248231, "grad_norm": 0.9634734988212585, "learning_rate": 9.711052306587728e-05, "loss": 0.9925265312194824, "memory(GiB)": 91.52, "step": 23630, "token_acc": 0.7517399525284628, "train_speed(iter/s)": 0.207835 }, { "epoch": 0.3066802046264788, "grad_norm": 0.8024837970733643, "learning_rate": 9.710872582212566e-05, "loss": 0.9868622779846191, "memory(GiB)": 91.52, "step": 23635, "token_acc": 0.7620459347100116, "train_speed(iter/s)": 0.207804 }, { "epoch": 0.3067450830281345, "grad_norm": 0.8757681250572205, "learning_rate": 9.710692803624987e-05, "loss": 0.9603732109069825, "memory(GiB)": 91.52, "step": 23640, "token_acc": 0.7514827560957751, "train_speed(iter/s)": 0.207771 }, { "epoch": 0.3068099614297902, "grad_norm": 0.7780416011810303, "learning_rate": 9.710512970827057e-05, "loss": 0.9320760726928711, "memory(GiB)": 91.52, "step": 23645, "token_acc": 0.7542247612959929, "train_speed(iter/s)": 0.20774 }, { "epoch": 0.3068748398314459, "grad_norm": 0.8444448113441467, "learning_rate": 9.710333083820847e-05, "loss": 0.9231999397277832, "memory(GiB)": 91.52, "step": 23650, "token_acc": 0.7618562460972162, "train_speed(iter/s)": 0.207706 }, { "epoch": 0.3069397182331016, "grad_norm": 0.757046639919281, "learning_rate": 9.710153142608427e-05, "loss": 0.9465600967407226, "memory(GiB)": 91.52, "step": 23655, "token_acc": 0.7553896915142391, "train_speed(iter/s)": 0.207673 }, { "epoch": 0.3070045966347573, "grad_norm": 0.8572490811347961, "learning_rate": 9.709973147191868e-05, "loss": 0.9543664932250977, "memory(GiB)": 91.52, "step": 23660, "token_acc": 0.7605518817862683, "train_speed(iter/s)": 0.207641 }, { "epoch": 0.307069475036413, "grad_norm": 0.8690835237503052, "learning_rate": 9.709793097573241e-05, "loss": 0.9639226913452148, "memory(GiB)": 91.52, "step": 23665, "token_acc": 0.7517280695503457, "train_speed(iter/s)": 0.20761 }, { "epoch": 0.3071343534380687, "grad_norm": 0.8036525845527649, "learning_rate": 9.709612993754618e-05, "loss": 0.9555124282836914, "memory(GiB)": 91.52, "step": 23670, "token_acc": 0.731678606262402, "train_speed(iter/s)": 0.207579 }, { "epoch": 0.3071992318397244, "grad_norm": 1.0137662887573242, "learning_rate": 9.709432835738072e-05, "loss": 0.9079198837280273, "memory(GiB)": 91.52, "step": 23675, "token_acc": 0.7497281178740571, "train_speed(iter/s)": 0.207549 }, { "epoch": 0.3072641102413801, "grad_norm": 1.0096018314361572, "learning_rate": 9.709252623525676e-05, "loss": 1.013978862762451, "memory(GiB)": 91.52, "step": 23680, "token_acc": 0.7177129819145723, "train_speed(iter/s)": 0.20752 }, { "epoch": 0.3073289886430358, "grad_norm": 0.8528391122817993, "learning_rate": 9.709072357119502e-05, "loss": 0.9384541511535645, "memory(GiB)": 91.52, "step": 23685, "token_acc": 0.7713250009440018, "train_speed(iter/s)": 0.207491 }, { "epoch": 0.30739386704469146, "grad_norm": 0.774128794670105, "learning_rate": 9.708892036521628e-05, "loss": 0.9179856300354003, "memory(GiB)": 91.52, "step": 23690, "token_acc": 0.7529560522178935, "train_speed(iter/s)": 0.207461 }, { "epoch": 0.30745874544634716, "grad_norm": 0.9608277082443237, "learning_rate": 9.708711661734129e-05, "loss": 0.9860341072082519, "memory(GiB)": 91.52, "step": 23695, "token_acc": 0.7459707230519, "train_speed(iter/s)": 0.207433 }, { "epoch": 0.30752362384800286, "grad_norm": 0.7529504299163818, "learning_rate": 9.708531232759077e-05, "loss": 0.9170396804809571, "memory(GiB)": 91.52, "step": 23700, "token_acc": 0.7496384784658913, "train_speed(iter/s)": 0.207402 }, { "epoch": 0.30758850224965856, "grad_norm": 0.8314902782440186, "learning_rate": 9.708350749598552e-05, "loss": 0.9635326385498046, "memory(GiB)": 91.52, "step": 23705, "token_acc": 0.7211013264964887, "train_speed(iter/s)": 0.207368 }, { "epoch": 0.30765338065131426, "grad_norm": 0.8655456900596619, "learning_rate": 9.708170212254629e-05, "loss": 0.910562801361084, "memory(GiB)": 91.52, "step": 23710, "token_acc": 0.7460311822992506, "train_speed(iter/s)": 0.207336 }, { "epoch": 0.30771825905296996, "grad_norm": 0.7820054888725281, "learning_rate": 9.707989620729387e-05, "loss": 0.9547372817993164, "memory(GiB)": 91.52, "step": 23715, "token_acc": 0.7364285470260667, "train_speed(iter/s)": 0.207303 }, { "epoch": 0.30778313745462565, "grad_norm": 0.829888105392456, "learning_rate": 9.707808975024902e-05, "loss": 0.9612070083618164, "memory(GiB)": 91.52, "step": 23720, "token_acc": 0.722958147094677, "train_speed(iter/s)": 0.207274 }, { "epoch": 0.30784801585628135, "grad_norm": 0.8510831594467163, "learning_rate": 9.707628275143254e-05, "loss": 1.021530532836914, "memory(GiB)": 91.52, "step": 23725, "token_acc": 0.7384199584199584, "train_speed(iter/s)": 0.207248 }, { "epoch": 0.30791289425793705, "grad_norm": 0.8610604405403137, "learning_rate": 9.707447521086526e-05, "loss": 0.9778693199157715, "memory(GiB)": 91.52, "step": 23730, "token_acc": 0.7579765971364198, "train_speed(iter/s)": 0.207216 }, { "epoch": 0.30797777265959275, "grad_norm": 0.7426246404647827, "learning_rate": 9.707266712856792e-05, "loss": 0.9780624389648438, "memory(GiB)": 91.52, "step": 23735, "token_acc": 0.75501622011294, "train_speed(iter/s)": 0.207185 }, { "epoch": 0.30804265106124845, "grad_norm": 0.7593613266944885, "learning_rate": 9.707085850456136e-05, "loss": 1.002646255493164, "memory(GiB)": 91.52, "step": 23740, "token_acc": 0.7512415659143062, "train_speed(iter/s)": 0.207156 }, { "epoch": 0.30810752946290415, "grad_norm": 0.8845605254173279, "learning_rate": 9.70690493388664e-05, "loss": 0.9930041313171387, "memory(GiB)": 91.52, "step": 23745, "token_acc": 0.7298664434659194, "train_speed(iter/s)": 0.207125 }, { "epoch": 0.30817240786455985, "grad_norm": 0.7363818287849426, "learning_rate": 9.706723963150383e-05, "loss": 0.9673439025878906, "memory(GiB)": 91.52, "step": 23750, "token_acc": 0.7331859924551309, "train_speed(iter/s)": 0.207094 }, { "epoch": 0.30823728626621555, "grad_norm": 0.9189772009849548, "learning_rate": 9.706542938249452e-05, "loss": 0.952426815032959, "memory(GiB)": 91.52, "step": 23755, "token_acc": 0.7473350815652352, "train_speed(iter/s)": 0.207064 }, { "epoch": 0.30830216466787125, "grad_norm": 0.7819016575813293, "learning_rate": 9.706361859185926e-05, "loss": 0.913817024230957, "memory(GiB)": 91.52, "step": 23760, "token_acc": 0.7503220177005858, "train_speed(iter/s)": 0.207038 }, { "epoch": 0.30836704306952695, "grad_norm": 0.7627517580986023, "learning_rate": 9.70618072596189e-05, "loss": 0.9287461280822754, "memory(GiB)": 91.52, "step": 23765, "token_acc": 0.7328707134676279, "train_speed(iter/s)": 0.207006 }, { "epoch": 0.30843192147118265, "grad_norm": 0.8194600939750671, "learning_rate": 9.705999538579432e-05, "loss": 0.9561979293823242, "memory(GiB)": 91.52, "step": 23770, "token_acc": 0.7315462733699357, "train_speed(iter/s)": 0.206977 }, { "epoch": 0.30849679987283835, "grad_norm": 0.8198071122169495, "learning_rate": 9.705818297040631e-05, "loss": 0.9825592041015625, "memory(GiB)": 91.52, "step": 23775, "token_acc": 0.7319216116964684, "train_speed(iter/s)": 0.206948 }, { "epoch": 0.30856167827449404, "grad_norm": 0.7906047701835632, "learning_rate": 9.705637001347578e-05, "loss": 0.9257602691650391, "memory(GiB)": 91.52, "step": 23780, "token_acc": 0.7496238338850436, "train_speed(iter/s)": 0.206921 }, { "epoch": 0.30862655667614974, "grad_norm": 0.7685394287109375, "learning_rate": 9.705455651502358e-05, "loss": 0.9385066986083984, "memory(GiB)": 91.52, "step": 23785, "token_acc": 0.7671549730146492, "train_speed(iter/s)": 0.206888 }, { "epoch": 0.30869143507780544, "grad_norm": 0.8631800413131714, "learning_rate": 9.705274247507055e-05, "loss": 1.002027702331543, "memory(GiB)": 91.52, "step": 23790, "token_acc": 0.7203770197486535, "train_speed(iter/s)": 0.206859 }, { "epoch": 0.30875631347946114, "grad_norm": 0.9422376155853271, "learning_rate": 9.70509278936376e-05, "loss": 0.9530475616455079, "memory(GiB)": 91.52, "step": 23795, "token_acc": 0.7490756716785802, "train_speed(iter/s)": 0.20683 }, { "epoch": 0.30882119188111684, "grad_norm": 0.9204165935516357, "learning_rate": 9.70491127707456e-05, "loss": 0.9328662872314453, "memory(GiB)": 91.52, "step": 23800, "token_acc": 0.7449647622795407, "train_speed(iter/s)": 0.2068 }, { "epoch": 0.30888607028277254, "grad_norm": 0.859462559223175, "learning_rate": 9.704729710641542e-05, "loss": 0.9860171318054199, "memory(GiB)": 91.52, "step": 23805, "token_acc": 0.7254829941967237, "train_speed(iter/s)": 0.206775 }, { "epoch": 0.3089509486844282, "grad_norm": 0.851243257522583, "learning_rate": 9.7045480900668e-05, "loss": 0.9772563934326172, "memory(GiB)": 91.52, "step": 23810, "token_acc": 0.7537039987255059, "train_speed(iter/s)": 0.206744 }, { "epoch": 0.3090158270860839, "grad_norm": 0.7808548212051392, "learning_rate": 9.70436641535242e-05, "loss": 0.9397651672363281, "memory(GiB)": 91.52, "step": 23815, "token_acc": 0.744136078782453, "train_speed(iter/s)": 0.206712 }, { "epoch": 0.3090807054877396, "grad_norm": 0.7614309191703796, "learning_rate": 9.704184686500494e-05, "loss": 0.9465002059936524, "memory(GiB)": 91.52, "step": 23820, "token_acc": 0.7396072881655757, "train_speed(iter/s)": 0.206684 }, { "epoch": 0.3091455838893953, "grad_norm": 0.8324408531188965, "learning_rate": 9.704002903513114e-05, "loss": 0.9484503746032715, "memory(GiB)": 91.52, "step": 23825, "token_acc": 0.7443291648585743, "train_speed(iter/s)": 0.206655 }, { "epoch": 0.309210462291051, "grad_norm": 0.7682905793190002, "learning_rate": 9.703821066392371e-05, "loss": 0.9663994789123536, "memory(GiB)": 91.52, "step": 23830, "token_acc": 0.7396506768582348, "train_speed(iter/s)": 0.206623 }, { "epoch": 0.3092753406927067, "grad_norm": 0.8169693946838379, "learning_rate": 9.703639175140357e-05, "loss": 0.951821231842041, "memory(GiB)": 91.52, "step": 23835, "token_acc": 0.7348324542841803, "train_speed(iter/s)": 0.206593 }, { "epoch": 0.3093402190943624, "grad_norm": 0.7247498035430908, "learning_rate": 9.703457229759167e-05, "loss": 0.968470573425293, "memory(GiB)": 91.52, "step": 23840, "token_acc": 0.7531287502142979, "train_speed(iter/s)": 0.206565 }, { "epoch": 0.3094050974960181, "grad_norm": 0.8377633690834045, "learning_rate": 9.703275230250895e-05, "loss": 0.9585165977478027, "memory(GiB)": 91.52, "step": 23845, "token_acc": 0.7374660223497432, "train_speed(iter/s)": 0.206535 }, { "epoch": 0.3094699758976738, "grad_norm": 0.7822192311286926, "learning_rate": 9.703093176617633e-05, "loss": 0.9391091346740723, "memory(GiB)": 91.52, "step": 23850, "token_acc": 0.7718192500801196, "train_speed(iter/s)": 0.206504 }, { "epoch": 0.3095348542993295, "grad_norm": 0.8935903906822205, "learning_rate": 9.702911068861478e-05, "loss": 0.9801076889038086, "memory(GiB)": 91.52, "step": 23855, "token_acc": 0.7598817080207733, "train_speed(iter/s)": 0.20647 }, { "epoch": 0.3095997327009852, "grad_norm": 0.847277820110321, "learning_rate": 9.702728906984524e-05, "loss": 0.9942638397216796, "memory(GiB)": 91.52, "step": 23860, "token_acc": 0.730300787295606, "train_speed(iter/s)": 0.206439 }, { "epoch": 0.3096646111026409, "grad_norm": 0.7931514978408813, "learning_rate": 9.70254669098887e-05, "loss": 0.9686379432678223, "memory(GiB)": 91.52, "step": 23865, "token_acc": 0.742753983238928, "train_speed(iter/s)": 0.206404 }, { "epoch": 0.3097294895042966, "grad_norm": 0.8322303295135498, "learning_rate": 9.70236442087661e-05, "loss": 0.9415926933288574, "memory(GiB)": 91.52, "step": 23870, "token_acc": 0.7528655297735533, "train_speed(iter/s)": 0.206376 }, { "epoch": 0.3097943679059523, "grad_norm": 0.8903275728225708, "learning_rate": 9.702182096649843e-05, "loss": 0.9690334320068359, "memory(GiB)": 91.52, "step": 23875, "token_acc": 0.7388809693513899, "train_speed(iter/s)": 0.206346 }, { "epoch": 0.30985924630760797, "grad_norm": 0.8822260499000549, "learning_rate": 9.701999718310666e-05, "loss": 0.9957776069641113, "memory(GiB)": 91.52, "step": 23880, "token_acc": 0.7484394976310446, "train_speed(iter/s)": 0.206317 }, { "epoch": 0.30992412470926367, "grad_norm": 0.728428304195404, "learning_rate": 9.701817285861181e-05, "loss": 0.9320911407470703, "memory(GiB)": 91.52, "step": 23885, "token_acc": 0.7538622333997452, "train_speed(iter/s)": 0.206287 }, { "epoch": 0.30998900311091937, "grad_norm": 0.8152517080307007, "learning_rate": 9.701634799303485e-05, "loss": 0.9732606887817383, "memory(GiB)": 91.52, "step": 23890, "token_acc": 0.7332736630118595, "train_speed(iter/s)": 0.20626 }, { "epoch": 0.31005388151257507, "grad_norm": 0.8174904584884644, "learning_rate": 9.701452258639678e-05, "loss": 1.0031919479370117, "memory(GiB)": 91.52, "step": 23895, "token_acc": 0.7292114242385948, "train_speed(iter/s)": 0.20623 }, { "epoch": 0.31011875991423077, "grad_norm": 0.7198225259780884, "learning_rate": 9.701269663871862e-05, "loss": 0.9706045150756836, "memory(GiB)": 91.52, "step": 23900, "token_acc": 0.7361192456366406, "train_speed(iter/s)": 0.206197 }, { "epoch": 0.31018363831588647, "grad_norm": 0.7758810520172119, "learning_rate": 9.701087015002135e-05, "loss": 0.9227646827697754, "memory(GiB)": 91.52, "step": 23905, "token_acc": 0.7635333203048661, "train_speed(iter/s)": 0.206166 }, { "epoch": 0.31024851671754217, "grad_norm": 0.958066463470459, "learning_rate": 9.700904312032601e-05, "loss": 0.9534929275512696, "memory(GiB)": 91.52, "step": 23910, "token_acc": 0.7485241600466438, "train_speed(iter/s)": 0.206138 }, { "epoch": 0.31031339511919787, "grad_norm": 0.7780290842056274, "learning_rate": 9.700721554965365e-05, "loss": 0.9899269104003906, "memory(GiB)": 91.52, "step": 23915, "token_acc": 0.7284950343773873, "train_speed(iter/s)": 0.206107 }, { "epoch": 0.31037827352085356, "grad_norm": 0.8290567994117737, "learning_rate": 9.700538743802528e-05, "loss": 0.9495754241943359, "memory(GiB)": 91.52, "step": 23920, "token_acc": 0.7499652101308099, "train_speed(iter/s)": 0.20608 }, { "epoch": 0.31044315192250926, "grad_norm": 0.8566648364067078, "learning_rate": 9.700355878546192e-05, "loss": 0.9309377670288086, "memory(GiB)": 91.52, "step": 23925, "token_acc": 0.741039339485187, "train_speed(iter/s)": 0.206051 }, { "epoch": 0.3105080303241649, "grad_norm": 0.911312460899353, "learning_rate": 9.700172959198463e-05, "loss": 1.0136948585510255, "memory(GiB)": 91.52, "step": 23930, "token_acc": 0.7263539232929934, "train_speed(iter/s)": 0.206022 }, { "epoch": 0.3105729087258206, "grad_norm": 0.8034507632255554, "learning_rate": 9.699989985761446e-05, "loss": 0.9171573638916015, "memory(GiB)": 91.52, "step": 23935, "token_acc": 0.7647917680595147, "train_speed(iter/s)": 0.205994 }, { "epoch": 0.3106377871274763, "grad_norm": 0.8510453104972839, "learning_rate": 9.699806958237249e-05, "loss": 0.9759517669677734, "memory(GiB)": 91.52, "step": 23940, "token_acc": 0.7446331304069208, "train_speed(iter/s)": 0.205963 }, { "epoch": 0.310702665529132, "grad_norm": 0.7647672891616821, "learning_rate": 9.699623876627974e-05, "loss": 0.9511814117431641, "memory(GiB)": 91.52, "step": 23945, "token_acc": 0.7506322225423349, "train_speed(iter/s)": 0.205935 }, { "epoch": 0.3107675439307877, "grad_norm": 0.7665718793869019, "learning_rate": 9.699440740935731e-05, "loss": 0.9753751754760742, "memory(GiB)": 91.52, "step": 23950, "token_acc": 0.7328854244806106, "train_speed(iter/s)": 0.205904 }, { "epoch": 0.3108324223324434, "grad_norm": 0.8726953864097595, "learning_rate": 9.699257551162625e-05, "loss": 0.9896463394165039, "memory(GiB)": 91.52, "step": 23955, "token_acc": 0.722961396857799, "train_speed(iter/s)": 0.205874 }, { "epoch": 0.3108973007340991, "grad_norm": 0.6853528618812561, "learning_rate": 9.699074307310768e-05, "loss": 0.9392516136169433, "memory(GiB)": 91.52, "step": 23960, "token_acc": 0.7707133003327112, "train_speed(iter/s)": 0.205844 }, { "epoch": 0.3109621791357548, "grad_norm": 0.7847853302955627, "learning_rate": 9.698891009382262e-05, "loss": 0.9404144287109375, "memory(GiB)": 91.52, "step": 23965, "token_acc": 0.7656648329531732, "train_speed(iter/s)": 0.205815 }, { "epoch": 0.3110270575374105, "grad_norm": 0.7757482528686523, "learning_rate": 9.698707657379224e-05, "loss": 0.9876983642578125, "memory(GiB)": 91.52, "step": 23970, "token_acc": 0.7482083487255264, "train_speed(iter/s)": 0.205789 }, { "epoch": 0.3110919359390662, "grad_norm": 0.7814602255821228, "learning_rate": 9.69852425130376e-05, "loss": 0.9974325180053711, "memory(GiB)": 91.52, "step": 23975, "token_acc": 0.7467945059544192, "train_speed(iter/s)": 0.20576 }, { "epoch": 0.3111568143407219, "grad_norm": 0.8080165386199951, "learning_rate": 9.698340791157982e-05, "loss": 0.9587543487548829, "memory(GiB)": 91.52, "step": 23980, "token_acc": 0.755322358936534, "train_speed(iter/s)": 0.205733 }, { "epoch": 0.3112216927423776, "grad_norm": 0.8678333163261414, "learning_rate": 9.698157276944e-05, "loss": 1.0360956192016602, "memory(GiB)": 91.52, "step": 23985, "token_acc": 0.7348539705527396, "train_speed(iter/s)": 0.205704 }, { "epoch": 0.3112865711440333, "grad_norm": 0.8840807676315308, "learning_rate": 9.697973708663925e-05, "loss": 0.9416976928710937, "memory(GiB)": 91.52, "step": 23990, "token_acc": 0.7433651605142471, "train_speed(iter/s)": 0.205673 }, { "epoch": 0.311351449545689, "grad_norm": 0.8486595153808594, "learning_rate": 9.697790086319874e-05, "loss": 0.9621174812316895, "memory(GiB)": 91.52, "step": 23995, "token_acc": 0.751124728513807, "train_speed(iter/s)": 0.205648 }, { "epoch": 0.3114163279473447, "grad_norm": 0.7985245585441589, "learning_rate": 9.697606409913954e-05, "loss": 1.036082649230957, "memory(GiB)": 91.52, "step": 24000, "token_acc": 0.7493748867548469, "train_speed(iter/s)": 0.205621 }, { "epoch": 0.3114812063490004, "grad_norm": 0.7507560849189758, "learning_rate": 9.697422679448282e-05, "loss": 0.9479855537414551, "memory(GiB)": 91.52, "step": 24005, "token_acc": 0.7429241442204345, "train_speed(iter/s)": 0.205589 }, { "epoch": 0.3115460847506561, "grad_norm": 0.7732161283493042, "learning_rate": 9.697238894924974e-05, "loss": 0.952604866027832, "memory(GiB)": 91.52, "step": 24010, "token_acc": 0.7545389703021658, "train_speed(iter/s)": 0.205558 }, { "epoch": 0.3116109631523118, "grad_norm": 0.8581015467643738, "learning_rate": 9.697055056346143e-05, "loss": 0.9637198448181152, "memory(GiB)": 91.52, "step": 24015, "token_acc": 0.733559806901484, "train_speed(iter/s)": 0.205528 }, { "epoch": 0.3116758415539675, "grad_norm": 0.7947702407836914, "learning_rate": 9.696871163713903e-05, "loss": 0.9542680740356445, "memory(GiB)": 91.52, "step": 24020, "token_acc": 0.747130846200569, "train_speed(iter/s)": 0.205498 }, { "epoch": 0.3117407199556232, "grad_norm": 0.778858482837677, "learning_rate": 9.696687217030373e-05, "loss": 0.894718074798584, "memory(GiB)": 91.52, "step": 24025, "token_acc": 0.753958082026538, "train_speed(iter/s)": 0.20547 }, { "epoch": 0.3118055983572789, "grad_norm": 0.7993784546852112, "learning_rate": 9.696503216297669e-05, "loss": 0.9811070442199707, "memory(GiB)": 91.52, "step": 24030, "token_acc": 0.7322407796195908, "train_speed(iter/s)": 0.205441 }, { "epoch": 0.3118704767589346, "grad_norm": 0.8396738171577454, "learning_rate": 9.696319161517909e-05, "loss": 0.9675531387329102, "memory(GiB)": 91.52, "step": 24035, "token_acc": 0.7541265822784811, "train_speed(iter/s)": 0.205412 }, { "epoch": 0.3119353551605903, "grad_norm": 0.8035851716995239, "learning_rate": 9.696135052693209e-05, "loss": 0.9825681686401367, "memory(GiB)": 91.52, "step": 24040, "token_acc": 0.7354633323105247, "train_speed(iter/s)": 0.205385 }, { "epoch": 0.312000233562246, "grad_norm": 0.9041162729263306, "learning_rate": 9.695950889825689e-05, "loss": 0.9705188751220704, "memory(GiB)": 91.52, "step": 24045, "token_acc": 0.7411007914423042, "train_speed(iter/s)": 0.205357 }, { "epoch": 0.31206511196390163, "grad_norm": 0.8396075367927551, "learning_rate": 9.695766672917468e-05, "loss": 0.9392017364501953, "memory(GiB)": 91.52, "step": 24050, "token_acc": 0.7261234424383333, "train_speed(iter/s)": 0.205324 }, { "epoch": 0.31212999036555733, "grad_norm": 0.8248077034950256, "learning_rate": 9.695582401970666e-05, "loss": 0.9397212028503418, "memory(GiB)": 91.52, "step": 24055, "token_acc": 0.7686888029772013, "train_speed(iter/s)": 0.205295 }, { "epoch": 0.31219486876721303, "grad_norm": 0.8511793613433838, "learning_rate": 9.695398076987405e-05, "loss": 0.9955093383789062, "memory(GiB)": 91.52, "step": 24060, "token_acc": 0.7405089695452649, "train_speed(iter/s)": 0.205268 }, { "epoch": 0.31225974716886873, "grad_norm": 0.9031813144683838, "learning_rate": 9.695213697969805e-05, "loss": 0.9997931480407715, "memory(GiB)": 91.52, "step": 24065, "token_acc": 0.7531168171337007, "train_speed(iter/s)": 0.205239 }, { "epoch": 0.31232462557052443, "grad_norm": 0.8521786332130432, "learning_rate": 9.695029264919989e-05, "loss": 0.9709190368652344, "memory(GiB)": 91.52, "step": 24070, "token_acc": 0.7542761452892628, "train_speed(iter/s)": 0.205211 }, { "epoch": 0.3123895039721801, "grad_norm": 0.7074304819107056, "learning_rate": 9.694844777840078e-05, "loss": 0.9662748336791992, "memory(GiB)": 91.52, "step": 24075, "token_acc": 0.7544431811381724, "train_speed(iter/s)": 0.205181 }, { "epoch": 0.3124543823738358, "grad_norm": 0.7454667687416077, "learning_rate": 9.694660236732193e-05, "loss": 0.9643716812133789, "memory(GiB)": 91.52, "step": 24080, "token_acc": 0.7468117029257314, "train_speed(iter/s)": 0.205151 }, { "epoch": 0.3125192607754915, "grad_norm": 0.8160332441329956, "learning_rate": 9.694475641598464e-05, "loss": 0.9820039749145508, "memory(GiB)": 91.52, "step": 24085, "token_acc": 0.7504478541942519, "train_speed(iter/s)": 0.205124 }, { "epoch": 0.3125841391771472, "grad_norm": 0.8512982130050659, "learning_rate": 9.694290992441009e-05, "loss": 0.9651872634887695, "memory(GiB)": 91.52, "step": 24090, "token_acc": 0.7470734562481709, "train_speed(iter/s)": 0.205097 }, { "epoch": 0.3126490175788029, "grad_norm": 0.748232901096344, "learning_rate": 9.694106289261956e-05, "loss": 0.9497271537780761, "memory(GiB)": 91.52, "step": 24095, "token_acc": 0.7443105327264569, "train_speed(iter/s)": 0.205068 }, { "epoch": 0.3127138959804586, "grad_norm": 0.8284322619438171, "learning_rate": 9.69392153206343e-05, "loss": 0.9259363174438476, "memory(GiB)": 91.52, "step": 24100, "token_acc": 0.7373440696983697, "train_speed(iter/s)": 0.205039 }, { "epoch": 0.3127787743821143, "grad_norm": 0.8310114741325378, "learning_rate": 9.693736720847557e-05, "loss": 0.9446971893310547, "memory(GiB)": 91.52, "step": 24105, "token_acc": 0.7759839893262175, "train_speed(iter/s)": 0.205015 }, { "epoch": 0.31284365278377, "grad_norm": 0.852583110332489, "learning_rate": 9.693551855616463e-05, "loss": 0.9775672912597656, "memory(GiB)": 91.52, "step": 24110, "token_acc": 0.7284993735342308, "train_speed(iter/s)": 0.204985 }, { "epoch": 0.3129085311854257, "grad_norm": 0.8835745453834534, "learning_rate": 9.693366936372276e-05, "loss": 0.9848126411437989, "memory(GiB)": 91.52, "step": 24115, "token_acc": 0.7399889642125177, "train_speed(iter/s)": 0.204961 }, { "epoch": 0.3129734095870814, "grad_norm": 0.7635022401809692, "learning_rate": 9.693181963117127e-05, "loss": 0.9343812942504883, "memory(GiB)": 91.52, "step": 24120, "token_acc": 0.7471504393978944, "train_speed(iter/s)": 0.204936 }, { "epoch": 0.3130382879887371, "grad_norm": 0.8000413179397583, "learning_rate": 9.69299693585314e-05, "loss": 0.9936605453491211, "memory(GiB)": 91.52, "step": 24125, "token_acc": 0.732546502398454, "train_speed(iter/s)": 0.204908 }, { "epoch": 0.3131031663903928, "grad_norm": 0.9046226143836975, "learning_rate": 9.692811854582446e-05, "loss": 0.9688000679016113, "memory(GiB)": 91.52, "step": 24130, "token_acc": 0.7395241806777271, "train_speed(iter/s)": 0.204876 }, { "epoch": 0.3131680447920485, "grad_norm": 0.81219482421875, "learning_rate": 9.692626719307177e-05, "loss": 0.9522791862487793, "memory(GiB)": 91.52, "step": 24135, "token_acc": 0.7479310344827587, "train_speed(iter/s)": 0.204848 }, { "epoch": 0.3132329231937042, "grad_norm": 0.8486524820327759, "learning_rate": 9.692441530029461e-05, "loss": 0.9633472442626954, "memory(GiB)": 91.52, "step": 24140, "token_acc": 0.731275942970583, "train_speed(iter/s)": 0.204819 }, { "epoch": 0.3132978015953599, "grad_norm": 0.8700555562973022, "learning_rate": 9.69225628675143e-05, "loss": 0.9420134544372558, "memory(GiB)": 91.52, "step": 24145, "token_acc": 0.7506398662253012, "train_speed(iter/s)": 0.20479 }, { "epoch": 0.3133626799970156, "grad_norm": 0.7911447882652283, "learning_rate": 9.692070989475215e-05, "loss": 1.0096479415893556, "memory(GiB)": 91.52, "step": 24150, "token_acc": 0.7330879775674728, "train_speed(iter/s)": 0.204763 }, { "epoch": 0.3134275583986713, "grad_norm": 0.8318676948547363, "learning_rate": 9.691885638202948e-05, "loss": 0.9760992050170898, "memory(GiB)": 91.52, "step": 24155, "token_acc": 0.7495982694684796, "train_speed(iter/s)": 0.204736 }, { "epoch": 0.313492436800327, "grad_norm": 0.828407347202301, "learning_rate": 9.691700232936766e-05, "loss": 0.9746641159057617, "memory(GiB)": 91.52, "step": 24160, "token_acc": 0.7200423376427005, "train_speed(iter/s)": 0.204708 }, { "epoch": 0.3135573152019827, "grad_norm": 0.8242773413658142, "learning_rate": 9.691514773678798e-05, "loss": 0.9208789825439453, "memory(GiB)": 91.52, "step": 24165, "token_acc": 0.7577186794023225, "train_speed(iter/s)": 0.204681 }, { "epoch": 0.31362219360363836, "grad_norm": 0.8296617865562439, "learning_rate": 9.691329260431179e-05, "loss": 0.9773204803466797, "memory(GiB)": 91.52, "step": 24170, "token_acc": 0.7442318878045628, "train_speed(iter/s)": 0.204651 }, { "epoch": 0.31368707200529405, "grad_norm": 0.6710308194160461, "learning_rate": 9.691143693196047e-05, "loss": 0.9321788787841797, "memory(GiB)": 91.52, "step": 24175, "token_acc": 0.7669728011303426, "train_speed(iter/s)": 0.204622 }, { "epoch": 0.31375195040694975, "grad_norm": 0.7472512125968933, "learning_rate": 9.690958071975533e-05, "loss": 1.0092548370361327, "memory(GiB)": 91.52, "step": 24180, "token_acc": 0.7351870783070946, "train_speed(iter/s)": 0.204592 }, { "epoch": 0.31381682880860545, "grad_norm": 0.7430112361907959, "learning_rate": 9.690772396771776e-05, "loss": 0.8932841300964356, "memory(GiB)": 91.52, "step": 24185, "token_acc": 0.7535397168226542, "train_speed(iter/s)": 0.204564 }, { "epoch": 0.31388170721026115, "grad_norm": 0.7200297117233276, "learning_rate": 9.690586667586914e-05, "loss": 0.9392762184143066, "memory(GiB)": 91.52, "step": 24190, "token_acc": 0.7471428571428571, "train_speed(iter/s)": 0.204533 }, { "epoch": 0.31394658561191685, "grad_norm": 0.8095786571502686, "learning_rate": 9.690400884423081e-05, "loss": 1.0026844024658204, "memory(GiB)": 91.52, "step": 24195, "token_acc": 0.7249168714760734, "train_speed(iter/s)": 0.204504 }, { "epoch": 0.31401146401357255, "grad_norm": 0.6884185671806335, "learning_rate": 9.690215047282416e-05, "loss": 0.9496096611022949, "memory(GiB)": 91.52, "step": 24200, "token_acc": 0.7458040891058896, "train_speed(iter/s)": 0.204475 }, { "epoch": 0.31407634241522825, "grad_norm": 0.8328260183334351, "learning_rate": 9.690029156167058e-05, "loss": 1.007779026031494, "memory(GiB)": 91.52, "step": 24205, "token_acc": 0.7272521843928894, "train_speed(iter/s)": 0.204445 }, { "epoch": 0.31414122081688395, "grad_norm": 0.8022722601890564, "learning_rate": 9.689843211079148e-05, "loss": 0.9096578598022461, "memory(GiB)": 91.52, "step": 24210, "token_acc": 0.7533656957928803, "train_speed(iter/s)": 0.204415 }, { "epoch": 0.31420609921853965, "grad_norm": 0.8222896456718445, "learning_rate": 9.689657212020823e-05, "loss": 1.0130792617797852, "memory(GiB)": 91.52, "step": 24215, "token_acc": 0.7284791544333529, "train_speed(iter/s)": 0.204388 }, { "epoch": 0.31427097762019535, "grad_norm": 0.8415164351463318, "learning_rate": 9.689471158994224e-05, "loss": 0.9428159713745117, "memory(GiB)": 91.52, "step": 24220, "token_acc": 0.744310149844501, "train_speed(iter/s)": 0.204359 }, { "epoch": 0.31433585602185105, "grad_norm": 0.7324197292327881, "learning_rate": 9.689285052001494e-05, "loss": 0.9539170265197754, "memory(GiB)": 91.52, "step": 24225, "token_acc": 0.7392998657366473, "train_speed(iter/s)": 0.204329 }, { "epoch": 0.31440073442350674, "grad_norm": 0.8102641105651855, "learning_rate": 9.689098891044773e-05, "loss": 0.9674654006958008, "memory(GiB)": 91.52, "step": 24230, "token_acc": 0.7385202606315302, "train_speed(iter/s)": 0.2043 }, { "epoch": 0.31446561282516244, "grad_norm": 0.8494684100151062, "learning_rate": 9.688912676126204e-05, "loss": 0.9867901802062988, "memory(GiB)": 91.52, "step": 24235, "token_acc": 0.7381541066892464, "train_speed(iter/s)": 0.204269 }, { "epoch": 0.31453049122681814, "grad_norm": 0.7673850655555725, "learning_rate": 9.688726407247929e-05, "loss": 0.9226781845092773, "memory(GiB)": 91.52, "step": 24240, "token_acc": 0.754824740326984, "train_speed(iter/s)": 0.204239 }, { "epoch": 0.31459536962847384, "grad_norm": 0.7302546501159668, "learning_rate": 9.688540084412092e-05, "loss": 1.0296024322509765, "memory(GiB)": 91.52, "step": 24245, "token_acc": 0.7254536204849924, "train_speed(iter/s)": 0.204212 }, { "epoch": 0.31466024803012954, "grad_norm": 0.7430723309516907, "learning_rate": 9.688353707620839e-05, "loss": 0.9758020401000976, "memory(GiB)": 91.52, "step": 24250, "token_acc": 0.7364460978974209, "train_speed(iter/s)": 0.204186 }, { "epoch": 0.31472512643178524, "grad_norm": 0.8528281450271606, "learning_rate": 9.688167276876313e-05, "loss": 0.9847065925598144, "memory(GiB)": 91.52, "step": 24255, "token_acc": 0.7391030221512337, "train_speed(iter/s)": 0.204159 }, { "epoch": 0.31479000483344094, "grad_norm": 0.8877397179603577, "learning_rate": 9.687980792180658e-05, "loss": 0.998499870300293, "memory(GiB)": 91.52, "step": 24260, "token_acc": 0.7320300751879699, "train_speed(iter/s)": 0.204129 }, { "epoch": 0.31485488323509664, "grad_norm": 0.7908616662025452, "learning_rate": 9.687794253536024e-05, "loss": 0.9225336074829101, "memory(GiB)": 91.52, "step": 24265, "token_acc": 0.7584532242751771, "train_speed(iter/s)": 0.204102 }, { "epoch": 0.31491976163675234, "grad_norm": 0.7995375990867615, "learning_rate": 9.687607660944554e-05, "loss": 0.9487100601196289, "memory(GiB)": 91.52, "step": 24270, "token_acc": 0.7220968050506983, "train_speed(iter/s)": 0.204075 }, { "epoch": 0.31498464003840804, "grad_norm": 0.7607510685920715, "learning_rate": 9.687421014408396e-05, "loss": 0.9554931640625, "memory(GiB)": 91.52, "step": 24275, "token_acc": 0.7415294276457883, "train_speed(iter/s)": 0.204046 }, { "epoch": 0.31504951844006374, "grad_norm": 0.8510801196098328, "learning_rate": 9.687234313929701e-05, "loss": 0.924609375, "memory(GiB)": 91.52, "step": 24280, "token_acc": 0.7624116292000953, "train_speed(iter/s)": 0.204021 }, { "epoch": 0.3151143968417194, "grad_norm": 0.7268886566162109, "learning_rate": 9.687047559510613e-05, "loss": 0.956977367401123, "memory(GiB)": 91.52, "step": 24285, "token_acc": 0.7513120850856766, "train_speed(iter/s)": 0.203994 }, { "epoch": 0.3151792752433751, "grad_norm": 0.7896832227706909, "learning_rate": 9.686860751153284e-05, "loss": 0.9250061988830567, "memory(GiB)": 91.52, "step": 24290, "token_acc": 0.7574549112596105, "train_speed(iter/s)": 0.203968 }, { "epoch": 0.3152441536450308, "grad_norm": 0.8236477375030518, "learning_rate": 9.686673888859864e-05, "loss": 0.9895030975341796, "memory(GiB)": 91.52, "step": 24295, "token_acc": 0.7526576653563494, "train_speed(iter/s)": 0.203942 }, { "epoch": 0.3153090320466865, "grad_norm": 0.8119866251945496, "learning_rate": 9.686486972632502e-05, "loss": 1.0018793106079102, "memory(GiB)": 91.52, "step": 24300, "token_acc": 0.7351467670892035, "train_speed(iter/s)": 0.203917 }, { "epoch": 0.3153739104483422, "grad_norm": 0.7072678208351135, "learning_rate": 9.68630000247335e-05, "loss": 0.950859260559082, "memory(GiB)": 91.52, "step": 24305, "token_acc": 0.7626537753887226, "train_speed(iter/s)": 0.203889 }, { "epoch": 0.3154387888499979, "grad_norm": 0.866725504398346, "learning_rate": 9.686112978384558e-05, "loss": 0.9933620452880859, "memory(GiB)": 91.52, "step": 24310, "token_acc": 0.7343225511138236, "train_speed(iter/s)": 0.203863 }, { "epoch": 0.3155036672516536, "grad_norm": 0.9523745179176331, "learning_rate": 9.68592590036828e-05, "loss": 0.9959915161132813, "memory(GiB)": 91.52, "step": 24315, "token_acc": 0.7297510147210274, "train_speed(iter/s)": 0.203834 }, { "epoch": 0.3155685456533093, "grad_norm": 0.745993971824646, "learning_rate": 9.685738768426669e-05, "loss": 0.9771503448486328, "memory(GiB)": 91.52, "step": 24320, "token_acc": 0.7226132134566491, "train_speed(iter/s)": 0.203808 }, { "epoch": 0.315633424054965, "grad_norm": 0.7319842576980591, "learning_rate": 9.685551582561876e-05, "loss": 0.9431485176086426, "memory(GiB)": 91.52, "step": 24325, "token_acc": 0.741374014538753, "train_speed(iter/s)": 0.20378 }, { "epoch": 0.31569830245662067, "grad_norm": 0.7692922949790955, "learning_rate": 9.685364342776058e-05, "loss": 0.942568588256836, "memory(GiB)": 91.52, "step": 24330, "token_acc": 0.739292997960571, "train_speed(iter/s)": 0.203749 }, { "epoch": 0.31576318085827637, "grad_norm": 0.9562738537788391, "learning_rate": 9.68517704907137e-05, "loss": 0.9140712738037109, "memory(GiB)": 91.52, "step": 24335, "token_acc": 0.7440919599280761, "train_speed(iter/s)": 0.203719 }, { "epoch": 0.31582805925993207, "grad_norm": 0.8211258053779602, "learning_rate": 9.684989701449963e-05, "loss": 0.9231910705566406, "memory(GiB)": 91.52, "step": 24340, "token_acc": 0.7268958956238109, "train_speed(iter/s)": 0.203693 }, { "epoch": 0.31589293766158777, "grad_norm": 0.8781353831291199, "learning_rate": 9.684802299913999e-05, "loss": 0.9118831634521485, "memory(GiB)": 91.52, "step": 24345, "token_acc": 0.7553747821034282, "train_speed(iter/s)": 0.203667 }, { "epoch": 0.31595781606324347, "grad_norm": 0.9261753559112549, "learning_rate": 9.684614844465631e-05, "loss": 0.9456196784973144, "memory(GiB)": 91.52, "step": 24350, "token_acc": 0.7495079816313143, "train_speed(iter/s)": 0.203641 }, { "epoch": 0.31602269446489917, "grad_norm": 0.8781790733337402, "learning_rate": 9.684427335107016e-05, "loss": 0.9660089492797852, "memory(GiB)": 91.52, "step": 24355, "token_acc": 0.7368999870617157, "train_speed(iter/s)": 0.203611 }, { "epoch": 0.31608757286655487, "grad_norm": 0.7936322689056396, "learning_rate": 9.684239771840314e-05, "loss": 0.9754812240600585, "memory(GiB)": 91.52, "step": 24360, "token_acc": 0.7463422762872854, "train_speed(iter/s)": 0.203581 }, { "epoch": 0.31615245126821057, "grad_norm": 0.82807856798172, "learning_rate": 9.68405215466768e-05, "loss": 0.9957136154174805, "memory(GiB)": 91.52, "step": 24365, "token_acc": 0.7370188847379785, "train_speed(iter/s)": 0.203554 }, { "epoch": 0.31621732966986627, "grad_norm": 0.8251848816871643, "learning_rate": 9.683864483591276e-05, "loss": 1.0429065704345704, "memory(GiB)": 91.52, "step": 24370, "token_acc": 0.7113963828435798, "train_speed(iter/s)": 0.203529 }, { "epoch": 0.31628220807152196, "grad_norm": 0.8852742314338684, "learning_rate": 9.683676758613263e-05, "loss": 1.0149824142456054, "memory(GiB)": 91.52, "step": 24375, "token_acc": 0.7605440208348405, "train_speed(iter/s)": 0.203502 }, { "epoch": 0.31634708647317766, "grad_norm": 0.8124488592147827, "learning_rate": 9.683488979735798e-05, "loss": 0.9853408813476563, "memory(GiB)": 91.52, "step": 24380, "token_acc": 0.7209384998222538, "train_speed(iter/s)": 0.203474 }, { "epoch": 0.31641196487483336, "grad_norm": 0.8644793629646301, "learning_rate": 9.683301146961042e-05, "loss": 0.9726459503173828, "memory(GiB)": 91.52, "step": 24385, "token_acc": 0.7304660402345967, "train_speed(iter/s)": 0.203446 }, { "epoch": 0.31647684327648906, "grad_norm": 0.8139976859092712, "learning_rate": 9.683113260291158e-05, "loss": 1.0234867095947267, "memory(GiB)": 91.52, "step": 24390, "token_acc": 0.7328840136171383, "train_speed(iter/s)": 0.203415 }, { "epoch": 0.31654172167814476, "grad_norm": 0.8838977217674255, "learning_rate": 9.68292531972831e-05, "loss": 0.9331104278564453, "memory(GiB)": 91.52, "step": 24395, "token_acc": 0.7478781988986071, "train_speed(iter/s)": 0.203388 }, { "epoch": 0.31660660007980046, "grad_norm": 0.7836819291114807, "learning_rate": 9.682737325274658e-05, "loss": 0.9661739349365235, "memory(GiB)": 91.52, "step": 24400, "token_acc": 0.7403274084475814, "train_speed(iter/s)": 0.203362 }, { "epoch": 0.3166714784814561, "grad_norm": 0.8711850047111511, "learning_rate": 9.682549276932366e-05, "loss": 0.9284831047058105, "memory(GiB)": 91.52, "step": 24405, "token_acc": 0.7756457813448439, "train_speed(iter/s)": 0.203327 }, { "epoch": 0.3167363568831118, "grad_norm": 0.8442559838294983, "learning_rate": 9.682361174703598e-05, "loss": 0.9574794769287109, "memory(GiB)": 91.52, "step": 24410, "token_acc": 0.7521604743581657, "train_speed(iter/s)": 0.2033 }, { "epoch": 0.3168012352847675, "grad_norm": 0.7480787634849548, "learning_rate": 9.682173018590519e-05, "loss": 0.9791942596435547, "memory(GiB)": 91.52, "step": 24415, "token_acc": 0.722012926416203, "train_speed(iter/s)": 0.203274 }, { "epoch": 0.3168661136864232, "grad_norm": 0.840904712677002, "learning_rate": 9.681984808595294e-05, "loss": 0.9665184020996094, "memory(GiB)": 91.52, "step": 24420, "token_acc": 0.7432051770079939, "train_speed(iter/s)": 0.203249 }, { "epoch": 0.3169309920880789, "grad_norm": 0.735558032989502, "learning_rate": 9.68179654472009e-05, "loss": 0.9651675224304199, "memory(GiB)": 91.52, "step": 24425, "token_acc": 0.7572492303286318, "train_speed(iter/s)": 0.203224 }, { "epoch": 0.3169958704897346, "grad_norm": 0.8358290195465088, "learning_rate": 9.681608226967072e-05, "loss": 0.9579867362976074, "memory(GiB)": 91.52, "step": 24430, "token_acc": 0.7710531907358879, "train_speed(iter/s)": 0.203193 }, { "epoch": 0.3170607488913903, "grad_norm": 0.9840350151062012, "learning_rate": 9.681419855338409e-05, "loss": 0.9625758171081543, "memory(GiB)": 91.52, "step": 24435, "token_acc": 0.7519720576115537, "train_speed(iter/s)": 0.203166 }, { "epoch": 0.317125627293046, "grad_norm": 0.8311500549316406, "learning_rate": 9.681231429836267e-05, "loss": 0.9591984748840332, "memory(GiB)": 91.52, "step": 24440, "token_acc": 0.7366502501260325, "train_speed(iter/s)": 0.203142 }, { "epoch": 0.3171905056947017, "grad_norm": 0.9161118865013123, "learning_rate": 9.681042950462813e-05, "loss": 0.9738130569458008, "memory(GiB)": 91.52, "step": 24445, "token_acc": 0.7349981364144614, "train_speed(iter/s)": 0.203114 }, { "epoch": 0.3172553840963574, "grad_norm": 0.9193580746650696, "learning_rate": 9.68085441722022e-05, "loss": 0.995789909362793, "memory(GiB)": 91.52, "step": 24450, "token_acc": 0.7383877513373917, "train_speed(iter/s)": 0.203089 }, { "epoch": 0.3173202624980131, "grad_norm": 0.873167872428894, "learning_rate": 9.680665830110656e-05, "loss": 0.9964106559753418, "memory(GiB)": 91.52, "step": 24455, "token_acc": 0.7437767874882916, "train_speed(iter/s)": 0.203059 }, { "epoch": 0.3173851408996688, "grad_norm": 0.8130576014518738, "learning_rate": 9.68047718913629e-05, "loss": 0.9594551086425781, "memory(GiB)": 91.52, "step": 24460, "token_acc": 0.7527441173508778, "train_speed(iter/s)": 0.203033 }, { "epoch": 0.3174500193013245, "grad_norm": 0.8322123885154724, "learning_rate": 9.680288494299295e-05, "loss": 0.9769960403442383, "memory(GiB)": 91.52, "step": 24465, "token_acc": 0.7211328168439782, "train_speed(iter/s)": 0.203007 }, { "epoch": 0.3175148977029802, "grad_norm": 0.8304184079170227, "learning_rate": 9.680099745601838e-05, "loss": 0.9609914779663086, "memory(GiB)": 91.52, "step": 24470, "token_acc": 0.7612217226860772, "train_speed(iter/s)": 0.202976 }, { "epoch": 0.3175797761046359, "grad_norm": 0.7174819707870483, "learning_rate": 9.679910943046098e-05, "loss": 0.8831771850585938, "memory(GiB)": 91.52, "step": 24475, "token_acc": 0.7677461871587272, "train_speed(iter/s)": 0.202945 }, { "epoch": 0.3176446545062916, "grad_norm": 0.8269783854484558, "learning_rate": 9.679722086634242e-05, "loss": 0.9844657897949218, "memory(GiB)": 91.52, "step": 24480, "token_acc": 0.7526946787186566, "train_speed(iter/s)": 0.202919 }, { "epoch": 0.3177095329079473, "grad_norm": 0.8580841422080994, "learning_rate": 9.679533176368446e-05, "loss": 0.9924222946166992, "memory(GiB)": 91.52, "step": 24485, "token_acc": 0.7308061881425963, "train_speed(iter/s)": 0.202893 }, { "epoch": 0.317774411309603, "grad_norm": 0.8152750730514526, "learning_rate": 9.679344212250883e-05, "loss": 0.9461716651916504, "memory(GiB)": 91.52, "step": 24490, "token_acc": 0.7382526178010471, "train_speed(iter/s)": 0.202868 }, { "epoch": 0.3178392897112587, "grad_norm": 0.8175463676452637, "learning_rate": 9.679155194283729e-05, "loss": 0.98189697265625, "memory(GiB)": 91.52, "step": 24495, "token_acc": 0.7413521760077084, "train_speed(iter/s)": 0.202839 }, { "epoch": 0.3179041681129144, "grad_norm": 0.8078638911247253, "learning_rate": 9.678966122469158e-05, "loss": 0.9533044815063476, "memory(GiB)": 91.52, "step": 24500, "token_acc": 0.7651222405019704, "train_speed(iter/s)": 0.202813 }, { "epoch": 0.3179690465145701, "grad_norm": 0.8546619415283203, "learning_rate": 9.678776996809344e-05, "loss": 0.9577627182006836, "memory(GiB)": 91.52, "step": 24505, "token_acc": 0.7477797513321492, "train_speed(iter/s)": 0.202784 }, { "epoch": 0.3180339249162258, "grad_norm": 0.7789976596832275, "learning_rate": 9.678587817306468e-05, "loss": 0.9462429046630859, "memory(GiB)": 91.52, "step": 24510, "token_acc": 0.7636006187959262, "train_speed(iter/s)": 0.202756 }, { "epoch": 0.3180988033178815, "grad_norm": 0.7737367153167725, "learning_rate": 9.678398583962703e-05, "loss": 1.024977970123291, "memory(GiB)": 91.52, "step": 24515, "token_acc": 0.7266604817372779, "train_speed(iter/s)": 0.202727 }, { "epoch": 0.3181636817195372, "grad_norm": 0.7948070168495178, "learning_rate": 9.678209296780229e-05, "loss": 1.0084484100341797, "memory(GiB)": 91.52, "step": 24520, "token_acc": 0.7249082785426277, "train_speed(iter/s)": 0.202701 }, { "epoch": 0.3182285601211928, "grad_norm": 0.8137603402137756, "learning_rate": 9.678019955761223e-05, "loss": 0.975837516784668, "memory(GiB)": 91.52, "step": 24525, "token_acc": 0.7319678361403369, "train_speed(iter/s)": 0.202674 }, { "epoch": 0.3182934385228485, "grad_norm": 0.813693106174469, "learning_rate": 9.677830560907864e-05, "loss": 0.9636585235595703, "memory(GiB)": 91.52, "step": 24530, "token_acc": 0.7228076639646278, "train_speed(iter/s)": 0.202648 }, { "epoch": 0.3183583169245042, "grad_norm": 0.8563336133956909, "learning_rate": 9.677641112222334e-05, "loss": 0.9807395935058594, "memory(GiB)": 91.52, "step": 24535, "token_acc": 0.7248794640952341, "train_speed(iter/s)": 0.20262 }, { "epoch": 0.3184231953261599, "grad_norm": 0.8154063820838928, "learning_rate": 9.67745160970681e-05, "loss": 0.9851629257202148, "memory(GiB)": 91.52, "step": 24540, "token_acc": 0.7430967958396242, "train_speed(iter/s)": 0.202592 }, { "epoch": 0.3184880737278156, "grad_norm": 0.8329765200614929, "learning_rate": 9.677262053363472e-05, "loss": 1.0050107955932617, "memory(GiB)": 91.52, "step": 24545, "token_acc": 0.7346443575257134, "train_speed(iter/s)": 0.202566 }, { "epoch": 0.3185529521294713, "grad_norm": 0.8694037795066833, "learning_rate": 9.677072443194505e-05, "loss": 0.9332071304321289, "memory(GiB)": 91.52, "step": 24550, "token_acc": 0.7549736173539183, "train_speed(iter/s)": 0.202539 }, { "epoch": 0.318617830531127, "grad_norm": 0.8104178309440613, "learning_rate": 9.676882779202089e-05, "loss": 0.9462142944335937, "memory(GiB)": 91.52, "step": 24555, "token_acc": 0.7540789189966218, "train_speed(iter/s)": 0.20251 }, { "epoch": 0.3186827089327827, "grad_norm": 0.7376154065132141, "learning_rate": 9.676693061388406e-05, "loss": 0.9181730270385742, "memory(GiB)": 91.52, "step": 24560, "token_acc": 0.7567430541472318, "train_speed(iter/s)": 0.202481 }, { "epoch": 0.3187475873344384, "grad_norm": 0.7499054074287415, "learning_rate": 9.676503289755642e-05, "loss": 0.9292974472045898, "memory(GiB)": 91.52, "step": 24565, "token_acc": 0.746807128941482, "train_speed(iter/s)": 0.202452 }, { "epoch": 0.3188124657360941, "grad_norm": 0.829652726650238, "learning_rate": 9.67631346430598e-05, "loss": 0.9795698165893555, "memory(GiB)": 91.52, "step": 24570, "token_acc": 0.7464677910437549, "train_speed(iter/s)": 0.202426 }, { "epoch": 0.3188773441377498, "grad_norm": 0.8306915760040283, "learning_rate": 9.676123585041601e-05, "loss": 0.9363426208496094, "memory(GiB)": 91.52, "step": 24575, "token_acc": 0.7508282411838327, "train_speed(iter/s)": 0.202402 }, { "epoch": 0.3189422225394055, "grad_norm": 0.8362636566162109, "learning_rate": 9.675933651964695e-05, "loss": 0.9890119552612304, "memory(GiB)": 91.52, "step": 24580, "token_acc": 0.7565491362220335, "train_speed(iter/s)": 0.202375 }, { "epoch": 0.3190071009410612, "grad_norm": 0.8852688670158386, "learning_rate": 9.675743665077444e-05, "loss": 1.0161416053771972, "memory(GiB)": 91.52, "step": 24585, "token_acc": 0.7357520786092214, "train_speed(iter/s)": 0.202345 }, { "epoch": 0.3190719793427169, "grad_norm": 0.7956507802009583, "learning_rate": 9.675553624382036e-05, "loss": 0.9640334129333497, "memory(GiB)": 91.52, "step": 24590, "token_acc": 0.7487706750111757, "train_speed(iter/s)": 0.202319 }, { "epoch": 0.3191368577443726, "grad_norm": 0.8748773336410522, "learning_rate": 9.675363529880658e-05, "loss": 0.9372756958007813, "memory(GiB)": 91.52, "step": 24595, "token_acc": 0.7677053824362606, "train_speed(iter/s)": 0.202291 }, { "epoch": 0.3192017361460283, "grad_norm": 0.7399943470954895, "learning_rate": 9.675173381575497e-05, "loss": 0.9482410430908204, "memory(GiB)": 91.52, "step": 24600, "token_acc": 0.7511872579820268, "train_speed(iter/s)": 0.202264 }, { "epoch": 0.319266614547684, "grad_norm": 0.8345412015914917, "learning_rate": 9.674983179468741e-05, "loss": 0.9643815994262696, "memory(GiB)": 91.52, "step": 24605, "token_acc": 0.7147948843385282, "train_speed(iter/s)": 0.202238 }, { "epoch": 0.3193314929493397, "grad_norm": 0.7217229008674622, "learning_rate": 9.674792923562581e-05, "loss": 0.9325069427490235, "memory(GiB)": 91.52, "step": 24610, "token_acc": 0.7653364837665179, "train_speed(iter/s)": 0.202212 }, { "epoch": 0.3193963713509954, "grad_norm": 0.7993184924125671, "learning_rate": 9.674602613859205e-05, "loss": 0.9491767883300781, "memory(GiB)": 91.52, "step": 24615, "token_acc": 0.7484181982524857, "train_speed(iter/s)": 0.202185 }, { "epoch": 0.3194612497526511, "grad_norm": 0.7661146521568298, "learning_rate": 9.674412250360803e-05, "loss": 0.9915791511535644, "memory(GiB)": 91.52, "step": 24620, "token_acc": 0.7271926092505926, "train_speed(iter/s)": 0.20216 }, { "epoch": 0.3195261281543068, "grad_norm": 0.772305965423584, "learning_rate": 9.674221833069566e-05, "loss": 0.9456523895263672, "memory(GiB)": 91.52, "step": 24625, "token_acc": 0.7490727581545288, "train_speed(iter/s)": 0.202131 }, { "epoch": 0.3195910065559625, "grad_norm": 0.8234891295433044, "learning_rate": 9.674031361987684e-05, "loss": 0.9177295684814453, "memory(GiB)": 91.52, "step": 24630, "token_acc": 0.756730802208732, "train_speed(iter/s)": 0.202104 }, { "epoch": 0.3196558849576182, "grad_norm": 0.7598605155944824, "learning_rate": 9.673840837117349e-05, "loss": 0.9636235237121582, "memory(GiB)": 91.52, "step": 24635, "token_acc": 0.7308993006533802, "train_speed(iter/s)": 0.202074 }, { "epoch": 0.3197207633592739, "grad_norm": 0.8364713191986084, "learning_rate": 9.673650258460756e-05, "loss": 0.9799036026000977, "memory(GiB)": 91.52, "step": 24640, "token_acc": 0.72628338225818, "train_speed(iter/s)": 0.202045 }, { "epoch": 0.31978564176092955, "grad_norm": 0.7576773762702942, "learning_rate": 9.673459626020095e-05, "loss": 0.9396811485290527, "memory(GiB)": 91.52, "step": 24645, "token_acc": 0.7406012233328361, "train_speed(iter/s)": 0.202017 }, { "epoch": 0.31985052016258525, "grad_norm": 0.8342211842536926, "learning_rate": 9.673268939797562e-05, "loss": 0.9498773574829101, "memory(GiB)": 91.52, "step": 24650, "token_acc": 0.7452544881281771, "train_speed(iter/s)": 0.201989 }, { "epoch": 0.31991539856424095, "grad_norm": 0.8204934000968933, "learning_rate": 9.673078199795351e-05, "loss": 0.965362548828125, "memory(GiB)": 91.52, "step": 24655, "token_acc": 0.7519583224017091, "train_speed(iter/s)": 0.201963 }, { "epoch": 0.31998027696589665, "grad_norm": 0.8791012167930603, "learning_rate": 9.672887406015658e-05, "loss": 0.9932579040527344, "memory(GiB)": 91.52, "step": 24660, "token_acc": 0.724588679245283, "train_speed(iter/s)": 0.201938 }, { "epoch": 0.32004515536755235, "grad_norm": 0.7720385789871216, "learning_rate": 9.672696558460676e-05, "loss": 0.9594195365905762, "memory(GiB)": 91.52, "step": 24665, "token_acc": 0.7406634100220612, "train_speed(iter/s)": 0.201913 }, { "epoch": 0.32011003376920805, "grad_norm": 1.0532950162887573, "learning_rate": 9.672505657132604e-05, "loss": 0.9617527008056641, "memory(GiB)": 91.52, "step": 24670, "token_acc": 0.7642582921177326, "train_speed(iter/s)": 0.201886 }, { "epoch": 0.32017491217086375, "grad_norm": 0.8618873357772827, "learning_rate": 9.672314702033636e-05, "loss": 0.9446417808532714, "memory(GiB)": 91.52, "step": 24675, "token_acc": 0.7512405327761817, "train_speed(iter/s)": 0.201861 }, { "epoch": 0.32023979057251944, "grad_norm": 0.8044174909591675, "learning_rate": 9.672123693165971e-05, "loss": 1.0281580924987792, "memory(GiB)": 91.52, "step": 24680, "token_acc": 0.7359762486901851, "train_speed(iter/s)": 0.201835 }, { "epoch": 0.32030466897417514, "grad_norm": 0.8074529767036438, "learning_rate": 9.671932630531809e-05, "loss": 0.934505844116211, "memory(GiB)": 91.52, "step": 24685, "token_acc": 0.7538490902150401, "train_speed(iter/s)": 0.201807 }, { "epoch": 0.32036954737583084, "grad_norm": 0.8237971663475037, "learning_rate": 9.671741514133346e-05, "loss": 0.9215030670166016, "memory(GiB)": 91.52, "step": 24690, "token_acc": 0.7732991520410175, "train_speed(iter/s)": 0.20178 }, { "epoch": 0.32043442577748654, "grad_norm": 0.8853472471237183, "learning_rate": 9.671550343972784e-05, "loss": 0.9384156227111816, "memory(GiB)": 91.52, "step": 24695, "token_acc": 0.7375078829246577, "train_speed(iter/s)": 0.201753 }, { "epoch": 0.32049930417914224, "grad_norm": 0.8761937022209167, "learning_rate": 9.67135912005232e-05, "loss": 1.0041385650634767, "memory(GiB)": 91.52, "step": 24700, "token_acc": 0.7365853658536585, "train_speed(iter/s)": 0.201727 }, { "epoch": 0.32056418258079794, "grad_norm": 0.816615104675293, "learning_rate": 9.671167842374154e-05, "loss": 0.9298195838928223, "memory(GiB)": 91.52, "step": 24705, "token_acc": 0.7579351931614869, "train_speed(iter/s)": 0.201702 }, { "epoch": 0.32062906098245364, "grad_norm": 0.779143214225769, "learning_rate": 9.670976510940492e-05, "loss": 0.9387312889099121, "memory(GiB)": 91.52, "step": 24710, "token_acc": 0.7543751807925947, "train_speed(iter/s)": 0.201676 }, { "epoch": 0.32069393938410934, "grad_norm": 0.8008418679237366, "learning_rate": 9.670785125753532e-05, "loss": 0.9476779937744141, "memory(GiB)": 91.52, "step": 24715, "token_acc": 0.7551809263951147, "train_speed(iter/s)": 0.201647 }, { "epoch": 0.32075881778576504, "grad_norm": 0.9485731720924377, "learning_rate": 9.670593686815477e-05, "loss": 0.9173348426818848, "memory(GiB)": 91.52, "step": 24720, "token_acc": 0.7400109269714078, "train_speed(iter/s)": 0.201618 }, { "epoch": 0.32082369618742074, "grad_norm": 0.9010252356529236, "learning_rate": 9.67040219412853e-05, "loss": 1.006731128692627, "memory(GiB)": 91.52, "step": 24725, "token_acc": 0.7372360844529751, "train_speed(iter/s)": 0.201592 }, { "epoch": 0.32088857458907644, "grad_norm": 0.8478607535362244, "learning_rate": 9.670210647694895e-05, "loss": 0.9490633010864258, "memory(GiB)": 91.52, "step": 24730, "token_acc": 0.7367077698223283, "train_speed(iter/s)": 0.201566 }, { "epoch": 0.32095345299073214, "grad_norm": 0.8735868334770203, "learning_rate": 9.670019047516779e-05, "loss": 0.9349748611450195, "memory(GiB)": 91.52, "step": 24735, "token_acc": 0.7324372493687806, "train_speed(iter/s)": 0.201545 }, { "epoch": 0.32101833139238783, "grad_norm": 0.6889681816101074, "learning_rate": 9.669827393596381e-05, "loss": 0.89991455078125, "memory(GiB)": 91.52, "step": 24740, "token_acc": 0.7583448476474438, "train_speed(iter/s)": 0.201515 }, { "epoch": 0.32108320979404353, "grad_norm": 0.6652535200119019, "learning_rate": 9.669635685935912e-05, "loss": 0.9400872230529785, "memory(GiB)": 91.52, "step": 24745, "token_acc": 0.735932704331022, "train_speed(iter/s)": 0.201492 }, { "epoch": 0.32114808819569923, "grad_norm": 0.8150721788406372, "learning_rate": 9.669443924537576e-05, "loss": 0.9648160934448242, "memory(GiB)": 91.52, "step": 24750, "token_acc": 0.7366128718646704, "train_speed(iter/s)": 0.201468 }, { "epoch": 0.32121296659735493, "grad_norm": 0.7926726937294006, "learning_rate": 9.669252109403578e-05, "loss": 0.9408201217651367, "memory(GiB)": 91.52, "step": 24755, "token_acc": 0.7539390280712346, "train_speed(iter/s)": 0.201442 }, { "epoch": 0.32127784499901063, "grad_norm": 0.7488014698028564, "learning_rate": 9.66906024053613e-05, "loss": 0.9052616119384765, "memory(GiB)": 91.52, "step": 24760, "token_acc": 0.771349975365413, "train_speed(iter/s)": 0.201417 }, { "epoch": 0.3213427234006663, "grad_norm": 0.7641466856002808, "learning_rate": 9.668868317937435e-05, "loss": 0.9814586639404297, "memory(GiB)": 91.52, "step": 24765, "token_acc": 0.7597727676193857, "train_speed(iter/s)": 0.201391 }, { "epoch": 0.321407601802322, "grad_norm": 0.8238776326179504, "learning_rate": 9.668676341609705e-05, "loss": 1.0190093994140625, "memory(GiB)": 91.52, "step": 24770, "token_acc": 0.7455115848357713, "train_speed(iter/s)": 0.201364 }, { "epoch": 0.3214724802039777, "grad_norm": 0.7464550733566284, "learning_rate": 9.668484311555147e-05, "loss": 0.9293401718139649, "memory(GiB)": 91.52, "step": 24775, "token_acc": 0.7427055702917772, "train_speed(iter/s)": 0.201337 }, { "epoch": 0.32153735860563337, "grad_norm": 0.7474105358123779, "learning_rate": 9.668292227775973e-05, "loss": 0.9538436889648437, "memory(GiB)": 91.52, "step": 24780, "token_acc": 0.7706455653661667, "train_speed(iter/s)": 0.201312 }, { "epoch": 0.32160223700728907, "grad_norm": 0.7963415384292603, "learning_rate": 9.668100090274392e-05, "loss": 0.9657169342041015, "memory(GiB)": 91.52, "step": 24785, "token_acc": 0.7608898859599925, "train_speed(iter/s)": 0.201284 }, { "epoch": 0.32166711540894477, "grad_norm": 0.8173549771308899, "learning_rate": 9.667907899052616e-05, "loss": 0.9754177093505859, "memory(GiB)": 91.52, "step": 24790, "token_acc": 0.732542716704377, "train_speed(iter/s)": 0.201257 }, { "epoch": 0.32173199381060047, "grad_norm": 0.8394323587417603, "learning_rate": 9.667715654112857e-05, "loss": 0.9522571563720703, "memory(GiB)": 91.52, "step": 24795, "token_acc": 0.7557828006490321, "train_speed(iter/s)": 0.20123 }, { "epoch": 0.32179687221225617, "grad_norm": 0.8024957776069641, "learning_rate": 9.667523355457327e-05, "loss": 1.000351333618164, "memory(GiB)": 91.52, "step": 24800, "token_acc": 0.7416847372810675, "train_speed(iter/s)": 0.201204 }, { "epoch": 0.32186175061391187, "grad_norm": 0.775836706161499, "learning_rate": 9.667331003088236e-05, "loss": 0.934514331817627, "memory(GiB)": 91.52, "step": 24805, "token_acc": 0.7546789315407748, "train_speed(iter/s)": 0.201179 }, { "epoch": 0.32192662901556757, "grad_norm": 0.8436768054962158, "learning_rate": 9.667138597007801e-05, "loss": 0.9768821716308593, "memory(GiB)": 91.52, "step": 24810, "token_acc": 0.7641977283634619, "train_speed(iter/s)": 0.201153 }, { "epoch": 0.32199150741722327, "grad_norm": 0.731018602848053, "learning_rate": 9.666946137218237e-05, "loss": 0.9572758674621582, "memory(GiB)": 91.52, "step": 24815, "token_acc": 0.7466649645752218, "train_speed(iter/s)": 0.201125 }, { "epoch": 0.32205638581887897, "grad_norm": 0.7705028057098389, "learning_rate": 9.666753623721755e-05, "loss": 0.9534688949584961, "memory(GiB)": 91.52, "step": 24820, "token_acc": 0.7372622282608695, "train_speed(iter/s)": 0.2011 }, { "epoch": 0.32212126422053466, "grad_norm": 0.7217574715614319, "learning_rate": 9.666561056520576e-05, "loss": 0.9138279914855957, "memory(GiB)": 91.52, "step": 24825, "token_acc": 0.7579068609139584, "train_speed(iter/s)": 0.201073 }, { "epoch": 0.32218614262219036, "grad_norm": 0.7520654201507568, "learning_rate": 9.66636843561691e-05, "loss": 0.9678354263305664, "memory(GiB)": 91.52, "step": 24830, "token_acc": 0.7345121566150932, "train_speed(iter/s)": 0.201048 }, { "epoch": 0.32225102102384606, "grad_norm": 0.7967449426651001, "learning_rate": 9.666175761012977e-05, "loss": 0.9774991989135742, "memory(GiB)": 91.52, "step": 24835, "token_acc": 0.7261166360580754, "train_speed(iter/s)": 0.201021 }, { "epoch": 0.32231589942550176, "grad_norm": 0.8866094350814819, "learning_rate": 9.665983032710994e-05, "loss": 0.9561822891235352, "memory(GiB)": 91.52, "step": 24840, "token_acc": 0.7439294267438687, "train_speed(iter/s)": 0.200995 }, { "epoch": 0.32238077782715746, "grad_norm": 0.8144985437393188, "learning_rate": 9.665790250713178e-05, "loss": 0.8951604843139649, "memory(GiB)": 91.52, "step": 24845, "token_acc": 0.7487200427175927, "train_speed(iter/s)": 0.200968 }, { "epoch": 0.32244565622881316, "grad_norm": 0.7422394156455994, "learning_rate": 9.665597415021747e-05, "loss": 0.9519770622253418, "memory(GiB)": 91.52, "step": 24850, "token_acc": 0.7466671789158247, "train_speed(iter/s)": 0.200943 }, { "epoch": 0.32251053463046886, "grad_norm": 0.8714608550071716, "learning_rate": 9.665404525638924e-05, "loss": 1.014956283569336, "memory(GiB)": 91.52, "step": 24855, "token_acc": 0.7434651112551307, "train_speed(iter/s)": 0.200916 }, { "epoch": 0.32257541303212456, "grad_norm": 0.8065102100372314, "learning_rate": 9.665211582566924e-05, "loss": 1.0112253189086915, "memory(GiB)": 91.52, "step": 24860, "token_acc": 0.727246219852543, "train_speed(iter/s)": 0.200895 }, { "epoch": 0.32264029143378026, "grad_norm": 0.7007826566696167, "learning_rate": 9.66501858580797e-05, "loss": 0.96670503616333, "memory(GiB)": 91.52, "step": 24865, "token_acc": 0.7337324776825727, "train_speed(iter/s)": 0.200869 }, { "epoch": 0.32270516983543596, "grad_norm": 0.9231230616569519, "learning_rate": 9.664825535364283e-05, "loss": 0.9806740760803223, "memory(GiB)": 91.52, "step": 24870, "token_acc": 0.7293128741921415, "train_speed(iter/s)": 0.200844 }, { "epoch": 0.32277004823709166, "grad_norm": 0.829390287399292, "learning_rate": 9.664632431238083e-05, "loss": 0.981536865234375, "memory(GiB)": 91.52, "step": 24875, "token_acc": 0.7317817234523891, "train_speed(iter/s)": 0.200815 }, { "epoch": 0.32283492663874735, "grad_norm": 0.8119911551475525, "learning_rate": 9.664439273431594e-05, "loss": 0.948145580291748, "memory(GiB)": 91.52, "step": 24880, "token_acc": 0.7353236327145007, "train_speed(iter/s)": 0.200787 }, { "epoch": 0.322899805040403, "grad_norm": 0.929973840713501, "learning_rate": 9.664246061947038e-05, "loss": 0.978018856048584, "memory(GiB)": 91.52, "step": 24885, "token_acc": 0.7435794248920758, "train_speed(iter/s)": 0.200764 }, { "epoch": 0.3229646834420587, "grad_norm": 0.7806313633918762, "learning_rate": 9.664052796786636e-05, "loss": 0.938160514831543, "memory(GiB)": 91.52, "step": 24890, "token_acc": 0.7568652386440409, "train_speed(iter/s)": 0.20074 }, { "epoch": 0.3230295618437144, "grad_norm": 0.8262535929679871, "learning_rate": 9.663859477952617e-05, "loss": 0.9292884826660156, "memory(GiB)": 91.52, "step": 24895, "token_acc": 0.7586638428956488, "train_speed(iter/s)": 0.200713 }, { "epoch": 0.3230944402453701, "grad_norm": 0.7645235657691956, "learning_rate": 9.663666105447203e-05, "loss": 0.9369125366210938, "memory(GiB)": 91.52, "step": 24900, "token_acc": 0.7370231973065966, "train_speed(iter/s)": 0.200689 }, { "epoch": 0.3231593186470258, "grad_norm": 0.7281742095947266, "learning_rate": 9.66347267927262e-05, "loss": 0.960874366760254, "memory(GiB)": 91.52, "step": 24905, "token_acc": 0.7462487609802783, "train_speed(iter/s)": 0.200663 }, { "epoch": 0.3232241970486815, "grad_norm": 0.8319474458694458, "learning_rate": 9.663279199431092e-05, "loss": 0.9840591430664063, "memory(GiB)": 91.52, "step": 24910, "token_acc": 0.7502685284640171, "train_speed(iter/s)": 0.200637 }, { "epoch": 0.3232890754503372, "grad_norm": 0.7574676871299744, "learning_rate": 9.663085665924849e-05, "loss": 0.9456605911254883, "memory(GiB)": 91.52, "step": 24915, "token_acc": 0.7628049204253249, "train_speed(iter/s)": 0.200609 }, { "epoch": 0.3233539538519929, "grad_norm": 0.8123035430908203, "learning_rate": 9.662892078756116e-05, "loss": 1.0041069030761718, "memory(GiB)": 91.52, "step": 24920, "token_acc": 0.7390506850340646, "train_speed(iter/s)": 0.200585 }, { "epoch": 0.3234188322536486, "grad_norm": 0.7889302372932434, "learning_rate": 9.66269843792712e-05, "loss": 0.8965390205383301, "memory(GiB)": 91.52, "step": 24925, "token_acc": 0.7613214550853749, "train_speed(iter/s)": 0.200559 }, { "epoch": 0.3234837106553043, "grad_norm": 0.7991968393325806, "learning_rate": 9.662504743440091e-05, "loss": 0.8844247817993164, "memory(GiB)": 91.52, "step": 24930, "token_acc": 0.7561733785058153, "train_speed(iter/s)": 0.200533 }, { "epoch": 0.32354858905696, "grad_norm": 0.8078881502151489, "learning_rate": 9.662310995297256e-05, "loss": 0.9558114051818848, "memory(GiB)": 91.52, "step": 24935, "token_acc": 0.7370191735143503, "train_speed(iter/s)": 0.200508 }, { "epoch": 0.3236134674586157, "grad_norm": 0.7637928128242493, "learning_rate": 9.662117193500848e-05, "loss": 0.9391481399536132, "memory(GiB)": 91.52, "step": 24940, "token_acc": 0.7375756993175046, "train_speed(iter/s)": 0.200482 }, { "epoch": 0.3236783458602714, "grad_norm": 0.7720791697502136, "learning_rate": 9.661923338053096e-05, "loss": 0.94786376953125, "memory(GiB)": 91.52, "step": 24945, "token_acc": 0.7372678522737601, "train_speed(iter/s)": 0.200454 }, { "epoch": 0.3237432242619271, "grad_norm": 0.8181898593902588, "learning_rate": 9.661729428956228e-05, "loss": 0.9666799545288086, "memory(GiB)": 91.52, "step": 24950, "token_acc": 0.7541759053954176, "train_speed(iter/s)": 0.200429 }, { "epoch": 0.3238081026635828, "grad_norm": 0.7661532163619995, "learning_rate": 9.661535466212479e-05, "loss": 0.9154224395751953, "memory(GiB)": 91.52, "step": 24955, "token_acc": 0.763040722805421, "train_speed(iter/s)": 0.200402 }, { "epoch": 0.3238729810652385, "grad_norm": 0.7637906074523926, "learning_rate": 9.66134144982408e-05, "loss": 0.9766718864440918, "memory(GiB)": 91.52, "step": 24960, "token_acc": 0.749391567453792, "train_speed(iter/s)": 0.200378 }, { "epoch": 0.3239378594668942, "grad_norm": 0.8227773904800415, "learning_rate": 9.661147379793264e-05, "loss": 0.9613493919372559, "memory(GiB)": 91.52, "step": 24965, "token_acc": 0.7187249398556536, "train_speed(iter/s)": 0.200354 }, { "epoch": 0.3240027378685499, "grad_norm": 0.8331441283226013, "learning_rate": 9.660953256122263e-05, "loss": 1.0058723449707032, "memory(GiB)": 91.52, "step": 24970, "token_acc": 0.7253186158817763, "train_speed(iter/s)": 0.200332 }, { "epoch": 0.3240676162702056, "grad_norm": 0.7898324728012085, "learning_rate": 9.660759078813313e-05, "loss": 0.9484224319458008, "memory(GiB)": 91.52, "step": 24975, "token_acc": 0.7364831782891516, "train_speed(iter/s)": 0.200306 }, { "epoch": 0.3241324946718613, "grad_norm": 0.9478586316108704, "learning_rate": 9.660564847868647e-05, "loss": 0.9836338043212891, "memory(GiB)": 91.52, "step": 24980, "token_acc": 0.7402477175509161, "train_speed(iter/s)": 0.20028 }, { "epoch": 0.324197373073517, "grad_norm": 0.7023670077323914, "learning_rate": 9.6603705632905e-05, "loss": 0.9734432220458984, "memory(GiB)": 91.52, "step": 24985, "token_acc": 0.756882676831851, "train_speed(iter/s)": 0.200251 }, { "epoch": 0.3242622514751727, "grad_norm": 0.8484435677528381, "learning_rate": 9.660176225081107e-05, "loss": 0.972563362121582, "memory(GiB)": 91.52, "step": 24990, "token_acc": 0.7623897046001464, "train_speed(iter/s)": 0.200227 }, { "epoch": 0.3243271298768284, "grad_norm": 0.7910907864570618, "learning_rate": 9.659981833242709e-05, "loss": 0.9901381492614746, "memory(GiB)": 91.52, "step": 24995, "token_acc": 0.7432134490836434, "train_speed(iter/s)": 0.200201 }, { "epoch": 0.3243920082784841, "grad_norm": 0.7756918668746948, "learning_rate": 9.65978738777754e-05, "loss": 0.9691893577575683, "memory(GiB)": 91.52, "step": 25000, "token_acc": 0.7344337365866995, "train_speed(iter/s)": 0.200177 }, { "epoch": 0.3244568866801397, "grad_norm": 0.7432878017425537, "learning_rate": 9.659592888687836e-05, "loss": 0.9844182968139649, "memory(GiB)": 91.52, "step": 25005, "token_acc": 0.7185513458534636, "train_speed(iter/s)": 0.200154 }, { "epoch": 0.3245217650817954, "grad_norm": 0.8158232569694519, "learning_rate": 9.659398335975838e-05, "loss": 0.9756648063659668, "memory(GiB)": 91.52, "step": 25010, "token_acc": 0.7577469611367916, "train_speed(iter/s)": 0.200132 }, { "epoch": 0.3245866434834511, "grad_norm": 0.8150123953819275, "learning_rate": 9.659203729643782e-05, "loss": 0.9501436233520508, "memory(GiB)": 91.52, "step": 25015, "token_acc": 0.745450035655598, "train_speed(iter/s)": 0.200107 }, { "epoch": 0.3246515218851068, "grad_norm": 0.8106909990310669, "learning_rate": 9.659009069693912e-05, "loss": 0.9212479591369629, "memory(GiB)": 91.52, "step": 25020, "token_acc": 0.773527191832519, "train_speed(iter/s)": 0.20008 }, { "epoch": 0.3247164002867625, "grad_norm": 0.8421273827552795, "learning_rate": 9.658814356128462e-05, "loss": 0.9047286033630371, "memory(GiB)": 91.52, "step": 25025, "token_acc": 0.7336086404066073, "train_speed(iter/s)": 0.200055 }, { "epoch": 0.3247812786884182, "grad_norm": 0.7082775235176086, "learning_rate": 9.65861958894968e-05, "loss": 0.9389800071716309, "memory(GiB)": 91.52, "step": 25030, "token_acc": 0.7453542392566783, "train_speed(iter/s)": 0.20003 }, { "epoch": 0.3248461570900739, "grad_norm": 0.9274479746818542, "learning_rate": 9.658424768159802e-05, "loss": 0.9793360710144043, "memory(GiB)": 91.52, "step": 25035, "token_acc": 0.750125547026329, "train_speed(iter/s)": 0.200006 }, { "epoch": 0.3249110354917296, "grad_norm": 0.8122207522392273, "learning_rate": 9.65822989376107e-05, "loss": 0.9084961891174317, "memory(GiB)": 91.52, "step": 25040, "token_acc": 0.7612730295524718, "train_speed(iter/s)": 0.199978 }, { "epoch": 0.3249759138933853, "grad_norm": 0.7363495826721191, "learning_rate": 9.65803496575573e-05, "loss": 0.9448455810546875, "memory(GiB)": 91.52, "step": 25045, "token_acc": 0.7331470735726054, "train_speed(iter/s)": 0.199952 }, { "epoch": 0.325040792295041, "grad_norm": 0.8811445832252502, "learning_rate": 9.657839984146023e-05, "loss": 0.9821912765502929, "memory(GiB)": 91.52, "step": 25050, "token_acc": 0.7462436730301701, "train_speed(iter/s)": 0.199927 }, { "epoch": 0.3251056706966967, "grad_norm": 0.7559829354286194, "learning_rate": 9.657644948934194e-05, "loss": 0.8949678421020508, "memory(GiB)": 91.52, "step": 25055, "token_acc": 0.7825710486439682, "train_speed(iter/s)": 0.199902 }, { "epoch": 0.3251705490983524, "grad_norm": 0.7357130646705627, "learning_rate": 9.657449860122486e-05, "loss": 0.9475173950195312, "memory(GiB)": 91.52, "step": 25060, "token_acc": 0.7350018280320404, "train_speed(iter/s)": 0.199877 }, { "epoch": 0.3252354275000081, "grad_norm": 0.7908957600593567, "learning_rate": 9.657254717713144e-05, "loss": 1.0018461227416993, "memory(GiB)": 91.52, "step": 25065, "token_acc": 0.7362120907812814, "train_speed(iter/s)": 0.199851 }, { "epoch": 0.3253003059016638, "grad_norm": 0.862975001335144, "learning_rate": 9.657059521708416e-05, "loss": 0.9049875259399414, "memory(GiB)": 91.52, "step": 25070, "token_acc": 0.7289838294058334, "train_speed(iter/s)": 0.199823 }, { "epoch": 0.3253651843033195, "grad_norm": 0.8051312565803528, "learning_rate": 9.656864272110545e-05, "loss": 0.9773950576782227, "memory(GiB)": 91.52, "step": 25075, "token_acc": 0.7426292491439072, "train_speed(iter/s)": 0.199796 }, { "epoch": 0.3254300627049752, "grad_norm": 0.7183454632759094, "learning_rate": 9.65666896892178e-05, "loss": 0.9965639114379883, "memory(GiB)": 91.52, "step": 25080, "token_acc": 0.7566832504145937, "train_speed(iter/s)": 0.199774 }, { "epoch": 0.3254949411066309, "grad_norm": 0.7660866379737854, "learning_rate": 9.65647361214437e-05, "loss": 0.9347340583801269, "memory(GiB)": 91.52, "step": 25085, "token_acc": 0.7591405184174624, "train_speed(iter/s)": 0.199748 }, { "epoch": 0.3255598195082866, "grad_norm": 0.7852791547775269, "learning_rate": 9.65627820178056e-05, "loss": 0.9837148666381836, "memory(GiB)": 91.52, "step": 25090, "token_acc": 0.7493727968496935, "train_speed(iter/s)": 0.199722 }, { "epoch": 0.3256246979099423, "grad_norm": 0.9296635389328003, "learning_rate": 9.6560827378326e-05, "loss": 0.9694708824157715, "memory(GiB)": 91.52, "step": 25095, "token_acc": 0.7453196890369665, "train_speed(iter/s)": 0.199698 }, { "epoch": 0.325689576311598, "grad_norm": 0.766595721244812, "learning_rate": 9.655887220302739e-05, "loss": 0.9155280113220214, "memory(GiB)": 91.52, "step": 25100, "token_acc": 0.7506825305874819, "train_speed(iter/s)": 0.199671 }, { "epoch": 0.3257544547132537, "grad_norm": 0.8685328960418701, "learning_rate": 9.655691649193229e-05, "loss": 0.9892011642456054, "memory(GiB)": 91.52, "step": 25105, "token_acc": 0.7519174452656533, "train_speed(iter/s)": 0.199646 }, { "epoch": 0.3258193331149094, "grad_norm": 0.768908679485321, "learning_rate": 9.655496024506319e-05, "loss": 0.9489130973815918, "memory(GiB)": 91.52, "step": 25110, "token_acc": 0.7562795324546133, "train_speed(iter/s)": 0.19962 }, { "epoch": 0.3258842115165651, "grad_norm": 0.8936448097229004, "learning_rate": 9.655300346244261e-05, "loss": 0.9838628768920898, "memory(GiB)": 91.52, "step": 25115, "token_acc": 0.729879439556859, "train_speed(iter/s)": 0.199599 }, { "epoch": 0.3259490899182208, "grad_norm": 0.8169732093811035, "learning_rate": 9.655104614409304e-05, "loss": 0.9416132926940918, "memory(GiB)": 91.52, "step": 25120, "token_acc": 0.7464632577941372, "train_speed(iter/s)": 0.199576 }, { "epoch": 0.32601396831987645, "grad_norm": 0.828315258026123, "learning_rate": 9.654908829003704e-05, "loss": 0.9406463623046875, "memory(GiB)": 91.52, "step": 25125, "token_acc": 0.7490681906925561, "train_speed(iter/s)": 0.199551 }, { "epoch": 0.32607884672153215, "grad_norm": 0.728663444519043, "learning_rate": 9.654712990029713e-05, "loss": 0.9557942390441895, "memory(GiB)": 91.52, "step": 25130, "token_acc": 0.746954490942372, "train_speed(iter/s)": 0.199527 }, { "epoch": 0.32614372512318784, "grad_norm": 0.9101400971412659, "learning_rate": 9.654517097489584e-05, "loss": 0.9138846397399902, "memory(GiB)": 91.52, "step": 25135, "token_acc": 0.7413324868620604, "train_speed(iter/s)": 0.199503 }, { "epoch": 0.32620860352484354, "grad_norm": 0.7482179999351501, "learning_rate": 9.65432115138557e-05, "loss": 0.9360394477844238, "memory(GiB)": 91.52, "step": 25140, "token_acc": 0.7372961315541178, "train_speed(iter/s)": 0.199477 }, { "epoch": 0.32627348192649924, "grad_norm": 0.8606112599372864, "learning_rate": 9.65412515171993e-05, "loss": 0.9939521789550781, "memory(GiB)": 91.52, "step": 25145, "token_acc": 0.737360472751149, "train_speed(iter/s)": 0.199451 }, { "epoch": 0.32633836032815494, "grad_norm": 0.7650282979011536, "learning_rate": 9.653929098494917e-05, "loss": 0.9503829956054688, "memory(GiB)": 91.52, "step": 25150, "token_acc": 0.7398733573042289, "train_speed(iter/s)": 0.199429 }, { "epoch": 0.32640323872981064, "grad_norm": 0.7582418918609619, "learning_rate": 9.653732991712788e-05, "loss": 0.913463020324707, "memory(GiB)": 91.52, "step": 25155, "token_acc": 0.7485778414148059, "train_speed(iter/s)": 0.199402 }, { "epoch": 0.32646811713146634, "grad_norm": 0.8315255641937256, "learning_rate": 9.653536831375797e-05, "loss": 0.9747760772705079, "memory(GiB)": 91.52, "step": 25160, "token_acc": 0.7334653174952339, "train_speed(iter/s)": 0.199382 }, { "epoch": 0.32653299553312204, "grad_norm": 0.82606041431427, "learning_rate": 9.653340617486206e-05, "loss": 0.9673446655273438, "memory(GiB)": 91.52, "step": 25165, "token_acc": 0.7468728225886725, "train_speed(iter/s)": 0.199355 }, { "epoch": 0.32659787393477774, "grad_norm": 1.0049927234649658, "learning_rate": 9.65314435004627e-05, "loss": 0.9854526519775391, "memory(GiB)": 91.52, "step": 25170, "token_acc": 0.7480413258717176, "train_speed(iter/s)": 0.199331 }, { "epoch": 0.32666275233643344, "grad_norm": 0.8064778447151184, "learning_rate": 9.652948029058249e-05, "loss": 0.9523500442504883, "memory(GiB)": 91.52, "step": 25175, "token_acc": 0.7457101597009854, "train_speed(iter/s)": 0.199304 }, { "epoch": 0.32672763073808914, "grad_norm": 0.8043739199638367, "learning_rate": 9.6527516545244e-05, "loss": 0.9877938270568848, "memory(GiB)": 91.52, "step": 25180, "token_acc": 0.7387544328998943, "train_speed(iter/s)": 0.199282 }, { "epoch": 0.32679250913974484, "grad_norm": 0.834111750125885, "learning_rate": 9.652555226446985e-05, "loss": 1.0033580780029296, "memory(GiB)": 91.52, "step": 25185, "token_acc": 0.7648058940209691, "train_speed(iter/s)": 0.199258 }, { "epoch": 0.32685738754140053, "grad_norm": 0.7716458439826965, "learning_rate": 9.652358744828264e-05, "loss": 0.9251138687133789, "memory(GiB)": 91.52, "step": 25190, "token_acc": 0.754289007933817, "train_speed(iter/s)": 0.199233 }, { "epoch": 0.32692226594305623, "grad_norm": 0.7173829674720764, "learning_rate": 9.6521622096705e-05, "loss": 0.9296079635620117, "memory(GiB)": 91.52, "step": 25195, "token_acc": 0.7240803973304362, "train_speed(iter/s)": 0.199208 }, { "epoch": 0.32698714434471193, "grad_norm": 0.7481768727302551, "learning_rate": 9.651965620975949e-05, "loss": 0.917119026184082, "memory(GiB)": 91.52, "step": 25200, "token_acc": 0.7470346885490489, "train_speed(iter/s)": 0.199185 }, { "epoch": 0.32705202274636763, "grad_norm": 0.7625685334205627, "learning_rate": 9.651768978746879e-05, "loss": 0.9462787628173828, "memory(GiB)": 91.52, "step": 25205, "token_acc": 0.7423460842768138, "train_speed(iter/s)": 0.199159 }, { "epoch": 0.32711690114802333, "grad_norm": 0.6880432367324829, "learning_rate": 9.651572282985551e-05, "loss": 0.9767599105834961, "memory(GiB)": 91.52, "step": 25210, "token_acc": 0.7346179183135705, "train_speed(iter/s)": 0.199136 }, { "epoch": 0.32718177954967903, "grad_norm": 0.7362475991249084, "learning_rate": 9.651375533694227e-05, "loss": 0.97254638671875, "memory(GiB)": 91.52, "step": 25215, "token_acc": 0.7457070948421421, "train_speed(iter/s)": 0.199108 }, { "epoch": 0.32724665795133473, "grad_norm": 0.8429293036460876, "learning_rate": 9.651178730875175e-05, "loss": 0.9902599334716797, "memory(GiB)": 91.52, "step": 25220, "token_acc": 0.7424282155500197, "train_speed(iter/s)": 0.199084 }, { "epoch": 0.32731153635299043, "grad_norm": 0.7917014956474304, "learning_rate": 9.650981874530656e-05, "loss": 0.9247072219848633, "memory(GiB)": 91.52, "step": 25225, "token_acc": 0.7530094112497264, "train_speed(iter/s)": 0.199059 }, { "epoch": 0.32737641475464613, "grad_norm": 0.7637408971786499, "learning_rate": 9.650784964662938e-05, "loss": 0.971717643737793, "memory(GiB)": 91.52, "step": 25230, "token_acc": 0.7502848770958815, "train_speed(iter/s)": 0.199035 }, { "epoch": 0.3274412931563018, "grad_norm": 0.7123239040374756, "learning_rate": 9.650588001274285e-05, "loss": 0.9681990623474122, "memory(GiB)": 91.52, "step": 25235, "token_acc": 0.7316144546122504, "train_speed(iter/s)": 0.199009 }, { "epoch": 0.32750617155795747, "grad_norm": 0.8645555973052979, "learning_rate": 9.650390984366965e-05, "loss": 1.0106836318969727, "memory(GiB)": 91.52, "step": 25240, "token_acc": 0.7278194989034359, "train_speed(iter/s)": 0.198984 }, { "epoch": 0.32757104995961317, "grad_norm": 0.7932599782943726, "learning_rate": 9.650193913943246e-05, "loss": 0.9773198127746582, "memory(GiB)": 91.52, "step": 25245, "token_acc": 0.7362142197599262, "train_speed(iter/s)": 0.198958 }, { "epoch": 0.32763592836126887, "grad_norm": 0.7376984357833862, "learning_rate": 9.649996790005393e-05, "loss": 0.9322121620178223, "memory(GiB)": 91.52, "step": 25250, "token_acc": 0.7629257613804649, "train_speed(iter/s)": 0.198933 }, { "epoch": 0.32770080676292457, "grad_norm": 0.9315522313117981, "learning_rate": 9.649799612555676e-05, "loss": 0.9302845001220703, "memory(GiB)": 91.52, "step": 25255, "token_acc": 0.7561960912763402, "train_speed(iter/s)": 0.198907 }, { "epoch": 0.32776568516458027, "grad_norm": 0.7978411316871643, "learning_rate": 9.649602381596365e-05, "loss": 0.9682306289672852, "memory(GiB)": 91.52, "step": 25260, "token_acc": 0.7235848402965427, "train_speed(iter/s)": 0.198884 }, { "epoch": 0.32783056356623597, "grad_norm": 0.8665252327919006, "learning_rate": 9.64940509712973e-05, "loss": 0.9465263366699219, "memory(GiB)": 91.52, "step": 25265, "token_acc": 0.7588518553488811, "train_speed(iter/s)": 0.198859 }, { "epoch": 0.32789544196789167, "grad_norm": 0.7578467726707458, "learning_rate": 9.649207759158039e-05, "loss": 0.9521890640258789, "memory(GiB)": 91.52, "step": 25270, "token_acc": 0.7581920091980454, "train_speed(iter/s)": 0.198832 }, { "epoch": 0.32796032036954736, "grad_norm": 0.7955176830291748, "learning_rate": 9.649010367683564e-05, "loss": 0.9224810600280762, "memory(GiB)": 91.52, "step": 25275, "token_acc": 0.7697454196116105, "train_speed(iter/s)": 0.198804 }, { "epoch": 0.32802519877120306, "grad_norm": 0.8494170308113098, "learning_rate": 9.64881292270858e-05, "loss": 0.965762996673584, "memory(GiB)": 91.52, "step": 25280, "token_acc": 0.7537266326778345, "train_speed(iter/s)": 0.198783 }, { "epoch": 0.32809007717285876, "grad_norm": 0.823319673538208, "learning_rate": 9.648615424235352e-05, "loss": 0.935579490661621, "memory(GiB)": 91.52, "step": 25285, "token_acc": 0.7524474433480245, "train_speed(iter/s)": 0.198756 }, { "epoch": 0.32815495557451446, "grad_norm": 0.7906967997550964, "learning_rate": 9.64841787226616e-05, "loss": 0.9921481132507324, "memory(GiB)": 91.52, "step": 25290, "token_acc": 0.7355695228322217, "train_speed(iter/s)": 0.198731 }, { "epoch": 0.32821983397617016, "grad_norm": 0.7857763171195984, "learning_rate": 9.648220266803272e-05, "loss": 0.9496352195739746, "memory(GiB)": 91.52, "step": 25295, "token_acc": 0.772654858963546, "train_speed(iter/s)": 0.198704 }, { "epoch": 0.32828471237782586, "grad_norm": 0.7795878648757935, "learning_rate": 9.648022607848965e-05, "loss": 0.9610715866088867, "memory(GiB)": 91.52, "step": 25300, "token_acc": 0.7572194162779041, "train_speed(iter/s)": 0.198677 }, { "epoch": 0.32834959077948156, "grad_norm": 0.6361367106437683, "learning_rate": 9.647824895405513e-05, "loss": 0.9106060028076172, "memory(GiB)": 91.52, "step": 25305, "token_acc": 0.765199876045863, "train_speed(iter/s)": 0.19865 }, { "epoch": 0.32841446918113726, "grad_norm": 0.8055233359336853, "learning_rate": 9.647627129475192e-05, "loss": 0.926240348815918, "memory(GiB)": 91.52, "step": 25310, "token_acc": 0.7403097137494135, "train_speed(iter/s)": 0.198627 }, { "epoch": 0.32847934758279296, "grad_norm": 0.8558530211448669, "learning_rate": 9.647429310060276e-05, "loss": 0.948517894744873, "memory(GiB)": 91.52, "step": 25315, "token_acc": 0.7388958878444294, "train_speed(iter/s)": 0.198603 }, { "epoch": 0.32854422598444866, "grad_norm": 0.7729669213294983, "learning_rate": 9.647231437163043e-05, "loss": 0.9656593322753906, "memory(GiB)": 91.52, "step": 25320, "token_acc": 0.735150152136526, "train_speed(iter/s)": 0.198577 }, { "epoch": 0.32860910438610436, "grad_norm": 0.8446770906448364, "learning_rate": 9.64703351078577e-05, "loss": 0.9138184547424316, "memory(GiB)": 91.52, "step": 25325, "token_acc": 0.7477704364315131, "train_speed(iter/s)": 0.198553 }, { "epoch": 0.32867398278776006, "grad_norm": 0.7795440554618835, "learning_rate": 9.646835530930735e-05, "loss": 1.0080271720886231, "memory(GiB)": 91.52, "step": 25330, "token_acc": 0.7356860057865083, "train_speed(iter/s)": 0.198528 }, { "epoch": 0.32873886118941575, "grad_norm": 0.775061845779419, "learning_rate": 9.646637497600215e-05, "loss": 0.912891960144043, "memory(GiB)": 91.52, "step": 25335, "token_acc": 0.742781860016329, "train_speed(iter/s)": 0.198502 }, { "epoch": 0.32880373959107145, "grad_norm": 0.884864091873169, "learning_rate": 9.64643941079649e-05, "loss": 0.9875537872314453, "memory(GiB)": 91.52, "step": 25340, "token_acc": 0.7321543286853343, "train_speed(iter/s)": 0.198478 }, { "epoch": 0.32886861799272715, "grad_norm": 0.7697044014930725, "learning_rate": 9.646241270521839e-05, "loss": 0.9628303527832032, "memory(GiB)": 91.52, "step": 25345, "token_acc": 0.7452863749555318, "train_speed(iter/s)": 0.198453 }, { "epoch": 0.32893349639438285, "grad_norm": 0.7093477249145508, "learning_rate": 9.646043076778542e-05, "loss": 0.9303772926330567, "memory(GiB)": 91.52, "step": 25350, "token_acc": 0.7563030829933792, "train_speed(iter/s)": 0.198428 }, { "epoch": 0.32899837479603855, "grad_norm": 0.8236041069030762, "learning_rate": 9.645844829568882e-05, "loss": 0.9642651557922364, "memory(GiB)": 91.52, "step": 25355, "token_acc": 0.7578533516807548, "train_speed(iter/s)": 0.198403 }, { "epoch": 0.3290632531976942, "grad_norm": 0.8400858044624329, "learning_rate": 9.645646528895137e-05, "loss": 0.9843392372131348, "memory(GiB)": 91.52, "step": 25360, "token_acc": 0.7453147525228255, "train_speed(iter/s)": 0.198379 }, { "epoch": 0.3291281315993499, "grad_norm": 0.7454770803451538, "learning_rate": 9.645448174759591e-05, "loss": 0.920245361328125, "memory(GiB)": 91.52, "step": 25365, "token_acc": 0.7618158558932338, "train_speed(iter/s)": 0.198352 }, { "epoch": 0.3291930100010056, "grad_norm": 0.7714295983314514, "learning_rate": 9.645249767164529e-05, "loss": 0.9847503662109375, "memory(GiB)": 91.52, "step": 25370, "token_acc": 0.7492416956425242, "train_speed(iter/s)": 0.198325 }, { "epoch": 0.3292578884026613, "grad_norm": 0.8578242063522339, "learning_rate": 9.645051306112227e-05, "loss": 0.9719078063964843, "memory(GiB)": 91.52, "step": 25375, "token_acc": 0.771625780619715, "train_speed(iter/s)": 0.198299 }, { "epoch": 0.329322766804317, "grad_norm": 0.7966440320014954, "learning_rate": 9.644852791604976e-05, "loss": 0.9472247123718261, "memory(GiB)": 91.52, "step": 25380, "token_acc": 0.7409082125603865, "train_speed(iter/s)": 0.198277 }, { "epoch": 0.3293876452059727, "grad_norm": 0.8892982602119446, "learning_rate": 9.644654223645058e-05, "loss": 0.9965041160583497, "memory(GiB)": 91.52, "step": 25385, "token_acc": 0.7447198890972845, "train_speed(iter/s)": 0.198258 }, { "epoch": 0.3294525236076284, "grad_norm": 0.7797614336013794, "learning_rate": 9.644455602234759e-05, "loss": 0.9830646514892578, "memory(GiB)": 91.52, "step": 25390, "token_acc": 0.7570152061451638, "train_speed(iter/s)": 0.198236 }, { "epoch": 0.3295174020092841, "grad_norm": 0.7445060014724731, "learning_rate": 9.644256927376362e-05, "loss": 0.9239637374877929, "memory(GiB)": 91.52, "step": 25395, "token_acc": 0.7570350701078333, "train_speed(iter/s)": 0.198211 }, { "epoch": 0.3295822804109398, "grad_norm": 0.768990159034729, "learning_rate": 9.644058199072157e-05, "loss": 0.9846128463745117, "memory(GiB)": 91.52, "step": 25400, "token_acc": 0.7497809579439252, "train_speed(iter/s)": 0.198188 }, { "epoch": 0.3296471588125955, "grad_norm": 0.8159743547439575, "learning_rate": 9.643859417324428e-05, "loss": 0.9417332649230957, "memory(GiB)": 91.52, "step": 25405, "token_acc": 0.7573507174300211, "train_speed(iter/s)": 0.19816 }, { "epoch": 0.3297120372142512, "grad_norm": 0.8999021053314209, "learning_rate": 9.643660582135464e-05, "loss": 0.9259716987609863, "memory(GiB)": 91.52, "step": 25410, "token_acc": 0.7602445827075734, "train_speed(iter/s)": 0.198134 }, { "epoch": 0.3297769156159069, "grad_norm": 0.8455936908721924, "learning_rate": 9.643461693507553e-05, "loss": 0.9866893768310547, "memory(GiB)": 91.52, "step": 25415, "token_acc": 0.7266914115615009, "train_speed(iter/s)": 0.198109 }, { "epoch": 0.3298417940175626, "grad_norm": 0.7571890354156494, "learning_rate": 9.643262751442982e-05, "loss": 0.9723578453063965, "memory(GiB)": 91.52, "step": 25420, "token_acc": 0.7308064953687555, "train_speed(iter/s)": 0.19808 }, { "epoch": 0.3299066724192183, "grad_norm": 0.8303622007369995, "learning_rate": 9.643063755944045e-05, "loss": 0.9400784492492675, "memory(GiB)": 91.52, "step": 25425, "token_acc": 0.7199719582489484, "train_speed(iter/s)": 0.198057 }, { "epoch": 0.329971550820874, "grad_norm": 0.7179346680641174, "learning_rate": 9.642864707013027e-05, "loss": 0.9272555351257324, "memory(GiB)": 91.52, "step": 25430, "token_acc": 0.7487254652052001, "train_speed(iter/s)": 0.198036 }, { "epoch": 0.3300364292225297, "grad_norm": 0.7167167663574219, "learning_rate": 9.642665604652222e-05, "loss": 0.9254261016845703, "memory(GiB)": 91.52, "step": 25435, "token_acc": 0.7507144801306478, "train_speed(iter/s)": 0.198011 }, { "epoch": 0.3301013076241854, "grad_norm": 0.8033238649368286, "learning_rate": 9.64246644886392e-05, "loss": 0.9566244125366211, "memory(GiB)": 91.52, "step": 25440, "token_acc": 0.7346118384657938, "train_speed(iter/s)": 0.197987 }, { "epoch": 0.3301661860258411, "grad_norm": 0.8664199113845825, "learning_rate": 9.642267239650412e-05, "loss": 0.9807870864868165, "memory(GiB)": 91.52, "step": 25445, "token_acc": 0.73544474393531, "train_speed(iter/s)": 0.197963 }, { "epoch": 0.3302310644274968, "grad_norm": 0.7369681000709534, "learning_rate": 9.64206797701399e-05, "loss": 0.9869303703308105, "memory(GiB)": 91.52, "step": 25450, "token_acc": 0.7339947294993024, "train_speed(iter/s)": 0.197941 }, { "epoch": 0.3302959428291525, "grad_norm": 0.7469660043716431, "learning_rate": 9.641868660956952e-05, "loss": 0.9684820175170898, "memory(GiB)": 91.52, "step": 25455, "token_acc": 0.7441274572422113, "train_speed(iter/s)": 0.197917 }, { "epoch": 0.3303608212308082, "grad_norm": 0.7865340113639832, "learning_rate": 9.641669291481586e-05, "loss": 0.9523783683776855, "memory(GiB)": 91.52, "step": 25460, "token_acc": 0.7311396722139767, "train_speed(iter/s)": 0.197891 }, { "epoch": 0.3304256996324639, "grad_norm": 0.7763739824295044, "learning_rate": 9.641469868590189e-05, "loss": 0.9619263648986817, "memory(GiB)": 91.52, "step": 25465, "token_acc": 0.7280029700631139, "train_speed(iter/s)": 0.197868 }, { "epoch": 0.3304905780341196, "grad_norm": 0.8492141962051392, "learning_rate": 9.641270392285056e-05, "loss": 0.9744550704956054, "memory(GiB)": 91.52, "step": 25470, "token_acc": 0.7517068730086481, "train_speed(iter/s)": 0.197845 }, { "epoch": 0.3305554564357753, "grad_norm": 0.7962377667427063, "learning_rate": 9.641070862568482e-05, "loss": 0.9952751159667969, "memory(GiB)": 91.52, "step": 25475, "token_acc": 0.7556210953604161, "train_speed(iter/s)": 0.197824 }, { "epoch": 0.3306203348374309, "grad_norm": 0.7997536659240723, "learning_rate": 9.640871279442762e-05, "loss": 0.9207740783691406, "memory(GiB)": 91.52, "step": 25480, "token_acc": 0.7468200352991297, "train_speed(iter/s)": 0.197796 }, { "epoch": 0.3306852132390866, "grad_norm": 0.7944600582122803, "learning_rate": 9.640671642910195e-05, "loss": 0.9511593818664551, "memory(GiB)": 91.52, "step": 25485, "token_acc": 0.7313164295471962, "train_speed(iter/s)": 0.197773 }, { "epoch": 0.3307500916407423, "grad_norm": 0.7645272612571716, "learning_rate": 9.640471952973077e-05, "loss": 0.9387502670288086, "memory(GiB)": 91.52, "step": 25490, "token_acc": 0.7602341072236952, "train_speed(iter/s)": 0.197748 }, { "epoch": 0.330814970042398, "grad_norm": 0.6886767745018005, "learning_rate": 9.640272209633707e-05, "loss": 0.9362573623657227, "memory(GiB)": 91.52, "step": 25495, "token_acc": 0.736647257565955, "train_speed(iter/s)": 0.197725 }, { "epoch": 0.3308798484440537, "grad_norm": 0.7263028621673584, "learning_rate": 9.640072412894382e-05, "loss": 0.9302406311035156, "memory(GiB)": 91.52, "step": 25500, "token_acc": 0.7413214230709996, "train_speed(iter/s)": 0.1977 }, { "epoch": 0.3309447268457094, "grad_norm": 0.8222230076789856, "learning_rate": 9.639872562757404e-05, "loss": 0.9821551322937012, "memory(GiB)": 91.52, "step": 25505, "token_acc": 0.7409698718304388, "train_speed(iter/s)": 0.197671 }, { "epoch": 0.3310096052473651, "grad_norm": 0.78822922706604, "learning_rate": 9.63967265922507e-05, "loss": 0.934918212890625, "memory(GiB)": 91.52, "step": 25510, "token_acc": 0.7459178105192461, "train_speed(iter/s)": 0.197647 }, { "epoch": 0.3310744836490208, "grad_norm": 0.9112266302108765, "learning_rate": 9.639472702299682e-05, "loss": 0.9568321228027343, "memory(GiB)": 91.52, "step": 25515, "token_acc": 0.7533642927358395, "train_speed(iter/s)": 0.197622 }, { "epoch": 0.3311393620506765, "grad_norm": 0.807790219783783, "learning_rate": 9.639272691983538e-05, "loss": 0.9488540649414062, "memory(GiB)": 91.52, "step": 25520, "token_acc": 0.7635392829900839, "train_speed(iter/s)": 0.197599 }, { "epoch": 0.3312042404523322, "grad_norm": 0.7844066023826599, "learning_rate": 9.639072628278947e-05, "loss": 0.9819923400878906, "memory(GiB)": 91.52, "step": 25525, "token_acc": 0.7521359669269637, "train_speed(iter/s)": 0.197578 }, { "epoch": 0.3312691188539879, "grad_norm": 0.7754469513893127, "learning_rate": 9.638872511188204e-05, "loss": 0.9719512939453125, "memory(GiB)": 91.52, "step": 25530, "token_acc": 0.7385125532922785, "train_speed(iter/s)": 0.197554 }, { "epoch": 0.3313339972556436, "grad_norm": 0.7312098741531372, "learning_rate": 9.638672340713615e-05, "loss": 0.9994277954101562, "memory(GiB)": 91.52, "step": 25535, "token_acc": 0.723624695558047, "train_speed(iter/s)": 0.197534 }, { "epoch": 0.3313988756572993, "grad_norm": 0.8173902034759521, "learning_rate": 9.638472116857484e-05, "loss": 0.9748476982116699, "memory(GiB)": 91.52, "step": 25540, "token_acc": 0.7311569181402812, "train_speed(iter/s)": 0.19751 }, { "epoch": 0.331463754058955, "grad_norm": 0.8093696236610413, "learning_rate": 9.638271839622113e-05, "loss": 1.0214317321777344, "memory(GiB)": 91.52, "step": 25545, "token_acc": 0.7681877872619829, "train_speed(iter/s)": 0.19749 }, { "epoch": 0.3315286324606107, "grad_norm": 0.8502563238143921, "learning_rate": 9.638071509009809e-05, "loss": 0.9314352035522461, "memory(GiB)": 91.52, "step": 25550, "token_acc": 0.753870324954016, "train_speed(iter/s)": 0.197466 }, { "epoch": 0.3315935108622664, "grad_norm": 0.8943303823471069, "learning_rate": 9.637871125022878e-05, "loss": 0.9453289031982421, "memory(GiB)": 91.52, "step": 25555, "token_acc": 0.735774296510988, "train_speed(iter/s)": 0.19744 }, { "epoch": 0.3316583892639221, "grad_norm": 0.8692590594291687, "learning_rate": 9.637670687663622e-05, "loss": 0.9540526390075683, "memory(GiB)": 91.52, "step": 25560, "token_acc": 0.7453597915413911, "train_speed(iter/s)": 0.197416 }, { "epoch": 0.3317232676655778, "grad_norm": 0.7305054068565369, "learning_rate": 9.637470196934353e-05, "loss": 0.9480253219604492, "memory(GiB)": 91.52, "step": 25565, "token_acc": 0.7442365162942934, "train_speed(iter/s)": 0.197393 }, { "epoch": 0.3317881460672335, "grad_norm": 0.7616767883300781, "learning_rate": 9.637269652837373e-05, "loss": 0.9808658599853516, "memory(GiB)": 91.52, "step": 25570, "token_acc": 0.7411432511409528, "train_speed(iter/s)": 0.197369 }, { "epoch": 0.3318530244688892, "grad_norm": 0.7885214686393738, "learning_rate": 9.637069055374994e-05, "loss": 0.9892890930175782, "memory(GiB)": 91.52, "step": 25575, "token_acc": 0.7356753646732325, "train_speed(iter/s)": 0.197348 }, { "epoch": 0.3319179028705449, "grad_norm": 0.7522333264350891, "learning_rate": 9.636868404549521e-05, "loss": 0.9233948707580566, "memory(GiB)": 91.52, "step": 25580, "token_acc": 0.7313067389912231, "train_speed(iter/s)": 0.197325 }, { "epoch": 0.3319827812722006, "grad_norm": 0.8930941820144653, "learning_rate": 9.636667700363267e-05, "loss": 0.9879523277282715, "memory(GiB)": 91.52, "step": 25585, "token_acc": 0.7502050060525597, "train_speed(iter/s)": 0.197303 }, { "epoch": 0.3320476596738563, "grad_norm": 0.8220685124397278, "learning_rate": 9.636466942818538e-05, "loss": 0.9691505432128906, "memory(GiB)": 91.52, "step": 25590, "token_acc": 0.7364495470165573, "train_speed(iter/s)": 0.197279 }, { "epoch": 0.332112538075512, "grad_norm": 0.6823964715003967, "learning_rate": 9.636266131917646e-05, "loss": 0.9122845649719238, "memory(GiB)": 91.52, "step": 25595, "token_acc": 0.7419295939234589, "train_speed(iter/s)": 0.197254 }, { "epoch": 0.33217741647716764, "grad_norm": 0.8839128017425537, "learning_rate": 9.636065267662903e-05, "loss": 1.0204544067382812, "memory(GiB)": 91.52, "step": 25600, "token_acc": 0.7186977223008622, "train_speed(iter/s)": 0.197232 }, { "epoch": 0.33224229487882334, "grad_norm": 1.1204309463500977, "learning_rate": 9.635864350056618e-05, "loss": 0.9810886383056641, "memory(GiB)": 91.52, "step": 25605, "token_acc": 0.7533006093432634, "train_speed(iter/s)": 0.197208 }, { "epoch": 0.33230717328047904, "grad_norm": 0.8265655040740967, "learning_rate": 9.635663379101104e-05, "loss": 0.9611092567443847, "memory(GiB)": 91.52, "step": 25610, "token_acc": 0.7546570942754581, "train_speed(iter/s)": 0.197184 }, { "epoch": 0.33237205168213474, "grad_norm": 0.7790566086769104, "learning_rate": 9.635462354798674e-05, "loss": 0.9617527008056641, "memory(GiB)": 91.52, "step": 25615, "token_acc": 0.7457505205375734, "train_speed(iter/s)": 0.197161 }, { "epoch": 0.33243693008379044, "grad_norm": 0.7133930921554565, "learning_rate": 9.635261277151643e-05, "loss": 0.9143085479736328, "memory(GiB)": 91.52, "step": 25620, "token_acc": 0.7428484940214923, "train_speed(iter/s)": 0.197139 }, { "epoch": 0.33250180848544614, "grad_norm": 0.7467265725135803, "learning_rate": 9.635060146162323e-05, "loss": 0.9413793563842774, "memory(GiB)": 91.52, "step": 25625, "token_acc": 0.7424571090514691, "train_speed(iter/s)": 0.197118 }, { "epoch": 0.33256668688710184, "grad_norm": 0.7659470438957214, "learning_rate": 9.634858961833028e-05, "loss": 0.9076774597167969, "memory(GiB)": 91.52, "step": 25630, "token_acc": 0.7667194928684627, "train_speed(iter/s)": 0.197096 }, { "epoch": 0.33263156528875754, "grad_norm": 0.7305635809898376, "learning_rate": 9.634657724166076e-05, "loss": 0.9703838348388671, "memory(GiB)": 91.52, "step": 25635, "token_acc": 0.7425379287598944, "train_speed(iter/s)": 0.197072 }, { "epoch": 0.33269644369041323, "grad_norm": 0.754435658454895, "learning_rate": 9.634456433163781e-05, "loss": 0.9858170509338379, "memory(GiB)": 91.52, "step": 25640, "token_acc": 0.7406525516442499, "train_speed(iter/s)": 0.197049 }, { "epoch": 0.33276132209206893, "grad_norm": 0.7984645366668701, "learning_rate": 9.63425508882846e-05, "loss": 0.8910373687744141, "memory(GiB)": 91.52, "step": 25645, "token_acc": 0.7719602181584857, "train_speed(iter/s)": 0.197025 }, { "epoch": 0.33282620049372463, "grad_norm": 0.8285045623779297, "learning_rate": 9.63405369116243e-05, "loss": 1.019048023223877, "memory(GiB)": 91.52, "step": 25650, "token_acc": 0.7382107109689403, "train_speed(iter/s)": 0.197002 }, { "epoch": 0.33289107889538033, "grad_norm": 0.7468647956848145, "learning_rate": 9.633852240168006e-05, "loss": 0.9408937454223633, "memory(GiB)": 91.52, "step": 25655, "token_acc": 0.7565244194807863, "train_speed(iter/s)": 0.19698 }, { "epoch": 0.33295595729703603, "grad_norm": 0.7927359342575073, "learning_rate": 9.63365073584751e-05, "loss": 0.893547248840332, "memory(GiB)": 91.52, "step": 25660, "token_acc": 0.7668573148705599, "train_speed(iter/s)": 0.196958 }, { "epoch": 0.33302083569869173, "grad_norm": 0.7773644328117371, "learning_rate": 9.63344917820326e-05, "loss": 0.9712327003479004, "memory(GiB)": 91.52, "step": 25665, "token_acc": 0.7476471875683957, "train_speed(iter/s)": 0.196933 }, { "epoch": 0.33308571410034743, "grad_norm": 0.7502251267433167, "learning_rate": 9.633247567237574e-05, "loss": 0.997375774383545, "memory(GiB)": 91.52, "step": 25670, "token_acc": 0.7220205787353069, "train_speed(iter/s)": 0.196909 }, { "epoch": 0.33315059250200313, "grad_norm": 0.7957862019538879, "learning_rate": 9.633045902952775e-05, "loss": 0.9568531036376953, "memory(GiB)": 91.52, "step": 25675, "token_acc": 0.7450397877984085, "train_speed(iter/s)": 0.196886 }, { "epoch": 0.33321547090365883, "grad_norm": 0.8489786386489868, "learning_rate": 9.63284418535118e-05, "loss": 0.9530099868774414, "memory(GiB)": 91.52, "step": 25680, "token_acc": 0.7485746864310148, "train_speed(iter/s)": 0.196862 }, { "epoch": 0.3332803493053145, "grad_norm": 0.7310195565223694, "learning_rate": 9.632642414435114e-05, "loss": 0.9172304153442383, "memory(GiB)": 91.52, "step": 25685, "token_acc": 0.7467417983199087, "train_speed(iter/s)": 0.196838 }, { "epoch": 0.3333452277069702, "grad_norm": 0.8420218229293823, "learning_rate": 9.632440590206895e-05, "loss": 0.9880067825317382, "memory(GiB)": 91.52, "step": 25690, "token_acc": 0.7390329596787686, "train_speed(iter/s)": 0.196814 }, { "epoch": 0.3334101061086259, "grad_norm": 0.8763824105262756, "learning_rate": 9.632238712668848e-05, "loss": 0.966126537322998, "memory(GiB)": 91.52, "step": 25695, "token_acc": 0.7461910864303004, "train_speed(iter/s)": 0.196792 }, { "epoch": 0.3334749845102816, "grad_norm": 0.7358158230781555, "learning_rate": 9.632036781823298e-05, "loss": 0.9207464218139648, "memory(GiB)": 91.52, "step": 25700, "token_acc": 0.7654838071367324, "train_speed(iter/s)": 0.196769 }, { "epoch": 0.3335398629119373, "grad_norm": 0.8310243487358093, "learning_rate": 9.631834797672566e-05, "loss": 0.9816814422607422, "memory(GiB)": 91.52, "step": 25705, "token_acc": 0.7680921665858842, "train_speed(iter/s)": 0.196745 }, { "epoch": 0.333604741313593, "grad_norm": 0.811037003993988, "learning_rate": 9.631632760218978e-05, "loss": 0.9630594253540039, "memory(GiB)": 91.52, "step": 25710, "token_acc": 0.7475411993026545, "train_speed(iter/s)": 0.196721 }, { "epoch": 0.3336696197152487, "grad_norm": 0.8584509491920471, "learning_rate": 9.631430669464857e-05, "loss": 0.9741912841796875, "memory(GiB)": 91.52, "step": 25715, "token_acc": 0.7449160035366932, "train_speed(iter/s)": 0.196698 }, { "epoch": 0.33373449811690437, "grad_norm": 0.7804141640663147, "learning_rate": 9.63122852541253e-05, "loss": 0.9688706398010254, "memory(GiB)": 91.52, "step": 25720, "token_acc": 0.7538676005576145, "train_speed(iter/s)": 0.196675 }, { "epoch": 0.33379937651856006, "grad_norm": 0.7969862818717957, "learning_rate": 9.631026328064324e-05, "loss": 0.9521338462829589, "memory(GiB)": 91.52, "step": 25725, "token_acc": 0.7321226281506655, "train_speed(iter/s)": 0.196651 }, { "epoch": 0.33386425492021576, "grad_norm": 0.9100721478462219, "learning_rate": 9.630824077422564e-05, "loss": 0.935824966430664, "memory(GiB)": 91.52, "step": 25730, "token_acc": 0.7473112875665896, "train_speed(iter/s)": 0.196629 }, { "epoch": 0.33392913332187146, "grad_norm": 0.9024302959442139, "learning_rate": 9.63062177348958e-05, "loss": 1.0551294326782226, "memory(GiB)": 91.52, "step": 25735, "token_acc": 0.7228658536585366, "train_speed(iter/s)": 0.196607 }, { "epoch": 0.33399401172352716, "grad_norm": 0.8277751803398132, "learning_rate": 9.630419416267698e-05, "loss": 0.9912483215332031, "memory(GiB)": 91.52, "step": 25740, "token_acc": 0.7444307692307692, "train_speed(iter/s)": 0.196584 }, { "epoch": 0.33405889012518286, "grad_norm": 0.8166214227676392, "learning_rate": 9.630217005759248e-05, "loss": 0.9851282119750977, "memory(GiB)": 91.52, "step": 25745, "token_acc": 0.7401197604790419, "train_speed(iter/s)": 0.196561 }, { "epoch": 0.33412376852683856, "grad_norm": 0.8431749939918518, "learning_rate": 9.630014541966556e-05, "loss": 0.9881872177124024, "memory(GiB)": 91.52, "step": 25750, "token_acc": 0.730098855359001, "train_speed(iter/s)": 0.196539 }, { "epoch": 0.33418864692849426, "grad_norm": 0.8032903075218201, "learning_rate": 9.629812024891958e-05, "loss": 0.9880936622619629, "memory(GiB)": 91.52, "step": 25755, "token_acc": 0.7441802529963739, "train_speed(iter/s)": 0.196516 }, { "epoch": 0.33425352533014996, "grad_norm": 0.7962529063224792, "learning_rate": 9.62960945453778e-05, "loss": 0.9425052642822266, "memory(GiB)": 91.52, "step": 25760, "token_acc": 0.7526473175021988, "train_speed(iter/s)": 0.196495 }, { "epoch": 0.33431840373180566, "grad_norm": 0.8009610176086426, "learning_rate": 9.629406830906353e-05, "loss": 1.002413845062256, "memory(GiB)": 91.52, "step": 25765, "token_acc": 0.7340012523481528, "train_speed(iter/s)": 0.196473 }, { "epoch": 0.33438328213346136, "grad_norm": 0.7368749380111694, "learning_rate": 9.629204154000011e-05, "loss": 0.9337604522705079, "memory(GiB)": 91.52, "step": 25770, "token_acc": 0.7463679789161588, "train_speed(iter/s)": 0.19645 }, { "epoch": 0.33444816053511706, "grad_norm": 0.737764835357666, "learning_rate": 9.629001423821087e-05, "loss": 0.9504829406738281, "memory(GiB)": 91.52, "step": 25775, "token_acc": 0.7631416202844774, "train_speed(iter/s)": 0.196429 }, { "epoch": 0.33451303893677276, "grad_norm": 0.765794038772583, "learning_rate": 9.628798640371912e-05, "loss": 0.957304573059082, "memory(GiB)": 91.52, "step": 25780, "token_acc": 0.7651414186287576, "train_speed(iter/s)": 0.196405 }, { "epoch": 0.33457791733842845, "grad_norm": 0.7899957299232483, "learning_rate": 9.628595803654818e-05, "loss": 0.9381145477294922, "memory(GiB)": 91.52, "step": 25785, "token_acc": 0.7464103382259094, "train_speed(iter/s)": 0.196386 }, { "epoch": 0.33464279574008415, "grad_norm": 0.7714922428131104, "learning_rate": 9.628392913672143e-05, "loss": 0.9536528587341309, "memory(GiB)": 91.52, "step": 25790, "token_acc": 0.7289804520020348, "train_speed(iter/s)": 0.19636 }, { "epoch": 0.33470767414173985, "grad_norm": 0.764578640460968, "learning_rate": 9.62818997042622e-05, "loss": 0.9695390701293946, "memory(GiB)": 91.52, "step": 25795, "token_acc": 0.7308332484119705, "train_speed(iter/s)": 0.196336 }, { "epoch": 0.33477255254339555, "grad_norm": 0.8484553694725037, "learning_rate": 9.627986973919385e-05, "loss": 0.9975221633911133, "memory(GiB)": 91.52, "step": 25800, "token_acc": 0.7275446014231206, "train_speed(iter/s)": 0.196315 }, { "epoch": 0.33483743094505125, "grad_norm": 0.8474596738815308, "learning_rate": 9.627783924153973e-05, "loss": 0.945111083984375, "memory(GiB)": 91.52, "step": 25805, "token_acc": 0.7392694463910301, "train_speed(iter/s)": 0.196292 }, { "epoch": 0.33490230934670695, "grad_norm": 0.8049477338790894, "learning_rate": 9.627580821132322e-05, "loss": 0.966402244567871, "memory(GiB)": 91.52, "step": 25810, "token_acc": 0.7345317390257604, "train_speed(iter/s)": 0.196272 }, { "epoch": 0.33496718774836265, "grad_norm": 0.9000502228736877, "learning_rate": 9.627377664856769e-05, "loss": 0.9755807876586914, "memory(GiB)": 91.52, "step": 25815, "token_acc": 0.7364094091731571, "train_speed(iter/s)": 0.196249 }, { "epoch": 0.33503206615001835, "grad_norm": 0.8046472072601318, "learning_rate": 9.627174455329651e-05, "loss": 0.9786934852600098, "memory(GiB)": 91.52, "step": 25820, "token_acc": 0.7327654947444726, "train_speed(iter/s)": 0.196226 }, { "epoch": 0.33509694455167405, "grad_norm": 0.8493183851242065, "learning_rate": 9.626971192553308e-05, "loss": 1.015653419494629, "memory(GiB)": 91.52, "step": 25825, "token_acc": 0.7445906432748538, "train_speed(iter/s)": 0.196205 }, { "epoch": 0.33516182295332975, "grad_norm": 0.7678708434104919, "learning_rate": 9.626767876530078e-05, "loss": 0.9589082717895507, "memory(GiB)": 91.52, "step": 25830, "token_acc": 0.7706664092415613, "train_speed(iter/s)": 0.196181 }, { "epoch": 0.33522670135498545, "grad_norm": 0.8401504158973694, "learning_rate": 9.626564507262302e-05, "loss": 0.940827751159668, "memory(GiB)": 91.52, "step": 25835, "token_acc": 0.7548087913523399, "train_speed(iter/s)": 0.196159 }, { "epoch": 0.3352915797566411, "grad_norm": 0.7253766059875488, "learning_rate": 9.626361084752318e-05, "loss": 0.9517148971557617, "memory(GiB)": 91.52, "step": 25840, "token_acc": 0.7645480521290682, "train_speed(iter/s)": 0.196137 }, { "epoch": 0.3353564581582968, "grad_norm": 0.8041348457336426, "learning_rate": 9.626157609002469e-05, "loss": 0.9523476600646973, "memory(GiB)": 91.52, "step": 25845, "token_acc": 0.7212262925435178, "train_speed(iter/s)": 0.196114 }, { "epoch": 0.3354213365599525, "grad_norm": 0.7663188576698303, "learning_rate": 9.625954080015093e-05, "loss": 0.960511302947998, "memory(GiB)": 91.52, "step": 25850, "token_acc": 0.7490091976370299, "train_speed(iter/s)": 0.196091 }, { "epoch": 0.3354862149616082, "grad_norm": 0.7652328014373779, "learning_rate": 9.62575049779254e-05, "loss": 1.0019493103027344, "memory(GiB)": 91.52, "step": 25855, "token_acc": 0.7457709218272599, "train_speed(iter/s)": 0.196071 }, { "epoch": 0.3355510933632639, "grad_norm": 0.80768221616745, "learning_rate": 9.625546862337145e-05, "loss": 0.9481586456298828, "memory(GiB)": 91.52, "step": 25860, "token_acc": 0.7477445673491802, "train_speed(iter/s)": 0.196048 }, { "epoch": 0.3356159717649196, "grad_norm": 0.8671614527702332, "learning_rate": 9.625343173651255e-05, "loss": 0.9909869194030761, "memory(GiB)": 91.52, "step": 25865, "token_acc": 0.7365848725899959, "train_speed(iter/s)": 0.196023 }, { "epoch": 0.3356808501665753, "grad_norm": 0.8887770175933838, "learning_rate": 9.625139431737214e-05, "loss": 0.9845930099487304, "memory(GiB)": 91.52, "step": 25870, "token_acc": 0.7433547845551203, "train_speed(iter/s)": 0.196 }, { "epoch": 0.335745728568231, "grad_norm": 0.8861237168312073, "learning_rate": 9.624935636597366e-05, "loss": 0.9799201965332032, "memory(GiB)": 91.52, "step": 25875, "token_acc": 0.7571544228941532, "train_speed(iter/s)": 0.195976 }, { "epoch": 0.3358106069698867, "grad_norm": 0.7440738081932068, "learning_rate": 9.624731788234056e-05, "loss": 0.9455095291137695, "memory(GiB)": 91.52, "step": 25880, "token_acc": 0.7294659897014871, "train_speed(iter/s)": 0.195955 }, { "epoch": 0.3358754853715424, "grad_norm": 0.7714506983757019, "learning_rate": 9.62452788664963e-05, "loss": 0.9466708183288575, "memory(GiB)": 91.52, "step": 25885, "token_acc": 0.7536511156186613, "train_speed(iter/s)": 0.195931 }, { "epoch": 0.3359403637731981, "grad_norm": 0.7696134448051453, "learning_rate": 9.624323931846434e-05, "loss": 0.9053266525268555, "memory(GiB)": 91.52, "step": 25890, "token_acc": 0.7478269458711971, "train_speed(iter/s)": 0.195907 }, { "epoch": 0.3360052421748538, "grad_norm": 0.8504070043563843, "learning_rate": 9.624119923826819e-05, "loss": 0.9287114143371582, "memory(GiB)": 91.52, "step": 25895, "token_acc": 0.7603995825257194, "train_speed(iter/s)": 0.195884 }, { "epoch": 0.3360701205765095, "grad_norm": 0.7435270547866821, "learning_rate": 9.623915862593126e-05, "loss": 0.9522789001464844, "memory(GiB)": 91.52, "step": 25900, "token_acc": 0.718307979717107, "train_speed(iter/s)": 0.195862 }, { "epoch": 0.3361349989781652, "grad_norm": 0.8427472114562988, "learning_rate": 9.62371174814771e-05, "loss": 0.9211284637451171, "memory(GiB)": 91.52, "step": 25905, "token_acc": 0.7614543091309528, "train_speed(iter/s)": 0.195838 }, { "epoch": 0.3361998773798209, "grad_norm": 0.8349899053573608, "learning_rate": 9.623507580492914e-05, "loss": 0.9612213134765625, "memory(GiB)": 91.52, "step": 25910, "token_acc": 0.7453028737611449, "train_speed(iter/s)": 0.195814 }, { "epoch": 0.3362647557814766, "grad_norm": 0.7770682573318481, "learning_rate": 9.623303359631091e-05, "loss": 0.9291766166687012, "memory(GiB)": 91.52, "step": 25915, "token_acc": 0.7557824165777846, "train_speed(iter/s)": 0.195793 }, { "epoch": 0.3363296341831323, "grad_norm": 0.7097427845001221, "learning_rate": 9.623099085564591e-05, "loss": 0.9631203651428223, "memory(GiB)": 91.52, "step": 25920, "token_acc": 0.7286281561399566, "train_speed(iter/s)": 0.195772 }, { "epoch": 0.336394512584788, "grad_norm": 0.8290252089500427, "learning_rate": 9.622894758295764e-05, "loss": 0.9819976806640625, "memory(GiB)": 91.52, "step": 25925, "token_acc": 0.7394384590270976, "train_speed(iter/s)": 0.195751 }, { "epoch": 0.3364593909864437, "grad_norm": 0.7333557605743408, "learning_rate": 9.622690377826961e-05, "loss": 0.9423967361450195, "memory(GiB)": 91.52, "step": 25930, "token_acc": 0.7476302944418202, "train_speed(iter/s)": 0.195728 }, { "epoch": 0.3365242693880994, "grad_norm": 0.7844270467758179, "learning_rate": 9.622485944160536e-05, "loss": 0.9216768264770507, "memory(GiB)": 91.52, "step": 25935, "token_acc": 0.7563142568491116, "train_speed(iter/s)": 0.195706 }, { "epoch": 0.33658914778975507, "grad_norm": 0.8226174712181091, "learning_rate": 9.622281457298841e-05, "loss": 0.9852960586547852, "memory(GiB)": 91.52, "step": 25940, "token_acc": 0.7405899185560203, "train_speed(iter/s)": 0.195683 }, { "epoch": 0.33665402619141077, "grad_norm": 0.7929579019546509, "learning_rate": 9.622076917244227e-05, "loss": 0.9709591865539551, "memory(GiB)": 91.52, "step": 25945, "token_acc": 0.7264205457463885, "train_speed(iter/s)": 0.19566 }, { "epoch": 0.33671890459306647, "grad_norm": 0.7195313572883606, "learning_rate": 9.621872323999049e-05, "loss": 0.9546003341674805, "memory(GiB)": 91.52, "step": 25950, "token_acc": 0.7476268951878708, "train_speed(iter/s)": 0.195633 }, { "epoch": 0.33678378299472217, "grad_norm": 0.8228321075439453, "learning_rate": 9.621667677565662e-05, "loss": 0.9507225036621094, "memory(GiB)": 91.52, "step": 25955, "token_acc": 0.7441634241245136, "train_speed(iter/s)": 0.195612 }, { "epoch": 0.3368486613963778, "grad_norm": 0.8073694109916687, "learning_rate": 9.62146297794642e-05, "loss": 0.9887462615966797, "memory(GiB)": 91.52, "step": 25960, "token_acc": 0.7346200241254524, "train_speed(iter/s)": 0.195592 }, { "epoch": 0.3369135397980335, "grad_norm": 0.7493142485618591, "learning_rate": 9.621258225143681e-05, "loss": 0.9185948371887207, "memory(GiB)": 91.52, "step": 25965, "token_acc": 0.7471829609848238, "train_speed(iter/s)": 0.195566 }, { "epoch": 0.3369784181996892, "grad_norm": 0.7590370178222656, "learning_rate": 9.6210534191598e-05, "loss": 0.9563434600830079, "memory(GiB)": 91.52, "step": 25970, "token_acc": 0.7575321300653136, "train_speed(iter/s)": 0.195544 }, { "epoch": 0.3370432966013449, "grad_norm": 0.800693154335022, "learning_rate": 9.620848559997134e-05, "loss": 0.978452205657959, "memory(GiB)": 91.52, "step": 25975, "token_acc": 0.752018093249826, "train_speed(iter/s)": 0.195519 }, { "epoch": 0.3371081750030006, "grad_norm": 0.6834903359413147, "learning_rate": 9.620643647658039e-05, "loss": 0.9716640472412109, "memory(GiB)": 91.52, "step": 25980, "token_acc": 0.723842021689922, "train_speed(iter/s)": 0.1955 }, { "epoch": 0.3371730534046563, "grad_norm": 0.8063622117042542, "learning_rate": 9.620438682144874e-05, "loss": 0.931393051147461, "memory(GiB)": 91.52, "step": 25985, "token_acc": 0.7304145319049837, "train_speed(iter/s)": 0.195479 }, { "epoch": 0.337237931806312, "grad_norm": 0.7817704081535339, "learning_rate": 9.620233663459998e-05, "loss": 0.9231455802917481, "memory(GiB)": 91.52, "step": 25990, "token_acc": 0.7570955251287373, "train_speed(iter/s)": 0.195455 }, { "epoch": 0.3373028102079677, "grad_norm": 0.6243228316307068, "learning_rate": 9.62002859160577e-05, "loss": 0.9538049697875977, "memory(GiB)": 91.52, "step": 25995, "token_acc": 0.7481327080076429, "train_speed(iter/s)": 0.19543 }, { "epoch": 0.3373676886096234, "grad_norm": 0.7151818871498108, "learning_rate": 9.619823466584551e-05, "loss": 0.9944772720336914, "memory(GiB)": 91.52, "step": 26000, "token_acc": 0.7270748218173143, "train_speed(iter/s)": 0.195405 }, { "epoch": 0.3374325670112791, "grad_norm": 0.6917436718940735, "learning_rate": 9.619618288398702e-05, "loss": 0.9571702003479003, "memory(GiB)": 91.52, "step": 26005, "token_acc": 0.7515070536324654, "train_speed(iter/s)": 0.195383 }, { "epoch": 0.3374974454129348, "grad_norm": 0.8651410341262817, "learning_rate": 9.619413057050582e-05, "loss": 0.9359001159667969, "memory(GiB)": 91.52, "step": 26010, "token_acc": 0.7581119934282585, "train_speed(iter/s)": 0.19536 }, { "epoch": 0.3375623238145905, "grad_norm": 0.739396333694458, "learning_rate": 9.619207772542553e-05, "loss": 0.8737092971801758, "memory(GiB)": 91.52, "step": 26015, "token_acc": 0.7429617026740715, "train_speed(iter/s)": 0.195336 }, { "epoch": 0.3376272022162462, "grad_norm": 0.7500829696655273, "learning_rate": 9.61900243487698e-05, "loss": 0.9206311225891113, "memory(GiB)": 91.52, "step": 26020, "token_acc": 0.7623680667158929, "train_speed(iter/s)": 0.195313 }, { "epoch": 0.3376920806179019, "grad_norm": 0.7594709992408752, "learning_rate": 9.618797044056222e-05, "loss": 0.9616539001464843, "memory(GiB)": 91.52, "step": 26025, "token_acc": 0.7461783532575337, "train_speed(iter/s)": 0.19529 }, { "epoch": 0.3377569590195576, "grad_norm": 0.7850965261459351, "learning_rate": 9.618591600082647e-05, "loss": 0.9565049171447754, "memory(GiB)": 91.52, "step": 26030, "token_acc": 0.7240314747185723, "train_speed(iter/s)": 0.195267 }, { "epoch": 0.3378218374212133, "grad_norm": 0.8825547099113464, "learning_rate": 9.618386102958617e-05, "loss": 0.9260051727294922, "memory(GiB)": 91.52, "step": 26035, "token_acc": 0.7487224057013732, "train_speed(iter/s)": 0.195246 }, { "epoch": 0.337886715822869, "grad_norm": 0.8154387474060059, "learning_rate": 9.618180552686496e-05, "loss": 0.9778392791748047, "memory(GiB)": 91.52, "step": 26040, "token_acc": 0.7426980493970553, "train_speed(iter/s)": 0.195223 }, { "epoch": 0.3379515942245247, "grad_norm": 0.8413930535316467, "learning_rate": 9.617974949268652e-05, "loss": 0.9868715286254883, "memory(GiB)": 91.52, "step": 26045, "token_acc": 0.7086215328722141, "train_speed(iter/s)": 0.195199 }, { "epoch": 0.3380164726261804, "grad_norm": 0.7664192318916321, "learning_rate": 9.617769292707445e-05, "loss": 0.9877419471740723, "memory(GiB)": 91.52, "step": 26050, "token_acc": 0.7382021280356322, "train_speed(iter/s)": 0.19518 }, { "epoch": 0.3380813510278361, "grad_norm": 0.8358368277549744, "learning_rate": 9.617563583005252e-05, "loss": 0.9723416328430176, "memory(GiB)": 91.52, "step": 26055, "token_acc": 0.7556747833264548, "train_speed(iter/s)": 0.195156 }, { "epoch": 0.3381462294294918, "grad_norm": 0.8902273178100586, "learning_rate": 9.61735782016443e-05, "loss": 0.9662749290466308, "memory(GiB)": 91.52, "step": 26060, "token_acc": 0.772066780027825, "train_speed(iter/s)": 0.195133 }, { "epoch": 0.3382111078311475, "grad_norm": 0.8111072778701782, "learning_rate": 9.617152004187355e-05, "loss": 0.9757076263427734, "memory(GiB)": 91.52, "step": 26065, "token_acc": 0.7368761220825852, "train_speed(iter/s)": 0.195112 }, { "epoch": 0.3382759862328032, "grad_norm": 0.9602763652801514, "learning_rate": 9.616946135076389e-05, "loss": 0.950658130645752, "memory(GiB)": 91.52, "step": 26070, "token_acc": 0.7537230361924245, "train_speed(iter/s)": 0.19509 }, { "epoch": 0.33834086463445884, "grad_norm": 0.7681691646575928, "learning_rate": 9.616740212833905e-05, "loss": 0.9496084213256836, "memory(GiB)": 91.52, "step": 26075, "token_acc": 0.724812030075188, "train_speed(iter/s)": 0.19507 }, { "epoch": 0.33840574303611454, "grad_norm": 0.861526608467102, "learning_rate": 9.616534237462272e-05, "loss": 0.9628474235534668, "memory(GiB)": 91.52, "step": 26080, "token_acc": 0.7633896606043248, "train_speed(iter/s)": 0.195048 }, { "epoch": 0.33847062143777024, "grad_norm": 0.7298756241798401, "learning_rate": 9.61632820896386e-05, "loss": 0.9510915756225586, "memory(GiB)": 91.52, "step": 26085, "token_acc": 0.7473458519210153, "train_speed(iter/s)": 0.195023 }, { "epoch": 0.33853549983942594, "grad_norm": 0.7557476758956909, "learning_rate": 9.61612212734104e-05, "loss": 0.9353360176086426, "memory(GiB)": 91.52, "step": 26090, "token_acc": 0.7541118703430278, "train_speed(iter/s)": 0.195 }, { "epoch": 0.33860037824108163, "grad_norm": 0.7146530151367188, "learning_rate": 9.615915992596184e-05, "loss": 0.9125534057617187, "memory(GiB)": 91.52, "step": 26095, "token_acc": 0.7450230038268049, "train_speed(iter/s)": 0.194978 }, { "epoch": 0.33866525664273733, "grad_norm": 0.7470430731773376, "learning_rate": 9.615709804731662e-05, "loss": 0.9076296806335449, "memory(GiB)": 91.52, "step": 26100, "token_acc": 0.7725446350581741, "train_speed(iter/s)": 0.194954 }, { "epoch": 0.33873013504439303, "grad_norm": 0.7099696397781372, "learning_rate": 9.615503563749853e-05, "loss": 0.9553425788879395, "memory(GiB)": 91.52, "step": 26105, "token_acc": 0.7641048961649235, "train_speed(iter/s)": 0.194931 }, { "epoch": 0.33879501344604873, "grad_norm": 0.8364201784133911, "learning_rate": 9.615297269653123e-05, "loss": 0.9789146423339844, "memory(GiB)": 91.52, "step": 26110, "token_acc": 0.7571740816470423, "train_speed(iter/s)": 0.19491 }, { "epoch": 0.33885989184770443, "grad_norm": 0.7960774302482605, "learning_rate": 9.615090922443848e-05, "loss": 0.9623479843139648, "memory(GiB)": 91.52, "step": 26115, "token_acc": 0.7541600182356963, "train_speed(iter/s)": 0.19489 }, { "epoch": 0.33892477024936013, "grad_norm": 0.8075785040855408, "learning_rate": 9.614884522124406e-05, "loss": 0.9495451927185059, "memory(GiB)": 91.52, "step": 26120, "token_acc": 0.7375849279548364, "train_speed(iter/s)": 0.194869 }, { "epoch": 0.33898964865101583, "grad_norm": 0.8400453329086304, "learning_rate": 9.614678068697168e-05, "loss": 0.9842201232910156, "memory(GiB)": 91.52, "step": 26125, "token_acc": 0.7606404919938501, "train_speed(iter/s)": 0.194847 }, { "epoch": 0.33905452705267153, "grad_norm": 0.8368027210235596, "learning_rate": 9.614471562164513e-05, "loss": 0.9926823616027832, "memory(GiB)": 91.52, "step": 26130, "token_acc": 0.7287353817098859, "train_speed(iter/s)": 0.194824 }, { "epoch": 0.3391194054543272, "grad_norm": 0.7856016159057617, "learning_rate": 9.614265002528816e-05, "loss": 0.9535825729370118, "memory(GiB)": 91.52, "step": 26135, "token_acc": 0.7544328090450165, "train_speed(iter/s)": 0.194806 }, { "epoch": 0.3391842838559829, "grad_norm": 0.7950417399406433, "learning_rate": 9.614058389792456e-05, "loss": 0.9267484664916992, "memory(GiB)": 91.52, "step": 26140, "token_acc": 0.7663644091810711, "train_speed(iter/s)": 0.194784 }, { "epoch": 0.3392491622576386, "grad_norm": 0.7959907054901123, "learning_rate": 9.613851723957805e-05, "loss": 0.9403420448303222, "memory(GiB)": 91.52, "step": 26145, "token_acc": 0.714245247387637, "train_speed(iter/s)": 0.194762 }, { "epoch": 0.3393140406592943, "grad_norm": 0.8277950286865234, "learning_rate": 9.613645005027248e-05, "loss": 0.9701011657714844, "memory(GiB)": 91.52, "step": 26150, "token_acc": 0.7232424622637923, "train_speed(iter/s)": 0.194738 }, { "epoch": 0.33937891906095, "grad_norm": 0.6999348998069763, "learning_rate": 9.613438233003162e-05, "loss": 0.9221426010131836, "memory(GiB)": 91.52, "step": 26155, "token_acc": 0.7560257710043496, "train_speed(iter/s)": 0.194717 }, { "epoch": 0.3394437974626057, "grad_norm": 0.7785660028457642, "learning_rate": 9.613231407887923e-05, "loss": 1.0155855178833009, "memory(GiB)": 91.52, "step": 26160, "token_acc": 0.7295312691225064, "train_speed(iter/s)": 0.194697 }, { "epoch": 0.3395086758642614, "grad_norm": 0.7932148575782776, "learning_rate": 9.613024529683916e-05, "loss": 0.9745502471923828, "memory(GiB)": 91.52, "step": 26165, "token_acc": 0.7406239403187521, "train_speed(iter/s)": 0.194676 }, { "epoch": 0.3395735542659171, "grad_norm": 0.8563870191574097, "learning_rate": 9.61281759839352e-05, "loss": 0.9653169631958007, "memory(GiB)": 91.52, "step": 26170, "token_acc": 0.7374774123599567, "train_speed(iter/s)": 0.194653 }, { "epoch": 0.3396384326675728, "grad_norm": 0.7492132186889648, "learning_rate": 9.612610614019114e-05, "loss": 0.9594498634338379, "memory(GiB)": 91.52, "step": 26175, "token_acc": 0.745458788751021, "train_speed(iter/s)": 0.194633 }, { "epoch": 0.3397033110692285, "grad_norm": 0.8551552891731262, "learning_rate": 9.612403576563083e-05, "loss": 0.9926034927368164, "memory(GiB)": 91.52, "step": 26180, "token_acc": 0.743153848996637, "train_speed(iter/s)": 0.194613 }, { "epoch": 0.3397681894708842, "grad_norm": 0.8004754185676575, "learning_rate": 9.61219648602781e-05, "loss": 0.9266968727111816, "memory(GiB)": 91.52, "step": 26185, "token_acc": 0.760898282694848, "train_speed(iter/s)": 0.194589 }, { "epoch": 0.3398330678725399, "grad_norm": 0.7716733813285828, "learning_rate": 9.611989342415674e-05, "loss": 0.9508251190185547, "memory(GiB)": 91.52, "step": 26190, "token_acc": 0.7529113220009038, "train_speed(iter/s)": 0.194567 }, { "epoch": 0.33989794627419556, "grad_norm": 0.7981228232383728, "learning_rate": 9.611782145729062e-05, "loss": 0.9900264739990234, "memory(GiB)": 91.52, "step": 26195, "token_acc": 0.7328857202187306, "train_speed(iter/s)": 0.194548 }, { "epoch": 0.33996282467585126, "grad_norm": 0.9041018486022949, "learning_rate": 9.611574895970359e-05, "loss": 0.9743200302124023, "memory(GiB)": 91.52, "step": 26200, "token_acc": 0.7498692696531288, "train_speed(iter/s)": 0.194526 }, { "epoch": 0.34002770307750696, "grad_norm": 0.954732358455658, "learning_rate": 9.611367593141947e-05, "loss": 1.0283939361572265, "memory(GiB)": 91.52, "step": 26205, "token_acc": 0.7193734146843079, "train_speed(iter/s)": 0.194505 }, { "epoch": 0.34009258147916266, "grad_norm": 0.7032878994941711, "learning_rate": 9.611160237246215e-05, "loss": 0.881346607208252, "memory(GiB)": 91.52, "step": 26210, "token_acc": 0.7683285592629867, "train_speed(iter/s)": 0.194481 }, { "epoch": 0.34015745988081836, "grad_norm": 0.6942235827445984, "learning_rate": 9.610952828285549e-05, "loss": 0.9260347366333008, "memory(GiB)": 91.52, "step": 26215, "token_acc": 0.7428837409144616, "train_speed(iter/s)": 0.194457 }, { "epoch": 0.34022233828247406, "grad_norm": 0.8125453591346741, "learning_rate": 9.610745366262333e-05, "loss": 0.9726022720336914, "memory(GiB)": 91.52, "step": 26220, "token_acc": 0.7510482808194561, "train_speed(iter/s)": 0.194438 }, { "epoch": 0.34028721668412976, "grad_norm": 0.8061242699623108, "learning_rate": 9.610537851178955e-05, "loss": 0.9093279838562012, "memory(GiB)": 91.52, "step": 26225, "token_acc": 0.7328432166337403, "train_speed(iter/s)": 0.194415 }, { "epoch": 0.34035209508578546, "grad_norm": 0.7848788499832153, "learning_rate": 9.610330283037807e-05, "loss": 1.0113229751586914, "memory(GiB)": 91.52, "step": 26230, "token_acc": 0.7410904780719083, "train_speed(iter/s)": 0.194398 }, { "epoch": 0.34041697348744115, "grad_norm": 0.7448428273200989, "learning_rate": 9.610122661841272e-05, "loss": 0.9105652809143067, "memory(GiB)": 91.52, "step": 26235, "token_acc": 0.7702095025621362, "train_speed(iter/s)": 0.194378 }, { "epoch": 0.34048185188909685, "grad_norm": 0.9268027544021606, "learning_rate": 9.609914987591744e-05, "loss": 0.9713177680969238, "memory(GiB)": 91.52, "step": 26240, "token_acc": 0.7342995169082126, "train_speed(iter/s)": 0.194356 }, { "epoch": 0.34054673029075255, "grad_norm": 0.8438153862953186, "learning_rate": 9.609707260291611e-05, "loss": 0.989566421508789, "memory(GiB)": 91.52, "step": 26245, "token_acc": 0.7353384592190563, "train_speed(iter/s)": 0.194334 }, { "epoch": 0.34061160869240825, "grad_norm": 0.8643060922622681, "learning_rate": 9.609499479943263e-05, "loss": 0.9729673385620117, "memory(GiB)": 91.52, "step": 26250, "token_acc": 0.7370573799228701, "train_speed(iter/s)": 0.194314 }, { "epoch": 0.34067648709406395, "grad_norm": 0.7201687693595886, "learning_rate": 9.60929164654909e-05, "loss": 0.9305744171142578, "memory(GiB)": 91.52, "step": 26255, "token_acc": 0.7446222197227456, "train_speed(iter/s)": 0.194289 }, { "epoch": 0.34074136549571965, "grad_norm": 0.880236029624939, "learning_rate": 9.609083760111487e-05, "loss": 0.9880005836486816, "memory(GiB)": 91.52, "step": 26260, "token_acc": 0.7426776429809359, "train_speed(iter/s)": 0.194272 }, { "epoch": 0.34080624389737535, "grad_norm": 0.7939754724502563, "learning_rate": 9.608875820632846e-05, "loss": 0.9797431945800781, "memory(GiB)": 91.52, "step": 26265, "token_acc": 0.7557764806031861, "train_speed(iter/s)": 0.194249 }, { "epoch": 0.34087112229903105, "grad_norm": 1.030712604522705, "learning_rate": 9.608667828115557e-05, "loss": 0.9837569236755371, "memory(GiB)": 91.52, "step": 26270, "token_acc": 0.7312666413084823, "train_speed(iter/s)": 0.194229 }, { "epoch": 0.34093600070068675, "grad_norm": 0.8344554305076599, "learning_rate": 9.608459782562015e-05, "loss": 0.9443160057067871, "memory(GiB)": 91.52, "step": 26275, "token_acc": 0.7298383177267278, "train_speed(iter/s)": 0.194206 }, { "epoch": 0.34100087910234245, "grad_norm": 0.8506379127502441, "learning_rate": 9.608251683974616e-05, "loss": 0.9655063629150391, "memory(GiB)": 91.52, "step": 26280, "token_acc": 0.7297372722561097, "train_speed(iter/s)": 0.194182 }, { "epoch": 0.34106575750399815, "grad_norm": 0.7446388602256775, "learning_rate": 9.608043532355752e-05, "loss": 0.9406261444091797, "memory(GiB)": 91.52, "step": 26285, "token_acc": 0.7573298338916324, "train_speed(iter/s)": 0.194161 }, { "epoch": 0.34113063590565385, "grad_norm": 0.7931508421897888, "learning_rate": 9.607835327707819e-05, "loss": 0.9567532539367676, "memory(GiB)": 91.52, "step": 26290, "token_acc": 0.75880632726306, "train_speed(iter/s)": 0.194141 }, { "epoch": 0.34119551430730954, "grad_norm": 0.8332579135894775, "learning_rate": 9.607627070033215e-05, "loss": 0.9764391899108886, "memory(GiB)": 91.52, "step": 26295, "token_acc": 0.7353299726532898, "train_speed(iter/s)": 0.194118 }, { "epoch": 0.34126039270896524, "grad_norm": 0.7364090085029602, "learning_rate": 9.607418759334334e-05, "loss": 0.940480899810791, "memory(GiB)": 91.52, "step": 26300, "token_acc": 0.759054624045735, "train_speed(iter/s)": 0.194094 }, { "epoch": 0.34132527111062094, "grad_norm": 0.7909189462661743, "learning_rate": 9.607210395613574e-05, "loss": 0.9593048095703125, "memory(GiB)": 91.52, "step": 26305, "token_acc": 0.7493498171949795, "train_speed(iter/s)": 0.194073 }, { "epoch": 0.34139014951227664, "grad_norm": 0.8180692791938782, "learning_rate": 9.607001978873336e-05, "loss": 0.9098993301391601, "memory(GiB)": 91.52, "step": 26310, "token_acc": 0.7823233838308085, "train_speed(iter/s)": 0.194052 }, { "epoch": 0.3414550279139323, "grad_norm": 0.7876774072647095, "learning_rate": 9.606793509116014e-05, "loss": 0.9359163284301758, "memory(GiB)": 91.52, "step": 26315, "token_acc": 0.7660016186354199, "train_speed(iter/s)": 0.19403 }, { "epoch": 0.341519906315588, "grad_norm": 0.7628508806228638, "learning_rate": 9.606584986344009e-05, "loss": 0.9706830978393555, "memory(GiB)": 91.52, "step": 26320, "token_acc": 0.7448752090171549, "train_speed(iter/s)": 0.194009 }, { "epoch": 0.3415847847172437, "grad_norm": 0.8230528831481934, "learning_rate": 9.606376410559721e-05, "loss": 0.9475815773010254, "memory(GiB)": 91.52, "step": 26325, "token_acc": 0.744060475161987, "train_speed(iter/s)": 0.193986 }, { "epoch": 0.3416496631188994, "grad_norm": 0.8454538583755493, "learning_rate": 9.606167781765548e-05, "loss": 0.9358797073364258, "memory(GiB)": 91.52, "step": 26330, "token_acc": 0.7438076897536356, "train_speed(iter/s)": 0.193965 }, { "epoch": 0.3417145415205551, "grad_norm": 0.878609836101532, "learning_rate": 9.605959099963895e-05, "loss": 0.942901611328125, "memory(GiB)": 91.52, "step": 26335, "token_acc": 0.7584354976110226, "train_speed(iter/s)": 0.193945 }, { "epoch": 0.3417794199222108, "grad_norm": 0.8146042823791504, "learning_rate": 9.605750365157159e-05, "loss": 1.0084016799926758, "memory(GiB)": 91.52, "step": 26340, "token_acc": 0.7288179577007915, "train_speed(iter/s)": 0.193925 }, { "epoch": 0.3418442983238665, "grad_norm": 0.6925597190856934, "learning_rate": 9.605541577347746e-05, "loss": 0.9357156753540039, "memory(GiB)": 91.52, "step": 26345, "token_acc": 0.75989133908597, "train_speed(iter/s)": 0.193903 }, { "epoch": 0.3419091767255222, "grad_norm": 0.6867716908454895, "learning_rate": 9.605332736538055e-05, "loss": 0.9520357131958008, "memory(GiB)": 91.52, "step": 26350, "token_acc": 0.7367520358786734, "train_speed(iter/s)": 0.193879 }, { "epoch": 0.3419740551271779, "grad_norm": 0.7389063835144043, "learning_rate": 9.605123842730492e-05, "loss": 0.9864617347717285, "memory(GiB)": 91.52, "step": 26355, "token_acc": 0.7308957829707226, "train_speed(iter/s)": 0.193859 }, { "epoch": 0.3420389335288336, "grad_norm": 0.8157074451446533, "learning_rate": 9.60491489592746e-05, "loss": 0.9862539291381835, "memory(GiB)": 91.52, "step": 26360, "token_acc": 0.7436714462736768, "train_speed(iter/s)": 0.193839 }, { "epoch": 0.3421038119304893, "grad_norm": 0.7292616963386536, "learning_rate": 9.604705896131365e-05, "loss": 0.964625358581543, "memory(GiB)": 91.52, "step": 26365, "token_acc": 0.7253086419753086, "train_speed(iter/s)": 0.193819 }, { "epoch": 0.342168690332145, "grad_norm": 0.79029381275177, "learning_rate": 9.60449684334461e-05, "loss": 0.9432266235351563, "memory(GiB)": 91.52, "step": 26370, "token_acc": 0.7568284789644013, "train_speed(iter/s)": 0.193798 }, { "epoch": 0.3422335687338007, "grad_norm": 0.8480714559555054, "learning_rate": 9.604287737569602e-05, "loss": 0.9823694229125977, "memory(GiB)": 91.52, "step": 26375, "token_acc": 0.7368524952621605, "train_speed(iter/s)": 0.193776 }, { "epoch": 0.3422984471354564, "grad_norm": 0.7657633423805237, "learning_rate": 9.604078578808747e-05, "loss": 0.9269802093505859, "memory(GiB)": 91.52, "step": 26380, "token_acc": 0.7421110730463248, "train_speed(iter/s)": 0.193755 }, { "epoch": 0.3423633255371121, "grad_norm": 0.7572020292282104, "learning_rate": 9.60386936706445e-05, "loss": 0.9256546020507812, "memory(GiB)": 91.52, "step": 26385, "token_acc": 0.7645645890159997, "train_speed(iter/s)": 0.193736 }, { "epoch": 0.3424282039387678, "grad_norm": 0.7850778102874756, "learning_rate": 9.603660102339124e-05, "loss": 0.9657465934753418, "memory(GiB)": 91.52, "step": 26390, "token_acc": 0.7353047478232299, "train_speed(iter/s)": 0.193713 }, { "epoch": 0.34249308234042347, "grad_norm": 0.7684910297393799, "learning_rate": 9.603450784635173e-05, "loss": 0.9530702590942383, "memory(GiB)": 91.52, "step": 26395, "token_acc": 0.751555136663525, "train_speed(iter/s)": 0.193695 }, { "epoch": 0.34255796074207917, "grad_norm": 0.7731494903564453, "learning_rate": 9.603241413955005e-05, "loss": 0.9809242248535156, "memory(GiB)": 91.52, "step": 26400, "token_acc": 0.7634980230587767, "train_speed(iter/s)": 0.193671 }, { "epoch": 0.34262283914373487, "grad_norm": 0.8740250468254089, "learning_rate": 9.603031990301032e-05, "loss": 0.8807275772094727, "memory(GiB)": 91.52, "step": 26405, "token_acc": 0.7551020408163265, "train_speed(iter/s)": 0.193648 }, { "epoch": 0.34268771754539057, "grad_norm": 0.9317175149917603, "learning_rate": 9.602822513675663e-05, "loss": 0.9324419021606445, "memory(GiB)": 91.52, "step": 26410, "token_acc": 0.7482538834395633, "train_speed(iter/s)": 0.193628 }, { "epoch": 0.34275259594704627, "grad_norm": 0.8147608637809753, "learning_rate": 9.60261298408131e-05, "loss": 0.9830924987792968, "memory(GiB)": 91.52, "step": 26415, "token_acc": 0.7363032153763248, "train_speed(iter/s)": 0.193608 }, { "epoch": 0.34281747434870197, "grad_norm": 0.7352777719497681, "learning_rate": 9.602403401520381e-05, "loss": 0.9552839279174805, "memory(GiB)": 91.52, "step": 26420, "token_acc": 0.7377348444512056, "train_speed(iter/s)": 0.193586 }, { "epoch": 0.34288235275035767, "grad_norm": 0.8116835951805115, "learning_rate": 9.60219376599529e-05, "loss": 0.9707162857055665, "memory(GiB)": 91.52, "step": 26425, "token_acc": 0.7351437431900092, "train_speed(iter/s)": 0.193568 }, { "epoch": 0.34294723115201337, "grad_norm": 0.9487637877464294, "learning_rate": 9.60198407750845e-05, "loss": 0.9256143569946289, "memory(GiB)": 91.52, "step": 26430, "token_acc": 0.7635345625999832, "train_speed(iter/s)": 0.193551 }, { "epoch": 0.343012109553669, "grad_norm": 0.7252492308616638, "learning_rate": 9.601774336062274e-05, "loss": 0.9549442291259765, "memory(GiB)": 91.52, "step": 26435, "token_acc": 0.7517389325879217, "train_speed(iter/s)": 0.19353 }, { "epoch": 0.3430769879553247, "grad_norm": 0.834346354007721, "learning_rate": 9.601564541659175e-05, "loss": 0.9680373191833496, "memory(GiB)": 91.52, "step": 26440, "token_acc": 0.7546319843057473, "train_speed(iter/s)": 0.193506 }, { "epoch": 0.3431418663569804, "grad_norm": 0.9198235869407654, "learning_rate": 9.601354694301568e-05, "loss": 0.9522859573364257, "memory(GiB)": 91.52, "step": 26445, "token_acc": 0.7490459075919819, "train_speed(iter/s)": 0.193485 }, { "epoch": 0.3432067447586361, "grad_norm": 0.8572207093238831, "learning_rate": 9.601144793991865e-05, "loss": 0.9791057586669922, "memory(GiB)": 91.52, "step": 26450, "token_acc": 0.7692168674698795, "train_speed(iter/s)": 0.193464 }, { "epoch": 0.3432716231602918, "grad_norm": 0.8565637469291687, "learning_rate": 9.600934840732485e-05, "loss": 0.9708877563476562, "memory(GiB)": 91.52, "step": 26455, "token_acc": 0.7342708502682491, "train_speed(iter/s)": 0.193444 }, { "epoch": 0.3433365015619475, "grad_norm": 0.8119630217552185, "learning_rate": 9.600724834525844e-05, "loss": 0.9213257789611816, "memory(GiB)": 91.52, "step": 26460, "token_acc": 0.7628257887517147, "train_speed(iter/s)": 0.193424 }, { "epoch": 0.3434013799636032, "grad_norm": 0.9154926538467407, "learning_rate": 9.600514775374357e-05, "loss": 0.9731657981872559, "memory(GiB)": 91.52, "step": 26465, "token_acc": 0.7167749212399362, "train_speed(iter/s)": 0.193404 }, { "epoch": 0.3434662583652589, "grad_norm": 0.8651766777038574, "learning_rate": 9.600304663280441e-05, "loss": 0.9876510620117187, "memory(GiB)": 91.52, "step": 26470, "token_acc": 0.7547367001550805, "train_speed(iter/s)": 0.193385 }, { "epoch": 0.3435311367669146, "grad_norm": 0.8055830001831055, "learning_rate": 9.600094498246517e-05, "loss": 0.9419956207275391, "memory(GiB)": 91.52, "step": 26475, "token_acc": 0.7501583782071587, "train_speed(iter/s)": 0.193364 }, { "epoch": 0.3435960151685703, "grad_norm": 0.7273737192153931, "learning_rate": 9.599884280275e-05, "loss": 0.9234476089477539, "memory(GiB)": 91.52, "step": 26480, "token_acc": 0.7638728626875065, "train_speed(iter/s)": 0.193342 }, { "epoch": 0.343660893570226, "grad_norm": 0.8323845863342285, "learning_rate": 9.599674009368311e-05, "loss": 0.970094871520996, "memory(GiB)": 91.52, "step": 26485, "token_acc": 0.7410595812162206, "train_speed(iter/s)": 0.193321 }, { "epoch": 0.3437257719718817, "grad_norm": 0.7552823424339294, "learning_rate": 9.59946368552887e-05, "loss": 0.9393397331237793, "memory(GiB)": 91.52, "step": 26490, "token_acc": 0.7452733538232281, "train_speed(iter/s)": 0.193299 }, { "epoch": 0.3437906503735374, "grad_norm": 0.794983983039856, "learning_rate": 9.599253308759097e-05, "loss": 0.9395208358764648, "memory(GiB)": 91.52, "step": 26495, "token_acc": 0.7652356466972952, "train_speed(iter/s)": 0.193276 }, { "epoch": 0.3438555287751931, "grad_norm": 0.7996928095817566, "learning_rate": 9.599042879061412e-05, "loss": 0.9714487075805665, "memory(GiB)": 91.52, "step": 26500, "token_acc": 0.7275442442728688, "train_speed(iter/s)": 0.193256 }, { "epoch": 0.3439204071768488, "grad_norm": 0.8989397883415222, "learning_rate": 9.598832396438238e-05, "loss": 0.9961665153503418, "memory(GiB)": 91.52, "step": 26505, "token_acc": 0.7356678670474771, "train_speed(iter/s)": 0.193236 }, { "epoch": 0.3439852855785045, "grad_norm": 0.7529841661453247, "learning_rate": 9.598621860891996e-05, "loss": 0.9606090545654297, "memory(GiB)": 91.52, "step": 26510, "token_acc": 0.756238276587279, "train_speed(iter/s)": 0.193214 }, { "epoch": 0.3440501639801602, "grad_norm": 0.7893354892730713, "learning_rate": 9.598411272425112e-05, "loss": 0.9361675262451172, "memory(GiB)": 91.52, "step": 26515, "token_acc": 0.7831051899568113, "train_speed(iter/s)": 0.193192 }, { "epoch": 0.3441150423818159, "grad_norm": 0.7551672458648682, "learning_rate": 9.598200631040005e-05, "loss": 0.9443792343139649, "memory(GiB)": 91.52, "step": 26520, "token_acc": 0.7317398597133272, "train_speed(iter/s)": 0.193168 }, { "epoch": 0.3441799207834716, "grad_norm": 0.8198902606964111, "learning_rate": 9.597989936739102e-05, "loss": 0.9648595809936523, "memory(GiB)": 91.52, "step": 26525, "token_acc": 0.7444797458300239, "train_speed(iter/s)": 0.193149 }, { "epoch": 0.3442447991851273, "grad_norm": 0.8313034772872925, "learning_rate": 9.597779189524826e-05, "loss": 0.9369284629821777, "memory(GiB)": 91.52, "step": 26530, "token_acc": 0.7621768363900375, "train_speed(iter/s)": 0.193126 }, { "epoch": 0.344309677586783, "grad_norm": 0.7653738260269165, "learning_rate": 9.597568389399603e-05, "loss": 0.9838336944580078, "memory(GiB)": 91.52, "step": 26535, "token_acc": 0.7542091465190796, "train_speed(iter/s)": 0.193106 }, { "epoch": 0.3443745559884387, "grad_norm": 0.7477595806121826, "learning_rate": 9.597357536365858e-05, "loss": 0.9475923538208008, "memory(GiB)": 91.52, "step": 26540, "token_acc": 0.7337599050807238, "train_speed(iter/s)": 0.193085 }, { "epoch": 0.3444394343900944, "grad_norm": 0.7101085782051086, "learning_rate": 9.597146630426021e-05, "loss": 0.9380457878112793, "memory(GiB)": 91.52, "step": 26545, "token_acc": 0.7411827956989248, "train_speed(iter/s)": 0.193064 }, { "epoch": 0.3445043127917501, "grad_norm": 0.763917088508606, "learning_rate": 9.596935671582515e-05, "loss": 0.9346706390380859, "memory(GiB)": 91.52, "step": 26550, "token_acc": 0.7377450221410212, "train_speed(iter/s)": 0.193043 }, { "epoch": 0.34456919119340573, "grad_norm": 0.7096128463745117, "learning_rate": 9.596724659837769e-05, "loss": 0.9318658828735351, "memory(GiB)": 91.52, "step": 26555, "token_acc": 0.7573239086616782, "train_speed(iter/s)": 0.19302 }, { "epoch": 0.34463406959506143, "grad_norm": 0.7641149759292603, "learning_rate": 9.596513595194212e-05, "loss": 0.9379884719848632, "memory(GiB)": 91.52, "step": 26560, "token_acc": 0.7404301463943611, "train_speed(iter/s)": 0.192998 }, { "epoch": 0.34469894799671713, "grad_norm": 0.7569352984428406, "learning_rate": 9.596302477654272e-05, "loss": 0.9004598617553711, "memory(GiB)": 91.52, "step": 26565, "token_acc": 0.7457118786857624, "train_speed(iter/s)": 0.192977 }, { "epoch": 0.34476382639837283, "grad_norm": 0.7199397683143616, "learning_rate": 9.596091307220378e-05, "loss": 0.9003200531005859, "memory(GiB)": 91.52, "step": 26570, "token_acc": 0.7541123354276513, "train_speed(iter/s)": 0.192956 }, { "epoch": 0.34482870480002853, "grad_norm": 0.838091254234314, "learning_rate": 9.595880083894963e-05, "loss": 0.9342234611511231, "memory(GiB)": 91.52, "step": 26575, "token_acc": 0.759734426869752, "train_speed(iter/s)": 0.192937 }, { "epoch": 0.34489358320168423, "grad_norm": 0.7238042950630188, "learning_rate": 9.595668807680453e-05, "loss": 0.9671440124511719, "memory(GiB)": 91.52, "step": 26580, "token_acc": 0.7362594052311, "train_speed(iter/s)": 0.192918 }, { "epoch": 0.3449584616033399, "grad_norm": 0.857996940612793, "learning_rate": 9.595457478579285e-05, "loss": 0.9170661926269531, "memory(GiB)": 91.52, "step": 26585, "token_acc": 0.7608754943406518, "train_speed(iter/s)": 0.192895 }, { "epoch": 0.3450233400049956, "grad_norm": 0.8155100345611572, "learning_rate": 9.595246096593887e-05, "loss": 0.9100490570068359, "memory(GiB)": 91.52, "step": 26590, "token_acc": 0.7737547892720307, "train_speed(iter/s)": 0.192874 }, { "epoch": 0.3450882184066513, "grad_norm": 0.7429622411727905, "learning_rate": 9.595034661726693e-05, "loss": 0.9761404991149902, "memory(GiB)": 91.52, "step": 26595, "token_acc": 0.7439811457577955, "train_speed(iter/s)": 0.192851 }, { "epoch": 0.345153096808307, "grad_norm": 0.745826005935669, "learning_rate": 9.594823173980135e-05, "loss": 0.9475257873535157, "memory(GiB)": 91.52, "step": 26600, "token_acc": 0.7341354611711486, "train_speed(iter/s)": 0.192828 }, { "epoch": 0.3452179752099627, "grad_norm": 0.690211296081543, "learning_rate": 9.594611633356648e-05, "loss": 0.9260252952575684, "memory(GiB)": 91.52, "step": 26605, "token_acc": 0.7395313789515232, "train_speed(iter/s)": 0.192806 }, { "epoch": 0.3452828536116184, "grad_norm": 0.8038079142570496, "learning_rate": 9.594400039858667e-05, "loss": 0.9563442230224609, "memory(GiB)": 91.52, "step": 26610, "token_acc": 0.7496429591545273, "train_speed(iter/s)": 0.192784 }, { "epoch": 0.3453477320132741, "grad_norm": 0.802331268787384, "learning_rate": 9.594188393488625e-05, "loss": 0.9473374366760254, "memory(GiB)": 91.52, "step": 26615, "token_acc": 0.736664064532917, "train_speed(iter/s)": 0.192763 }, { "epoch": 0.3454126104149298, "grad_norm": 0.8122112154960632, "learning_rate": 9.593976694248958e-05, "loss": 0.8858049392700196, "memory(GiB)": 91.52, "step": 26620, "token_acc": 0.7628321678321678, "train_speed(iter/s)": 0.192743 }, { "epoch": 0.3454774888165855, "grad_norm": 0.891338586807251, "learning_rate": 9.593764942142104e-05, "loss": 0.9522270202636719, "memory(GiB)": 91.52, "step": 26625, "token_acc": 0.7490094688066617, "train_speed(iter/s)": 0.192724 }, { "epoch": 0.3455423672182412, "grad_norm": 0.7937016487121582, "learning_rate": 9.593553137170499e-05, "loss": 0.9656109809875488, "memory(GiB)": 91.52, "step": 26630, "token_acc": 0.7492302955665024, "train_speed(iter/s)": 0.192702 }, { "epoch": 0.3456072456198969, "grad_norm": 0.8175861835479736, "learning_rate": 9.593341279336578e-05, "loss": 0.9545498847961426, "memory(GiB)": 91.52, "step": 26635, "token_acc": 0.7413821204465814, "train_speed(iter/s)": 0.192678 }, { "epoch": 0.3456721240215526, "grad_norm": 0.751525342464447, "learning_rate": 9.593129368642784e-05, "loss": 0.9674808502197265, "memory(GiB)": 91.52, "step": 26640, "token_acc": 0.7562962962962962, "train_speed(iter/s)": 0.192657 }, { "epoch": 0.3457370024232083, "grad_norm": 0.8007870316505432, "learning_rate": 9.592917405091551e-05, "loss": 0.9481950759887695, "memory(GiB)": 91.52, "step": 26645, "token_acc": 0.7317082965223677, "train_speed(iter/s)": 0.192636 }, { "epoch": 0.345801880824864, "grad_norm": 0.9311871528625488, "learning_rate": 9.592705388685321e-05, "loss": 0.9053360939025878, "memory(GiB)": 91.52, "step": 26650, "token_acc": 0.7734548389266259, "train_speed(iter/s)": 0.192616 }, { "epoch": 0.3458667592265197, "grad_norm": 0.7181285619735718, "learning_rate": 9.592493319426532e-05, "loss": 0.9624095916748047, "memory(GiB)": 91.52, "step": 26655, "token_acc": 0.7635812711925547, "train_speed(iter/s)": 0.192595 }, { "epoch": 0.3459316376281754, "grad_norm": 0.7246310710906982, "learning_rate": 9.592281197317626e-05, "loss": 0.9662490844726562, "memory(GiB)": 91.52, "step": 26660, "token_acc": 0.7580938015028352, "train_speed(iter/s)": 0.192573 }, { "epoch": 0.3459965160298311, "grad_norm": 0.7342712879180908, "learning_rate": 9.592069022361043e-05, "loss": 0.903864860534668, "memory(GiB)": 91.52, "step": 26665, "token_acc": 0.7629934141532783, "train_speed(iter/s)": 0.192553 }, { "epoch": 0.3460613944314868, "grad_norm": 0.8079228401184082, "learning_rate": 9.591856794559226e-05, "loss": 1.0083539962768555, "memory(GiB)": 91.52, "step": 26670, "token_acc": 0.7434658739006565, "train_speed(iter/s)": 0.192534 }, { "epoch": 0.34612627283314246, "grad_norm": 0.8434061408042908, "learning_rate": 9.591644513914617e-05, "loss": 0.9634511947631836, "memory(GiB)": 91.52, "step": 26675, "token_acc": 0.736518746067582, "train_speed(iter/s)": 0.19251 }, { "epoch": 0.34619115123479816, "grad_norm": 0.7194541096687317, "learning_rate": 9.591432180429658e-05, "loss": 0.9164250373840332, "memory(GiB)": 91.52, "step": 26680, "token_acc": 0.7429545924343692, "train_speed(iter/s)": 0.192486 }, { "epoch": 0.34625602963645385, "grad_norm": 0.8131638765335083, "learning_rate": 9.591219794106793e-05, "loss": 0.9675992012023926, "memory(GiB)": 91.52, "step": 26685, "token_acc": 0.7308769963575231, "train_speed(iter/s)": 0.192466 }, { "epoch": 0.34632090803810955, "grad_norm": 0.8368979096412659, "learning_rate": 9.591007354948465e-05, "loss": 0.9963456153869629, "memory(GiB)": 91.52, "step": 26690, "token_acc": 0.7347670250896058, "train_speed(iter/s)": 0.192446 }, { "epoch": 0.34638578643976525, "grad_norm": 0.850847065448761, "learning_rate": 9.590794862957122e-05, "loss": 0.9330156326293946, "memory(GiB)": 91.52, "step": 26695, "token_acc": 0.7413274082568807, "train_speed(iter/s)": 0.192425 }, { "epoch": 0.34645066484142095, "grad_norm": 0.7220759987831116, "learning_rate": 9.590582318135206e-05, "loss": 0.9471920967102051, "memory(GiB)": 91.52, "step": 26700, "token_acc": 0.743621059580789, "train_speed(iter/s)": 0.192407 }, { "epoch": 0.34651554324307665, "grad_norm": 0.8625006675720215, "learning_rate": 9.590369720485164e-05, "loss": 0.9432865142822265, "memory(GiB)": 91.52, "step": 26705, "token_acc": 0.7613718411552347, "train_speed(iter/s)": 0.192387 }, { "epoch": 0.34658042164473235, "grad_norm": 0.8665882349014282, "learning_rate": 9.590157070009442e-05, "loss": 0.9779869079589844, "memory(GiB)": 91.52, "step": 26710, "token_acc": 0.7294975067126965, "train_speed(iter/s)": 0.192365 }, { "epoch": 0.34664530004638805, "grad_norm": 0.8614934682846069, "learning_rate": 9.589944366710488e-05, "loss": 0.9853262901306152, "memory(GiB)": 91.52, "step": 26715, "token_acc": 0.7417494705978173, "train_speed(iter/s)": 0.192345 }, { "epoch": 0.34671017844804375, "grad_norm": 0.774385392665863, "learning_rate": 9.58973161059075e-05, "loss": 0.9988182067871094, "memory(GiB)": 91.52, "step": 26720, "token_acc": 0.7208425023577492, "train_speed(iter/s)": 0.192326 }, { "epoch": 0.34677505684969945, "grad_norm": 0.9058374762535095, "learning_rate": 9.589518801652678e-05, "loss": 0.9670926094055176, "memory(GiB)": 91.52, "step": 26725, "token_acc": 0.7571958975113039, "train_speed(iter/s)": 0.192306 }, { "epoch": 0.34683993525135515, "grad_norm": 0.8070016503334045, "learning_rate": 9.589305939898716e-05, "loss": 1.0121725082397461, "memory(GiB)": 91.52, "step": 26730, "token_acc": 0.7341878841088674, "train_speed(iter/s)": 0.192286 }, { "epoch": 0.34690481365301085, "grad_norm": 0.7329708933830261, "learning_rate": 9.589093025331319e-05, "loss": 0.9538437843322753, "memory(GiB)": 91.52, "step": 26735, "token_acc": 0.724824044131634, "train_speed(iter/s)": 0.192263 }, { "epoch": 0.34696969205466655, "grad_norm": 0.8143279552459717, "learning_rate": 9.588880057952933e-05, "loss": 0.9615777969360352, "memory(GiB)": 91.52, "step": 26740, "token_acc": 0.7264167272475508, "train_speed(iter/s)": 0.192241 }, { "epoch": 0.34703457045632224, "grad_norm": 0.7821047902107239, "learning_rate": 9.58866703776601e-05, "loss": 0.9325762748718261, "memory(GiB)": 91.52, "step": 26745, "token_acc": 0.7409064455114215, "train_speed(iter/s)": 0.19222 }, { "epoch": 0.34709944885797794, "grad_norm": 0.7296066284179688, "learning_rate": 9.588453964773006e-05, "loss": 0.9548258781433105, "memory(GiB)": 91.52, "step": 26750, "token_acc": 0.7517467504586439, "train_speed(iter/s)": 0.1922 }, { "epoch": 0.34716432725963364, "grad_norm": 0.7573896050453186, "learning_rate": 9.588240838976366e-05, "loss": 0.9544898986816406, "memory(GiB)": 91.52, "step": 26755, "token_acc": 0.7642924425766249, "train_speed(iter/s)": 0.19218 }, { "epoch": 0.34722920566128934, "grad_norm": 0.949635922908783, "learning_rate": 9.588027660378545e-05, "loss": 0.9415806770324707, "memory(GiB)": 91.52, "step": 26760, "token_acc": 0.7458973916047676, "train_speed(iter/s)": 0.192159 }, { "epoch": 0.34729408406294504, "grad_norm": 0.8109501600265503, "learning_rate": 9.587814428981998e-05, "loss": 0.9297590255737305, "memory(GiB)": 91.52, "step": 26765, "token_acc": 0.7647497297901681, "train_speed(iter/s)": 0.192137 }, { "epoch": 0.34735896246460074, "grad_norm": 0.7742337584495544, "learning_rate": 9.587601144789178e-05, "loss": 0.9584834098815918, "memory(GiB)": 91.52, "step": 26770, "token_acc": 0.7296893667861409, "train_speed(iter/s)": 0.192115 }, { "epoch": 0.34742384086625644, "grad_norm": 0.805883526802063, "learning_rate": 9.587387807802539e-05, "loss": 0.9726727485656739, "memory(GiB)": 91.52, "step": 26775, "token_acc": 0.7309156115055259, "train_speed(iter/s)": 0.192095 }, { "epoch": 0.34748871926791214, "grad_norm": 0.7394514083862305, "learning_rate": 9.587174418024537e-05, "loss": 0.9414478302001953, "memory(GiB)": 91.52, "step": 26780, "token_acc": 0.7675724759534539, "train_speed(iter/s)": 0.19207 }, { "epoch": 0.34755359766956784, "grad_norm": 0.8178097009658813, "learning_rate": 9.586960975457626e-05, "loss": 0.9609342575073242, "memory(GiB)": 91.52, "step": 26785, "token_acc": 0.7558495128369248, "train_speed(iter/s)": 0.19205 }, { "epoch": 0.34761847607122354, "grad_norm": 0.8444105982780457, "learning_rate": 9.586747480104265e-05, "loss": 0.8920389175415039, "memory(GiB)": 91.52, "step": 26790, "token_acc": 0.7494059712800027, "train_speed(iter/s)": 0.192029 }, { "epoch": 0.3476833544728792, "grad_norm": 0.7300061583518982, "learning_rate": 9.586533931966907e-05, "loss": 0.9811543464660645, "memory(GiB)": 91.52, "step": 26795, "token_acc": 0.745042201331229, "train_speed(iter/s)": 0.192009 }, { "epoch": 0.3477482328745349, "grad_norm": 0.7648597955703735, "learning_rate": 9.586320331048013e-05, "loss": 0.9619629859924317, "memory(GiB)": 91.52, "step": 26800, "token_acc": 0.7501265639690461, "train_speed(iter/s)": 0.191987 }, { "epoch": 0.3478131112761906, "grad_norm": 0.689212441444397, "learning_rate": 9.58610667735004e-05, "loss": 0.9473455429077149, "memory(GiB)": 91.52, "step": 26805, "token_acc": 0.7533051295610788, "train_speed(iter/s)": 0.191967 }, { "epoch": 0.3478779896778463, "grad_norm": 0.740082859992981, "learning_rate": 9.585892970875446e-05, "loss": 0.972313117980957, "memory(GiB)": 91.52, "step": 26810, "token_acc": 0.7299272454406055, "train_speed(iter/s)": 0.191947 }, { "epoch": 0.347942868079502, "grad_norm": 0.7636255025863647, "learning_rate": 9.585679211626692e-05, "loss": 0.9760387420654297, "memory(GiB)": 91.52, "step": 26815, "token_acc": 0.737144283574638, "train_speed(iter/s)": 0.191928 }, { "epoch": 0.3480077464811577, "grad_norm": 0.6982622146606445, "learning_rate": 9.585465399606235e-05, "loss": 0.9727144241333008, "memory(GiB)": 91.52, "step": 26820, "token_acc": 0.7431674593596933, "train_speed(iter/s)": 0.191908 }, { "epoch": 0.3480726248828134, "grad_norm": 0.767592191696167, "learning_rate": 9.585251534816538e-05, "loss": 0.969500732421875, "memory(GiB)": 91.52, "step": 26825, "token_acc": 0.7445326988348088, "train_speed(iter/s)": 0.191888 }, { "epoch": 0.3481375032844691, "grad_norm": 0.7894752621650696, "learning_rate": 9.585037617260062e-05, "loss": 0.9189370155334473, "memory(GiB)": 91.52, "step": 26830, "token_acc": 0.7460469927589773, "train_speed(iter/s)": 0.191869 }, { "epoch": 0.3482023816861248, "grad_norm": 0.7912863492965698, "learning_rate": 9.584823646939267e-05, "loss": 0.9585217475891114, "memory(GiB)": 91.52, "step": 26835, "token_acc": 0.7463590103526935, "train_speed(iter/s)": 0.191848 }, { "epoch": 0.3482672600877805, "grad_norm": 0.887834906578064, "learning_rate": 9.584609623856618e-05, "loss": 0.9420835494995117, "memory(GiB)": 91.52, "step": 26840, "token_acc": 0.7493012732354376, "train_speed(iter/s)": 0.191832 }, { "epoch": 0.34833213848943617, "grad_norm": 0.8241045475006104, "learning_rate": 9.584395548014575e-05, "loss": 0.9091148376464844, "memory(GiB)": 91.52, "step": 26845, "token_acc": 0.764519535374868, "train_speed(iter/s)": 0.191812 }, { "epoch": 0.34839701689109187, "grad_norm": 0.7389832735061646, "learning_rate": 9.584181419415603e-05, "loss": 0.9655122756958008, "memory(GiB)": 91.52, "step": 26850, "token_acc": 0.7590097564213181, "train_speed(iter/s)": 0.191793 }, { "epoch": 0.34846189529274757, "grad_norm": 0.8731091618537903, "learning_rate": 9.583967238062168e-05, "loss": 0.9183799743652343, "memory(GiB)": 91.52, "step": 26855, "token_acc": 0.745066559636096, "train_speed(iter/s)": 0.191772 }, { "epoch": 0.34852677369440327, "grad_norm": 0.8639339804649353, "learning_rate": 9.583753003956733e-05, "loss": 0.9394423484802246, "memory(GiB)": 91.52, "step": 26860, "token_acc": 0.7357863845980199, "train_speed(iter/s)": 0.191753 }, { "epoch": 0.34859165209605897, "grad_norm": 0.8684532046318054, "learning_rate": 9.583538717101761e-05, "loss": 0.972871208190918, "memory(GiB)": 91.52, "step": 26865, "token_acc": 0.7417956046122006, "train_speed(iter/s)": 0.191732 }, { "epoch": 0.34865653049771467, "grad_norm": 0.7261451482772827, "learning_rate": 9.583324377499723e-05, "loss": 0.9054744720458985, "memory(GiB)": 91.52, "step": 26870, "token_acc": 0.7592400690846287, "train_speed(iter/s)": 0.191709 }, { "epoch": 0.34872140889937037, "grad_norm": 0.7926918864250183, "learning_rate": 9.583109985153081e-05, "loss": 0.9656330108642578, "memory(GiB)": 91.52, "step": 26875, "token_acc": 0.7404615384615385, "train_speed(iter/s)": 0.19169 }, { "epoch": 0.34878628730102607, "grad_norm": 0.8982170224189758, "learning_rate": 9.582895540064304e-05, "loss": 0.9178005218505859, "memory(GiB)": 91.52, "step": 26880, "token_acc": 0.7654570126300407, "train_speed(iter/s)": 0.19167 }, { "epoch": 0.34885116570268176, "grad_norm": 0.8539833426475525, "learning_rate": 9.582681042235864e-05, "loss": 0.9990373611450195, "memory(GiB)": 91.52, "step": 26885, "token_acc": 0.735108289064219, "train_speed(iter/s)": 0.191649 }, { "epoch": 0.34891604410433746, "grad_norm": 0.8351530432701111, "learning_rate": 9.58246649167022e-05, "loss": 0.9544861793518067, "memory(GiB)": 91.52, "step": 26890, "token_acc": 0.7518801804973277, "train_speed(iter/s)": 0.19163 }, { "epoch": 0.34898092250599316, "grad_norm": 0.7237114310264587, "learning_rate": 9.58225188836985e-05, "loss": 0.9307991027832031, "memory(GiB)": 91.52, "step": 26895, "token_acc": 0.7381734784692271, "train_speed(iter/s)": 0.191609 }, { "epoch": 0.34904580090764886, "grad_norm": 0.7555425763130188, "learning_rate": 9.582037232337221e-05, "loss": 0.9496807098388672, "memory(GiB)": 91.52, "step": 26900, "token_acc": 0.7435459917211742, "train_speed(iter/s)": 0.191589 }, { "epoch": 0.34911067930930456, "grad_norm": 0.7461119294166565, "learning_rate": 9.581822523574802e-05, "loss": 0.9683126449584961, "memory(GiB)": 91.52, "step": 26905, "token_acc": 0.7408466819221968, "train_speed(iter/s)": 0.191569 }, { "epoch": 0.34917555771096026, "grad_norm": 0.7957533001899719, "learning_rate": 9.581607762085062e-05, "loss": 0.9674774169921875, "memory(GiB)": 91.52, "step": 26910, "token_acc": 0.7384343006696944, "train_speed(iter/s)": 0.191553 }, { "epoch": 0.3492404361126159, "grad_norm": 0.731242835521698, "learning_rate": 9.581392947870475e-05, "loss": 0.9400537490844727, "memory(GiB)": 91.52, "step": 26915, "token_acc": 0.7491244123307986, "train_speed(iter/s)": 0.191533 }, { "epoch": 0.3493053145142716, "grad_norm": 0.7245625853538513, "learning_rate": 9.581178080933515e-05, "loss": 0.9666308403015137, "memory(GiB)": 91.52, "step": 26920, "token_acc": 0.7515907750566471, "train_speed(iter/s)": 0.19151 }, { "epoch": 0.3493701929159273, "grad_norm": 0.9097219705581665, "learning_rate": 9.580963161276651e-05, "loss": 0.9691783905029296, "memory(GiB)": 91.52, "step": 26925, "token_acc": 0.7470066863629296, "train_speed(iter/s)": 0.191489 }, { "epoch": 0.349435071317583, "grad_norm": 0.7912476658821106, "learning_rate": 9.580748188902358e-05, "loss": 0.9483194351196289, "memory(GiB)": 91.52, "step": 26930, "token_acc": 0.7536770921386307, "train_speed(iter/s)": 0.191468 }, { "epoch": 0.3494999497192387, "grad_norm": 0.7815747857093811, "learning_rate": 9.58053316381311e-05, "loss": 0.8754463195800781, "memory(GiB)": 91.52, "step": 26935, "token_acc": 0.7592917592917593, "train_speed(iter/s)": 0.191447 }, { "epoch": 0.3495648281208944, "grad_norm": 0.7476909160614014, "learning_rate": 9.580318086011382e-05, "loss": 0.9410383224487304, "memory(GiB)": 91.52, "step": 26940, "token_acc": 0.7524749152656129, "train_speed(iter/s)": 0.191429 }, { "epoch": 0.3496297065225501, "grad_norm": 0.8423792123794556, "learning_rate": 9.580102955499646e-05, "loss": 1.0030473709106444, "memory(GiB)": 91.52, "step": 26945, "token_acc": 0.7599116858539663, "train_speed(iter/s)": 0.191408 }, { "epoch": 0.3496945849242058, "grad_norm": 0.9736546277999878, "learning_rate": 9.579887772280381e-05, "loss": 0.9280685424804688, "memory(GiB)": 91.52, "step": 26950, "token_acc": 0.7771507863089732, "train_speed(iter/s)": 0.191387 }, { "epoch": 0.3497594633258615, "grad_norm": 0.9060323238372803, "learning_rate": 9.579672536356064e-05, "loss": 0.9693390846252441, "memory(GiB)": 91.52, "step": 26955, "token_acc": 0.748530530392089, "train_speed(iter/s)": 0.191368 }, { "epoch": 0.3498243417275172, "grad_norm": 0.7410199642181396, "learning_rate": 9.579457247729167e-05, "loss": 0.9892855644226074, "memory(GiB)": 91.52, "step": 26960, "token_acc": 0.7398178078166324, "train_speed(iter/s)": 0.191347 }, { "epoch": 0.3498892201291729, "grad_norm": 0.9028399586677551, "learning_rate": 9.579241906402173e-05, "loss": 0.961923885345459, "memory(GiB)": 91.52, "step": 26965, "token_acc": 0.7249463623951629, "train_speed(iter/s)": 0.191329 }, { "epoch": 0.3499540985308286, "grad_norm": 0.8132534027099609, "learning_rate": 9.579026512377557e-05, "loss": 0.9517366409301757, "memory(GiB)": 91.52, "step": 26970, "token_acc": 0.7363178469403098, "train_speed(iter/s)": 0.191311 }, { "epoch": 0.3500189769324843, "grad_norm": 0.8011821508407593, "learning_rate": 9.578811065657799e-05, "loss": 0.9030576705932617, "memory(GiB)": 91.52, "step": 26975, "token_acc": 0.7510167334140511, "train_speed(iter/s)": 0.191291 }, { "epoch": 0.35008385533414, "grad_norm": 0.7600702047348022, "learning_rate": 9.578595566245377e-05, "loss": 0.9330350875854492, "memory(GiB)": 91.52, "step": 26980, "token_acc": 0.7690187089579977, "train_speed(iter/s)": 0.19127 }, { "epoch": 0.3501487337357957, "grad_norm": 0.784947395324707, "learning_rate": 9.578380014142774e-05, "loss": 0.9558924674987793, "memory(GiB)": 91.52, "step": 26985, "token_acc": 0.7380538941411023, "train_speed(iter/s)": 0.19125 }, { "epoch": 0.3502136121374514, "grad_norm": 0.8159117102622986, "learning_rate": 9.578164409352466e-05, "loss": 0.959095573425293, "memory(GiB)": 91.52, "step": 26990, "token_acc": 0.7427970271829504, "train_speed(iter/s)": 0.191231 }, { "epoch": 0.3502784905391071, "grad_norm": 0.8109724521636963, "learning_rate": 9.577948751876938e-05, "loss": 0.9275239944458008, "memory(GiB)": 91.52, "step": 26995, "token_acc": 0.7708841413845804, "train_speed(iter/s)": 0.191211 }, { "epoch": 0.3503433689407628, "grad_norm": 0.8584492206573486, "learning_rate": 9.577733041718671e-05, "loss": 0.9467397689819336, "memory(GiB)": 91.52, "step": 27000, "token_acc": 0.7502306786725635, "train_speed(iter/s)": 0.191191 }, { "epoch": 0.3504082473424185, "grad_norm": 0.8117880821228027, "learning_rate": 9.577517278880145e-05, "loss": 0.9286307334899903, "memory(GiB)": 91.52, "step": 27005, "token_acc": 0.7612622282192033, "train_speed(iter/s)": 0.191171 }, { "epoch": 0.3504731257440742, "grad_norm": 0.7664446830749512, "learning_rate": 9.577301463363846e-05, "loss": 0.9439374923706054, "memory(GiB)": 91.52, "step": 27010, "token_acc": 0.7421663965424095, "train_speed(iter/s)": 0.191154 }, { "epoch": 0.3505380041457299, "grad_norm": 0.775600016117096, "learning_rate": 9.577085595172256e-05, "loss": 0.9089113235473633, "memory(GiB)": 91.52, "step": 27015, "token_acc": 0.7662252140191702, "train_speed(iter/s)": 0.191138 }, { "epoch": 0.3506028825473856, "grad_norm": 0.8732255101203918, "learning_rate": 9.57686967430786e-05, "loss": 0.9330266952514649, "memory(GiB)": 91.52, "step": 27020, "token_acc": 0.753442088091354, "train_speed(iter/s)": 0.191119 }, { "epoch": 0.3506677609490413, "grad_norm": 0.7510066032409668, "learning_rate": 9.576653700773142e-05, "loss": 0.9600887298583984, "memory(GiB)": 91.52, "step": 27025, "token_acc": 0.7471869208366428, "train_speed(iter/s)": 0.191098 }, { "epoch": 0.35073263935069693, "grad_norm": 0.7215787768363953, "learning_rate": 9.576437674570587e-05, "loss": 0.9454408645629883, "memory(GiB)": 91.52, "step": 27030, "token_acc": 0.7431473362142385, "train_speed(iter/s)": 0.191076 }, { "epoch": 0.35079751775235263, "grad_norm": 0.8128179311752319, "learning_rate": 9.576221595702682e-05, "loss": 0.9679193496704102, "memory(GiB)": 91.52, "step": 27035, "token_acc": 0.7415069982334557, "train_speed(iter/s)": 0.191056 }, { "epoch": 0.3508623961540083, "grad_norm": 0.7618628740310669, "learning_rate": 9.576005464171915e-05, "loss": 0.966855812072754, "memory(GiB)": 91.52, "step": 27040, "token_acc": 0.7393930519456748, "train_speed(iter/s)": 0.191037 }, { "epoch": 0.350927274555664, "grad_norm": 0.7409077286720276, "learning_rate": 9.575789279980769e-05, "loss": 0.9448225021362304, "memory(GiB)": 91.52, "step": 27045, "token_acc": 0.7384654764190184, "train_speed(iter/s)": 0.191019 }, { "epoch": 0.3509921529573197, "grad_norm": 0.7609830498695374, "learning_rate": 9.575573043131735e-05, "loss": 0.895725154876709, "memory(GiB)": 91.52, "step": 27050, "token_acc": 0.7774755389549755, "train_speed(iter/s)": 0.190997 }, { "epoch": 0.3510570313589754, "grad_norm": 0.8893769383430481, "learning_rate": 9.575356753627304e-05, "loss": 0.9791242599487304, "memory(GiB)": 91.52, "step": 27055, "token_acc": 0.7461738261738262, "train_speed(iter/s)": 0.190977 }, { "epoch": 0.3511219097606311, "grad_norm": 0.7348476648330688, "learning_rate": 9.575140411469959e-05, "loss": 0.9462888717651368, "memory(GiB)": 91.52, "step": 27060, "token_acc": 0.7407980902609762, "train_speed(iter/s)": 0.190958 }, { "epoch": 0.3511867881622868, "grad_norm": 0.9276631474494934, "learning_rate": 9.574924016662194e-05, "loss": 0.9484895706176758, "memory(GiB)": 91.52, "step": 27065, "token_acc": 0.7547553305721737, "train_speed(iter/s)": 0.190939 }, { "epoch": 0.3512516665639425, "grad_norm": 0.8322154879570007, "learning_rate": 9.574707569206496e-05, "loss": 0.906285285949707, "memory(GiB)": 91.52, "step": 27070, "token_acc": 0.7518802945134986, "train_speed(iter/s)": 0.190921 }, { "epoch": 0.3513165449655982, "grad_norm": 0.783076822757721, "learning_rate": 9.57449106910536e-05, "loss": 0.8978673934936523, "memory(GiB)": 91.52, "step": 27075, "token_acc": 0.7588243144058183, "train_speed(iter/s)": 0.1909 }, { "epoch": 0.3513814233672539, "grad_norm": 0.7702566981315613, "learning_rate": 9.574274516361275e-05, "loss": 0.9819491386413575, "memory(GiB)": 91.52, "step": 27080, "token_acc": 0.7426031081888822, "train_speed(iter/s)": 0.190881 }, { "epoch": 0.3514463017689096, "grad_norm": 0.9074148535728455, "learning_rate": 9.574057910976732e-05, "loss": 0.9810754776000976, "memory(GiB)": 91.52, "step": 27085, "token_acc": 0.7401799364930025, "train_speed(iter/s)": 0.190862 }, { "epoch": 0.3515111801705653, "grad_norm": 0.7968980669975281, "learning_rate": 9.573841252954225e-05, "loss": 0.9580232620239257, "memory(GiB)": 91.52, "step": 27090, "token_acc": 0.7309641285504734, "train_speed(iter/s)": 0.19084 }, { "epoch": 0.351576058572221, "grad_norm": 0.6886026859283447, "learning_rate": 9.573624542296246e-05, "loss": 0.9516361236572266, "memory(GiB)": 91.52, "step": 27095, "token_acc": 0.7748413964893162, "train_speed(iter/s)": 0.190819 }, { "epoch": 0.3516409369738767, "grad_norm": 0.7772578597068787, "learning_rate": 9.573407779005294e-05, "loss": 0.9510481834411622, "memory(GiB)": 91.52, "step": 27100, "token_acc": 0.746493098417686, "train_speed(iter/s)": 0.190801 }, { "epoch": 0.3517058153755324, "grad_norm": 0.7932158708572388, "learning_rate": 9.573190963083856e-05, "loss": 0.9842601776123047, "memory(GiB)": 91.52, "step": 27105, "token_acc": 0.7239433046897296, "train_speed(iter/s)": 0.190783 }, { "epoch": 0.3517706937771881, "grad_norm": 0.7835428714752197, "learning_rate": 9.572974094534432e-05, "loss": 0.9675462722778321, "memory(GiB)": 91.52, "step": 27110, "token_acc": 0.7326722481471315, "train_speed(iter/s)": 0.19076 }, { "epoch": 0.3518355721788438, "grad_norm": 0.7846666574478149, "learning_rate": 9.572757173359517e-05, "loss": 0.9902679443359375, "memory(GiB)": 91.52, "step": 27115, "token_acc": 0.7237870836457106, "train_speed(iter/s)": 0.190742 }, { "epoch": 0.3519004505804995, "grad_norm": 0.9614079594612122, "learning_rate": 9.572540199561606e-05, "loss": 0.9663874626159668, "memory(GiB)": 91.52, "step": 27120, "token_acc": 0.7451927044648016, "train_speed(iter/s)": 0.19072 }, { "epoch": 0.3519653289821552, "grad_norm": 0.8888713121414185, "learning_rate": 9.572323173143197e-05, "loss": 1.003868865966797, "memory(GiB)": 91.52, "step": 27125, "token_acc": 0.732174317720972, "train_speed(iter/s)": 0.190702 }, { "epoch": 0.3520302073838109, "grad_norm": 0.8335396647453308, "learning_rate": 9.572106094106788e-05, "loss": 1.0165712356567382, "memory(GiB)": 91.52, "step": 27130, "token_acc": 0.7342869074963458, "train_speed(iter/s)": 0.190681 }, { "epoch": 0.3520950857854666, "grad_norm": 0.7486433982849121, "learning_rate": 9.571888962454875e-05, "loss": 0.9605151176452636, "memory(GiB)": 91.52, "step": 27135, "token_acc": 0.7579369302259586, "train_speed(iter/s)": 0.190659 }, { "epoch": 0.3521599641871223, "grad_norm": 0.7625041604042053, "learning_rate": 9.57167177818996e-05, "loss": 0.9529394149780274, "memory(GiB)": 91.52, "step": 27140, "token_acc": 0.7337194055944056, "train_speed(iter/s)": 0.190639 }, { "epoch": 0.352224842588778, "grad_norm": 0.8035889267921448, "learning_rate": 9.571454541314538e-05, "loss": 1.0046758651733398, "memory(GiB)": 91.52, "step": 27145, "token_acc": 0.724703535409509, "train_speed(iter/s)": 0.190619 }, { "epoch": 0.35228972099043365, "grad_norm": 0.8538349270820618, "learning_rate": 9.571237251831111e-05, "loss": 0.995790672302246, "memory(GiB)": 91.52, "step": 27150, "token_acc": 0.7461776220894292, "train_speed(iter/s)": 0.190597 }, { "epoch": 0.35235459939208935, "grad_norm": 0.7365090250968933, "learning_rate": 9.571019909742182e-05, "loss": 0.8841464042663574, "memory(GiB)": 91.52, "step": 27155, "token_acc": 0.7562312762133013, "train_speed(iter/s)": 0.190577 }, { "epoch": 0.35241947779374505, "grad_norm": 0.7080572843551636, "learning_rate": 9.570802515050249e-05, "loss": 0.9519326210021972, "memory(GiB)": 91.52, "step": 27160, "token_acc": 0.7402517444246819, "train_speed(iter/s)": 0.190554 }, { "epoch": 0.35248435619540075, "grad_norm": 0.7590062022209167, "learning_rate": 9.570585067757816e-05, "loss": 0.9581903457641602, "memory(GiB)": 91.52, "step": 27165, "token_acc": 0.7404134711570524, "train_speed(iter/s)": 0.190535 }, { "epoch": 0.35254923459705645, "grad_norm": 0.6763535737991333, "learning_rate": 9.570367567867382e-05, "loss": 0.8969633102416992, "memory(GiB)": 91.52, "step": 27170, "token_acc": 0.7679006727987779, "train_speed(iter/s)": 0.190515 }, { "epoch": 0.35261411299871215, "grad_norm": 0.7507739663124084, "learning_rate": 9.570150015381453e-05, "loss": 0.9514151573181152, "memory(GiB)": 91.52, "step": 27175, "token_acc": 0.7488651620740362, "train_speed(iter/s)": 0.190498 }, { "epoch": 0.35267899140036785, "grad_norm": 0.7462843656539917, "learning_rate": 9.569932410302534e-05, "loss": 0.9716837882995606, "memory(GiB)": 91.52, "step": 27180, "token_acc": 0.7504322578540437, "train_speed(iter/s)": 0.19048 }, { "epoch": 0.35274386980202355, "grad_norm": 0.794845700263977, "learning_rate": 9.569714752633123e-05, "loss": 0.9307168006896973, "memory(GiB)": 91.52, "step": 27185, "token_acc": 0.7615992102665351, "train_speed(iter/s)": 0.190461 }, { "epoch": 0.35280874820367925, "grad_norm": 0.8091119527816772, "learning_rate": 9.569497042375732e-05, "loss": 0.9283651351928711, "memory(GiB)": 91.52, "step": 27190, "token_acc": 0.75, "train_speed(iter/s)": 0.190442 }, { "epoch": 0.35287362660533494, "grad_norm": 0.7512410283088684, "learning_rate": 9.569279279532862e-05, "loss": 0.9314334869384766, "memory(GiB)": 91.52, "step": 27195, "token_acc": 0.7481450004903082, "train_speed(iter/s)": 0.190422 }, { "epoch": 0.35293850500699064, "grad_norm": 0.8938506841659546, "learning_rate": 9.56906146410702e-05, "loss": 0.9060693740844726, "memory(GiB)": 91.52, "step": 27200, "token_acc": 0.7759291132785109, "train_speed(iter/s)": 0.1904 }, { "epoch": 0.35300338340864634, "grad_norm": 0.8422931432723999, "learning_rate": 9.568843596100713e-05, "loss": 0.9434219360351562, "memory(GiB)": 91.52, "step": 27205, "token_acc": 0.7551380170263995, "train_speed(iter/s)": 0.190381 }, { "epoch": 0.35306826181030204, "grad_norm": 0.8623948693275452, "learning_rate": 9.568625675516447e-05, "loss": 0.979678726196289, "memory(GiB)": 91.52, "step": 27210, "token_acc": 0.7586206896551724, "train_speed(iter/s)": 0.190363 }, { "epoch": 0.35313314021195774, "grad_norm": 0.857521116733551, "learning_rate": 9.568407702356732e-05, "loss": 0.9775978088378906, "memory(GiB)": 91.52, "step": 27215, "token_acc": 0.7423758460719446, "train_speed(iter/s)": 0.190344 }, { "epoch": 0.35319801861361344, "grad_norm": 0.8056617379188538, "learning_rate": 9.568189676624075e-05, "loss": 0.972038459777832, "memory(GiB)": 91.52, "step": 27220, "token_acc": 0.7549107142857143, "train_speed(iter/s)": 0.190324 }, { "epoch": 0.35326289701526914, "grad_norm": 0.7993339896202087, "learning_rate": 9.567971598320984e-05, "loss": 0.9422552108764648, "memory(GiB)": 91.52, "step": 27225, "token_acc": 0.7579707391221737, "train_speed(iter/s)": 0.190304 }, { "epoch": 0.35332777541692484, "grad_norm": 0.7250942587852478, "learning_rate": 9.567753467449971e-05, "loss": 0.9530336380004882, "memory(GiB)": 91.52, "step": 27230, "token_acc": 0.7390133563119345, "train_speed(iter/s)": 0.190287 }, { "epoch": 0.35339265381858054, "grad_norm": 0.7662692070007324, "learning_rate": 9.567535284013545e-05, "loss": 0.9740351676940918, "memory(GiB)": 91.52, "step": 27235, "token_acc": 0.7540193931811073, "train_speed(iter/s)": 0.190269 }, { "epoch": 0.35345753222023624, "grad_norm": 0.8721308708190918, "learning_rate": 9.567317048014217e-05, "loss": 0.9415456771850585, "memory(GiB)": 91.52, "step": 27240, "token_acc": 0.7344769874476987, "train_speed(iter/s)": 0.190252 }, { "epoch": 0.35352241062189194, "grad_norm": 0.7968454360961914, "learning_rate": 9.567098759454499e-05, "loss": 0.9298310279846191, "memory(GiB)": 91.52, "step": 27245, "token_acc": 0.7664180098574663, "train_speed(iter/s)": 0.190232 }, { "epoch": 0.35358728902354764, "grad_norm": 0.7567636966705322, "learning_rate": 9.5668804183369e-05, "loss": 0.955807876586914, "memory(GiB)": 91.52, "step": 27250, "token_acc": 0.7410432320592399, "train_speed(iter/s)": 0.19021 }, { "epoch": 0.35365216742520333, "grad_norm": 0.9118586778640747, "learning_rate": 9.566662024663937e-05, "loss": 0.9562811851501465, "memory(GiB)": 91.52, "step": 27255, "token_acc": 0.7595884773662551, "train_speed(iter/s)": 0.190189 }, { "epoch": 0.35371704582685903, "grad_norm": 0.8651736974716187, "learning_rate": 9.566443578438122e-05, "loss": 0.9740751266479493, "memory(GiB)": 91.52, "step": 27260, "token_acc": 0.7661064897450578, "train_speed(iter/s)": 0.190169 }, { "epoch": 0.35378192422851473, "grad_norm": 0.9112225770950317, "learning_rate": 9.566225079661967e-05, "loss": 0.9629279136657715, "memory(GiB)": 91.52, "step": 27265, "token_acc": 0.7275843406097781, "train_speed(iter/s)": 0.190152 }, { "epoch": 0.3538468026301704, "grad_norm": 0.9177769422531128, "learning_rate": 9.566006528337987e-05, "loss": 0.9380632400512695, "memory(GiB)": 91.52, "step": 27270, "token_acc": 0.7415206625327729, "train_speed(iter/s)": 0.190134 }, { "epoch": 0.3539116810318261, "grad_norm": 0.8557824492454529, "learning_rate": 9.565787924468698e-05, "loss": 0.9827388763427735, "memory(GiB)": 91.52, "step": 27275, "token_acc": 0.7316039647739415, "train_speed(iter/s)": 0.190115 }, { "epoch": 0.3539765594334818, "grad_norm": 0.7904674410820007, "learning_rate": 9.565569268056618e-05, "loss": 0.9571426391601563, "memory(GiB)": 91.52, "step": 27280, "token_acc": 0.7598550781711495, "train_speed(iter/s)": 0.190097 }, { "epoch": 0.3540414378351375, "grad_norm": 0.7973254919052124, "learning_rate": 9.565350559104258e-05, "loss": 0.9354253768920898, "memory(GiB)": 91.52, "step": 27285, "token_acc": 0.7684837314808252, "train_speed(iter/s)": 0.190076 }, { "epoch": 0.3541063162367932, "grad_norm": 0.8618850708007812, "learning_rate": 9.565131797614137e-05, "loss": 0.936886215209961, "memory(GiB)": 91.52, "step": 27290, "token_acc": 0.7576803356340506, "train_speed(iter/s)": 0.190055 }, { "epoch": 0.35417119463844887, "grad_norm": 0.8206112384796143, "learning_rate": 9.564912983588776e-05, "loss": 0.9304325103759765, "memory(GiB)": 91.52, "step": 27295, "token_acc": 0.7443175217812198, "train_speed(iter/s)": 0.190037 }, { "epoch": 0.35423607304010457, "grad_norm": 0.7850489616394043, "learning_rate": 9.564694117030688e-05, "loss": 0.943571662902832, "memory(GiB)": 91.52, "step": 27300, "token_acc": 0.7488360594948704, "train_speed(iter/s)": 0.19002 }, { "epoch": 0.35430095144176027, "grad_norm": 0.7711634635925293, "learning_rate": 9.564475197942396e-05, "loss": 0.9572771072387696, "memory(GiB)": 91.52, "step": 27305, "token_acc": 0.754514100223169, "train_speed(iter/s)": 0.189998 }, { "epoch": 0.35436582984341597, "grad_norm": 0.7641965746879578, "learning_rate": 9.564256226326414e-05, "loss": 0.9497550964355469, "memory(GiB)": 91.52, "step": 27310, "token_acc": 0.7416051895192063, "train_speed(iter/s)": 0.18998 }, { "epoch": 0.35443070824507167, "grad_norm": 0.7435631155967712, "learning_rate": 9.564037202185268e-05, "loss": 0.9999545097351075, "memory(GiB)": 91.52, "step": 27315, "token_acc": 0.7246497387600253, "train_speed(iter/s)": 0.18996 }, { "epoch": 0.35449558664672737, "grad_norm": 0.878481924533844, "learning_rate": 9.563818125521473e-05, "loss": 1.0057943344116211, "memory(GiB)": 91.52, "step": 27320, "token_acc": 0.7441711401330021, "train_speed(iter/s)": 0.189942 }, { "epoch": 0.35456046504838307, "grad_norm": 0.7587834000587463, "learning_rate": 9.563598996337555e-05, "loss": 0.9525842666625977, "memory(GiB)": 91.52, "step": 27325, "token_acc": 0.7708395578129669, "train_speed(iter/s)": 0.189921 }, { "epoch": 0.35462534345003877, "grad_norm": 0.7756629586219788, "learning_rate": 9.563379814636031e-05, "loss": 0.9260244369506836, "memory(GiB)": 91.52, "step": 27330, "token_acc": 0.7411713752665245, "train_speed(iter/s)": 0.189902 }, { "epoch": 0.35469022185169446, "grad_norm": 0.7748023271560669, "learning_rate": 9.563160580419428e-05, "loss": 0.9987078666687011, "memory(GiB)": 91.52, "step": 27335, "token_acc": 0.7409376349802306, "train_speed(iter/s)": 0.189883 }, { "epoch": 0.35475510025335016, "grad_norm": 0.8067600727081299, "learning_rate": 9.562941293690266e-05, "loss": 0.9708421707153321, "memory(GiB)": 91.52, "step": 27340, "token_acc": 0.7583727589641435, "train_speed(iter/s)": 0.189862 }, { "epoch": 0.35481997865500586, "grad_norm": 0.8045753240585327, "learning_rate": 9.562721954451069e-05, "loss": 0.9610343933105469, "memory(GiB)": 91.52, "step": 27345, "token_acc": 0.7304895256702486, "train_speed(iter/s)": 0.189844 }, { "epoch": 0.35488485705666156, "grad_norm": 0.7668873071670532, "learning_rate": 9.562502562704361e-05, "loss": 0.9634347915649414, "memory(GiB)": 91.52, "step": 27350, "token_acc": 0.7502601456815817, "train_speed(iter/s)": 0.189824 }, { "epoch": 0.35494973545831726, "grad_norm": 0.7007825970649719, "learning_rate": 9.562283118452668e-05, "loss": 0.9243105888366699, "memory(GiB)": 91.52, "step": 27355, "token_acc": 0.7666768406416387, "train_speed(iter/s)": 0.189806 }, { "epoch": 0.35501461385997296, "grad_norm": 0.7961570024490356, "learning_rate": 9.562063621698515e-05, "loss": 0.9865753173828125, "memory(GiB)": 91.52, "step": 27360, "token_acc": 0.7343112106651252, "train_speed(iter/s)": 0.189786 }, { "epoch": 0.35507949226162866, "grad_norm": 0.8617417216300964, "learning_rate": 9.561844072444426e-05, "loss": 0.9797248840332031, "memory(GiB)": 91.52, "step": 27365, "token_acc": 0.7486461665136017, "train_speed(iter/s)": 0.189766 }, { "epoch": 0.35514437066328436, "grad_norm": 0.7055057287216187, "learning_rate": 9.561624470692931e-05, "loss": 0.9478679656982422, "memory(GiB)": 91.52, "step": 27370, "token_acc": 0.7385832544216024, "train_speed(iter/s)": 0.189744 }, { "epoch": 0.35520924906494006, "grad_norm": 0.7628471255302429, "learning_rate": 9.561404816446554e-05, "loss": 0.9907445907592773, "memory(GiB)": 91.52, "step": 27375, "token_acc": 0.7452887231683967, "train_speed(iter/s)": 0.189727 }, { "epoch": 0.35527412746659576, "grad_norm": 0.7901588678359985, "learning_rate": 9.561185109707825e-05, "loss": 0.9602056503295898, "memory(GiB)": 91.52, "step": 27380, "token_acc": 0.7465802972510851, "train_speed(iter/s)": 0.189709 }, { "epoch": 0.35533900586825146, "grad_norm": 0.7361676096916199, "learning_rate": 9.560965350479269e-05, "loss": 0.9330926895141601, "memory(GiB)": 91.52, "step": 27385, "token_acc": 0.7403026427962489, "train_speed(iter/s)": 0.18969 }, { "epoch": 0.3554038842699071, "grad_norm": 0.7965372204780579, "learning_rate": 9.560745538763419e-05, "loss": 0.9397675514221191, "memory(GiB)": 91.52, "step": 27390, "token_acc": 0.7423111145498859, "train_speed(iter/s)": 0.189672 }, { "epoch": 0.3554687626715628, "grad_norm": 0.7468025088310242, "learning_rate": 9.560525674562802e-05, "loss": 0.9381653785705566, "memory(GiB)": 91.52, "step": 27395, "token_acc": 0.7513272318665929, "train_speed(iter/s)": 0.189651 }, { "epoch": 0.3555336410732185, "grad_norm": 0.7572433352470398, "learning_rate": 9.560305757879949e-05, "loss": 0.9458765029907227, "memory(GiB)": 91.52, "step": 27400, "token_acc": 0.7247953250209738, "train_speed(iter/s)": 0.189632 }, { "epoch": 0.3555985194748742, "grad_norm": 0.8822898864746094, "learning_rate": 9.560085788717392e-05, "loss": 0.9819944381713868, "memory(GiB)": 91.52, "step": 27405, "token_acc": 0.7323510424182584, "train_speed(iter/s)": 0.189614 }, { "epoch": 0.3556633978765299, "grad_norm": 0.8395500779151917, "learning_rate": 9.55986576707766e-05, "loss": 1.002618408203125, "memory(GiB)": 91.52, "step": 27410, "token_acc": 0.7410358565737052, "train_speed(iter/s)": 0.189595 }, { "epoch": 0.3557282762781856, "grad_norm": 0.8569678068161011, "learning_rate": 9.559645692963285e-05, "loss": 0.9535970687866211, "memory(GiB)": 91.52, "step": 27415, "token_acc": 0.7279307225022619, "train_speed(iter/s)": 0.189576 }, { "epoch": 0.3557931546798413, "grad_norm": 0.8682976365089417, "learning_rate": 9.559425566376805e-05, "loss": 0.9133070945739746, "memory(GiB)": 91.52, "step": 27420, "token_acc": 0.7546395780425864, "train_speed(iter/s)": 0.189553 }, { "epoch": 0.355858033081497, "grad_norm": 0.7775744199752808, "learning_rate": 9.559205387320747e-05, "loss": 0.955974006652832, "memory(GiB)": 91.52, "step": 27425, "token_acc": 0.7412798236514523, "train_speed(iter/s)": 0.189535 }, { "epoch": 0.3559229114831527, "grad_norm": 0.8897885084152222, "learning_rate": 9.558985155797645e-05, "loss": 0.9689780235290527, "memory(GiB)": 91.52, "step": 27430, "token_acc": 0.7414843671960843, "train_speed(iter/s)": 0.189516 }, { "epoch": 0.3559877898848084, "grad_norm": 0.7664679884910583, "learning_rate": 9.558764871810037e-05, "loss": 0.9699048042297364, "memory(GiB)": 91.52, "step": 27435, "token_acc": 0.7346281324536134, "train_speed(iter/s)": 0.189495 }, { "epoch": 0.3560526682864641, "grad_norm": 0.8378893733024597, "learning_rate": 9.558544535360457e-05, "loss": 0.9821043968200683, "memory(GiB)": 91.52, "step": 27440, "token_acc": 0.7317905602264214, "train_speed(iter/s)": 0.189478 }, { "epoch": 0.3561175466881198, "grad_norm": 0.8138142228126526, "learning_rate": 9.558324146451438e-05, "loss": 0.9395366668701172, "memory(GiB)": 91.52, "step": 27445, "token_acc": 0.7542872916738553, "train_speed(iter/s)": 0.189459 }, { "epoch": 0.3561824250897755, "grad_norm": 0.8209066987037659, "learning_rate": 9.558103705085519e-05, "loss": 0.952891731262207, "memory(GiB)": 91.52, "step": 27450, "token_acc": 0.7377065767284992, "train_speed(iter/s)": 0.189442 }, { "epoch": 0.3562473034914312, "grad_norm": 0.73843914270401, "learning_rate": 9.557883211265235e-05, "loss": 0.9609873771667481, "memory(GiB)": 91.52, "step": 27455, "token_acc": 0.7441845052687388, "train_speed(iter/s)": 0.189424 }, { "epoch": 0.3563121818930869, "grad_norm": 0.765644371509552, "learning_rate": 9.557662664993125e-05, "loss": 0.932757568359375, "memory(GiB)": 91.52, "step": 27460, "token_acc": 0.7307965355672299, "train_speed(iter/s)": 0.189403 }, { "epoch": 0.3563770602947426, "grad_norm": 0.826770544052124, "learning_rate": 9.557442066271727e-05, "loss": 0.9207127571105957, "memory(GiB)": 91.52, "step": 27465, "token_acc": 0.753479259559519, "train_speed(iter/s)": 0.189383 }, { "epoch": 0.3564419386963983, "grad_norm": 0.7509761452674866, "learning_rate": 9.557221415103578e-05, "loss": 0.9478527069091797, "memory(GiB)": 91.52, "step": 27470, "token_acc": 0.7474845829276209, "train_speed(iter/s)": 0.189364 }, { "epoch": 0.356506817098054, "grad_norm": 0.7250659465789795, "learning_rate": 9.55700071149122e-05, "loss": 0.95069580078125, "memory(GiB)": 91.52, "step": 27475, "token_acc": 0.7460154593337438, "train_speed(iter/s)": 0.189343 }, { "epoch": 0.3565716954997097, "grad_norm": 0.8157508373260498, "learning_rate": 9.556779955437189e-05, "loss": 0.939570140838623, "memory(GiB)": 91.52, "step": 27480, "token_acc": 0.7308217408951699, "train_speed(iter/s)": 0.189326 }, { "epoch": 0.3566365739013654, "grad_norm": 0.7750990986824036, "learning_rate": 9.556559146944029e-05, "loss": 0.9059293746948243, "memory(GiB)": 91.52, "step": 27485, "token_acc": 0.7589141764463275, "train_speed(iter/s)": 0.189306 }, { "epoch": 0.3567014523030211, "grad_norm": 0.8077378273010254, "learning_rate": 9.556338286014279e-05, "loss": 1.0235223770141602, "memory(GiB)": 91.52, "step": 27490, "token_acc": 0.7359625413986067, "train_speed(iter/s)": 0.189289 }, { "epoch": 0.3567663307046768, "grad_norm": 0.7421767115592957, "learning_rate": 9.556117372650481e-05, "loss": 0.9176414489746094, "memory(GiB)": 91.52, "step": 27495, "token_acc": 0.7350650241910579, "train_speed(iter/s)": 0.189269 }, { "epoch": 0.3568312091063325, "grad_norm": 0.7726659178733826, "learning_rate": 9.555896406855178e-05, "loss": 1.0228584289550782, "memory(GiB)": 91.52, "step": 27500, "token_acc": 0.7341507286810323, "train_speed(iter/s)": 0.189249 }, { "epoch": 0.3568960875079882, "grad_norm": 0.8585420846939087, "learning_rate": 9.555675388630911e-05, "loss": 0.931201171875, "memory(GiB)": 91.52, "step": 27505, "token_acc": 0.7619120989176071, "train_speed(iter/s)": 0.189231 }, { "epoch": 0.3569609659096438, "grad_norm": 0.8169578909873962, "learning_rate": 9.555454317980227e-05, "loss": 0.9127185821533204, "memory(GiB)": 91.52, "step": 27510, "token_acc": 0.7369473777097979, "train_speed(iter/s)": 0.189211 }, { "epoch": 0.3570258443112995, "grad_norm": 0.8304630517959595, "learning_rate": 9.555233194905666e-05, "loss": 0.9859523773193359, "memory(GiB)": 91.52, "step": 27515, "token_acc": 0.725504439063761, "train_speed(iter/s)": 0.189194 }, { "epoch": 0.3570907227129552, "grad_norm": 0.7231667637825012, "learning_rate": 9.555012019409776e-05, "loss": 0.9652267456054687, "memory(GiB)": 91.52, "step": 27520, "token_acc": 0.7550051247010591, "train_speed(iter/s)": 0.189177 }, { "epoch": 0.3571556011146109, "grad_norm": 0.8506848216056824, "learning_rate": 9.554790791495101e-05, "loss": 0.9448649406433105, "memory(GiB)": 91.52, "step": 27525, "token_acc": 0.7572134651349185, "train_speed(iter/s)": 0.18916 }, { "epoch": 0.3572204795162666, "grad_norm": 0.7665948867797852, "learning_rate": 9.554569511164185e-05, "loss": 0.9435842514038086, "memory(GiB)": 91.52, "step": 27530, "token_acc": 0.7400812936290536, "train_speed(iter/s)": 0.189142 }, { "epoch": 0.3572853579179223, "grad_norm": 2.4450299739837646, "learning_rate": 9.554348178419578e-05, "loss": 0.9120098114013672, "memory(GiB)": 91.52, "step": 27535, "token_acc": 0.7550537311189599, "train_speed(iter/s)": 0.189121 }, { "epoch": 0.357350236319578, "grad_norm": 0.8262280821800232, "learning_rate": 9.554126793263825e-05, "loss": 0.9527225494384766, "memory(GiB)": 91.52, "step": 27540, "token_acc": 0.7359920896506262, "train_speed(iter/s)": 0.189104 }, { "epoch": 0.3574151147212337, "grad_norm": 0.8480983376502991, "learning_rate": 9.553905355699473e-05, "loss": 0.9675064086914062, "memory(GiB)": 91.52, "step": 27545, "token_acc": 0.7321226764350571, "train_speed(iter/s)": 0.189085 }, { "epoch": 0.3574799931228894, "grad_norm": 0.8356218338012695, "learning_rate": 9.553683865729071e-05, "loss": 0.9323188781738281, "memory(GiB)": 91.52, "step": 27550, "token_acc": 0.7396429647484182, "train_speed(iter/s)": 0.189069 }, { "epoch": 0.3575448715245451, "grad_norm": 0.7783671021461487, "learning_rate": 9.553462323355169e-05, "loss": 0.9207768440246582, "memory(GiB)": 91.52, "step": 27555, "token_acc": 0.763368276977634, "train_speed(iter/s)": 0.189049 }, { "epoch": 0.3576097499262008, "grad_norm": 0.8265756964683533, "learning_rate": 9.553240728580315e-05, "loss": 0.9729703903198242, "memory(GiB)": 91.52, "step": 27560, "token_acc": 0.7348932619428412, "train_speed(iter/s)": 0.189029 }, { "epoch": 0.3576746283278565, "grad_norm": 0.7609823346138, "learning_rate": 9.553019081407061e-05, "loss": 0.9659381866455078, "memory(GiB)": 91.52, "step": 27565, "token_acc": 0.7527029039463887, "train_speed(iter/s)": 0.189009 }, { "epoch": 0.3577395067295122, "grad_norm": 0.8177542090415955, "learning_rate": 9.552797381837954e-05, "loss": 0.9171932220458985, "memory(GiB)": 91.52, "step": 27570, "token_acc": 0.7553202904252656, "train_speed(iter/s)": 0.188991 }, { "epoch": 0.3578043851311679, "grad_norm": 0.7356248497962952, "learning_rate": 9.55257562987555e-05, "loss": 1.0143471717834474, "memory(GiB)": 91.52, "step": 27575, "token_acc": 0.7390686852204393, "train_speed(iter/s)": 0.188972 }, { "epoch": 0.3578692635328236, "grad_norm": 0.7745472192764282, "learning_rate": 9.552353825522399e-05, "loss": 0.9423822402954102, "memory(GiB)": 91.52, "step": 27580, "token_acc": 0.7427362928410484, "train_speed(iter/s)": 0.188955 }, { "epoch": 0.3579341419344793, "grad_norm": 0.841944694519043, "learning_rate": 9.552131968781052e-05, "loss": 0.9510408401489258, "memory(GiB)": 91.52, "step": 27585, "token_acc": 0.7546148414646447, "train_speed(iter/s)": 0.188937 }, { "epoch": 0.357999020336135, "grad_norm": 0.793538510799408, "learning_rate": 9.551910059654064e-05, "loss": 0.883000373840332, "memory(GiB)": 91.52, "step": 27590, "token_acc": 0.7810370172390753, "train_speed(iter/s)": 0.188917 }, { "epoch": 0.3580638987377907, "grad_norm": 0.7638513445854187, "learning_rate": 9.551688098143987e-05, "loss": 0.9131145477294922, "memory(GiB)": 91.52, "step": 27595, "token_acc": 0.7455756698044895, "train_speed(iter/s)": 0.188898 }, { "epoch": 0.3581287771394464, "grad_norm": 0.7722308039665222, "learning_rate": 9.551466084253377e-05, "loss": 0.9704385757446289, "memory(GiB)": 91.52, "step": 27600, "token_acc": 0.7417629076086957, "train_speed(iter/s)": 0.188879 }, { "epoch": 0.3581936555411021, "grad_norm": 0.749789834022522, "learning_rate": 9.551244017984788e-05, "loss": 0.9389432907104492, "memory(GiB)": 91.52, "step": 27605, "token_acc": 0.7329694401551334, "train_speed(iter/s)": 0.188862 }, { "epoch": 0.3582585339427578, "grad_norm": 0.7435270547866821, "learning_rate": 9.551021899340776e-05, "loss": 0.9735282897949219, "memory(GiB)": 91.52, "step": 27610, "token_acc": 0.7395135352452425, "train_speed(iter/s)": 0.188842 }, { "epoch": 0.3583234123444135, "grad_norm": 0.7315138578414917, "learning_rate": 9.550799728323896e-05, "loss": 0.9261170387268066, "memory(GiB)": 91.52, "step": 27615, "token_acc": 0.7472982771121776, "train_speed(iter/s)": 0.188824 }, { "epoch": 0.3583882907460692, "grad_norm": 0.8765550255775452, "learning_rate": 9.550577504936707e-05, "loss": 0.9941320419311523, "memory(GiB)": 91.52, "step": 27620, "token_acc": 0.7371874888548093, "train_speed(iter/s)": 0.188807 }, { "epoch": 0.3584531691477249, "grad_norm": 0.764388918876648, "learning_rate": 9.550355229181765e-05, "loss": 0.9906654357910156, "memory(GiB)": 91.52, "step": 27625, "token_acc": 0.7416515822982336, "train_speed(iter/s)": 0.188787 }, { "epoch": 0.35851804754938055, "grad_norm": 0.6977697610855103, "learning_rate": 9.550132901061626e-05, "loss": 0.9580162048339844, "memory(GiB)": 91.52, "step": 27630, "token_acc": 0.7629700584000247, "train_speed(iter/s)": 0.188767 }, { "epoch": 0.35858292595103625, "grad_norm": 0.7984714508056641, "learning_rate": 9.549910520578851e-05, "loss": 0.9322273254394531, "memory(GiB)": 91.52, "step": 27635, "token_acc": 0.7634912682507873, "train_speed(iter/s)": 0.188748 }, { "epoch": 0.35864780435269195, "grad_norm": 0.6838639378547668, "learning_rate": 9.549688087736e-05, "loss": 0.9184390068054199, "memory(GiB)": 91.52, "step": 27640, "token_acc": 0.7657280022929206, "train_speed(iter/s)": 0.18873 }, { "epoch": 0.35871268275434764, "grad_norm": 0.7285974621772766, "learning_rate": 9.54946560253563e-05, "loss": 1.0092796325683593, "memory(GiB)": 91.52, "step": 27645, "token_acc": 0.739032233366511, "train_speed(iter/s)": 0.188713 }, { "epoch": 0.35877756115600334, "grad_norm": 0.7630103230476379, "learning_rate": 9.549243064980302e-05, "loss": 0.9057506561279297, "memory(GiB)": 91.52, "step": 27650, "token_acc": 0.747447803464401, "train_speed(iter/s)": 0.188693 }, { "epoch": 0.35884243955765904, "grad_norm": 0.7797348499298096, "learning_rate": 9.549020475072578e-05, "loss": 0.9394162178039551, "memory(GiB)": 91.52, "step": 27655, "token_acc": 0.7539962069899756, "train_speed(iter/s)": 0.188674 }, { "epoch": 0.35890731795931474, "grad_norm": 0.7715123295783997, "learning_rate": 9.548797832815022e-05, "loss": 0.9897037506103515, "memory(GiB)": 91.52, "step": 27660, "token_acc": 0.7278723475132416, "train_speed(iter/s)": 0.188655 }, { "epoch": 0.35897219636097044, "grad_norm": 0.7815625667572021, "learning_rate": 9.54857513821019e-05, "loss": 0.9320655822753906, "memory(GiB)": 91.52, "step": 27665, "token_acc": 0.7565479861088572, "train_speed(iter/s)": 0.188635 }, { "epoch": 0.35903707476262614, "grad_norm": 0.8178109526634216, "learning_rate": 9.548352391260649e-05, "loss": 0.9561649322509765, "memory(GiB)": 91.52, "step": 27670, "token_acc": 0.7578684948582113, "train_speed(iter/s)": 0.188618 }, { "epoch": 0.35910195316428184, "grad_norm": 0.7460406422615051, "learning_rate": 9.548129591968962e-05, "loss": 0.9613234519958496, "memory(GiB)": 91.52, "step": 27675, "token_acc": 0.7556957226071072, "train_speed(iter/s)": 0.188597 }, { "epoch": 0.35916683156593754, "grad_norm": 0.9468928575515747, "learning_rate": 9.54790674033769e-05, "loss": 0.9850525856018066, "memory(GiB)": 91.52, "step": 27680, "token_acc": 0.7346952884648622, "train_speed(iter/s)": 0.188577 }, { "epoch": 0.35923170996759324, "grad_norm": 0.8292118310928345, "learning_rate": 9.547683836369404e-05, "loss": 0.9642829895019531, "memory(GiB)": 91.52, "step": 27685, "token_acc": 0.7440668611527241, "train_speed(iter/s)": 0.188555 }, { "epoch": 0.35929658836924894, "grad_norm": 0.7596157789230347, "learning_rate": 9.547460880066663e-05, "loss": 0.945427131652832, "memory(GiB)": 91.52, "step": 27690, "token_acc": 0.7489990982867448, "train_speed(iter/s)": 0.188534 }, { "epoch": 0.35936146677090464, "grad_norm": 0.8504999279975891, "learning_rate": 9.547237871432034e-05, "loss": 0.9618513107299804, "memory(GiB)": 91.52, "step": 27695, "token_acc": 0.730239110569601, "train_speed(iter/s)": 0.188515 }, { "epoch": 0.35942634517256034, "grad_norm": 1.4181100130081177, "learning_rate": 9.547014810468084e-05, "loss": 0.95435791015625, "memory(GiB)": 91.52, "step": 27700, "token_acc": 0.7586839089867409, "train_speed(iter/s)": 0.188493 }, { "epoch": 0.35949122357421603, "grad_norm": 0.7073531150817871, "learning_rate": 9.546791697177382e-05, "loss": 0.9374525070190429, "memory(GiB)": 91.52, "step": 27705, "token_acc": 0.7486706056129985, "train_speed(iter/s)": 0.188474 }, { "epoch": 0.35955610197587173, "grad_norm": 0.8070125579833984, "learning_rate": 9.546568531562494e-05, "loss": 0.9529640197753906, "memory(GiB)": 91.52, "step": 27710, "token_acc": 0.7550073277967758, "train_speed(iter/s)": 0.188456 }, { "epoch": 0.35962098037752743, "grad_norm": 0.7637696862220764, "learning_rate": 9.546345313625986e-05, "loss": 0.9154956817626954, "memory(GiB)": 91.52, "step": 27715, "token_acc": 0.7634946191000762, "train_speed(iter/s)": 0.188436 }, { "epoch": 0.35968585877918313, "grad_norm": 0.8014907836914062, "learning_rate": 9.54612204337043e-05, "loss": 0.9656970024108886, "memory(GiB)": 91.52, "step": 27720, "token_acc": 0.7433768560305922, "train_speed(iter/s)": 0.188417 }, { "epoch": 0.35975073718083883, "grad_norm": 0.7958313226699829, "learning_rate": 9.545898720798395e-05, "loss": 0.9314988136291504, "memory(GiB)": 91.52, "step": 27725, "token_acc": 0.754596981480261, "train_speed(iter/s)": 0.188398 }, { "epoch": 0.35981561558249453, "grad_norm": 0.7924172282218933, "learning_rate": 9.545675345912448e-05, "loss": 0.9272882461547851, "memory(GiB)": 91.52, "step": 27730, "token_acc": 0.7608605566682782, "train_speed(iter/s)": 0.18838 }, { "epoch": 0.35988049398415023, "grad_norm": 0.7921887636184692, "learning_rate": 9.545451918715163e-05, "loss": 0.9610729217529297, "memory(GiB)": 91.52, "step": 27735, "token_acc": 0.7403024574669187, "train_speed(iter/s)": 0.188361 }, { "epoch": 0.35994537238580593, "grad_norm": 0.8078672885894775, "learning_rate": 9.545228439209109e-05, "loss": 0.9758553504943848, "memory(GiB)": 91.52, "step": 27740, "token_acc": 0.7267651888341543, "train_speed(iter/s)": 0.188342 }, { "epoch": 0.3600102507874616, "grad_norm": 0.7830342650413513, "learning_rate": 9.545004907396859e-05, "loss": 0.9408992767333985, "memory(GiB)": 91.52, "step": 27745, "token_acc": 0.7550563532499615, "train_speed(iter/s)": 0.188324 }, { "epoch": 0.36007512918911727, "grad_norm": 0.7940805554389954, "learning_rate": 9.544781323280985e-05, "loss": 0.93812255859375, "memory(GiB)": 91.52, "step": 27750, "token_acc": 0.7151790093182933, "train_speed(iter/s)": 0.188306 }, { "epoch": 0.36014000759077297, "grad_norm": 0.873288094997406, "learning_rate": 9.54455768686406e-05, "loss": 0.9335803031921387, "memory(GiB)": 91.52, "step": 27755, "token_acc": 0.7642198581560283, "train_speed(iter/s)": 0.188286 }, { "epoch": 0.36020488599242867, "grad_norm": 0.8112235069274902, "learning_rate": 9.544333998148658e-05, "loss": 0.9986127853393555, "memory(GiB)": 91.52, "step": 27760, "token_acc": 0.7283226659992134, "train_speed(iter/s)": 0.188268 }, { "epoch": 0.36026976439408437, "grad_norm": 0.8039928674697876, "learning_rate": 9.544110257137354e-05, "loss": 0.9680112838745117, "memory(GiB)": 91.52, "step": 27765, "token_acc": 0.7166352440932248, "train_speed(iter/s)": 0.188248 }, { "epoch": 0.36033464279574007, "grad_norm": 0.7871840000152588, "learning_rate": 9.543886463832718e-05, "loss": 0.9775851249694825, "memory(GiB)": 91.52, "step": 27770, "token_acc": 0.7446615711921835, "train_speed(iter/s)": 0.188229 }, { "epoch": 0.36039952119739577, "grad_norm": 0.7362667322158813, "learning_rate": 9.543662618237333e-05, "loss": 0.9311145782470703, "memory(GiB)": 91.52, "step": 27775, "token_acc": 0.7417788261412516, "train_speed(iter/s)": 0.18821 }, { "epoch": 0.36046439959905147, "grad_norm": 0.8345035910606384, "learning_rate": 9.543438720353767e-05, "loss": 0.9670232772827149, "memory(GiB)": 91.52, "step": 27780, "token_acc": 0.738247132983975, "train_speed(iter/s)": 0.188192 }, { "epoch": 0.36052927800070717, "grad_norm": 0.7280612587928772, "learning_rate": 9.543214770184605e-05, "loss": 0.8881866455078125, "memory(GiB)": 91.52, "step": 27785, "token_acc": 0.7554885618017378, "train_speed(iter/s)": 0.188172 }, { "epoch": 0.36059415640236286, "grad_norm": 0.7482603192329407, "learning_rate": 9.542990767732417e-05, "loss": 0.955177116394043, "memory(GiB)": 91.52, "step": 27790, "token_acc": 0.744602073427391, "train_speed(iter/s)": 0.188152 }, { "epoch": 0.36065903480401856, "grad_norm": 0.8157876133918762, "learning_rate": 9.542766712999784e-05, "loss": 1.0080326080322266, "memory(GiB)": 91.52, "step": 27795, "token_acc": 0.742320819112628, "train_speed(iter/s)": 0.188137 }, { "epoch": 0.36072391320567426, "grad_norm": 0.8641220331192017, "learning_rate": 9.542542605989284e-05, "loss": 0.9788128852844238, "memory(GiB)": 91.52, "step": 27800, "token_acc": 0.7220444185820968, "train_speed(iter/s)": 0.188121 }, { "epoch": 0.36078879160732996, "grad_norm": 0.7886224389076233, "learning_rate": 9.542318446703497e-05, "loss": 0.9292998313903809, "memory(GiB)": 91.52, "step": 27805, "token_acc": 0.7406281996234766, "train_speed(iter/s)": 0.1881 }, { "epoch": 0.36085367000898566, "grad_norm": 0.795903742313385, "learning_rate": 9.542094235145001e-05, "loss": 0.8972824096679688, "memory(GiB)": 91.52, "step": 27810, "token_acc": 0.7452019532876557, "train_speed(iter/s)": 0.188083 }, { "epoch": 0.36091854841064136, "grad_norm": 0.7715718150138855, "learning_rate": 9.541869971316377e-05, "loss": 0.94796781539917, "memory(GiB)": 91.52, "step": 27815, "token_acc": 0.7434861123054433, "train_speed(iter/s)": 0.188064 }, { "epoch": 0.36098342681229706, "grad_norm": 0.7505821585655212, "learning_rate": 9.541645655220206e-05, "loss": 0.9243068695068359, "memory(GiB)": 91.52, "step": 27820, "token_acc": 0.7789810576457892, "train_speed(iter/s)": 0.188046 }, { "epoch": 0.36104830521395276, "grad_norm": 0.7550058364868164, "learning_rate": 9.541421286859069e-05, "loss": 0.9171161651611328, "memory(GiB)": 91.52, "step": 27825, "token_acc": 0.7474339888740892, "train_speed(iter/s)": 0.188027 }, { "epoch": 0.36111318361560846, "grad_norm": 0.7349224090576172, "learning_rate": 9.541196866235547e-05, "loss": 0.9406947135925293, "memory(GiB)": 91.52, "step": 27830, "token_acc": 0.7303827164816695, "train_speed(iter/s)": 0.188009 }, { "epoch": 0.36117806201726416, "grad_norm": 0.8509812951087952, "learning_rate": 9.540972393352225e-05, "loss": 0.9116569519042969, "memory(GiB)": 91.52, "step": 27835, "token_acc": 0.7631681347058615, "train_speed(iter/s)": 0.18799 }, { "epoch": 0.36124294041891986, "grad_norm": 0.8284879922866821, "learning_rate": 9.540747868211684e-05, "loss": 0.9738246917724609, "memory(GiB)": 91.52, "step": 27840, "token_acc": 0.7242332135949158, "train_speed(iter/s)": 0.187972 }, { "epoch": 0.36130781882057555, "grad_norm": 0.7636933922767639, "learning_rate": 9.54052329081651e-05, "loss": 0.9609828948974609, "memory(GiB)": 91.52, "step": 27845, "token_acc": 0.7473822467072859, "train_speed(iter/s)": 0.187954 }, { "epoch": 0.36137269722223125, "grad_norm": 0.8299164175987244, "learning_rate": 9.540298661169286e-05, "loss": 0.9714042663574218, "memory(GiB)": 91.52, "step": 27850, "token_acc": 0.7359279302634564, "train_speed(iter/s)": 0.187936 }, { "epoch": 0.36143757562388695, "grad_norm": 0.7173258066177368, "learning_rate": 9.540073979272597e-05, "loss": 0.958642578125, "memory(GiB)": 91.52, "step": 27855, "token_acc": 0.7188230336002452, "train_speed(iter/s)": 0.187919 }, { "epoch": 0.36150245402554265, "grad_norm": 0.7372323870658875, "learning_rate": 9.539849245129027e-05, "loss": 0.9448923110961914, "memory(GiB)": 91.52, "step": 27860, "token_acc": 0.7408149890126836, "train_speed(iter/s)": 0.187901 }, { "epoch": 0.3615673324271983, "grad_norm": 0.8337293863296509, "learning_rate": 9.539624458741166e-05, "loss": 0.9581514358520508, "memory(GiB)": 91.52, "step": 27865, "token_acc": 0.7449054237642387, "train_speed(iter/s)": 0.187883 }, { "epoch": 0.361632210828854, "grad_norm": 0.7821131348609924, "learning_rate": 9.539399620111598e-05, "loss": 0.9884844779968261, "memory(GiB)": 91.52, "step": 27870, "token_acc": 0.7393082495910259, "train_speed(iter/s)": 0.187864 }, { "epoch": 0.3616970892305097, "grad_norm": 0.743316650390625, "learning_rate": 9.539174729242912e-05, "loss": 0.9346364974975586, "memory(GiB)": 91.52, "step": 27875, "token_acc": 0.768404664315445, "train_speed(iter/s)": 0.187845 }, { "epoch": 0.3617619676321654, "grad_norm": 0.758196234703064, "learning_rate": 9.538949786137697e-05, "loss": 0.9591527938842773, "memory(GiB)": 91.52, "step": 27880, "token_acc": 0.7336233641193881, "train_speed(iter/s)": 0.187827 }, { "epoch": 0.3618268460338211, "grad_norm": 0.7575712203979492, "learning_rate": 9.538724790798537e-05, "loss": 0.9269437789916992, "memory(GiB)": 91.52, "step": 27885, "token_acc": 0.7667751265365148, "train_speed(iter/s)": 0.187809 }, { "epoch": 0.3618917244354768, "grad_norm": 0.8493726849555969, "learning_rate": 9.538499743228025e-05, "loss": 0.902127456665039, "memory(GiB)": 91.52, "step": 27890, "token_acc": 0.7554254567477541, "train_speed(iter/s)": 0.187791 }, { "epoch": 0.3619566028371325, "grad_norm": 0.8523567318916321, "learning_rate": 9.538274643428751e-05, "loss": 0.9487008094787598, "memory(GiB)": 91.52, "step": 27895, "token_acc": 0.7363067292644757, "train_speed(iter/s)": 0.187771 }, { "epoch": 0.3620214812387882, "grad_norm": 0.7889786958694458, "learning_rate": 9.538049491403304e-05, "loss": 0.9075088500976562, "memory(GiB)": 91.52, "step": 27900, "token_acc": 0.7323472252213344, "train_speed(iter/s)": 0.187752 }, { "epoch": 0.3620863596404439, "grad_norm": 0.8224388360977173, "learning_rate": 9.537824287154275e-05, "loss": 0.9208751678466797, "memory(GiB)": 91.52, "step": 27905, "token_acc": 0.7393010464180306, "train_speed(iter/s)": 0.187735 }, { "epoch": 0.3621512380420996, "grad_norm": 0.742867112159729, "learning_rate": 9.537599030684257e-05, "loss": 0.9499755859375, "memory(GiB)": 91.52, "step": 27910, "token_acc": 0.7530403761958813, "train_speed(iter/s)": 0.187718 }, { "epoch": 0.3622161164437553, "grad_norm": 0.74541836977005, "learning_rate": 9.537373721995842e-05, "loss": 0.9131392478942871, "memory(GiB)": 91.52, "step": 27915, "token_acc": 0.7623282718727404, "train_speed(iter/s)": 0.1877 }, { "epoch": 0.362280994845411, "grad_norm": 0.8130046129226685, "learning_rate": 9.53714836109162e-05, "loss": 0.9584520339965821, "memory(GiB)": 91.52, "step": 27920, "token_acc": 0.7372979921645446, "train_speed(iter/s)": 0.187681 }, { "epoch": 0.3623458732470667, "grad_norm": 0.8395792245864868, "learning_rate": 9.536922947974188e-05, "loss": 0.9375776290893555, "memory(GiB)": 91.52, "step": 27925, "token_acc": 0.7644961657473429, "train_speed(iter/s)": 0.187662 }, { "epoch": 0.3624107516487224, "grad_norm": 0.6997215747833252, "learning_rate": 9.536697482646138e-05, "loss": 0.9618489265441894, "memory(GiB)": 91.52, "step": 27930, "token_acc": 0.7547913084258246, "train_speed(iter/s)": 0.187645 }, { "epoch": 0.3624756300503781, "grad_norm": 0.8633074760437012, "learning_rate": 9.536471965110066e-05, "loss": 0.9820750236511231, "memory(GiB)": 91.52, "step": 27935, "token_acc": 0.7423452190881674, "train_speed(iter/s)": 0.187628 }, { "epoch": 0.3625405084520338, "grad_norm": 0.7382888197898865, "learning_rate": 9.536246395368566e-05, "loss": 0.9185100555419922, "memory(GiB)": 91.52, "step": 27940, "token_acc": 0.743321718931475, "train_speed(iter/s)": 0.18761 }, { "epoch": 0.3626053868536895, "grad_norm": 0.8389622569084167, "learning_rate": 9.536020773424237e-05, "loss": 0.9844951629638672, "memory(GiB)": 91.52, "step": 27945, "token_acc": 0.7698341395991707, "train_speed(iter/s)": 0.187592 }, { "epoch": 0.3626702652553452, "grad_norm": 0.7727460265159607, "learning_rate": 9.535795099279669e-05, "loss": 0.9575870513916016, "memory(GiB)": 91.52, "step": 27950, "token_acc": 0.736471508671274, "train_speed(iter/s)": 0.187574 }, { "epoch": 0.3627351436570009, "grad_norm": 0.8077437281608582, "learning_rate": 9.535569372937466e-05, "loss": 0.926845645904541, "memory(GiB)": 91.52, "step": 27955, "token_acc": 0.7477451985993704, "train_speed(iter/s)": 0.187556 }, { "epoch": 0.3628000220586566, "grad_norm": 0.7570321559906006, "learning_rate": 9.535343594400222e-05, "loss": 0.9362033843994141, "memory(GiB)": 91.52, "step": 27960, "token_acc": 0.7237664159122784, "train_speed(iter/s)": 0.187537 }, { "epoch": 0.3628649004603123, "grad_norm": 0.715092122554779, "learning_rate": 9.535117763670534e-05, "loss": 0.9934881210327149, "memory(GiB)": 91.52, "step": 27965, "token_acc": 0.7299570815450643, "train_speed(iter/s)": 0.187518 }, { "epoch": 0.362929778861968, "grad_norm": 0.8746072053909302, "learning_rate": 9.534891880751005e-05, "loss": 0.9603775024414063, "memory(GiB)": 91.52, "step": 27970, "token_acc": 0.7473795219225361, "train_speed(iter/s)": 0.187499 }, { "epoch": 0.3629946572636237, "grad_norm": 0.8077362775802612, "learning_rate": 9.53466594564423e-05, "loss": 0.9541784286499023, "memory(GiB)": 91.52, "step": 27975, "token_acc": 0.7478663128451553, "train_speed(iter/s)": 0.187482 }, { "epoch": 0.3630595356652794, "grad_norm": 0.8161594271659851, "learning_rate": 9.534439958352812e-05, "loss": 0.9397268295288086, "memory(GiB)": 91.52, "step": 27980, "token_acc": 0.7603658316162383, "train_speed(iter/s)": 0.187466 }, { "epoch": 0.363124414066935, "grad_norm": 0.9323325157165527, "learning_rate": 9.534213918879352e-05, "loss": 0.9767246246337891, "memory(GiB)": 91.52, "step": 27985, "token_acc": 0.7248947562022315, "train_speed(iter/s)": 0.187448 }, { "epoch": 0.3631892924685907, "grad_norm": 0.7264406085014343, "learning_rate": 9.533987827226448e-05, "loss": 0.9647924423217773, "memory(GiB)": 91.52, "step": 27990, "token_acc": 0.7348683808709461, "train_speed(iter/s)": 0.187432 }, { "epoch": 0.3632541708702464, "grad_norm": 0.7788143754005432, "learning_rate": 9.533761683396704e-05, "loss": 0.9362838745117188, "memory(GiB)": 91.52, "step": 27995, "token_acc": 0.7431244576904213, "train_speed(iter/s)": 0.187416 }, { "epoch": 0.3633190492719021, "grad_norm": 0.7388318181037903, "learning_rate": 9.533535487392724e-05, "loss": 0.9176271438598633, "memory(GiB)": 91.52, "step": 28000, "token_acc": 0.755716557695706, "train_speed(iter/s)": 0.187397 }, { "epoch": 0.3633839276735578, "grad_norm": 0.7857837080955505, "learning_rate": 9.533309239217108e-05, "loss": 0.909893798828125, "memory(GiB)": 91.52, "step": 28005, "token_acc": 0.759828699803426, "train_speed(iter/s)": 0.187379 }, { "epoch": 0.3634488060752135, "grad_norm": 0.7547706961631775, "learning_rate": 9.533082938872462e-05, "loss": 0.9696357727050782, "memory(GiB)": 91.52, "step": 28010, "token_acc": 0.738485142624542, "train_speed(iter/s)": 0.187361 }, { "epoch": 0.3635136844768692, "grad_norm": 0.7708939909934998, "learning_rate": 9.53285658636139e-05, "loss": 0.9405368804931641, "memory(GiB)": 91.52, "step": 28015, "token_acc": 0.7529620153513088, "train_speed(iter/s)": 0.187343 }, { "epoch": 0.3635785628785249, "grad_norm": 0.7902277112007141, "learning_rate": 9.532630181686493e-05, "loss": 0.9905908584594727, "memory(GiB)": 91.52, "step": 28020, "token_acc": 0.7253233377232972, "train_speed(iter/s)": 0.187326 }, { "epoch": 0.3636434412801806, "grad_norm": 0.7533190250396729, "learning_rate": 9.532403724850383e-05, "loss": 0.9744152069091797, "memory(GiB)": 91.52, "step": 28025, "token_acc": 0.7602099952273812, "train_speed(iter/s)": 0.187309 }, { "epoch": 0.3637083196818363, "grad_norm": 0.8155835866928101, "learning_rate": 9.53217721585566e-05, "loss": 0.9935551643371582, "memory(GiB)": 91.52, "step": 28030, "token_acc": 0.739618708180626, "train_speed(iter/s)": 0.187291 }, { "epoch": 0.363773198083492, "grad_norm": 0.7944484353065491, "learning_rate": 9.531950654704936e-05, "loss": 0.9497754096984863, "memory(GiB)": 91.52, "step": 28035, "token_acc": 0.7427100569957713, "train_speed(iter/s)": 0.187273 }, { "epoch": 0.3638380764851477, "grad_norm": 0.7837541699409485, "learning_rate": 9.531724041400814e-05, "loss": 0.9468513488769531, "memory(GiB)": 91.52, "step": 28040, "token_acc": 0.741270454838294, "train_speed(iter/s)": 0.187257 }, { "epoch": 0.3639029548868034, "grad_norm": 0.7846401929855347, "learning_rate": 9.531497375945903e-05, "loss": 0.9581629753112793, "memory(GiB)": 91.52, "step": 28045, "token_acc": 0.7242889812220182, "train_speed(iter/s)": 0.187237 }, { "epoch": 0.3639678332884591, "grad_norm": 0.8145828247070312, "learning_rate": 9.531270658342812e-05, "loss": 0.9992826461791993, "memory(GiB)": 91.52, "step": 28050, "token_acc": 0.7449297971918877, "train_speed(iter/s)": 0.187219 }, { "epoch": 0.3640327116901148, "grad_norm": 0.8125447034835815, "learning_rate": 9.53104388859415e-05, "loss": 0.9237077713012696, "memory(GiB)": 91.52, "step": 28055, "token_acc": 0.7526569697849034, "train_speed(iter/s)": 0.187203 }, { "epoch": 0.3640975900917705, "grad_norm": 0.7878735065460205, "learning_rate": 9.530817066702526e-05, "loss": 0.932620620727539, "memory(GiB)": 91.52, "step": 28060, "token_acc": 0.7596408349247831, "train_speed(iter/s)": 0.187186 }, { "epoch": 0.3641624684934262, "grad_norm": 0.7601243257522583, "learning_rate": 9.530590192670552e-05, "loss": 0.9525653839111328, "memory(GiB)": 91.52, "step": 28065, "token_acc": 0.7341377814582594, "train_speed(iter/s)": 0.187168 }, { "epoch": 0.3642273468950819, "grad_norm": 0.8541954755783081, "learning_rate": 9.530363266500837e-05, "loss": 0.9364777565002441, "memory(GiB)": 91.52, "step": 28070, "token_acc": 0.7743706375970512, "train_speed(iter/s)": 0.187152 }, { "epoch": 0.3642922252967376, "grad_norm": 0.734042227268219, "learning_rate": 9.530136288195993e-05, "loss": 0.9645322799682617, "memory(GiB)": 91.52, "step": 28075, "token_acc": 0.7403081332855608, "train_speed(iter/s)": 0.187136 }, { "epoch": 0.3643571036983933, "grad_norm": 0.8145620822906494, "learning_rate": 9.529909257758633e-05, "loss": 0.9258184432983398, "memory(GiB)": 91.52, "step": 28080, "token_acc": 0.7610929979816798, "train_speed(iter/s)": 0.187117 }, { "epoch": 0.364421982100049, "grad_norm": 0.7676918506622314, "learning_rate": 9.529682175191368e-05, "loss": 0.9526018142700196, "memory(GiB)": 91.52, "step": 28085, "token_acc": 0.767506248798308, "train_speed(iter/s)": 0.1871 }, { "epoch": 0.3644868605017047, "grad_norm": 0.7574359178543091, "learning_rate": 9.529455040496812e-05, "loss": 0.967136001586914, "memory(GiB)": 91.52, "step": 28090, "token_acc": 0.7340860628517496, "train_speed(iter/s)": 0.187084 }, { "epoch": 0.3645517389033604, "grad_norm": 0.768022894859314, "learning_rate": 9.529227853677579e-05, "loss": 0.9646997451782227, "memory(GiB)": 91.52, "step": 28095, "token_acc": 0.731912013536379, "train_speed(iter/s)": 0.187064 }, { "epoch": 0.3646166173050161, "grad_norm": 0.8031719923019409, "learning_rate": 9.529000614736283e-05, "loss": 0.9833242416381835, "memory(GiB)": 91.52, "step": 28100, "token_acc": 0.7527138157894737, "train_speed(iter/s)": 0.187048 }, { "epoch": 0.36468149570667174, "grad_norm": 0.7966282963752747, "learning_rate": 9.528773323675541e-05, "loss": 0.9427979469299317, "memory(GiB)": 91.52, "step": 28105, "token_acc": 0.7441845290528857, "train_speed(iter/s)": 0.187031 }, { "epoch": 0.36474637410832744, "grad_norm": 0.6981242895126343, "learning_rate": 9.528545980497966e-05, "loss": 0.9741710662841797, "memory(GiB)": 91.52, "step": 28110, "token_acc": 0.7393194782400574, "train_speed(iter/s)": 0.187015 }, { "epoch": 0.36481125250998314, "grad_norm": 0.7330872416496277, "learning_rate": 9.528318585206176e-05, "loss": 0.913899040222168, "memory(GiB)": 91.52, "step": 28115, "token_acc": 0.739190092530697, "train_speed(iter/s)": 0.186997 }, { "epoch": 0.36487613091163884, "grad_norm": 0.8755386471748352, "learning_rate": 9.528091137802786e-05, "loss": 0.9550397872924805, "memory(GiB)": 91.52, "step": 28120, "token_acc": 0.7215160310946519, "train_speed(iter/s)": 0.18698 }, { "epoch": 0.36494100931329454, "grad_norm": 0.8238516449928284, "learning_rate": 9.527863638290416e-05, "loss": 0.9295862197875977, "memory(GiB)": 91.52, "step": 28125, "token_acc": 0.7363418083741736, "train_speed(iter/s)": 0.18696 }, { "epoch": 0.36500588771495024, "grad_norm": 0.7745876312255859, "learning_rate": 9.527636086671681e-05, "loss": 0.9142444610595704, "memory(GiB)": 91.52, "step": 28130, "token_acc": 0.7422067131937521, "train_speed(iter/s)": 0.186942 }, { "epoch": 0.36507076611660594, "grad_norm": 0.8184450268745422, "learning_rate": 9.5274084829492e-05, "loss": 0.9546044349670411, "memory(GiB)": 91.52, "step": 28135, "token_acc": 0.7300054654764073, "train_speed(iter/s)": 0.186925 }, { "epoch": 0.36513564451826164, "grad_norm": 0.8141911625862122, "learning_rate": 9.527180827125597e-05, "loss": 0.9738482475280762, "memory(GiB)": 91.52, "step": 28140, "token_acc": 0.7710481238486151, "train_speed(iter/s)": 0.186907 }, { "epoch": 0.36520052291991734, "grad_norm": 0.8187702894210815, "learning_rate": 9.526953119203488e-05, "loss": 0.9647414207458496, "memory(GiB)": 91.52, "step": 28145, "token_acc": 0.7560009169204571, "train_speed(iter/s)": 0.186889 }, { "epoch": 0.36526540132157304, "grad_norm": 0.7799043655395508, "learning_rate": 9.526725359185491e-05, "loss": 0.9288487434387207, "memory(GiB)": 91.52, "step": 28150, "token_acc": 0.7522932440302854, "train_speed(iter/s)": 0.186869 }, { "epoch": 0.36533027972322873, "grad_norm": 0.7847080826759338, "learning_rate": 9.526497547074234e-05, "loss": 0.9774642944335937, "memory(GiB)": 91.52, "step": 28155, "token_acc": 0.73276955602537, "train_speed(iter/s)": 0.186853 }, { "epoch": 0.36539515812488443, "grad_norm": 0.718468964099884, "learning_rate": 9.526269682872331e-05, "loss": 0.970762825012207, "memory(GiB)": 91.52, "step": 28160, "token_acc": 0.7450911176808013, "train_speed(iter/s)": 0.186838 }, { "epoch": 0.36546003652654013, "grad_norm": 0.8105126619338989, "learning_rate": 9.526041766582409e-05, "loss": 0.9616854667663575, "memory(GiB)": 91.52, "step": 28165, "token_acc": 0.7300784929356358, "train_speed(iter/s)": 0.186819 }, { "epoch": 0.36552491492819583, "grad_norm": 0.7955342531204224, "learning_rate": 9.525813798207091e-05, "loss": 0.988017463684082, "memory(GiB)": 91.52, "step": 28170, "token_acc": 0.7390316515198997, "train_speed(iter/s)": 0.186798 }, { "epoch": 0.36558979332985153, "grad_norm": 0.8872389197349548, "learning_rate": 9.525585777748997e-05, "loss": 0.9493809700012207, "memory(GiB)": 91.52, "step": 28175, "token_acc": 0.7538627380524614, "train_speed(iter/s)": 0.186779 }, { "epoch": 0.36565467173150723, "grad_norm": 0.7325091361999512, "learning_rate": 9.525357705210753e-05, "loss": 0.9319643020629883, "memory(GiB)": 91.52, "step": 28180, "token_acc": 0.7474084450670402, "train_speed(iter/s)": 0.186761 }, { "epoch": 0.36571955013316293, "grad_norm": 0.7825198173522949, "learning_rate": 9.525129580594984e-05, "loss": 1.0023682594299317, "memory(GiB)": 91.52, "step": 28185, "token_acc": 0.7303778005564504, "train_speed(iter/s)": 0.186743 }, { "epoch": 0.36578442853481863, "grad_norm": 0.7127513885498047, "learning_rate": 9.524901403904317e-05, "loss": 0.9367166519165039, "memory(GiB)": 91.52, "step": 28190, "token_acc": 0.7668596513551342, "train_speed(iter/s)": 0.186723 }, { "epoch": 0.36584930693647433, "grad_norm": 0.8409751653671265, "learning_rate": 9.524673175141374e-05, "loss": 0.9848272323608398, "memory(GiB)": 91.52, "step": 28195, "token_acc": 0.7316253817441466, "train_speed(iter/s)": 0.186708 }, { "epoch": 0.36591418533813, "grad_norm": 0.8255550861358643, "learning_rate": 9.524444894308785e-05, "loss": 0.981018352508545, "memory(GiB)": 91.52, "step": 28200, "token_acc": 0.7466810041771785, "train_speed(iter/s)": 0.186693 }, { "epoch": 0.3659790637397857, "grad_norm": 0.7145212292671204, "learning_rate": 9.524216561409174e-05, "loss": 0.9699649810791016, "memory(GiB)": 91.52, "step": 28205, "token_acc": 0.757367026696556, "train_speed(iter/s)": 0.186677 }, { "epoch": 0.3660439421414414, "grad_norm": 0.8630922436714172, "learning_rate": 9.523988176445171e-05, "loss": 0.9750583648681641, "memory(GiB)": 91.52, "step": 28210, "token_acc": 0.7276768457672981, "train_speed(iter/s)": 0.186661 }, { "epoch": 0.3661088205430971, "grad_norm": 0.8423701524734497, "learning_rate": 9.523759739419402e-05, "loss": 0.9946447372436523, "memory(GiB)": 91.52, "step": 28215, "token_acc": 0.749222972972973, "train_speed(iter/s)": 0.186642 }, { "epoch": 0.3661736989447528, "grad_norm": 0.8496942520141602, "learning_rate": 9.523531250334497e-05, "loss": 0.9452965736389161, "memory(GiB)": 91.52, "step": 28220, "token_acc": 0.7357877105980491, "train_speed(iter/s)": 0.186623 }, { "epoch": 0.36623857734640847, "grad_norm": 0.8053585290908813, "learning_rate": 9.523302709193087e-05, "loss": 0.9386281967163086, "memory(GiB)": 91.52, "step": 28225, "token_acc": 0.7466701833047606, "train_speed(iter/s)": 0.186605 }, { "epoch": 0.36630345574806417, "grad_norm": 0.771979570388794, "learning_rate": 9.523074115997799e-05, "loss": 0.9434339523315429, "memory(GiB)": 91.52, "step": 28230, "token_acc": 0.7487268057216244, "train_speed(iter/s)": 0.186589 }, { "epoch": 0.36636833414971987, "grad_norm": 0.839842677116394, "learning_rate": 9.522845470751265e-05, "loss": 0.9745518684387207, "memory(GiB)": 91.52, "step": 28235, "token_acc": 0.7383311899538543, "train_speed(iter/s)": 0.186573 }, { "epoch": 0.36643321255137556, "grad_norm": 0.7796629667282104, "learning_rate": 9.522616773456118e-05, "loss": 1.0113492012023926, "memory(GiB)": 91.52, "step": 28240, "token_acc": 0.7123486682808716, "train_speed(iter/s)": 0.186556 }, { "epoch": 0.36649809095303126, "grad_norm": 0.8254779577255249, "learning_rate": 9.522388024114987e-05, "loss": 0.9833772659301758, "memory(GiB)": 91.52, "step": 28245, "token_acc": 0.7425183300912763, "train_speed(iter/s)": 0.186538 }, { "epoch": 0.36656296935468696, "grad_norm": 0.7787219285964966, "learning_rate": 9.522159222730507e-05, "loss": 0.9796287536621093, "memory(GiB)": 91.52, "step": 28250, "token_acc": 0.7324965453707969, "train_speed(iter/s)": 0.186519 }, { "epoch": 0.36662784775634266, "grad_norm": 0.8042829632759094, "learning_rate": 9.521930369305308e-05, "loss": 1.0050874710083009, "memory(GiB)": 91.52, "step": 28255, "token_acc": 0.7550616337740308, "train_speed(iter/s)": 0.186501 }, { "epoch": 0.36669272615799836, "grad_norm": 0.7925153374671936, "learning_rate": 9.521701463842027e-05, "loss": 0.9548423767089844, "memory(GiB)": 91.52, "step": 28260, "token_acc": 0.7469460227272727, "train_speed(iter/s)": 0.186485 }, { "epoch": 0.36675760455965406, "grad_norm": 0.7472002506256104, "learning_rate": 9.521472506343296e-05, "loss": 0.9177999496459961, "memory(GiB)": 91.52, "step": 28265, "token_acc": 0.7445015052240127, "train_speed(iter/s)": 0.186469 }, { "epoch": 0.36682248296130976, "grad_norm": 0.8106189966201782, "learning_rate": 9.52124349681175e-05, "loss": 0.9827056884765625, "memory(GiB)": 91.52, "step": 28270, "token_acc": 0.7614328136599852, "train_speed(iter/s)": 0.18645 }, { "epoch": 0.36688736136296546, "grad_norm": 0.7980762720108032, "learning_rate": 9.521014435250024e-05, "loss": 0.9380435943603516, "memory(GiB)": 91.52, "step": 28275, "token_acc": 0.7696619311171432, "train_speed(iter/s)": 0.186433 }, { "epoch": 0.36695223976462116, "grad_norm": 0.8018049597740173, "learning_rate": 9.520785321660757e-05, "loss": 0.9043771743774414, "memory(GiB)": 91.52, "step": 28280, "token_acc": 0.7609066767830045, "train_speed(iter/s)": 0.186412 }, { "epoch": 0.36701711816627686, "grad_norm": 0.7705551385879517, "learning_rate": 9.520556156046582e-05, "loss": 0.9322713851928711, "memory(GiB)": 91.52, "step": 28285, "token_acc": 0.7509529039343626, "train_speed(iter/s)": 0.186396 }, { "epoch": 0.36708199656793256, "grad_norm": 0.7873970866203308, "learning_rate": 9.520326938410137e-05, "loss": 0.9160932540893555, "memory(GiB)": 91.52, "step": 28290, "token_acc": 0.7466360856269113, "train_speed(iter/s)": 0.186379 }, { "epoch": 0.36714687496958825, "grad_norm": 0.8108974695205688, "learning_rate": 9.520097668754063e-05, "loss": 0.9349181175231933, "memory(GiB)": 91.52, "step": 28295, "token_acc": 0.7647908595442905, "train_speed(iter/s)": 0.186361 }, { "epoch": 0.36721175337124395, "grad_norm": 0.7354395985603333, "learning_rate": 9.519868347080993e-05, "loss": 0.8975071907043457, "memory(GiB)": 91.52, "step": 28300, "token_acc": 0.7536008949797232, "train_speed(iter/s)": 0.186344 }, { "epoch": 0.36727663177289965, "grad_norm": 0.834016740322113, "learning_rate": 9.519638973393572e-05, "loss": 0.9030679702758789, "memory(GiB)": 91.52, "step": 28305, "token_acc": 0.7527916705985971, "train_speed(iter/s)": 0.186325 }, { "epoch": 0.36734151017455535, "grad_norm": 0.8198798894882202, "learning_rate": 9.519409547694434e-05, "loss": 0.9773329734802246, "memory(GiB)": 91.52, "step": 28310, "token_acc": 0.7444954855060986, "train_speed(iter/s)": 0.186308 }, { "epoch": 0.36740638857621105, "grad_norm": 0.7882913947105408, "learning_rate": 9.519180069986224e-05, "loss": 0.9707394599914551, "memory(GiB)": 91.52, "step": 28315, "token_acc": 0.7517901234567901, "train_speed(iter/s)": 0.186289 }, { "epoch": 0.36747126697786675, "grad_norm": 0.8876964449882507, "learning_rate": 9.518950540271578e-05, "loss": 0.974697208404541, "memory(GiB)": 91.52, "step": 28320, "token_acc": 0.7416934964584675, "train_speed(iter/s)": 0.186271 }, { "epoch": 0.36753614537952245, "grad_norm": 0.8251147270202637, "learning_rate": 9.518720958553142e-05, "loss": 0.9321785926818847, "memory(GiB)": 91.52, "step": 28325, "token_acc": 0.7488145097089058, "train_speed(iter/s)": 0.186252 }, { "epoch": 0.36760102378117815, "grad_norm": 0.7548322677612305, "learning_rate": 9.518491324833556e-05, "loss": 0.9651305198669433, "memory(GiB)": 91.52, "step": 28330, "token_acc": 0.7291377030280425, "train_speed(iter/s)": 0.186236 }, { "epoch": 0.36766590218283385, "grad_norm": 0.7878726720809937, "learning_rate": 9.518261639115463e-05, "loss": 0.9493370056152344, "memory(GiB)": 91.52, "step": 28335, "token_acc": 0.7343452909113957, "train_speed(iter/s)": 0.186215 }, { "epoch": 0.36773078058448955, "grad_norm": 0.7101555466651917, "learning_rate": 9.518031901401503e-05, "loss": 0.9049412727355957, "memory(GiB)": 91.52, "step": 28340, "token_acc": 0.7590599604656271, "train_speed(iter/s)": 0.186196 }, { "epoch": 0.3677956589861452, "grad_norm": 0.7742006182670593, "learning_rate": 9.517802111694325e-05, "loss": 0.9896867752075196, "memory(GiB)": 91.52, "step": 28345, "token_acc": 0.7441068103482429, "train_speed(iter/s)": 0.186177 }, { "epoch": 0.3678605373878009, "grad_norm": 0.7499767541885376, "learning_rate": 9.51757226999657e-05, "loss": 0.8923656463623046, "memory(GiB)": 91.52, "step": 28350, "token_acc": 0.7388952892985151, "train_speed(iter/s)": 0.186159 }, { "epoch": 0.3679254157894566, "grad_norm": 0.8253175020217896, "learning_rate": 9.517342376310885e-05, "loss": 0.8733087539672851, "memory(GiB)": 91.52, "step": 28355, "token_acc": 0.7515468059461631, "train_speed(iter/s)": 0.186142 }, { "epoch": 0.3679902941911123, "grad_norm": 0.8909127116203308, "learning_rate": 9.517112430639915e-05, "loss": 0.9330781936645508, "memory(GiB)": 91.52, "step": 28360, "token_acc": 0.7270862753009213, "train_speed(iter/s)": 0.186126 }, { "epoch": 0.368055172592768, "grad_norm": 0.7331744432449341, "learning_rate": 9.516882432986304e-05, "loss": 0.9590845108032227, "memory(GiB)": 91.52, "step": 28365, "token_acc": 0.7482096736323081, "train_speed(iter/s)": 0.186107 }, { "epoch": 0.3681200509944237, "grad_norm": 0.7996032238006592, "learning_rate": 9.516652383352703e-05, "loss": 0.9237456321716309, "memory(GiB)": 91.52, "step": 28370, "token_acc": 0.728453551265348, "train_speed(iter/s)": 0.18609 }, { "epoch": 0.3681849293960794, "grad_norm": 0.8336453437805176, "learning_rate": 9.516422281741756e-05, "loss": 0.9995174407958984, "memory(GiB)": 91.52, "step": 28375, "token_acc": 0.7372237015174361, "train_speed(iter/s)": 0.186073 }, { "epoch": 0.3682498077977351, "grad_norm": 0.8752080798149109, "learning_rate": 9.516192128156112e-05, "loss": 0.8810957908630371, "memory(GiB)": 91.52, "step": 28380, "token_acc": 0.7586993276451854, "train_speed(iter/s)": 0.186055 }, { "epoch": 0.3683146861993908, "grad_norm": 0.7852591276168823, "learning_rate": 9.51596192259842e-05, "loss": 0.9444402694702149, "memory(GiB)": 91.52, "step": 28385, "token_acc": 0.7421477388396103, "train_speed(iter/s)": 0.18604 }, { "epoch": 0.3683795646010465, "grad_norm": 0.7476568222045898, "learning_rate": 9.515731665071328e-05, "loss": 0.9604318618774415, "memory(GiB)": 91.52, "step": 28390, "token_acc": 0.7610239357095847, "train_speed(iter/s)": 0.186021 }, { "epoch": 0.3684444430027022, "grad_norm": 0.8053017258644104, "learning_rate": 9.515501355577488e-05, "loss": 0.965091323852539, "memory(GiB)": 91.52, "step": 28395, "token_acc": 0.7394289844597036, "train_speed(iter/s)": 0.186005 }, { "epoch": 0.3685093214043579, "grad_norm": 0.8958921432495117, "learning_rate": 9.515270994119547e-05, "loss": 0.9832603454589843, "memory(GiB)": 91.52, "step": 28400, "token_acc": 0.7436614853195164, "train_speed(iter/s)": 0.18599 }, { "epoch": 0.3685741998060136, "grad_norm": 0.8967288732528687, "learning_rate": 9.515040580700159e-05, "loss": 0.9741361618041993, "memory(GiB)": 91.52, "step": 28405, "token_acc": 0.7400132177999706, "train_speed(iter/s)": 0.185973 }, { "epoch": 0.3686390782076693, "grad_norm": 0.7595657706260681, "learning_rate": 9.514810115321974e-05, "loss": 0.9381052017211914, "memory(GiB)": 91.52, "step": 28410, "token_acc": 0.73810023799524, "train_speed(iter/s)": 0.185958 }, { "epoch": 0.368703956609325, "grad_norm": 0.790465235710144, "learning_rate": 9.514579597987648e-05, "loss": 0.9616599082946777, "memory(GiB)": 91.52, "step": 28415, "token_acc": 0.7876816860465117, "train_speed(iter/s)": 0.185941 }, { "epoch": 0.3687688350109807, "grad_norm": 0.761871337890625, "learning_rate": 9.514349028699828e-05, "loss": 0.9176271438598633, "memory(GiB)": 91.52, "step": 28420, "token_acc": 0.7514681700728212, "train_speed(iter/s)": 0.185924 }, { "epoch": 0.3688337134126364, "grad_norm": 0.8740987777709961, "learning_rate": 9.514118407461169e-05, "loss": 0.9306413650512695, "memory(GiB)": 91.52, "step": 28425, "token_acc": 0.7704918032786885, "train_speed(iter/s)": 0.185908 }, { "epoch": 0.3688985918142921, "grad_norm": 0.7823433876037598, "learning_rate": 9.513887734274326e-05, "loss": 0.9357797622680664, "memory(GiB)": 91.52, "step": 28430, "token_acc": 0.7490936058009229, "train_speed(iter/s)": 0.18589 }, { "epoch": 0.3689634702159478, "grad_norm": 0.844258725643158, "learning_rate": 9.513657009141955e-05, "loss": 0.9599185943603515, "memory(GiB)": 91.52, "step": 28435, "token_acc": 0.7318199431671403, "train_speed(iter/s)": 0.18587 }, { "epoch": 0.3690283486176035, "grad_norm": 0.8519304990768433, "learning_rate": 9.513426232066709e-05, "loss": 0.9485612869262695, "memory(GiB)": 91.52, "step": 28440, "token_acc": 0.732998179083208, "train_speed(iter/s)": 0.185854 }, { "epoch": 0.3690932270192592, "grad_norm": 0.8840778470039368, "learning_rate": 9.513195403051245e-05, "loss": 0.9822742462158203, "memory(GiB)": 91.52, "step": 28445, "token_acc": 0.7397919577705325, "train_speed(iter/s)": 0.185839 }, { "epoch": 0.3691581054209149, "grad_norm": 0.8826653361320496, "learning_rate": 9.512964522098218e-05, "loss": 0.9703044891357422, "memory(GiB)": 91.52, "step": 28450, "token_acc": 0.7601561000575779, "train_speed(iter/s)": 0.185821 }, { "epoch": 0.36922298382257057, "grad_norm": 0.8427175283432007, "learning_rate": 9.512733589210285e-05, "loss": 0.949644660949707, "memory(GiB)": 91.52, "step": 28455, "token_acc": 0.7432955303535691, "train_speed(iter/s)": 0.185805 }, { "epoch": 0.36928786222422627, "grad_norm": 0.7280872464179993, "learning_rate": 9.512502604390105e-05, "loss": 0.9128818511962891, "memory(GiB)": 91.52, "step": 28460, "token_acc": 0.7529575967034428, "train_speed(iter/s)": 0.185786 }, { "epoch": 0.3693527406258819, "grad_norm": 0.7992732524871826, "learning_rate": 9.512271567640337e-05, "loss": 0.9641425132751464, "memory(GiB)": 91.52, "step": 28465, "token_acc": 0.7455410225921522, "train_speed(iter/s)": 0.185769 }, { "epoch": 0.3694176190275376, "grad_norm": 0.7624853253364563, "learning_rate": 9.512040478963636e-05, "loss": 0.9209066390991211, "memory(GiB)": 91.52, "step": 28470, "token_acc": 0.7472404730617609, "train_speed(iter/s)": 0.185753 }, { "epoch": 0.3694824974291933, "grad_norm": 0.8070055246353149, "learning_rate": 9.511809338362664e-05, "loss": 0.9687760353088379, "memory(GiB)": 91.52, "step": 28475, "token_acc": 0.7402726048022823, "train_speed(iter/s)": 0.185734 }, { "epoch": 0.369547375830849, "grad_norm": 0.7775603532791138, "learning_rate": 9.511578145840081e-05, "loss": 0.9511187553405762, "memory(GiB)": 91.52, "step": 28480, "token_acc": 0.7532064401691909, "train_speed(iter/s)": 0.185717 }, { "epoch": 0.3696122542325047, "grad_norm": 0.8173046112060547, "learning_rate": 9.511346901398548e-05, "loss": 0.9372117996215821, "memory(GiB)": 91.52, "step": 28485, "token_acc": 0.7532560603228018, "train_speed(iter/s)": 0.185701 }, { "epoch": 0.3696771326341604, "grad_norm": 0.7149575352668762, "learning_rate": 9.511115605040723e-05, "loss": 0.9403656959533692, "memory(GiB)": 91.52, "step": 28490, "token_acc": 0.7493520207353365, "train_speed(iter/s)": 0.185683 }, { "epoch": 0.3697420110358161, "grad_norm": 0.8601506352424622, "learning_rate": 9.510884256769271e-05, "loss": 0.9791330337524414, "memory(GiB)": 91.52, "step": 28495, "token_acc": 0.7277866626994458, "train_speed(iter/s)": 0.185668 }, { "epoch": 0.3698068894374718, "grad_norm": 0.8132993578910828, "learning_rate": 9.510652856586854e-05, "loss": 0.9572551727294922, "memory(GiB)": 91.52, "step": 28500, "token_acc": 0.7485288873038516, "train_speed(iter/s)": 0.185653 }, { "epoch": 0.3698717678391275, "grad_norm": 0.7803130149841309, "learning_rate": 9.510421404496134e-05, "loss": 0.9419948577880859, "memory(GiB)": 91.52, "step": 28505, "token_acc": 0.7636650767049117, "train_speed(iter/s)": 0.185637 }, { "epoch": 0.3699366462407832, "grad_norm": 0.8474149107933044, "learning_rate": 9.510189900499775e-05, "loss": 0.9793235778808593, "memory(GiB)": 91.52, "step": 28510, "token_acc": 0.7534871346585078, "train_speed(iter/s)": 0.18562 }, { "epoch": 0.3700015246424389, "grad_norm": 0.8489887118339539, "learning_rate": 9.50995834460044e-05, "loss": 0.9866827964782715, "memory(GiB)": 91.52, "step": 28515, "token_acc": 0.7389696364510522, "train_speed(iter/s)": 0.185604 }, { "epoch": 0.3700664030440946, "grad_norm": 0.7113795280456543, "learning_rate": 9.509726736800794e-05, "loss": 0.8988471984863281, "memory(GiB)": 91.52, "step": 28520, "token_acc": 0.7806017360600265, "train_speed(iter/s)": 0.185584 }, { "epoch": 0.3701312814457503, "grad_norm": 0.7690566182136536, "learning_rate": 9.509495077103504e-05, "loss": 0.9848505973815918, "memory(GiB)": 91.52, "step": 28525, "token_acc": 0.7595176628214764, "train_speed(iter/s)": 0.185566 }, { "epoch": 0.370196159847406, "grad_norm": 0.7834473848342896, "learning_rate": 9.509263365511235e-05, "loss": 0.9176666259765625, "memory(GiB)": 91.52, "step": 28530, "token_acc": 0.7577973841033976, "train_speed(iter/s)": 0.185549 }, { "epoch": 0.3702610382490617, "grad_norm": 0.812309741973877, "learning_rate": 9.509031602026654e-05, "loss": 0.947146987915039, "memory(GiB)": 91.52, "step": 28535, "token_acc": 0.7336980866449094, "train_speed(iter/s)": 0.185531 }, { "epoch": 0.3703259166507174, "grad_norm": 0.7539152503013611, "learning_rate": 9.508799786652425e-05, "loss": 0.9233874320983887, "memory(GiB)": 91.52, "step": 28540, "token_acc": 0.7573962144106887, "train_speed(iter/s)": 0.185515 }, { "epoch": 0.3703907950523731, "grad_norm": 0.7452343702316284, "learning_rate": 9.50856791939122e-05, "loss": 0.958305549621582, "memory(GiB)": 91.52, "step": 28545, "token_acc": 0.7492478074387043, "train_speed(iter/s)": 0.185499 }, { "epoch": 0.3704556734540288, "grad_norm": 0.6934848427772522, "learning_rate": 9.508336000245706e-05, "loss": 0.9614630699157715, "memory(GiB)": 91.52, "step": 28550, "token_acc": 0.7332990915219415, "train_speed(iter/s)": 0.185481 }, { "epoch": 0.3705205518556845, "grad_norm": 0.7732114791870117, "learning_rate": 9.508104029218552e-05, "loss": 0.892985725402832, "memory(GiB)": 91.52, "step": 28555, "token_acc": 0.760679265131307, "train_speed(iter/s)": 0.185462 }, { "epoch": 0.3705854302573402, "grad_norm": 0.7946586608886719, "learning_rate": 9.507872006312427e-05, "loss": 0.9417732238769532, "memory(GiB)": 91.52, "step": 28560, "token_acc": 0.7419298091300905, "train_speed(iter/s)": 0.185448 }, { "epoch": 0.3706503086589959, "grad_norm": 0.7597106099128723, "learning_rate": 9.507639931529998e-05, "loss": 0.9618368148803711, "memory(GiB)": 91.52, "step": 28565, "token_acc": 0.7563796397379913, "train_speed(iter/s)": 0.185431 }, { "epoch": 0.3707151870606516, "grad_norm": 0.7741093039512634, "learning_rate": 9.50740780487394e-05, "loss": 1.0268815994262694, "memory(GiB)": 91.52, "step": 28570, "token_acc": 0.7360748323332157, "train_speed(iter/s)": 0.185415 }, { "epoch": 0.3707800654623073, "grad_norm": 0.7508167624473572, "learning_rate": 9.507175626346925e-05, "loss": 0.9160388946533203, "memory(GiB)": 91.52, "step": 28575, "token_acc": 0.7634135405768944, "train_speed(iter/s)": 0.185398 }, { "epoch": 0.370844943863963, "grad_norm": 0.7634274959564209, "learning_rate": 9.506943395951622e-05, "loss": 0.9269804000854492, "memory(GiB)": 91.52, "step": 28580, "token_acc": 0.7469551979121357, "train_speed(iter/s)": 0.18538 }, { "epoch": 0.37090982226561864, "grad_norm": 0.6400617361068726, "learning_rate": 9.506711113690704e-05, "loss": 0.9320045471191406, "memory(GiB)": 91.52, "step": 28585, "token_acc": 0.7588979344681424, "train_speed(iter/s)": 0.185364 }, { "epoch": 0.37097470066727434, "grad_norm": 0.7551462650299072, "learning_rate": 9.506478779566846e-05, "loss": 0.8923530578613281, "memory(GiB)": 91.52, "step": 28590, "token_acc": 0.7673778389538886, "train_speed(iter/s)": 0.185347 }, { "epoch": 0.37103957906893004, "grad_norm": 0.7850807309150696, "learning_rate": 9.506246393582718e-05, "loss": 0.9261035919189453, "memory(GiB)": 91.52, "step": 28595, "token_acc": 0.7612760618913914, "train_speed(iter/s)": 0.185329 }, { "epoch": 0.37110445747058574, "grad_norm": 0.8294679522514343, "learning_rate": 9.506013955740998e-05, "loss": 0.9199551582336426, "memory(GiB)": 91.52, "step": 28600, "token_acc": 0.75894942379111, "train_speed(iter/s)": 0.18531 }, { "epoch": 0.37116933587224143, "grad_norm": 0.7367417216300964, "learning_rate": 9.505781466044359e-05, "loss": 0.9531612396240234, "memory(GiB)": 91.52, "step": 28605, "token_acc": 0.7341582769691114, "train_speed(iter/s)": 0.185294 }, { "epoch": 0.37123421427389713, "grad_norm": 0.7581084370613098, "learning_rate": 9.505548924495476e-05, "loss": 0.955206298828125, "memory(GiB)": 91.52, "step": 28610, "token_acc": 0.7419924086035826, "train_speed(iter/s)": 0.185278 }, { "epoch": 0.37129909267555283, "grad_norm": 0.7499463558197021, "learning_rate": 9.505316331097027e-05, "loss": 0.8935517311096192, "memory(GiB)": 91.52, "step": 28615, "token_acc": 0.751644967100658, "train_speed(iter/s)": 0.185263 }, { "epoch": 0.37136397107720853, "grad_norm": 0.689399778842926, "learning_rate": 9.505083685851688e-05, "loss": 0.9620341300964356, "memory(GiB)": 91.52, "step": 28620, "token_acc": 0.7441537392915026, "train_speed(iter/s)": 0.185245 }, { "epoch": 0.37142884947886423, "grad_norm": 0.8617850542068481, "learning_rate": 9.504850988762134e-05, "loss": 0.9902288436889648, "memory(GiB)": 91.52, "step": 28625, "token_acc": 0.7338476726754177, "train_speed(iter/s)": 0.185231 }, { "epoch": 0.37149372788051993, "grad_norm": 0.8200909495353699, "learning_rate": 9.504618239831047e-05, "loss": 0.9398902893066406, "memory(GiB)": 91.52, "step": 28630, "token_acc": 0.7392639171444424, "train_speed(iter/s)": 0.185214 }, { "epoch": 0.37155860628217563, "grad_norm": 0.8170406818389893, "learning_rate": 9.504385439061103e-05, "loss": 0.9055305480957031, "memory(GiB)": 91.52, "step": 28635, "token_acc": 0.7563247662768489, "train_speed(iter/s)": 0.185196 }, { "epoch": 0.37162348468383133, "grad_norm": 0.7906733155250549, "learning_rate": 9.50415258645498e-05, "loss": 0.9270606994628906, "memory(GiB)": 91.52, "step": 28640, "token_acc": 0.7563691949337604, "train_speed(iter/s)": 0.18518 }, { "epoch": 0.37168836308548703, "grad_norm": 0.7748280167579651, "learning_rate": 9.503919682015358e-05, "loss": 0.94493408203125, "memory(GiB)": 91.52, "step": 28645, "token_acc": 0.7620851191069471, "train_speed(iter/s)": 0.185162 }, { "epoch": 0.3717532414871427, "grad_norm": 0.8559218049049377, "learning_rate": 9.503686725744921e-05, "loss": 0.9320576667785645, "memory(GiB)": 91.52, "step": 28650, "token_acc": 0.7773414018354506, "train_speed(iter/s)": 0.185145 }, { "epoch": 0.3718181198887984, "grad_norm": 0.8306902050971985, "learning_rate": 9.503453717646345e-05, "loss": 0.9434032440185547, "memory(GiB)": 91.52, "step": 28655, "token_acc": 0.7407591452926111, "train_speed(iter/s)": 0.185128 }, { "epoch": 0.3718829982904541, "grad_norm": 0.7751948237419128, "learning_rate": 9.503220657722315e-05, "loss": 0.9978979110717774, "memory(GiB)": 91.52, "step": 28660, "token_acc": 0.7312966323855616, "train_speed(iter/s)": 0.185111 }, { "epoch": 0.3719478766921098, "grad_norm": 0.7758434414863586, "learning_rate": 9.50298754597551e-05, "loss": 0.9553817749023438, "memory(GiB)": 91.52, "step": 28665, "token_acc": 0.747378355704698, "train_speed(iter/s)": 0.185093 }, { "epoch": 0.3720127550937655, "grad_norm": 0.809360682964325, "learning_rate": 9.502754382408616e-05, "loss": 0.9514974594116211, "memory(GiB)": 91.52, "step": 28670, "token_acc": 0.747735093756584, "train_speed(iter/s)": 0.185076 }, { "epoch": 0.3720776334954212, "grad_norm": 0.809985876083374, "learning_rate": 9.502521167024312e-05, "loss": 0.9428499221801758, "memory(GiB)": 91.52, "step": 28675, "token_acc": 0.747472246296067, "train_speed(iter/s)": 0.185062 }, { "epoch": 0.3721425118970769, "grad_norm": 0.8060658574104309, "learning_rate": 9.502287899825286e-05, "loss": 0.9481452941894531, "memory(GiB)": 91.52, "step": 28680, "token_acc": 0.7409109657651287, "train_speed(iter/s)": 0.185046 }, { "epoch": 0.3722073902987326, "grad_norm": 0.8157952427864075, "learning_rate": 9.50205458081422e-05, "loss": 0.9676778793334961, "memory(GiB)": 91.52, "step": 28685, "token_acc": 0.7591434823382307, "train_speed(iter/s)": 0.185028 }, { "epoch": 0.3722722687003883, "grad_norm": 0.8713919520378113, "learning_rate": 9.5018212099938e-05, "loss": 0.9726329803466797, "memory(GiB)": 91.52, "step": 28690, "token_acc": 0.7545978589074939, "train_speed(iter/s)": 0.18501 }, { "epoch": 0.372337147102044, "grad_norm": 0.8196465373039246, "learning_rate": 9.501587787366711e-05, "loss": 0.9532122611999512, "memory(GiB)": 91.52, "step": 28695, "token_acc": 0.7478118487645132, "train_speed(iter/s)": 0.184992 }, { "epoch": 0.3724020255036997, "grad_norm": 0.875340461730957, "learning_rate": 9.501354312935637e-05, "loss": 1.0156014442443848, "memory(GiB)": 91.52, "step": 28700, "token_acc": 0.7175310321373916, "train_speed(iter/s)": 0.184977 }, { "epoch": 0.37246690390535536, "grad_norm": 0.7015575170516968, "learning_rate": 9.50112078670327e-05, "loss": 0.962800407409668, "memory(GiB)": 91.52, "step": 28705, "token_acc": 0.7560030251465305, "train_speed(iter/s)": 0.184959 }, { "epoch": 0.37253178230701106, "grad_norm": 0.8012177348136902, "learning_rate": 9.500887208672295e-05, "loss": 0.9390920639038086, "memory(GiB)": 91.52, "step": 28710, "token_acc": 0.7392110586648685, "train_speed(iter/s)": 0.184942 }, { "epoch": 0.37259666070866676, "grad_norm": 0.8373680114746094, "learning_rate": 9.5006535788454e-05, "loss": 0.947810173034668, "memory(GiB)": 91.52, "step": 28715, "token_acc": 0.7463099286274802, "train_speed(iter/s)": 0.184923 }, { "epoch": 0.37266153911032246, "grad_norm": 0.8598152995109558, "learning_rate": 9.500419897225271e-05, "loss": 0.9505752563476563, "memory(GiB)": 91.52, "step": 28720, "token_acc": 0.7473579778285813, "train_speed(iter/s)": 0.184907 }, { "epoch": 0.37272641751197816, "grad_norm": 0.7520130276679993, "learning_rate": 9.500186163814601e-05, "loss": 0.943393325805664, "memory(GiB)": 91.52, "step": 28725, "token_acc": 0.7319513294276702, "train_speed(iter/s)": 0.184891 }, { "epoch": 0.37279129591363386, "grad_norm": 0.8451444506645203, "learning_rate": 9.499952378616077e-05, "loss": 1.0038379669189452, "memory(GiB)": 91.52, "step": 28730, "token_acc": 0.7502556610664719, "train_speed(iter/s)": 0.184874 }, { "epoch": 0.37285617431528956, "grad_norm": 0.875876784324646, "learning_rate": 9.499718541632392e-05, "loss": 0.9556549072265625, "memory(GiB)": 91.52, "step": 28735, "token_acc": 0.7410464147800409, "train_speed(iter/s)": 0.18486 }, { "epoch": 0.37292105271694526, "grad_norm": 0.8499491214752197, "learning_rate": 9.499484652866235e-05, "loss": 0.9643815994262696, "memory(GiB)": 91.52, "step": 28740, "token_acc": 0.757062534284147, "train_speed(iter/s)": 0.18484 }, { "epoch": 0.37298593111860096, "grad_norm": 0.7203895449638367, "learning_rate": 9.499250712320298e-05, "loss": 0.9198145866394043, "memory(GiB)": 91.52, "step": 28745, "token_acc": 0.752110459293175, "train_speed(iter/s)": 0.184825 }, { "epoch": 0.37305080952025665, "grad_norm": 0.8006405234336853, "learning_rate": 9.499016719997275e-05, "loss": 0.9441913604736328, "memory(GiB)": 91.52, "step": 28750, "token_acc": 0.7619857834240774, "train_speed(iter/s)": 0.184807 }, { "epoch": 0.37311568792191235, "grad_norm": 0.7351105809211731, "learning_rate": 9.498782675899856e-05, "loss": 0.985072135925293, "memory(GiB)": 91.52, "step": 28755, "token_acc": 0.7529864270128334, "train_speed(iter/s)": 0.18479 }, { "epoch": 0.37318056632356805, "grad_norm": 0.7271997928619385, "learning_rate": 9.498548580030736e-05, "loss": 0.9743685722351074, "memory(GiB)": 91.52, "step": 28760, "token_acc": 0.7435879733508233, "train_speed(iter/s)": 0.184774 }, { "epoch": 0.37324544472522375, "grad_norm": 0.79710453748703, "learning_rate": 9.498314432392607e-05, "loss": 0.9112997055053711, "memory(GiB)": 91.52, "step": 28765, "token_acc": 0.7591084093211753, "train_speed(iter/s)": 0.18476 }, { "epoch": 0.37331032312687945, "grad_norm": 0.7674022912979126, "learning_rate": 9.498080232988165e-05, "loss": 0.9090693473815918, "memory(GiB)": 91.52, "step": 28770, "token_acc": 0.7508534825607996, "train_speed(iter/s)": 0.184744 }, { "epoch": 0.37337520152853515, "grad_norm": 0.8215175867080688, "learning_rate": 9.497845981820106e-05, "loss": 0.9379743576049805, "memory(GiB)": 91.52, "step": 28775, "token_acc": 0.744921137665163, "train_speed(iter/s)": 0.184729 }, { "epoch": 0.37344007993019085, "grad_norm": 0.7099896669387817, "learning_rate": 9.497611678891124e-05, "loss": 0.9313164710998535, "memory(GiB)": 91.52, "step": 28780, "token_acc": 0.743946562760924, "train_speed(iter/s)": 0.18471 }, { "epoch": 0.37350495833184655, "grad_norm": 0.7501224875450134, "learning_rate": 9.497377324203919e-05, "loss": 0.8979633331298829, "memory(GiB)": 91.52, "step": 28785, "token_acc": 0.7584441289101157, "train_speed(iter/s)": 0.184694 }, { "epoch": 0.37356983673350225, "grad_norm": 0.753591775894165, "learning_rate": 9.497142917761183e-05, "loss": 0.9471622467041015, "memory(GiB)": 91.52, "step": 28790, "token_acc": 0.7569021002710027, "train_speed(iter/s)": 0.184676 }, { "epoch": 0.37363471513515795, "grad_norm": 0.7284125685691833, "learning_rate": 9.496908459565617e-05, "loss": 0.9002227783203125, "memory(GiB)": 91.52, "step": 28795, "token_acc": 0.7771075837742505, "train_speed(iter/s)": 0.184657 }, { "epoch": 0.37369959353681365, "grad_norm": 0.7540770173072815, "learning_rate": 9.496673949619916e-05, "loss": 0.9459749221801758, "memory(GiB)": 91.52, "step": 28800, "token_acc": 0.7525593552602918, "train_speed(iter/s)": 0.18464 }, { "epoch": 0.37376447193846934, "grad_norm": 0.7437392473220825, "learning_rate": 9.496439387926782e-05, "loss": 0.9426191329956055, "memory(GiB)": 91.52, "step": 28805, "token_acc": 0.733868178988865, "train_speed(iter/s)": 0.184623 }, { "epoch": 0.37382935034012504, "grad_norm": 0.6951450705528259, "learning_rate": 9.496204774488913e-05, "loss": 0.9496246337890625, "memory(GiB)": 91.52, "step": 28810, "token_acc": 0.7432760978574685, "train_speed(iter/s)": 0.184605 }, { "epoch": 0.37389422874178074, "grad_norm": 0.7936433553695679, "learning_rate": 9.495970109309009e-05, "loss": 0.9769498825073242, "memory(GiB)": 91.52, "step": 28815, "token_acc": 0.7307247566115856, "train_speed(iter/s)": 0.184589 }, { "epoch": 0.3739591071434364, "grad_norm": 0.819619357585907, "learning_rate": 9.495735392389768e-05, "loss": 0.9742738723754882, "memory(GiB)": 91.52, "step": 28820, "token_acc": 0.7319897777634005, "train_speed(iter/s)": 0.184572 }, { "epoch": 0.3740239855450921, "grad_norm": 0.7214512825012207, "learning_rate": 9.495500623733895e-05, "loss": 0.961390209197998, "memory(GiB)": 91.52, "step": 28825, "token_acc": 0.7413730951990825, "train_speed(iter/s)": 0.184554 }, { "epoch": 0.3740888639467478, "grad_norm": 0.8001119494438171, "learning_rate": 9.495265803344091e-05, "loss": 0.9343635559082031, "memory(GiB)": 91.52, "step": 28830, "token_acc": 0.7609321288672156, "train_speed(iter/s)": 0.184538 }, { "epoch": 0.3741537423484035, "grad_norm": 0.7715861797332764, "learning_rate": 9.495030931223056e-05, "loss": 0.9335280418395996, "memory(GiB)": 91.52, "step": 28835, "token_acc": 0.7702377870640879, "train_speed(iter/s)": 0.18452 }, { "epoch": 0.3742186207500592, "grad_norm": 0.799187958240509, "learning_rate": 9.494796007373496e-05, "loss": 0.9548238754272461, "memory(GiB)": 91.52, "step": 28840, "token_acc": 0.7475335974520314, "train_speed(iter/s)": 0.184504 }, { "epoch": 0.3742834991517149, "grad_norm": 0.8393515944480896, "learning_rate": 9.494561031798111e-05, "loss": 0.9595874786376953, "memory(GiB)": 91.52, "step": 28845, "token_acc": 0.7409518058648149, "train_speed(iter/s)": 0.184489 }, { "epoch": 0.3743483775533706, "grad_norm": 0.8219572305679321, "learning_rate": 9.494326004499609e-05, "loss": 0.9257461547851562, "memory(GiB)": 91.52, "step": 28850, "token_acc": 0.7662691652470187, "train_speed(iter/s)": 0.184471 }, { "epoch": 0.3744132559550263, "grad_norm": 0.8428144454956055, "learning_rate": 9.494090925480691e-05, "loss": 0.9690038681030273, "memory(GiB)": 91.52, "step": 28855, "token_acc": 0.7356281836302085, "train_speed(iter/s)": 0.184453 }, { "epoch": 0.374478134356682, "grad_norm": 0.706716775894165, "learning_rate": 9.493855794744064e-05, "loss": 0.9335414886474609, "memory(GiB)": 91.52, "step": 28860, "token_acc": 0.7491903677814474, "train_speed(iter/s)": 0.184436 }, { "epoch": 0.3745430127583377, "grad_norm": 0.7765167355537415, "learning_rate": 9.493620612292434e-05, "loss": 0.9114834785461425, "memory(GiB)": 91.52, "step": 28865, "token_acc": 0.766776500843848, "train_speed(iter/s)": 0.184419 }, { "epoch": 0.3746078911599934, "grad_norm": 0.8806355595588684, "learning_rate": 9.493385378128507e-05, "loss": 0.9424680709838867, "memory(GiB)": 91.52, "step": 28870, "token_acc": 0.7556523385722401, "train_speed(iter/s)": 0.184405 }, { "epoch": 0.3746727695616491, "grad_norm": 0.8254386186599731, "learning_rate": 9.493150092254992e-05, "loss": 0.9282135009765625, "memory(GiB)": 91.52, "step": 28875, "token_acc": 0.7323504173492472, "train_speed(iter/s)": 0.184387 }, { "epoch": 0.3747376479633048, "grad_norm": 0.8124104142189026, "learning_rate": 9.492914754674592e-05, "loss": 0.9351565361022949, "memory(GiB)": 91.52, "step": 28880, "token_acc": 0.7455995866033653, "train_speed(iter/s)": 0.18437 }, { "epoch": 0.3748025263649605, "grad_norm": 0.9257842302322388, "learning_rate": 9.49267936539002e-05, "loss": 1.0282081604003905, "memory(GiB)": 91.52, "step": 28885, "token_acc": 0.7421215242018537, "train_speed(iter/s)": 0.184355 }, { "epoch": 0.3748674047666162, "grad_norm": 0.7590802311897278, "learning_rate": 9.492443924403982e-05, "loss": 0.9112110137939453, "memory(GiB)": 91.52, "step": 28890, "token_acc": 0.7464133260583496, "train_speed(iter/s)": 0.18434 }, { "epoch": 0.3749322831682719, "grad_norm": 0.74863201379776, "learning_rate": 9.49220843171919e-05, "loss": 0.916157341003418, "memory(GiB)": 91.52, "step": 28895, "token_acc": 0.7629534628068889, "train_speed(iter/s)": 0.184324 }, { "epoch": 0.3749971615699276, "grad_norm": 0.7288339138031006, "learning_rate": 9.491972887338351e-05, "loss": 0.962076473236084, "memory(GiB)": 91.52, "step": 28900, "token_acc": 0.7346010136509933, "train_speed(iter/s)": 0.184307 }, { "epoch": 0.37506203997158327, "grad_norm": 0.7716361880302429, "learning_rate": 9.491737291264178e-05, "loss": 0.9565386772155762, "memory(GiB)": 91.52, "step": 28905, "token_acc": 0.7616822429906542, "train_speed(iter/s)": 0.184291 }, { "epoch": 0.37512691837323897, "grad_norm": 0.7229257225990295, "learning_rate": 9.491501643499381e-05, "loss": 0.9712820053100586, "memory(GiB)": 91.52, "step": 28910, "token_acc": 0.7328114252022667, "train_speed(iter/s)": 0.184275 }, { "epoch": 0.37519179677489467, "grad_norm": 0.8139042258262634, "learning_rate": 9.491265944046673e-05, "loss": 0.9545651435852051, "memory(GiB)": 91.52, "step": 28915, "token_acc": 0.7382054942376658, "train_speed(iter/s)": 0.184259 }, { "epoch": 0.37525667517655037, "grad_norm": 0.80226731300354, "learning_rate": 9.491030192908763e-05, "loss": 0.9862434387207031, "memory(GiB)": 91.52, "step": 28920, "token_acc": 0.7226830708049834, "train_speed(iter/s)": 0.184242 }, { "epoch": 0.37532155357820607, "grad_norm": 0.8255354166030884, "learning_rate": 9.490794390088369e-05, "loss": 0.992336368560791, "memory(GiB)": 91.52, "step": 28925, "token_acc": 0.76068027714995, "train_speed(iter/s)": 0.184226 }, { "epoch": 0.37538643197986177, "grad_norm": 0.6783868074417114, "learning_rate": 9.490558535588201e-05, "loss": 0.9315333366394043, "memory(GiB)": 91.52, "step": 28930, "token_acc": 0.757087252675965, "train_speed(iter/s)": 0.184209 }, { "epoch": 0.37545131038151747, "grad_norm": 0.8055264949798584, "learning_rate": 9.490322629410975e-05, "loss": 0.9319606781005859, "memory(GiB)": 91.52, "step": 28935, "token_acc": 0.7628357334484567, "train_speed(iter/s)": 0.184192 }, { "epoch": 0.3755161887831731, "grad_norm": 0.746354877948761, "learning_rate": 9.490086671559406e-05, "loss": 0.9234788894653321, "memory(GiB)": 91.52, "step": 28940, "token_acc": 0.7678195710752649, "train_speed(iter/s)": 0.184177 }, { "epoch": 0.3755810671848288, "grad_norm": 0.782146692276001, "learning_rate": 9.489850662036207e-05, "loss": 0.9596182823181152, "memory(GiB)": 91.52, "step": 28945, "token_acc": 0.7240510028862664, "train_speed(iter/s)": 0.184162 }, { "epoch": 0.3756459455864845, "grad_norm": 0.7579538822174072, "learning_rate": 9.489614600844097e-05, "loss": 0.9438705444335938, "memory(GiB)": 91.52, "step": 28950, "token_acc": 0.7414291183738819, "train_speed(iter/s)": 0.184147 }, { "epoch": 0.3757108239881402, "grad_norm": 0.7916929125785828, "learning_rate": 9.48937848798579e-05, "loss": 0.9521236419677734, "memory(GiB)": 91.52, "step": 28955, "token_acc": 0.7429425924575097, "train_speed(iter/s)": 0.184131 }, { "epoch": 0.3757757023897959, "grad_norm": 0.8084967732429504, "learning_rate": 9.489142323464004e-05, "loss": 0.9492198944091796, "memory(GiB)": 91.52, "step": 28960, "token_acc": 0.7710024542193694, "train_speed(iter/s)": 0.184113 }, { "epoch": 0.3758405807914516, "grad_norm": 0.8714473843574524, "learning_rate": 9.488906107281458e-05, "loss": 0.9468053817749024, "memory(GiB)": 91.52, "step": 28965, "token_acc": 0.7380130958248258, "train_speed(iter/s)": 0.184096 }, { "epoch": 0.3759054591931073, "grad_norm": 0.7327938079833984, "learning_rate": 9.48866983944087e-05, "loss": 0.9494535446166992, "memory(GiB)": 91.52, "step": 28970, "token_acc": 0.7399311120963067, "train_speed(iter/s)": 0.184081 }, { "epoch": 0.375970337594763, "grad_norm": 0.7653911709785461, "learning_rate": 9.488433519944956e-05, "loss": 0.9239004135131836, "memory(GiB)": 91.52, "step": 28975, "token_acc": 0.7527749379290201, "train_speed(iter/s)": 0.184066 }, { "epoch": 0.3760352159964187, "grad_norm": 0.8446093797683716, "learning_rate": 9.488197148796439e-05, "loss": 0.9777402877807617, "memory(GiB)": 91.52, "step": 28980, "token_acc": 0.7538122056450285, "train_speed(iter/s)": 0.184049 }, { "epoch": 0.3761000943980744, "grad_norm": 0.6705333590507507, "learning_rate": 9.487960725998039e-05, "loss": 0.9280298233032227, "memory(GiB)": 91.52, "step": 28985, "token_acc": 0.7522754688014036, "train_speed(iter/s)": 0.184033 }, { "epoch": 0.3761649727997301, "grad_norm": 0.6910062432289124, "learning_rate": 9.487724251552474e-05, "loss": 0.9764804840087891, "memory(GiB)": 91.52, "step": 28990, "token_acc": 0.7354881266490765, "train_speed(iter/s)": 0.184016 }, { "epoch": 0.3762298512013858, "grad_norm": 0.7747626304626465, "learning_rate": 9.48748772546247e-05, "loss": 0.9451152801513671, "memory(GiB)": 91.52, "step": 28995, "token_acc": 0.7369531056157472, "train_speed(iter/s)": 0.184001 }, { "epoch": 0.3762947296030415, "grad_norm": 0.7348217964172363, "learning_rate": 9.487251147730743e-05, "loss": 0.9340812683105468, "memory(GiB)": 91.52, "step": 29000, "token_acc": 0.7668270957546154, "train_speed(iter/s)": 0.183985 }, { "epoch": 0.3763596080046972, "grad_norm": 0.8046427965164185, "learning_rate": 9.48701451836002e-05, "loss": 0.9644765853881836, "memory(GiB)": 91.52, "step": 29005, "token_acc": 0.7645340939961967, "train_speed(iter/s)": 0.18397 }, { "epoch": 0.3764244864063529, "grad_norm": 0.7747892141342163, "learning_rate": 9.486777837353023e-05, "loss": 1.0281315803527833, "memory(GiB)": 91.52, "step": 29010, "token_acc": 0.7408738347683769, "train_speed(iter/s)": 0.183957 }, { "epoch": 0.3764893648080086, "grad_norm": 0.7454500794410706, "learning_rate": 9.486541104712476e-05, "loss": 0.9376877784729004, "memory(GiB)": 91.52, "step": 29015, "token_acc": 0.7458934713426982, "train_speed(iter/s)": 0.183941 }, { "epoch": 0.3765542432096643, "grad_norm": 0.8219878077507019, "learning_rate": 9.486304320441101e-05, "loss": 0.9823428153991699, "memory(GiB)": 91.52, "step": 29020, "token_acc": 0.7388108182035764, "train_speed(iter/s)": 0.183925 }, { "epoch": 0.37661912161132, "grad_norm": 0.8584590554237366, "learning_rate": 9.486067484541625e-05, "loss": 0.9371524810791015, "memory(GiB)": 91.52, "step": 29025, "token_acc": 0.7569626713327501, "train_speed(iter/s)": 0.183908 }, { "epoch": 0.3766840000129757, "grad_norm": 0.7688146233558655, "learning_rate": 9.485830597016773e-05, "loss": 0.9418889999389648, "memory(GiB)": 91.52, "step": 29030, "token_acc": 0.7529735770735212, "train_speed(iter/s)": 0.183893 }, { "epoch": 0.3767488784146314, "grad_norm": 0.7010014057159424, "learning_rate": 9.485593657869273e-05, "loss": 0.9438358306884765, "memory(GiB)": 91.52, "step": 29035, "token_acc": 0.76283835811171, "train_speed(iter/s)": 0.183877 }, { "epoch": 0.3768137568162871, "grad_norm": 0.7425881624221802, "learning_rate": 9.485356667101848e-05, "loss": 0.9090456008911133, "memory(GiB)": 91.52, "step": 29040, "token_acc": 0.7673958107731815, "train_speed(iter/s)": 0.183859 }, { "epoch": 0.3768786352179428, "grad_norm": 0.8633785247802734, "learning_rate": 9.48511962471723e-05, "loss": 0.995921516418457, "memory(GiB)": 91.52, "step": 29045, "token_acc": 0.7470884949187746, "train_speed(iter/s)": 0.183841 }, { "epoch": 0.3769435136195985, "grad_norm": 0.686268150806427, "learning_rate": 9.48488253071814e-05, "loss": 0.9231742858886719, "memory(GiB)": 91.52, "step": 29050, "token_acc": 0.7354146341463415, "train_speed(iter/s)": 0.183823 }, { "epoch": 0.3770083920212542, "grad_norm": 0.7110826969146729, "learning_rate": 9.484645385107313e-05, "loss": 0.9281765937805175, "memory(GiB)": 91.52, "step": 29055, "token_acc": 0.7701967170959585, "train_speed(iter/s)": 0.183807 }, { "epoch": 0.37707327042290983, "grad_norm": 0.8357815742492676, "learning_rate": 9.484408187887475e-05, "loss": 0.9552730560302735, "memory(GiB)": 91.52, "step": 29060, "token_acc": 0.7411465090577098, "train_speed(iter/s)": 0.183793 }, { "epoch": 0.37713814882456553, "grad_norm": 0.7065703272819519, "learning_rate": 9.484170939061355e-05, "loss": 0.9488509178161622, "memory(GiB)": 91.52, "step": 29065, "token_acc": 0.7378673332520425, "train_speed(iter/s)": 0.183776 }, { "epoch": 0.37720302722622123, "grad_norm": 0.7756434679031372, "learning_rate": 9.483933638631686e-05, "loss": 0.9233917236328125, "memory(GiB)": 91.52, "step": 29070, "token_acc": 0.7723306858640792, "train_speed(iter/s)": 0.18376 }, { "epoch": 0.37726790562787693, "grad_norm": 0.7251488566398621, "learning_rate": 9.483696286601196e-05, "loss": 0.9227480888366699, "memory(GiB)": 91.52, "step": 29075, "token_acc": 0.7485765657776446, "train_speed(iter/s)": 0.183743 }, { "epoch": 0.37733278402953263, "grad_norm": 0.7742865681648254, "learning_rate": 9.48345888297262e-05, "loss": 0.9730146408081055, "memory(GiB)": 91.52, "step": 29080, "token_acc": 0.7481259370314842, "train_speed(iter/s)": 0.183726 }, { "epoch": 0.37739766243118833, "grad_norm": 0.8499398231506348, "learning_rate": 9.483221427748686e-05, "loss": 0.9820785522460938, "memory(GiB)": 91.52, "step": 29085, "token_acc": 0.7656869757461473, "train_speed(iter/s)": 0.18371 }, { "epoch": 0.37746254083284403, "grad_norm": 0.8163697719573975, "learning_rate": 9.482983920932128e-05, "loss": 0.9398229598999024, "memory(GiB)": 91.52, "step": 29090, "token_acc": 0.7389228206551915, "train_speed(iter/s)": 0.183694 }, { "epoch": 0.37752741923449973, "grad_norm": 0.7636057138442993, "learning_rate": 9.48274636252568e-05, "loss": 1.041097640991211, "memory(GiB)": 91.52, "step": 29095, "token_acc": 0.7546613961813843, "train_speed(iter/s)": 0.183679 }, { "epoch": 0.3775922976361554, "grad_norm": 0.7263864874839783, "learning_rate": 9.482508752532076e-05, "loss": 0.8915014266967773, "memory(GiB)": 91.52, "step": 29100, "token_acc": 0.7591089235489644, "train_speed(iter/s)": 0.183658 }, { "epoch": 0.3776571760378111, "grad_norm": 0.8483531475067139, "learning_rate": 9.482271090954049e-05, "loss": 0.9584510803222657, "memory(GiB)": 91.52, "step": 29105, "token_acc": 0.744483965183193, "train_speed(iter/s)": 0.183643 }, { "epoch": 0.3777220544394668, "grad_norm": 0.8701364398002625, "learning_rate": 9.482033377794334e-05, "loss": 0.9527748107910157, "memory(GiB)": 91.52, "step": 29110, "token_acc": 0.7540793336519424, "train_speed(iter/s)": 0.183626 }, { "epoch": 0.3777869328411225, "grad_norm": 0.7701944708824158, "learning_rate": 9.481795613055668e-05, "loss": 0.9676499366760254, "memory(GiB)": 91.52, "step": 29115, "token_acc": 0.765961162335848, "train_speed(iter/s)": 0.183611 }, { "epoch": 0.3778518112427782, "grad_norm": 0.7883363366127014, "learning_rate": 9.481557796740788e-05, "loss": 1.0157648086547852, "memory(GiB)": 91.52, "step": 29120, "token_acc": 0.7307351638618246, "train_speed(iter/s)": 0.183596 }, { "epoch": 0.3779166896444339, "grad_norm": 0.6841316223144531, "learning_rate": 9.481319928852428e-05, "loss": 0.9167215347290039, "memory(GiB)": 91.52, "step": 29125, "token_acc": 0.7506215337540639, "train_speed(iter/s)": 0.183579 }, { "epoch": 0.3779815680460896, "grad_norm": 0.881395697593689, "learning_rate": 9.481082009393327e-05, "loss": 0.9751447677612305, "memory(GiB)": 91.52, "step": 29130, "token_acc": 0.74951171875, "train_speed(iter/s)": 0.183565 }, { "epoch": 0.3780464464477453, "grad_norm": 0.7161197662353516, "learning_rate": 9.480844038366221e-05, "loss": 0.9190157890319824, "memory(GiB)": 91.52, "step": 29135, "token_acc": 0.757254483494544, "train_speed(iter/s)": 0.183548 }, { "epoch": 0.378111324849401, "grad_norm": 0.7494566440582275, "learning_rate": 9.480606015773852e-05, "loss": 1.0135672569274903, "memory(GiB)": 91.52, "step": 29140, "token_acc": 0.7267153197344021, "train_speed(iter/s)": 0.183534 }, { "epoch": 0.3781762032510567, "grad_norm": 0.8330870270729065, "learning_rate": 9.480367941618958e-05, "loss": 0.9878274917602539, "memory(GiB)": 91.52, "step": 29145, "token_acc": 0.7386554321825127, "train_speed(iter/s)": 0.183519 }, { "epoch": 0.3782410816527124, "grad_norm": 0.761507511138916, "learning_rate": 9.480129815904277e-05, "loss": 0.961122703552246, "memory(GiB)": 91.52, "step": 29150, "token_acc": 0.751389590702375, "train_speed(iter/s)": 0.183505 }, { "epoch": 0.3783059600543681, "grad_norm": 0.791897714138031, "learning_rate": 9.47989163863255e-05, "loss": 0.9168639183044434, "memory(GiB)": 91.52, "step": 29155, "token_acc": 0.7560570987654321, "train_speed(iter/s)": 0.183488 }, { "epoch": 0.3783708384560238, "grad_norm": 0.8082398176193237, "learning_rate": 9.479653409806519e-05, "loss": 0.9507776260375976, "memory(GiB)": 91.52, "step": 29160, "token_acc": 0.7274128440366973, "train_speed(iter/s)": 0.183472 }, { "epoch": 0.3784357168576795, "grad_norm": 0.7673718929290771, "learning_rate": 9.479415129428924e-05, "loss": 0.9744953155517578, "memory(GiB)": 91.52, "step": 29165, "token_acc": 0.7594605475040258, "train_speed(iter/s)": 0.183454 }, { "epoch": 0.3785005952593352, "grad_norm": 0.8009763360023499, "learning_rate": 9.479176797502509e-05, "loss": 0.8600917816162109, "memory(GiB)": 91.52, "step": 29170, "token_acc": 0.7783423629572299, "train_speed(iter/s)": 0.183439 }, { "epoch": 0.3785654736609909, "grad_norm": 0.7665990591049194, "learning_rate": 9.478938414030017e-05, "loss": 0.9584840774536133, "memory(GiB)": 91.52, "step": 29175, "token_acc": 0.7539531272059862, "train_speed(iter/s)": 0.183422 }, { "epoch": 0.37863035206264656, "grad_norm": 0.7517569661140442, "learning_rate": 9.478699979014187e-05, "loss": 0.9193716049194336, "memory(GiB)": 91.52, "step": 29180, "token_acc": 0.7551097336237226, "train_speed(iter/s)": 0.183406 }, { "epoch": 0.37869523046430226, "grad_norm": 0.8431653380393982, "learning_rate": 9.478461492457768e-05, "loss": 0.9319933891296387, "memory(GiB)": 91.52, "step": 29185, "token_acc": 0.7465001891789633, "train_speed(iter/s)": 0.18339 }, { "epoch": 0.37876010886595796, "grad_norm": 0.7325209379196167, "learning_rate": 9.478222954363501e-05, "loss": 0.9298042297363281, "memory(GiB)": 91.52, "step": 29190, "token_acc": 0.7533798300994943, "train_speed(iter/s)": 0.183376 }, { "epoch": 0.37882498726761366, "grad_norm": 0.7142398953437805, "learning_rate": 9.477984364734134e-05, "loss": 0.9411770820617675, "memory(GiB)": 91.52, "step": 29195, "token_acc": 0.7586330242412569, "train_speed(iter/s)": 0.183358 }, { "epoch": 0.37888986566926935, "grad_norm": 0.7121292948722839, "learning_rate": 9.477745723572409e-05, "loss": 0.9517118453979492, "memory(GiB)": 91.52, "step": 29200, "token_acc": 0.7598598670904363, "train_speed(iter/s)": 0.183343 }, { "epoch": 0.37895474407092505, "grad_norm": 0.7735239267349243, "learning_rate": 9.477507030881076e-05, "loss": 0.9827864646911622, "memory(GiB)": 91.52, "step": 29205, "token_acc": 0.7278688524590164, "train_speed(iter/s)": 0.183328 }, { "epoch": 0.37901962247258075, "grad_norm": 0.7955101728439331, "learning_rate": 9.47726828666288e-05, "loss": 0.9410181045532227, "memory(GiB)": 91.52, "step": 29210, "token_acc": 0.7430962968498935, "train_speed(iter/s)": 0.183314 }, { "epoch": 0.37908450087423645, "grad_norm": 0.8094510436058044, "learning_rate": 9.477029490920569e-05, "loss": 0.995123291015625, "memory(GiB)": 91.52, "step": 29215, "token_acc": 0.7140825035561877, "train_speed(iter/s)": 0.183298 }, { "epoch": 0.37914937927589215, "grad_norm": 0.8292470574378967, "learning_rate": 9.476790643656889e-05, "loss": 0.946002197265625, "memory(GiB)": 91.52, "step": 29220, "token_acc": 0.7506190177020584, "train_speed(iter/s)": 0.183283 }, { "epoch": 0.37921425767754785, "grad_norm": 0.826452374458313, "learning_rate": 9.476551744874594e-05, "loss": 0.9433778762817383, "memory(GiB)": 91.52, "step": 29225, "token_acc": 0.7414396268419379, "train_speed(iter/s)": 0.183269 }, { "epoch": 0.37927913607920355, "grad_norm": 0.7441685199737549, "learning_rate": 9.476312794576425e-05, "loss": 0.9912550926208497, "memory(GiB)": 91.52, "step": 29230, "token_acc": 0.74659664155898, "train_speed(iter/s)": 0.183251 }, { "epoch": 0.37934401448085925, "grad_norm": 0.8346419930458069, "learning_rate": 9.47607379276514e-05, "loss": 0.9236283302307129, "memory(GiB)": 91.52, "step": 29235, "token_acc": 0.7375861936130051, "train_speed(iter/s)": 0.183236 }, { "epoch": 0.37940889288251495, "grad_norm": 0.7712430953979492, "learning_rate": 9.475834739443485e-05, "loss": 0.9967848777770996, "memory(GiB)": 91.52, "step": 29240, "token_acc": 0.731110142400465, "train_speed(iter/s)": 0.183221 }, { "epoch": 0.37947377128417065, "grad_norm": 0.8492328524589539, "learning_rate": 9.475595634614211e-05, "loss": 0.9504640579223633, "memory(GiB)": 91.52, "step": 29245, "token_acc": 0.7327188940092166, "train_speed(iter/s)": 0.183206 }, { "epoch": 0.37953864968582635, "grad_norm": 0.8107526302337646, "learning_rate": 9.47535647828007e-05, "loss": 0.9249591827392578, "memory(GiB)": 91.52, "step": 29250, "token_acc": 0.7446686268545744, "train_speed(iter/s)": 0.18319 }, { "epoch": 0.37960352808748205, "grad_norm": 0.850631594657898, "learning_rate": 9.475117270443817e-05, "loss": 0.9458463668823243, "memory(GiB)": 91.52, "step": 29255, "token_acc": 0.7661568090070738, "train_speed(iter/s)": 0.183176 }, { "epoch": 0.37966840648913774, "grad_norm": 0.7157258987426758, "learning_rate": 9.474878011108201e-05, "loss": 0.9146360397338867, "memory(GiB)": 91.52, "step": 29260, "token_acc": 0.7549931129476584, "train_speed(iter/s)": 0.183159 }, { "epoch": 0.37973328489079344, "grad_norm": 0.7933359146118164, "learning_rate": 9.474638700275977e-05, "loss": 0.953946304321289, "memory(GiB)": 91.52, "step": 29265, "token_acc": 0.7592233747482807, "train_speed(iter/s)": 0.183143 }, { "epoch": 0.37979816329244914, "grad_norm": 0.7843176126480103, "learning_rate": 9.474399337949899e-05, "loss": 0.9301487922668457, "memory(GiB)": 91.52, "step": 29270, "token_acc": 0.754572106659221, "train_speed(iter/s)": 0.183127 }, { "epoch": 0.37986304169410484, "grad_norm": 0.7687445282936096, "learning_rate": 9.474159924132722e-05, "loss": 0.9500977516174316, "memory(GiB)": 91.52, "step": 29275, "token_acc": 0.7512196786189486, "train_speed(iter/s)": 0.183113 }, { "epoch": 0.37992792009576054, "grad_norm": 0.8083987832069397, "learning_rate": 9.473920458827198e-05, "loss": 0.9333341598510743, "memory(GiB)": 91.52, "step": 29280, "token_acc": 0.7358384929521802, "train_speed(iter/s)": 0.183099 }, { "epoch": 0.37999279849741624, "grad_norm": 0.728131115436554, "learning_rate": 9.473680942036088e-05, "loss": 0.8819353103637695, "memory(GiB)": 91.52, "step": 29285, "token_acc": 0.747611817549187, "train_speed(iter/s)": 0.183084 }, { "epoch": 0.38005767689907194, "grad_norm": 0.7720640301704407, "learning_rate": 9.473441373762144e-05, "loss": 0.9187593460083008, "memory(GiB)": 91.52, "step": 29290, "token_acc": 0.7638444767441861, "train_speed(iter/s)": 0.183068 }, { "epoch": 0.38012255530072764, "grad_norm": 0.7916444540023804, "learning_rate": 9.473201754008126e-05, "loss": 0.932147216796875, "memory(GiB)": 91.52, "step": 29295, "token_acc": 0.7685175392208707, "train_speed(iter/s)": 0.183052 }, { "epoch": 0.3801874337023833, "grad_norm": 0.7439457774162292, "learning_rate": 9.472962082776789e-05, "loss": 0.9069394111633301, "memory(GiB)": 91.52, "step": 29300, "token_acc": 0.7574013628815209, "train_speed(iter/s)": 0.183035 }, { "epoch": 0.380252312104039, "grad_norm": 0.7551634311676025, "learning_rate": 9.472722360070894e-05, "loss": 0.9512507438659668, "memory(GiB)": 91.52, "step": 29305, "token_acc": 0.7549613826065875, "train_speed(iter/s)": 0.183019 }, { "epoch": 0.3803171905056947, "grad_norm": 0.7404981851577759, "learning_rate": 9.472482585893196e-05, "loss": 0.8920130729675293, "memory(GiB)": 91.52, "step": 29310, "token_acc": 0.7641668832856772, "train_speed(iter/s)": 0.183002 }, { "epoch": 0.3803820689073504, "grad_norm": 0.8614546060562134, "learning_rate": 9.472242760246456e-05, "loss": 0.9762876510620118, "memory(GiB)": 91.52, "step": 29315, "token_acc": 0.7490074352125893, "train_speed(iter/s)": 0.182983 }, { "epoch": 0.3804469473090061, "grad_norm": 0.8086790442466736, "learning_rate": 9.472002883133435e-05, "loss": 0.9500862121582031, "memory(GiB)": 91.52, "step": 29320, "token_acc": 0.7416535809380761, "train_speed(iter/s)": 0.182967 }, { "epoch": 0.3805118257106618, "grad_norm": 0.7501413822174072, "learning_rate": 9.471762954556894e-05, "loss": 0.9264741897583008, "memory(GiB)": 91.52, "step": 29325, "token_acc": 0.7607090573483547, "train_speed(iter/s)": 0.18295 }, { "epoch": 0.3805767041123175, "grad_norm": 0.912013053894043, "learning_rate": 9.471522974519591e-05, "loss": 0.9762985229492187, "memory(GiB)": 91.52, "step": 29330, "token_acc": 0.7568306010928961, "train_speed(iter/s)": 0.182934 }, { "epoch": 0.3806415825139732, "grad_norm": 0.7839206457138062, "learning_rate": 9.471282943024288e-05, "loss": 0.962161636352539, "memory(GiB)": 91.52, "step": 29335, "token_acc": 0.7536986941856193, "train_speed(iter/s)": 0.182917 }, { "epoch": 0.3807064609156289, "grad_norm": 0.8213853240013123, "learning_rate": 9.471042860073752e-05, "loss": 0.9577720642089844, "memory(GiB)": 91.52, "step": 29340, "token_acc": 0.7418290965685911, "train_speed(iter/s)": 0.182903 }, { "epoch": 0.3807713393172846, "grad_norm": 0.7635225653648376, "learning_rate": 9.47080272567074e-05, "loss": 0.9761148452758789, "memory(GiB)": 91.52, "step": 29345, "token_acc": 0.7476785470435614, "train_speed(iter/s)": 0.182889 }, { "epoch": 0.3808362177189403, "grad_norm": 0.7697716951370239, "learning_rate": 9.470562539818019e-05, "loss": 0.9589820861816406, "memory(GiB)": 91.52, "step": 29350, "token_acc": 0.7318558935645414, "train_speed(iter/s)": 0.182873 }, { "epoch": 0.38090109612059597, "grad_norm": 0.749758780002594, "learning_rate": 9.470322302518353e-05, "loss": 0.9417182922363281, "memory(GiB)": 91.52, "step": 29355, "token_acc": 0.7392521184793595, "train_speed(iter/s)": 0.182858 }, { "epoch": 0.38096597452225167, "grad_norm": 0.7738716006278992, "learning_rate": 9.470082013774504e-05, "loss": 0.9126714706420899, "memory(GiB)": 91.52, "step": 29360, "token_acc": 0.7368732874176109, "train_speed(iter/s)": 0.182843 }, { "epoch": 0.38103085292390737, "grad_norm": 0.7336829900741577, "learning_rate": 9.46984167358924e-05, "loss": 0.9231822967529297, "memory(GiB)": 91.52, "step": 29365, "token_acc": 0.7670213482389386, "train_speed(iter/s)": 0.182826 }, { "epoch": 0.38109573132556307, "grad_norm": 0.7888122797012329, "learning_rate": 9.469601281965326e-05, "loss": 0.9198203086853027, "memory(GiB)": 91.52, "step": 29370, "token_acc": 0.7519320779362142, "train_speed(iter/s)": 0.182811 }, { "epoch": 0.38116060972721877, "grad_norm": 0.7408748865127563, "learning_rate": 9.469360838905525e-05, "loss": 0.9598278045654297, "memory(GiB)": 91.52, "step": 29375, "token_acc": 0.7376134477522336, "train_speed(iter/s)": 0.182795 }, { "epoch": 0.38122548812887447, "grad_norm": 2.839716911315918, "learning_rate": 9.469120344412611e-05, "loss": 0.9435161590576172, "memory(GiB)": 91.52, "step": 29380, "token_acc": 0.7487463076183279, "train_speed(iter/s)": 0.182779 }, { "epoch": 0.38129036653053017, "grad_norm": 0.723235011100769, "learning_rate": 9.468879798489345e-05, "loss": 0.9206424713134765, "memory(GiB)": 91.52, "step": 29385, "token_acc": 0.7631348766691936, "train_speed(iter/s)": 0.182764 }, { "epoch": 0.38135524493218587, "grad_norm": 0.7653849720954895, "learning_rate": 9.4686392011385e-05, "loss": 0.9377463340759278, "memory(GiB)": 91.52, "step": 29390, "token_acc": 0.740627029580209, "train_speed(iter/s)": 0.182747 }, { "epoch": 0.38142012333384157, "grad_norm": 0.7527067065238953, "learning_rate": 9.46839855236284e-05, "loss": 0.9417582511901855, "memory(GiB)": 91.52, "step": 29395, "token_acc": 0.7399140606349964, "train_speed(iter/s)": 0.182731 }, { "epoch": 0.38148500173549726, "grad_norm": 0.786146879196167, "learning_rate": 9.46815785216514e-05, "loss": 0.9154035568237304, "memory(GiB)": 91.52, "step": 29400, "token_acc": 0.7675922671353251, "train_speed(iter/s)": 0.182715 }, { "epoch": 0.38154988013715296, "grad_norm": 0.8244863748550415, "learning_rate": 9.467917100548166e-05, "loss": 0.9394394874572753, "memory(GiB)": 91.52, "step": 29405, "token_acc": 0.7601614889831668, "train_speed(iter/s)": 0.1827 }, { "epoch": 0.38161475853880866, "grad_norm": 0.8139168620109558, "learning_rate": 9.467676297514689e-05, "loss": 0.9403894424438477, "memory(GiB)": 91.52, "step": 29410, "token_acc": 0.7503014359311629, "train_speed(iter/s)": 0.182685 }, { "epoch": 0.38167963694046436, "grad_norm": 0.7666898965835571, "learning_rate": 9.46743544306748e-05, "loss": 0.9804423332214356, "memory(GiB)": 91.52, "step": 29415, "token_acc": 0.7534401007990981, "train_speed(iter/s)": 0.182669 }, { "epoch": 0.38174451534212, "grad_norm": 0.7322013974189758, "learning_rate": 9.467194537209314e-05, "loss": 0.8929375648498535, "memory(GiB)": 91.52, "step": 29420, "token_acc": 0.7616405667412379, "train_speed(iter/s)": 0.182654 }, { "epoch": 0.3818093937437757, "grad_norm": 0.8023943901062012, "learning_rate": 9.466953579942959e-05, "loss": 0.9424993515014648, "memory(GiB)": 91.52, "step": 29425, "token_acc": 0.7434455612567653, "train_speed(iter/s)": 0.182638 }, { "epoch": 0.3818742721454314, "grad_norm": 0.7730353474617004, "learning_rate": 9.466712571271191e-05, "loss": 1.019547462463379, "memory(GiB)": 91.52, "step": 29430, "token_acc": 0.7448933113766161, "train_speed(iter/s)": 0.182621 }, { "epoch": 0.3819391505470871, "grad_norm": 0.7988215088844299, "learning_rate": 9.46647151119678e-05, "loss": 0.9203204154968262, "memory(GiB)": 91.52, "step": 29435, "token_acc": 0.7558374171230903, "train_speed(iter/s)": 0.182605 }, { "epoch": 0.3820040289487428, "grad_norm": 0.8260475993156433, "learning_rate": 9.466230399722503e-05, "loss": 0.9484958648681641, "memory(GiB)": 91.52, "step": 29440, "token_acc": 0.7556415449153768, "train_speed(iter/s)": 0.182591 }, { "epoch": 0.3820689073503985, "grad_norm": 0.8556408286094666, "learning_rate": 9.465989236851134e-05, "loss": 0.9290637969970703, "memory(GiB)": 91.52, "step": 29445, "token_acc": 0.7508471704506947, "train_speed(iter/s)": 0.182576 }, { "epoch": 0.3821337857520542, "grad_norm": 0.8393327593803406, "learning_rate": 9.465748022585448e-05, "loss": 0.9522022247314453, "memory(GiB)": 91.52, "step": 29450, "token_acc": 0.749869337979094, "train_speed(iter/s)": 0.182558 }, { "epoch": 0.3821986641537099, "grad_norm": 0.7956961989402771, "learning_rate": 9.465506756928224e-05, "loss": 1.0001312255859376, "memory(GiB)": 91.52, "step": 29455, "token_acc": 0.7297651556526489, "train_speed(iter/s)": 0.182544 }, { "epoch": 0.3822635425553656, "grad_norm": 0.7268287539482117, "learning_rate": 9.465265439882232e-05, "loss": 0.9501717567443848, "memory(GiB)": 91.52, "step": 29460, "token_acc": 0.7495208433157643, "train_speed(iter/s)": 0.18253 }, { "epoch": 0.3823284209570213, "grad_norm": 0.8445721864700317, "learning_rate": 9.465024071450255e-05, "loss": 0.9389178276062011, "memory(GiB)": 91.52, "step": 29465, "token_acc": 0.7374730515450447, "train_speed(iter/s)": 0.182516 }, { "epoch": 0.382393299358677, "grad_norm": 0.7114868760108948, "learning_rate": 9.464782651635067e-05, "loss": 0.9646910667419434, "memory(GiB)": 91.52, "step": 29470, "token_acc": 0.7612427950559443, "train_speed(iter/s)": 0.1825 }, { "epoch": 0.3824581777603327, "grad_norm": 0.7596234679222107, "learning_rate": 9.464541180439448e-05, "loss": 0.9195403099060059, "memory(GiB)": 91.52, "step": 29475, "token_acc": 0.754957056515746, "train_speed(iter/s)": 0.182484 }, { "epoch": 0.3825230561619884, "grad_norm": 0.7219467759132385, "learning_rate": 9.464299657866176e-05, "loss": 0.9649204254150391, "memory(GiB)": 91.52, "step": 29480, "token_acc": 0.752436203693171, "train_speed(iter/s)": 0.182469 }, { "epoch": 0.3825879345636441, "grad_norm": 0.784739077091217, "learning_rate": 9.464058083918033e-05, "loss": 0.9586694717407227, "memory(GiB)": 91.52, "step": 29485, "token_acc": 0.7348668280871671, "train_speed(iter/s)": 0.182456 }, { "epoch": 0.3826528129652998, "grad_norm": 0.7849181294441223, "learning_rate": 9.463816458597794e-05, "loss": 0.9466163635253906, "memory(GiB)": 91.52, "step": 29490, "token_acc": 0.732368841381005, "train_speed(iter/s)": 0.182441 }, { "epoch": 0.3827176913669555, "grad_norm": 0.7588423490524292, "learning_rate": 9.463574781908245e-05, "loss": 0.9387908935546875, "memory(GiB)": 91.52, "step": 29495, "token_acc": 0.7562952627732377, "train_speed(iter/s)": 0.182427 }, { "epoch": 0.3827825697686112, "grad_norm": 0.8197699785232544, "learning_rate": 9.463333053852163e-05, "loss": 0.9554962158203125, "memory(GiB)": 91.52, "step": 29500, "token_acc": 0.755581112163442, "train_speed(iter/s)": 0.182414 }, { "epoch": 0.3828474481702669, "grad_norm": 0.8472199440002441, "learning_rate": 9.463091274432331e-05, "loss": 0.9910726547241211, "memory(GiB)": 91.52, "step": 29505, "token_acc": 0.7249740932642487, "train_speed(iter/s)": 0.182399 }, { "epoch": 0.3829123265719226, "grad_norm": 0.8306491374969482, "learning_rate": 9.462849443651533e-05, "loss": 0.9918307304382324, "memory(GiB)": 91.52, "step": 29510, "token_acc": 0.7459499263622975, "train_speed(iter/s)": 0.182383 }, { "epoch": 0.3829772049735783, "grad_norm": 0.7718955278396606, "learning_rate": 9.462607561512551e-05, "loss": 0.9209522247314453, "memory(GiB)": 91.52, "step": 29515, "token_acc": 0.7580188852555388, "train_speed(iter/s)": 0.182368 }, { "epoch": 0.383042083375234, "grad_norm": 0.7923986911773682, "learning_rate": 9.462365628018168e-05, "loss": 0.9684274673461915, "memory(GiB)": 91.52, "step": 29520, "token_acc": 0.7319049689686524, "train_speed(iter/s)": 0.182354 }, { "epoch": 0.3831069617768897, "grad_norm": 0.8427495956420898, "learning_rate": 9.46212364317117e-05, "loss": 0.919710350036621, "memory(GiB)": 91.52, "step": 29525, "token_acc": 0.7470781352593565, "train_speed(iter/s)": 0.182341 }, { "epoch": 0.3831718401785454, "grad_norm": 0.7062879204750061, "learning_rate": 9.461881606974338e-05, "loss": 0.9444646835327148, "memory(GiB)": 91.52, "step": 29530, "token_acc": 0.7483058740405305, "train_speed(iter/s)": 0.182324 }, { "epoch": 0.3832367185802011, "grad_norm": 0.805936336517334, "learning_rate": 9.461639519430461e-05, "loss": 0.9510909080505371, "memory(GiB)": 91.52, "step": 29535, "token_acc": 0.7736065629114479, "train_speed(iter/s)": 0.18231 }, { "epoch": 0.38330159698185673, "grad_norm": 0.7025217413902283, "learning_rate": 9.461397380542323e-05, "loss": 0.9405929565429687, "memory(GiB)": 91.52, "step": 29540, "token_acc": 0.7575275278526651, "train_speed(iter/s)": 0.182294 }, { "epoch": 0.38336647538351243, "grad_norm": 0.8789684772491455, "learning_rate": 9.46115519031271e-05, "loss": 0.9837116241455078, "memory(GiB)": 91.52, "step": 29545, "token_acc": 0.714829600059141, "train_speed(iter/s)": 0.182281 }, { "epoch": 0.3834313537851681, "grad_norm": 0.8994116187095642, "learning_rate": 9.460912948744411e-05, "loss": 0.939870548248291, "memory(GiB)": 91.52, "step": 29550, "token_acc": 0.7210636583400484, "train_speed(iter/s)": 0.182266 }, { "epoch": 0.3834962321868238, "grad_norm": 0.8430866003036499, "learning_rate": 9.460670655840214e-05, "loss": 0.9636093139648437, "memory(GiB)": 91.52, "step": 29555, "token_acc": 0.7363573883161512, "train_speed(iter/s)": 0.182252 }, { "epoch": 0.3835611105884795, "grad_norm": 0.7508744597434998, "learning_rate": 9.460428311602905e-05, "loss": 0.9866680145263672, "memory(GiB)": 91.52, "step": 29560, "token_acc": 0.7263250039789909, "train_speed(iter/s)": 0.182237 }, { "epoch": 0.3836259889901352, "grad_norm": 0.7115780711174011, "learning_rate": 9.460185916035275e-05, "loss": 0.9355268478393555, "memory(GiB)": 91.52, "step": 29565, "token_acc": 0.7351961574869403, "train_speed(iter/s)": 0.18222 }, { "epoch": 0.3836908673917909, "grad_norm": 0.7662344574928284, "learning_rate": 9.459943469140111e-05, "loss": 0.9722648620605469, "memory(GiB)": 91.52, "step": 29570, "token_acc": 0.7366848532086256, "train_speed(iter/s)": 0.182205 }, { "epoch": 0.3837557457934466, "grad_norm": 0.7655537128448486, "learning_rate": 9.459700970920207e-05, "loss": 0.9316341400146484, "memory(GiB)": 91.52, "step": 29575, "token_acc": 0.7679112813148237, "train_speed(iter/s)": 0.182189 }, { "epoch": 0.3838206241951023, "grad_norm": 0.7845095992088318, "learning_rate": 9.459458421378349e-05, "loss": 1.002608299255371, "memory(GiB)": 91.52, "step": 29580, "token_acc": 0.7590582680263632, "train_speed(iter/s)": 0.182172 }, { "epoch": 0.383885502596758, "grad_norm": 0.7789335250854492, "learning_rate": 9.45921582051733e-05, "loss": 0.9663227081298829, "memory(GiB)": 91.52, "step": 29585, "token_acc": 0.7517273764003488, "train_speed(iter/s)": 0.182156 }, { "epoch": 0.3839503809984137, "grad_norm": 0.8462551832199097, "learning_rate": 9.458973168339945e-05, "loss": 0.9651735305786133, "memory(GiB)": 91.52, "step": 29590, "token_acc": 0.7426498796777457, "train_speed(iter/s)": 0.182142 }, { "epoch": 0.3840152594000694, "grad_norm": 0.8613731861114502, "learning_rate": 9.458730464848983e-05, "loss": 0.9766911506652832, "memory(GiB)": 91.52, "step": 29595, "token_acc": 0.7512632532055963, "train_speed(iter/s)": 0.182125 }, { "epoch": 0.3840801378017251, "grad_norm": 0.7310112714767456, "learning_rate": 9.458487710047237e-05, "loss": 0.9332450866699219, "memory(GiB)": 91.52, "step": 29600, "token_acc": 0.751384739333311, "train_speed(iter/s)": 0.182108 }, { "epoch": 0.3841450162033808, "grad_norm": 0.7656012773513794, "learning_rate": 9.458244903937502e-05, "loss": 0.9238901138305664, "memory(GiB)": 91.52, "step": 29605, "token_acc": 0.7512258683948839, "train_speed(iter/s)": 0.182092 }, { "epoch": 0.3842098946050365, "grad_norm": 0.7722537517547607, "learning_rate": 9.458002046522571e-05, "loss": 0.9604939460754395, "memory(GiB)": 91.52, "step": 29610, "token_acc": 0.7531450094161959, "train_speed(iter/s)": 0.182078 }, { "epoch": 0.3842747730066922, "grad_norm": 0.8110567331314087, "learning_rate": 9.457759137805239e-05, "loss": 0.9036028861999512, "memory(GiB)": 91.52, "step": 29615, "token_acc": 0.7477584757635192, "train_speed(iter/s)": 0.182062 }, { "epoch": 0.3843396514083479, "grad_norm": 0.7878473997116089, "learning_rate": 9.457516177788301e-05, "loss": 0.9326549530029297, "memory(GiB)": 91.52, "step": 29620, "token_acc": 0.7496729021535415, "train_speed(iter/s)": 0.182047 }, { "epoch": 0.3844045298100036, "grad_norm": 0.8431857228279114, "learning_rate": 9.457273166474556e-05, "loss": 0.9511293411254883, "memory(GiB)": 91.52, "step": 29625, "token_acc": 0.7397498262682418, "train_speed(iter/s)": 0.182034 }, { "epoch": 0.3844694082116593, "grad_norm": 0.7420701384544373, "learning_rate": 9.457030103866798e-05, "loss": 0.9270978927612304, "memory(GiB)": 91.52, "step": 29630, "token_acc": 0.7219620039719714, "train_speed(iter/s)": 0.182018 }, { "epoch": 0.384534286613315, "grad_norm": 0.7905776500701904, "learning_rate": 9.456786989967823e-05, "loss": 0.9352394104003906, "memory(GiB)": 91.52, "step": 29635, "token_acc": 0.7464351467456048, "train_speed(iter/s)": 0.182004 }, { "epoch": 0.3845991650149707, "grad_norm": 0.7984187006950378, "learning_rate": 9.456543824780431e-05, "loss": 0.9888298034667968, "memory(GiB)": 91.52, "step": 29640, "token_acc": 0.7405881952481449, "train_speed(iter/s)": 0.18199 }, { "epoch": 0.3846640434166264, "grad_norm": 0.8511672616004944, "learning_rate": 9.456300608307418e-05, "loss": 0.9707904815673828, "memory(GiB)": 91.52, "step": 29645, "token_acc": 0.7432502427970217, "train_speed(iter/s)": 0.181976 }, { "epoch": 0.3847289218182821, "grad_norm": 0.9421770572662354, "learning_rate": 9.456057340551587e-05, "loss": 0.9147563934326172, "memory(GiB)": 91.52, "step": 29650, "token_acc": 0.7540115222518656, "train_speed(iter/s)": 0.181962 }, { "epoch": 0.3847938002199378, "grad_norm": 0.7428297996520996, "learning_rate": 9.455814021515733e-05, "loss": 0.9499689102172851, "memory(GiB)": 91.52, "step": 29655, "token_acc": 0.7443045368988123, "train_speed(iter/s)": 0.181945 }, { "epoch": 0.38485867862159345, "grad_norm": 0.6965286731719971, "learning_rate": 9.455570651202658e-05, "loss": 0.9122236251831055, "memory(GiB)": 91.52, "step": 29660, "token_acc": 0.751892346509672, "train_speed(iter/s)": 0.18193 }, { "epoch": 0.38492355702324915, "grad_norm": 0.754448413848877, "learning_rate": 9.455327229615164e-05, "loss": 0.9523536682128906, "memory(GiB)": 91.52, "step": 29665, "token_acc": 0.7422122190690543, "train_speed(iter/s)": 0.181914 }, { "epoch": 0.38498843542490485, "grad_norm": 0.8167515993118286, "learning_rate": 9.455083756756051e-05, "loss": 0.9557939529418945, "memory(GiB)": 91.52, "step": 29670, "token_acc": 0.7590053917632212, "train_speed(iter/s)": 0.1819 }, { "epoch": 0.38505331382656055, "grad_norm": 0.702680230140686, "learning_rate": 9.45484023262812e-05, "loss": 0.9454082489013672, "memory(GiB)": 91.52, "step": 29675, "token_acc": 0.7363396435437098, "train_speed(iter/s)": 0.181886 }, { "epoch": 0.38511819222821625, "grad_norm": 0.8442196846008301, "learning_rate": 9.454596657234173e-05, "loss": 0.9563450813293457, "memory(GiB)": 91.52, "step": 29680, "token_acc": 0.7432357594936709, "train_speed(iter/s)": 0.18187 }, { "epoch": 0.38518307062987195, "grad_norm": 0.779114842414856, "learning_rate": 9.454353030577017e-05, "loss": 0.9631040573120118, "memory(GiB)": 91.52, "step": 29685, "token_acc": 0.7570356472795498, "train_speed(iter/s)": 0.181857 }, { "epoch": 0.38524794903152765, "grad_norm": 0.7511851191520691, "learning_rate": 9.454109352659452e-05, "loss": 0.969449234008789, "memory(GiB)": 91.52, "step": 29690, "token_acc": 0.7359602986087547, "train_speed(iter/s)": 0.181844 }, { "epoch": 0.38531282743318335, "grad_norm": 0.6538289189338684, "learning_rate": 9.453865623484284e-05, "loss": 0.951841163635254, "memory(GiB)": 91.52, "step": 29695, "token_acc": 0.760097467326294, "train_speed(iter/s)": 0.181826 }, { "epoch": 0.38537770583483905, "grad_norm": 0.8614253401756287, "learning_rate": 9.453621843054315e-05, "loss": 0.9419999122619629, "memory(GiB)": 91.52, "step": 29700, "token_acc": 0.7275404484814079, "train_speed(iter/s)": 0.18181 }, { "epoch": 0.38544258423649475, "grad_norm": 0.8008512258529663, "learning_rate": 9.453378011372354e-05, "loss": 0.9469413757324219, "memory(GiB)": 91.52, "step": 29705, "token_acc": 0.7459652496394479, "train_speed(iter/s)": 0.181796 }, { "epoch": 0.38550746263815044, "grad_norm": 0.8590803146362305, "learning_rate": 9.453134128441205e-05, "loss": 0.913114356994629, "memory(GiB)": 91.52, "step": 29710, "token_acc": 0.7710013887657714, "train_speed(iter/s)": 0.181782 }, { "epoch": 0.38557234103980614, "grad_norm": 0.7396215796470642, "learning_rate": 9.452890194263675e-05, "loss": 0.9434070587158203, "memory(GiB)": 91.52, "step": 29715, "token_acc": 0.7595898673100121, "train_speed(iter/s)": 0.181767 }, { "epoch": 0.38563721944146184, "grad_norm": 0.7574953436851501, "learning_rate": 9.452646208842572e-05, "loss": 0.9558731079101562, "memory(GiB)": 91.52, "step": 29720, "token_acc": 0.7350606264070461, "train_speed(iter/s)": 0.181752 }, { "epoch": 0.38570209784311754, "grad_norm": 0.8733079433441162, "learning_rate": 9.452402172180702e-05, "loss": 0.9773347854614258, "memory(GiB)": 91.52, "step": 29725, "token_acc": 0.7284688995215312, "train_speed(iter/s)": 0.181739 }, { "epoch": 0.38576697624477324, "grad_norm": 0.7414464354515076, "learning_rate": 9.452158084280874e-05, "loss": 0.9626184463500976, "memory(GiB)": 91.52, "step": 29730, "token_acc": 0.7340190746433357, "train_speed(iter/s)": 0.181725 }, { "epoch": 0.38583185464642894, "grad_norm": 0.7665583491325378, "learning_rate": 9.451913945145899e-05, "loss": 0.9203171730041504, "memory(GiB)": 91.52, "step": 29735, "token_acc": 0.7668243243243243, "train_speed(iter/s)": 0.18171 }, { "epoch": 0.38589673304808464, "grad_norm": 0.7706335186958313, "learning_rate": 9.451669754778582e-05, "loss": 0.9592809677124023, "memory(GiB)": 91.52, "step": 29740, "token_acc": 0.7325183919149885, "train_speed(iter/s)": 0.181693 }, { "epoch": 0.38596161144974034, "grad_norm": 0.7622880935668945, "learning_rate": 9.451425513181739e-05, "loss": 0.9301303863525391, "memory(GiB)": 91.52, "step": 29745, "token_acc": 0.7638450615336068, "train_speed(iter/s)": 0.181679 }, { "epoch": 0.38602648985139604, "grad_norm": 0.7967594861984253, "learning_rate": 9.451181220358176e-05, "loss": 0.8636144638061524, "memory(GiB)": 91.52, "step": 29750, "token_acc": 0.7632259733540292, "train_speed(iter/s)": 0.181661 }, { "epoch": 0.38609136825305174, "grad_norm": 0.7107325792312622, "learning_rate": 9.450936876310707e-05, "loss": 0.9115863800048828, "memory(GiB)": 91.52, "step": 29755, "token_acc": 0.7521780420331144, "train_speed(iter/s)": 0.181647 }, { "epoch": 0.38615624665470744, "grad_norm": 0.7052758932113647, "learning_rate": 9.450692481042143e-05, "loss": 0.9596631050109863, "memory(GiB)": 91.52, "step": 29760, "token_acc": 0.7467962586958693, "train_speed(iter/s)": 0.181632 }, { "epoch": 0.38622112505636313, "grad_norm": 0.742453932762146, "learning_rate": 9.450448034555295e-05, "loss": 0.973331069946289, "memory(GiB)": 91.52, "step": 29765, "token_acc": 0.7601003160643122, "train_speed(iter/s)": 0.181619 }, { "epoch": 0.38628600345801883, "grad_norm": 0.7854361534118652, "learning_rate": 9.450203536852978e-05, "loss": 0.9939075469970703, "memory(GiB)": 91.52, "step": 29770, "token_acc": 0.7241266613406928, "train_speed(iter/s)": 0.181605 }, { "epoch": 0.3863508818596745, "grad_norm": 0.8777285218238831, "learning_rate": 9.449958987938006e-05, "loss": 1.0141307830810546, "memory(GiB)": 91.52, "step": 29775, "token_acc": 0.7473419749490242, "train_speed(iter/s)": 0.181591 }, { "epoch": 0.3864157602613302, "grad_norm": 0.7609989643096924, "learning_rate": 9.449714387813192e-05, "loss": 0.958011245727539, "memory(GiB)": 91.52, "step": 29780, "token_acc": 0.7516992353440951, "train_speed(iter/s)": 0.181576 }, { "epoch": 0.3864806386629859, "grad_norm": 0.7413404583930969, "learning_rate": 9.44946973648135e-05, "loss": 0.9634052276611328, "memory(GiB)": 91.52, "step": 29785, "token_acc": 0.7303232081400569, "train_speed(iter/s)": 0.181563 }, { "epoch": 0.3865455170646416, "grad_norm": 0.8662523031234741, "learning_rate": 9.449225033945297e-05, "loss": 0.9082571029663086, "memory(GiB)": 91.52, "step": 29790, "token_acc": 0.7526877814906291, "train_speed(iter/s)": 0.181548 }, { "epoch": 0.3866103954662973, "grad_norm": 0.8061839938163757, "learning_rate": 9.44898028020785e-05, "loss": 0.9343624114990234, "memory(GiB)": 91.52, "step": 29795, "token_acc": 0.7325697051368435, "train_speed(iter/s)": 0.181532 }, { "epoch": 0.386675273867953, "grad_norm": 0.7616939544677734, "learning_rate": 9.448735475271824e-05, "loss": 0.9295162200927735, "memory(GiB)": 91.52, "step": 29800, "token_acc": 0.7428967663374374, "train_speed(iter/s)": 0.181516 }, { "epoch": 0.3867401522696087, "grad_norm": 0.7176750898361206, "learning_rate": 9.448490619140036e-05, "loss": 0.9065488815307617, "memory(GiB)": 91.52, "step": 29805, "token_acc": 0.7460103531718075, "train_speed(iter/s)": 0.181501 }, { "epoch": 0.38680503067126437, "grad_norm": 0.7215604782104492, "learning_rate": 9.448245711815303e-05, "loss": 0.9033981323242187, "memory(GiB)": 91.52, "step": 29810, "token_acc": 0.7684291779235817, "train_speed(iter/s)": 0.181488 }, { "epoch": 0.38686990907292007, "grad_norm": 0.7780045866966248, "learning_rate": 9.448000753300446e-05, "loss": 0.9471174240112304, "memory(GiB)": 91.52, "step": 29815, "token_acc": 0.7536075347792275, "train_speed(iter/s)": 0.181475 }, { "epoch": 0.38693478747457577, "grad_norm": 0.874634861946106, "learning_rate": 9.447755743598283e-05, "loss": 0.9594826698303223, "memory(GiB)": 91.52, "step": 29820, "token_acc": 0.7533673784145046, "train_speed(iter/s)": 0.181459 }, { "epoch": 0.38699966587623147, "grad_norm": 0.8022376894950867, "learning_rate": 9.447510682711632e-05, "loss": 0.9356538772583007, "memory(GiB)": 91.52, "step": 29825, "token_acc": 0.7335141624794344, "train_speed(iter/s)": 0.181445 }, { "epoch": 0.38706454427788717, "grad_norm": 0.8102121949195862, "learning_rate": 9.447265570643316e-05, "loss": 0.9268770217895508, "memory(GiB)": 91.52, "step": 29830, "token_acc": 0.7381276467029643, "train_speed(iter/s)": 0.181432 }, { "epoch": 0.38712942267954287, "grad_norm": 0.7549089789390564, "learning_rate": 9.447020407396153e-05, "loss": 0.9917486190795899, "memory(GiB)": 91.52, "step": 29835, "token_acc": 0.7340919901935673, "train_speed(iter/s)": 0.181417 }, { "epoch": 0.38719430108119857, "grad_norm": 0.7148298025131226, "learning_rate": 9.446775192972964e-05, "loss": 0.9216906547546386, "memory(GiB)": 91.52, "step": 29840, "token_acc": 0.7596476355460543, "train_speed(iter/s)": 0.181402 }, { "epoch": 0.38725917948285427, "grad_norm": 0.7647616267204285, "learning_rate": 9.446529927376573e-05, "loss": 0.9515569686889649, "memory(GiB)": 91.52, "step": 29845, "token_acc": 0.7257268239166209, "train_speed(iter/s)": 0.181387 }, { "epoch": 0.38732405788450996, "grad_norm": 0.8796681761741638, "learning_rate": 9.446284610609803e-05, "loss": 0.9056337356567383, "memory(GiB)": 91.52, "step": 29850, "token_acc": 0.7767803498851387, "train_speed(iter/s)": 0.181369 }, { "epoch": 0.38738893628616566, "grad_norm": 0.8002785444259644, "learning_rate": 9.446039242675475e-05, "loss": 0.9400306701660156, "memory(GiB)": 91.52, "step": 29855, "token_acc": 0.7452948557089084, "train_speed(iter/s)": 0.181356 }, { "epoch": 0.38745381468782136, "grad_norm": 0.8335821032524109, "learning_rate": 9.445793823576414e-05, "loss": 0.9679177284240723, "memory(GiB)": 91.52, "step": 29860, "token_acc": 0.7582983069489728, "train_speed(iter/s)": 0.181341 }, { "epoch": 0.38751869308947706, "grad_norm": 0.8550769686698914, "learning_rate": 9.445548353315444e-05, "loss": 0.9473589897155762, "memory(GiB)": 91.52, "step": 29865, "token_acc": 0.7597370983446933, "train_speed(iter/s)": 0.181327 }, { "epoch": 0.38758357149113276, "grad_norm": 0.8236908316612244, "learning_rate": 9.445302831895388e-05, "loss": 0.9763500213623046, "memory(GiB)": 91.52, "step": 29870, "token_acc": 0.7351957442742012, "train_speed(iter/s)": 0.181313 }, { "epoch": 0.38764844989278846, "grad_norm": 0.8822179436683655, "learning_rate": 9.445057259319075e-05, "loss": 0.9201866149902344, "memory(GiB)": 91.52, "step": 29875, "token_acc": 0.7548999165971643, "train_speed(iter/s)": 0.181297 }, { "epoch": 0.38771332829444416, "grad_norm": 0.7770354151725769, "learning_rate": 9.444811635589328e-05, "loss": 0.9275177001953125, "memory(GiB)": 91.52, "step": 29880, "token_acc": 0.7574744137319688, "train_speed(iter/s)": 0.181283 }, { "epoch": 0.38777820669609986, "grad_norm": 0.8612209558486938, "learning_rate": 9.444565960708975e-05, "loss": 0.9817996978759765, "memory(GiB)": 91.52, "step": 29885, "token_acc": 0.7332196888983593, "train_speed(iter/s)": 0.181271 }, { "epoch": 0.38784308509775556, "grad_norm": 0.7807677984237671, "learning_rate": 9.444320234680843e-05, "loss": 0.9127933502197265, "memory(GiB)": 91.52, "step": 29890, "token_acc": 0.7525848142164782, "train_speed(iter/s)": 0.181256 }, { "epoch": 0.3879079634994112, "grad_norm": 0.7508009672164917, "learning_rate": 9.444074457507762e-05, "loss": 0.9350584030151368, "memory(GiB)": 91.52, "step": 29895, "token_acc": 0.7490505093529266, "train_speed(iter/s)": 0.18124 }, { "epoch": 0.3879728419010669, "grad_norm": 0.7197054624557495, "learning_rate": 9.443828629192554e-05, "loss": 0.9079395294189453, "memory(GiB)": 91.52, "step": 29900, "token_acc": 0.7574950540252625, "train_speed(iter/s)": 0.181224 }, { "epoch": 0.3880377203027226, "grad_norm": 0.7480284571647644, "learning_rate": 9.443582749738054e-05, "loss": 0.9201744079589844, "memory(GiB)": 91.52, "step": 29905, "token_acc": 0.7689594356261023, "train_speed(iter/s)": 0.181209 }, { "epoch": 0.3881025987043783, "grad_norm": 0.6947422027587891, "learning_rate": 9.44333681914709e-05, "loss": 0.930308723449707, "memory(GiB)": 91.52, "step": 29910, "token_acc": 0.7470680485476613, "train_speed(iter/s)": 0.181194 }, { "epoch": 0.388167477106034, "grad_norm": 0.8032476305961609, "learning_rate": 9.443090837422492e-05, "loss": 0.9204440116882324, "memory(GiB)": 91.52, "step": 29915, "token_acc": 0.753946111217275, "train_speed(iter/s)": 0.181179 }, { "epoch": 0.3882323555076897, "grad_norm": 0.791803240776062, "learning_rate": 9.44284480456709e-05, "loss": 0.9654335975646973, "memory(GiB)": 91.52, "step": 29920, "token_acc": 0.7553159895462105, "train_speed(iter/s)": 0.181166 }, { "epoch": 0.3882972339093454, "grad_norm": 0.851265549659729, "learning_rate": 9.442598720583717e-05, "loss": 0.9302853584289551, "memory(GiB)": 91.52, "step": 29925, "token_acc": 0.7549019607843137, "train_speed(iter/s)": 0.181152 }, { "epoch": 0.3883621123110011, "grad_norm": 0.7808371186256409, "learning_rate": 9.442352585475204e-05, "loss": 0.9664952278137207, "memory(GiB)": 91.52, "step": 29930, "token_acc": 0.7508663214970035, "train_speed(iter/s)": 0.181138 }, { "epoch": 0.3884269907126568, "grad_norm": 0.7062735557556152, "learning_rate": 9.442106399244381e-05, "loss": 0.9390060424804687, "memory(GiB)": 91.52, "step": 29935, "token_acc": 0.7291467769880838, "train_speed(iter/s)": 0.18112 }, { "epoch": 0.3884918691143125, "grad_norm": 0.7517561912536621, "learning_rate": 9.441860161894087e-05, "loss": 0.9357702255249023, "memory(GiB)": 91.52, "step": 29940, "token_acc": 0.7474390288359885, "train_speed(iter/s)": 0.181105 }, { "epoch": 0.3885567475159682, "grad_norm": 0.7509251832962036, "learning_rate": 9.441613873427149e-05, "loss": 0.9342802047729493, "memory(GiB)": 91.52, "step": 29945, "token_acc": 0.7630995236536853, "train_speed(iter/s)": 0.181091 }, { "epoch": 0.3886216259176239, "grad_norm": 0.7793818712234497, "learning_rate": 9.441367533846406e-05, "loss": 0.9680597305297851, "memory(GiB)": 91.52, "step": 29950, "token_acc": 0.7578118659199741, "train_speed(iter/s)": 0.181076 }, { "epoch": 0.3886865043192796, "grad_norm": 0.7139471173286438, "learning_rate": 9.44112114315469e-05, "loss": 0.9002143859863281, "memory(GiB)": 91.52, "step": 29955, "token_acc": 0.7639101094983448, "train_speed(iter/s)": 0.18106 }, { "epoch": 0.3887513827209353, "grad_norm": 0.7638959288597107, "learning_rate": 9.440874701354838e-05, "loss": 0.9678525924682617, "memory(GiB)": 91.52, "step": 29960, "token_acc": 0.7303761125466551, "train_speed(iter/s)": 0.181046 }, { "epoch": 0.388816261122591, "grad_norm": 0.7364798188209534, "learning_rate": 9.440628208449686e-05, "loss": 0.9616512298583985, "memory(GiB)": 91.52, "step": 29965, "token_acc": 0.7478639694708048, "train_speed(iter/s)": 0.18103 }, { "epoch": 0.3888811395242467, "grad_norm": 0.8693231344223022, "learning_rate": 9.440381664442072e-05, "loss": 0.947754955291748, "memory(GiB)": 91.52, "step": 29970, "token_acc": 0.7568207218460324, "train_speed(iter/s)": 0.181016 }, { "epoch": 0.3889460179259024, "grad_norm": 0.7306767702102661, "learning_rate": 9.44013506933483e-05, "loss": 0.922230339050293, "memory(GiB)": 91.52, "step": 29975, "token_acc": 0.7525095496135737, "train_speed(iter/s)": 0.181003 }, { "epoch": 0.3890108963275581, "grad_norm": 0.8101060390472412, "learning_rate": 9.439888423130799e-05, "loss": 0.9917065620422363, "memory(GiB)": 91.52, "step": 29980, "token_acc": 0.743203865589613, "train_speed(iter/s)": 0.18099 }, { "epoch": 0.3890757747292138, "grad_norm": 0.831303596496582, "learning_rate": 9.43964172583282e-05, "loss": 0.9448977470397949, "memory(GiB)": 91.52, "step": 29985, "token_acc": 0.7586158489506563, "train_speed(iter/s)": 0.180974 }, { "epoch": 0.3891406531308695, "grad_norm": 0.7114154696464539, "learning_rate": 9.439394977443727e-05, "loss": 0.9411153793334961, "memory(GiB)": 91.52, "step": 29990, "token_acc": 0.7292154345692063, "train_speed(iter/s)": 0.180961 }, { "epoch": 0.3892055315325252, "grad_norm": 0.8767409920692444, "learning_rate": 9.439148177966363e-05, "loss": 0.9216073989868164, "memory(GiB)": 91.52, "step": 29995, "token_acc": 0.7653795479882436, "train_speed(iter/s)": 0.180947 }, { "epoch": 0.3892704099341809, "grad_norm": 0.7813697457313538, "learning_rate": 9.438901327403569e-05, "loss": 0.9645419120788574, "memory(GiB)": 91.52, "step": 30000, "token_acc": 0.72675050985724, "train_speed(iter/s)": 0.180934 }, { "epoch": 0.3892704099341809, "eval_loss": 0.9416898488998413, "eval_runtime": 1641.7675, "eval_samples_per_second": 30.346, "eval_steps_per_second": 1.897, "eval_token_acc": 0.7487496509686189, "step": 30000 }, { "epoch": 0.3893352883358366, "grad_norm": 0.8297749757766724, "learning_rate": 9.438654425758182e-05, "loss": 0.9738674163818359, "memory(GiB)": 91.52, "step": 30005, "token_acc": 0.7502963796635816, "train_speed(iter/s)": 0.179013 }, { "epoch": 0.3894001667374923, "grad_norm": 0.7709600329399109, "learning_rate": 9.438407473033047e-05, "loss": 0.9482213020324707, "memory(GiB)": 91.52, "step": 30010, "token_acc": 0.7447746883988495, "train_speed(iter/s)": 0.178997 }, { "epoch": 0.3894650451391479, "grad_norm": 0.7726654410362244, "learning_rate": 9.438160469231005e-05, "loss": 0.9286824226379394, "memory(GiB)": 91.52, "step": 30015, "token_acc": 0.7503218777854809, "train_speed(iter/s)": 0.178985 }, { "epoch": 0.3895299235408036, "grad_norm": 0.7331092357635498, "learning_rate": 9.437913414354897e-05, "loss": 0.9203021049499511, "memory(GiB)": 91.52, "step": 30020, "token_acc": 0.7766640104013757, "train_speed(iter/s)": 0.178971 }, { "epoch": 0.3895948019424593, "grad_norm": 0.7438106536865234, "learning_rate": 9.437666308407566e-05, "loss": 0.9608661651611328, "memory(GiB)": 91.52, "step": 30025, "token_acc": 0.7303576687718839, "train_speed(iter/s)": 0.178957 }, { "epoch": 0.389659680344115, "grad_norm": 0.8560203313827515, "learning_rate": 9.437419151391858e-05, "loss": 1.004410743713379, "memory(GiB)": 91.52, "step": 30030, "token_acc": 0.7295736623045483, "train_speed(iter/s)": 0.178942 }, { "epoch": 0.3897245587457707, "grad_norm": 0.846419095993042, "learning_rate": 9.437171943310617e-05, "loss": 0.9699277877807617, "memory(GiB)": 91.52, "step": 30035, "token_acc": 0.7417922344099898, "train_speed(iter/s)": 0.178929 }, { "epoch": 0.3897894371474264, "grad_norm": 0.7622472047805786, "learning_rate": 9.436924684166685e-05, "loss": 0.9102316856384277, "memory(GiB)": 91.52, "step": 30040, "token_acc": 0.7531900349384779, "train_speed(iter/s)": 0.178916 }, { "epoch": 0.3898543155490821, "grad_norm": 0.711487889289856, "learning_rate": 9.43667737396291e-05, "loss": 0.921656608581543, "memory(GiB)": 91.52, "step": 30045, "token_acc": 0.7517146356399265, "train_speed(iter/s)": 0.178903 }, { "epoch": 0.3899191939507378, "grad_norm": 0.7612297534942627, "learning_rate": 9.436430012702137e-05, "loss": 0.9212042808532714, "memory(GiB)": 91.52, "step": 30050, "token_acc": 0.7454693185431165, "train_speed(iter/s)": 0.178887 }, { "epoch": 0.3899840723523935, "grad_norm": 0.709975004196167, "learning_rate": 9.436182600387214e-05, "loss": 0.9065479278564453, "memory(GiB)": 91.52, "step": 30055, "token_acc": 0.7638719058792426, "train_speed(iter/s)": 0.178872 }, { "epoch": 0.3900489507540492, "grad_norm": 0.7717165350914001, "learning_rate": 9.435935137020986e-05, "loss": 1.0030319213867187, "memory(GiB)": 91.52, "step": 30060, "token_acc": 0.7400142146410803, "train_speed(iter/s)": 0.178859 }, { "epoch": 0.3901138291557049, "grad_norm": 0.8060060739517212, "learning_rate": 9.435687622606302e-05, "loss": 1.0172592163085938, "memory(GiB)": 91.52, "step": 30065, "token_acc": 0.7253009805138388, "train_speed(iter/s)": 0.178845 }, { "epoch": 0.3901787075573606, "grad_norm": 0.8145341873168945, "learning_rate": 9.43544005714601e-05, "loss": 0.9680936813354493, "memory(GiB)": 91.52, "step": 30070, "token_acc": 0.7510899980477647, "train_speed(iter/s)": 0.178832 }, { "epoch": 0.3902435859590163, "grad_norm": 0.6950053572654724, "learning_rate": 9.435192440642961e-05, "loss": 0.9351974487304687, "memory(GiB)": 91.52, "step": 30075, "token_acc": 0.748774565944386, "train_speed(iter/s)": 0.178817 }, { "epoch": 0.390308464360672, "grad_norm": 0.7338938117027283, "learning_rate": 9.434944773100001e-05, "loss": 0.9797938346862793, "memory(GiB)": 91.52, "step": 30080, "token_acc": 0.7452127478657189, "train_speed(iter/s)": 0.178804 }, { "epoch": 0.3903733427623277, "grad_norm": 0.864496111869812, "learning_rate": 9.434697054519983e-05, "loss": 1.0366859436035156, "memory(GiB)": 91.52, "step": 30085, "token_acc": 0.7248878035902852, "train_speed(iter/s)": 0.178791 }, { "epoch": 0.3904382211639834, "grad_norm": 0.7248215675354004, "learning_rate": 9.434449284905756e-05, "loss": 0.9178110122680664, "memory(GiB)": 91.52, "step": 30090, "token_acc": 0.7544388134812348, "train_speed(iter/s)": 0.178776 }, { "epoch": 0.3905030995656391, "grad_norm": 0.8515762090682983, "learning_rate": 9.434201464260173e-05, "loss": 0.9990595817565918, "memory(GiB)": 91.52, "step": 30095, "token_acc": 0.7371207569840793, "train_speed(iter/s)": 0.178763 }, { "epoch": 0.3905679779672948, "grad_norm": 0.8712682723999023, "learning_rate": 9.433953592586084e-05, "loss": 0.9412920951843262, "memory(GiB)": 91.52, "step": 30100, "token_acc": 0.7544293098403823, "train_speed(iter/s)": 0.178747 }, { "epoch": 0.3906328563689505, "grad_norm": 0.7382856607437134, "learning_rate": 9.433705669886343e-05, "loss": 0.9201057434082032, "memory(GiB)": 91.52, "step": 30105, "token_acc": 0.7448946041719943, "train_speed(iter/s)": 0.178734 }, { "epoch": 0.3906977347706062, "grad_norm": 0.7683481574058533, "learning_rate": 9.433457696163803e-05, "loss": 0.9058221817016602, "memory(GiB)": 91.52, "step": 30110, "token_acc": 0.7485551170002819, "train_speed(iter/s)": 0.178721 }, { "epoch": 0.3907626131722619, "grad_norm": 0.8508977890014648, "learning_rate": 9.433209671421315e-05, "loss": 0.9369909286499023, "memory(GiB)": 91.52, "step": 30115, "token_acc": 0.7551232252879722, "train_speed(iter/s)": 0.178708 }, { "epoch": 0.3908274915739176, "grad_norm": 0.8087891936302185, "learning_rate": 9.432961595661737e-05, "loss": 1.0000986099243163, "memory(GiB)": 91.52, "step": 30120, "token_acc": 0.734213074192276, "train_speed(iter/s)": 0.178695 }, { "epoch": 0.3908923699755733, "grad_norm": 0.9336360692977905, "learning_rate": 9.432713468887924e-05, "loss": 0.97308349609375, "memory(GiB)": 91.52, "step": 30125, "token_acc": 0.7463151587777112, "train_speed(iter/s)": 0.178681 }, { "epoch": 0.390957248377229, "grad_norm": 0.7305018305778503, "learning_rate": 9.432465291102727e-05, "loss": 0.9160133361816406, "memory(GiB)": 91.52, "step": 30130, "token_acc": 0.7866293160429322, "train_speed(iter/s)": 0.178667 }, { "epoch": 0.39102212677888465, "grad_norm": 0.7452860474586487, "learning_rate": 9.432217062309006e-05, "loss": 0.913994312286377, "memory(GiB)": 91.52, "step": 30135, "token_acc": 0.7335328398172941, "train_speed(iter/s)": 0.178652 }, { "epoch": 0.39108700518054035, "grad_norm": 0.7910617589950562, "learning_rate": 9.431968782509614e-05, "loss": 0.933992576599121, "memory(GiB)": 91.52, "step": 30140, "token_acc": 0.7463996919522526, "train_speed(iter/s)": 0.178639 }, { "epoch": 0.39115188358219605, "grad_norm": 0.838814377784729, "learning_rate": 9.431720451707415e-05, "loss": 1.028378391265869, "memory(GiB)": 91.52, "step": 30145, "token_acc": 0.7452015973206235, "train_speed(iter/s)": 0.178624 }, { "epoch": 0.39121676198385175, "grad_norm": 0.8596357107162476, "learning_rate": 9.431472069905258e-05, "loss": 0.974873161315918, "memory(GiB)": 91.52, "step": 30150, "token_acc": 0.7413810167925835, "train_speed(iter/s)": 0.178611 }, { "epoch": 0.39128164038550745, "grad_norm": 0.8410447239875793, "learning_rate": 9.431223637106008e-05, "loss": 0.9295426368713379, "memory(GiB)": 91.52, "step": 30155, "token_acc": 0.757666908688587, "train_speed(iter/s)": 0.178596 }, { "epoch": 0.39134651878716314, "grad_norm": 0.8249576091766357, "learning_rate": 9.43097515331252e-05, "loss": 0.9866754531860351, "memory(GiB)": 91.52, "step": 30160, "token_acc": 0.7338693861541479, "train_speed(iter/s)": 0.178582 }, { "epoch": 0.39141139718881884, "grad_norm": 0.8014799952507019, "learning_rate": 9.430726618527657e-05, "loss": 0.9685016632080078, "memory(GiB)": 91.52, "step": 30165, "token_acc": 0.75254730713246, "train_speed(iter/s)": 0.178568 }, { "epoch": 0.39147627559047454, "grad_norm": 0.8164180517196655, "learning_rate": 9.430478032754275e-05, "loss": 0.9981258392333985, "memory(GiB)": 91.52, "step": 30170, "token_acc": 0.7412157569668134, "train_speed(iter/s)": 0.178555 }, { "epoch": 0.39154115399213024, "grad_norm": 0.8713433146476746, "learning_rate": 9.430229395995239e-05, "loss": 0.9909355163574218, "memory(GiB)": 91.52, "step": 30175, "token_acc": 0.734164495720134, "train_speed(iter/s)": 0.178544 }, { "epoch": 0.39160603239378594, "grad_norm": 0.8648621439933777, "learning_rate": 9.429980708253408e-05, "loss": 0.9171504020690918, "memory(GiB)": 91.52, "step": 30180, "token_acc": 0.7600910947070834, "train_speed(iter/s)": 0.178532 }, { "epoch": 0.39167091079544164, "grad_norm": 0.83530592918396, "learning_rate": 9.429731969531644e-05, "loss": 0.9580921173095703, "memory(GiB)": 91.52, "step": 30185, "token_acc": 0.730971391887012, "train_speed(iter/s)": 0.178521 }, { "epoch": 0.39173578919709734, "grad_norm": 0.7215760946273804, "learning_rate": 9.429483179832808e-05, "loss": 0.9625627517700195, "memory(GiB)": 91.52, "step": 30190, "token_acc": 0.751738969723445, "train_speed(iter/s)": 0.178508 }, { "epoch": 0.39180066759875304, "grad_norm": 0.8055239915847778, "learning_rate": 9.429234339159765e-05, "loss": 0.9613343238830566, "memory(GiB)": 91.52, "step": 30195, "token_acc": 0.7369281681915515, "train_speed(iter/s)": 0.178494 }, { "epoch": 0.39186554600040874, "grad_norm": 0.7468801736831665, "learning_rate": 9.428985447515379e-05, "loss": 0.9324203491210937, "memory(GiB)": 91.52, "step": 30200, "token_acc": 0.7633802816901408, "train_speed(iter/s)": 0.178478 }, { "epoch": 0.39193042440206444, "grad_norm": 0.8100964426994324, "learning_rate": 9.428736504902513e-05, "loss": 0.916616153717041, "memory(GiB)": 91.52, "step": 30205, "token_acc": 0.758328340323547, "train_speed(iter/s)": 0.178463 }, { "epoch": 0.39199530280372014, "grad_norm": 0.789447009563446, "learning_rate": 9.428487511324033e-05, "loss": 0.9690383911132813, "memory(GiB)": 91.52, "step": 30210, "token_acc": 0.7461484320012512, "train_speed(iter/s)": 0.178452 }, { "epoch": 0.39206018120537584, "grad_norm": 0.6977142095565796, "learning_rate": 9.428238466782804e-05, "loss": 0.9368326187133789, "memory(GiB)": 91.52, "step": 30215, "token_acc": 0.7518167261978076, "train_speed(iter/s)": 0.178439 }, { "epoch": 0.39212505960703153, "grad_norm": 0.7261024713516235, "learning_rate": 9.427989371281691e-05, "loss": 0.9355926513671875, "memory(GiB)": 91.52, "step": 30220, "token_acc": 0.7429231658001155, "train_speed(iter/s)": 0.178426 }, { "epoch": 0.39218993800868723, "grad_norm": 0.883931040763855, "learning_rate": 9.427740224823562e-05, "loss": 0.9645378112792968, "memory(GiB)": 91.52, "step": 30225, "token_acc": 0.7263766372627639, "train_speed(iter/s)": 0.178413 }, { "epoch": 0.39225481641034293, "grad_norm": 0.820020854473114, "learning_rate": 9.427491027411284e-05, "loss": 0.9532516479492188, "memory(GiB)": 91.52, "step": 30230, "token_acc": 0.7505147899983657, "train_speed(iter/s)": 0.178399 }, { "epoch": 0.39231969481199863, "grad_norm": 0.766463577747345, "learning_rate": 9.427241779047723e-05, "loss": 0.9698028564453125, "memory(GiB)": 91.52, "step": 30235, "token_acc": 0.7464688257954059, "train_speed(iter/s)": 0.178386 }, { "epoch": 0.39238457321365433, "grad_norm": 0.7620070576667786, "learning_rate": 9.426992479735748e-05, "loss": 0.922282600402832, "memory(GiB)": 91.52, "step": 30240, "token_acc": 0.7606435734489219, "train_speed(iter/s)": 0.178372 }, { "epoch": 0.39244945161531003, "grad_norm": 0.8618238568305969, "learning_rate": 9.42674312947823e-05, "loss": 0.939658260345459, "memory(GiB)": 91.52, "step": 30245, "token_acc": 0.7467199490752201, "train_speed(iter/s)": 0.178359 }, { "epoch": 0.39251433001696573, "grad_norm": 0.8546565771102905, "learning_rate": 9.426493728278036e-05, "loss": 0.9492410659790039, "memory(GiB)": 91.52, "step": 30250, "token_acc": 0.7681525901307709, "train_speed(iter/s)": 0.178346 }, { "epoch": 0.3925792084186214, "grad_norm": 0.8227539658546448, "learning_rate": 9.426244276138037e-05, "loss": 0.9406885147094727, "memory(GiB)": 91.52, "step": 30255, "token_acc": 0.7495876758957083, "train_speed(iter/s)": 0.178332 }, { "epoch": 0.39264408682027707, "grad_norm": 0.7703717350959778, "learning_rate": 9.425994773061104e-05, "loss": 0.9142898559570313, "memory(GiB)": 91.52, "step": 30260, "token_acc": 0.7708340582483733, "train_speed(iter/s)": 0.178318 }, { "epoch": 0.39270896522193277, "grad_norm": 0.6934529542922974, "learning_rate": 9.425745219050108e-05, "loss": 0.9203983306884765, "memory(GiB)": 91.52, "step": 30265, "token_acc": 0.7448976149512533, "train_speed(iter/s)": 0.178304 }, { "epoch": 0.39277384362358847, "grad_norm": 0.758564293384552, "learning_rate": 9.425495614107922e-05, "loss": 0.9405557632446289, "memory(GiB)": 91.52, "step": 30270, "token_acc": 0.7673748223029537, "train_speed(iter/s)": 0.17829 }, { "epoch": 0.39283872202524417, "grad_norm": 0.8000543117523193, "learning_rate": 9.425245958237417e-05, "loss": 0.9319913864135743, "memory(GiB)": 91.52, "step": 30275, "token_acc": 0.7484765576420843, "train_speed(iter/s)": 0.178277 }, { "epoch": 0.39290360042689987, "grad_norm": 0.7328886389732361, "learning_rate": 9.424996251441465e-05, "loss": 0.930793285369873, "memory(GiB)": 91.52, "step": 30280, "token_acc": 0.7403099215133829, "train_speed(iter/s)": 0.178264 }, { "epoch": 0.39296847882855557, "grad_norm": 0.8616173267364502, "learning_rate": 9.42474649372294e-05, "loss": 0.9532916069030761, "memory(GiB)": 91.52, "step": 30285, "token_acc": 0.7621633362293657, "train_speed(iter/s)": 0.178248 }, { "epoch": 0.39303335723021127, "grad_norm": 0.9537596702575684, "learning_rate": 9.424496685084719e-05, "loss": 0.9279918670654297, "memory(GiB)": 91.52, "step": 30290, "token_acc": 0.7510886195127692, "train_speed(iter/s)": 0.178235 }, { "epoch": 0.39309823563186697, "grad_norm": 0.8289961814880371, "learning_rate": 9.424246825529674e-05, "loss": 0.914466381072998, "memory(GiB)": 91.52, "step": 30295, "token_acc": 0.7466173091651774, "train_speed(iter/s)": 0.17822 }, { "epoch": 0.39316311403352266, "grad_norm": 0.8116927742958069, "learning_rate": 9.423996915060682e-05, "loss": 0.9238981246948242, "memory(GiB)": 91.52, "step": 30300, "token_acc": 0.7641565736352466, "train_speed(iter/s)": 0.178206 }, { "epoch": 0.39322799243517836, "grad_norm": 0.7446622252464294, "learning_rate": 9.423746953680619e-05, "loss": 0.9331951141357422, "memory(GiB)": 91.52, "step": 30305, "token_acc": 0.7657859512308677, "train_speed(iter/s)": 0.178188 }, { "epoch": 0.39329287083683406, "grad_norm": 0.7988547086715698, "learning_rate": 9.42349694139236e-05, "loss": 0.9005505561828613, "memory(GiB)": 91.52, "step": 30310, "token_acc": 0.7618837055051957, "train_speed(iter/s)": 0.178175 }, { "epoch": 0.39335774923848976, "grad_norm": 0.8394191861152649, "learning_rate": 9.423246878198783e-05, "loss": 1.0042957305908202, "memory(GiB)": 91.52, "step": 30315, "token_acc": 0.7383195248363787, "train_speed(iter/s)": 0.178161 }, { "epoch": 0.39342262764014546, "grad_norm": 0.7893999814987183, "learning_rate": 9.422996764102764e-05, "loss": 0.9848066329956054, "memory(GiB)": 91.52, "step": 30320, "token_acc": 0.7522008571759528, "train_speed(iter/s)": 0.178147 }, { "epoch": 0.39348750604180116, "grad_norm": 0.7464848756790161, "learning_rate": 9.422746599107185e-05, "loss": 0.9829133033752442, "memory(GiB)": 91.52, "step": 30325, "token_acc": 0.7284673144876325, "train_speed(iter/s)": 0.178133 }, { "epoch": 0.39355238444345686, "grad_norm": 0.8198950886726379, "learning_rate": 9.422496383214922e-05, "loss": 0.887579345703125, "memory(GiB)": 91.52, "step": 30330, "token_acc": 0.7513162279240253, "train_speed(iter/s)": 0.178119 }, { "epoch": 0.39361726284511256, "grad_norm": 0.8446565270423889, "learning_rate": 9.422246116428854e-05, "loss": 1.0126588821411133, "memory(GiB)": 91.52, "step": 30335, "token_acc": 0.7200166574125486, "train_speed(iter/s)": 0.178105 }, { "epoch": 0.39368214124676826, "grad_norm": 0.7889739274978638, "learning_rate": 9.421995798751863e-05, "loss": 0.9518270492553711, "memory(GiB)": 91.52, "step": 30340, "token_acc": 0.7528761904761905, "train_speed(iter/s)": 0.178094 }, { "epoch": 0.39374701964842396, "grad_norm": 0.6422325372695923, "learning_rate": 9.42174543018683e-05, "loss": 0.9192249298095703, "memory(GiB)": 91.52, "step": 30345, "token_acc": 0.7573191462570044, "train_speed(iter/s)": 0.178079 }, { "epoch": 0.39381189805007966, "grad_norm": 0.754837691783905, "learning_rate": 9.421495010736636e-05, "loss": 0.9151592254638672, "memory(GiB)": 91.52, "step": 30350, "token_acc": 0.7708817861651936, "train_speed(iter/s)": 0.178067 }, { "epoch": 0.39387677645173536, "grad_norm": 0.8835062980651855, "learning_rate": 9.42124454040416e-05, "loss": 0.9297327041625977, "memory(GiB)": 91.52, "step": 30355, "token_acc": 0.7452166802943582, "train_speed(iter/s)": 0.178053 }, { "epoch": 0.39394165485339105, "grad_norm": 0.7393098473548889, "learning_rate": 9.420994019192287e-05, "loss": 0.8984302520751953, "memory(GiB)": 91.52, "step": 30360, "token_acc": 0.7684054753977062, "train_speed(iter/s)": 0.178037 }, { "epoch": 0.39400653325504675, "grad_norm": 0.7904713153839111, "learning_rate": 9.4207434471039e-05, "loss": 0.9185534477233886, "memory(GiB)": 91.52, "step": 30365, "token_acc": 0.7596031799051373, "train_speed(iter/s)": 0.178025 }, { "epoch": 0.39407141165670245, "grad_norm": 0.727323591709137, "learning_rate": 9.420492824141881e-05, "loss": 0.920648193359375, "memory(GiB)": 91.52, "step": 30370, "token_acc": 0.7508076540755467, "train_speed(iter/s)": 0.17801 }, { "epoch": 0.3941362900583581, "grad_norm": 0.789587676525116, "learning_rate": 9.420242150309116e-05, "loss": 0.9424440383911132, "memory(GiB)": 91.52, "step": 30375, "token_acc": 0.7466048249858187, "train_speed(iter/s)": 0.177996 }, { "epoch": 0.3942011684600138, "grad_norm": 0.6606378555297852, "learning_rate": 9.419991425608487e-05, "loss": 0.931204891204834, "memory(GiB)": 91.52, "step": 30380, "token_acc": 0.7496641448066181, "train_speed(iter/s)": 0.177982 }, { "epoch": 0.3942660468616695, "grad_norm": 0.7754583358764648, "learning_rate": 9.419740650042884e-05, "loss": 0.9647841453552246, "memory(GiB)": 91.52, "step": 30385, "token_acc": 0.7426143631601388, "train_speed(iter/s)": 0.17797 }, { "epoch": 0.3943309252633252, "grad_norm": 0.7452601194381714, "learning_rate": 9.419489823615187e-05, "loss": 0.999907398223877, "memory(GiB)": 91.52, "step": 30390, "token_acc": 0.7304665067729622, "train_speed(iter/s)": 0.177955 }, { "epoch": 0.3943958036649809, "grad_norm": 0.802968442440033, "learning_rate": 9.419238946328285e-05, "loss": 0.967683982849121, "memory(GiB)": 91.52, "step": 30395, "token_acc": 0.7285605868960283, "train_speed(iter/s)": 0.177944 }, { "epoch": 0.3944606820666366, "grad_norm": 0.739209771156311, "learning_rate": 9.418988018185069e-05, "loss": 0.9375910758972168, "memory(GiB)": 91.52, "step": 30400, "token_acc": 0.7652455098239078, "train_speed(iter/s)": 0.177931 }, { "epoch": 0.3945255604682923, "grad_norm": 0.6719823479652405, "learning_rate": 9.418737039188422e-05, "loss": 0.9130900382995606, "memory(GiB)": 91.52, "step": 30405, "token_acc": 0.7430221688034188, "train_speed(iter/s)": 0.177917 }, { "epoch": 0.394590438869948, "grad_norm": 0.7665043473243713, "learning_rate": 9.418486009341234e-05, "loss": 0.9587390899658204, "memory(GiB)": 91.52, "step": 30410, "token_acc": 0.7287559223597738, "train_speed(iter/s)": 0.177903 }, { "epoch": 0.3946553172716037, "grad_norm": 0.7402637004852295, "learning_rate": 9.418234928646393e-05, "loss": 0.8489581108093261, "memory(GiB)": 91.52, "step": 30415, "token_acc": 0.7819994004796164, "train_speed(iter/s)": 0.177889 }, { "epoch": 0.3947201956732594, "grad_norm": 0.768267810344696, "learning_rate": 9.417983797106788e-05, "loss": 1.0019168853759766, "memory(GiB)": 91.52, "step": 30420, "token_acc": 0.7347765313071216, "train_speed(iter/s)": 0.177875 }, { "epoch": 0.3947850740749151, "grad_norm": 0.8144748210906982, "learning_rate": 9.41773261472531e-05, "loss": 0.9273857116699219, "memory(GiB)": 91.52, "step": 30425, "token_acc": 0.7556171009517866, "train_speed(iter/s)": 0.177863 }, { "epoch": 0.3948499524765708, "grad_norm": 0.7511162757873535, "learning_rate": 9.41748138150485e-05, "loss": 0.9713700294494629, "memory(GiB)": 91.52, "step": 30430, "token_acc": 0.7283380544022351, "train_speed(iter/s)": 0.177851 }, { "epoch": 0.3949148308782265, "grad_norm": 0.8688735961914062, "learning_rate": 9.417230097448297e-05, "loss": 0.9671670913696289, "memory(GiB)": 91.52, "step": 30435, "token_acc": 0.740394717094402, "train_speed(iter/s)": 0.177839 }, { "epoch": 0.3949797092798822, "grad_norm": 0.8216278553009033, "learning_rate": 9.416978762558547e-05, "loss": 0.9701778411865234, "memory(GiB)": 91.52, "step": 30440, "token_acc": 0.7541862899005756, "train_speed(iter/s)": 0.177823 }, { "epoch": 0.3950445876815379, "grad_norm": 0.7875153422355652, "learning_rate": 9.416727376838488e-05, "loss": 0.9473691940307617, "memory(GiB)": 91.52, "step": 30445, "token_acc": 0.74057808191196, "train_speed(iter/s)": 0.17781 }, { "epoch": 0.3951094660831936, "grad_norm": 0.8819133043289185, "learning_rate": 9.416475940291015e-05, "loss": 0.9683415412902832, "memory(GiB)": 91.52, "step": 30450, "token_acc": 0.741594919942343, "train_speed(iter/s)": 0.177799 }, { "epoch": 0.3951743444848493, "grad_norm": 0.8720438480377197, "learning_rate": 9.41622445291902e-05, "loss": 0.9437600135803222, "memory(GiB)": 91.52, "step": 30455, "token_acc": 0.7551887971992999, "train_speed(iter/s)": 0.177785 }, { "epoch": 0.395239222886505, "grad_norm": 0.8003754615783691, "learning_rate": 9.4159729147254e-05, "loss": 0.948365306854248, "memory(GiB)": 91.52, "step": 30460, "token_acc": 0.758329126703685, "train_speed(iter/s)": 0.177772 }, { "epoch": 0.3953041012881607, "grad_norm": 0.8340771794319153, "learning_rate": 9.415721325713047e-05, "loss": 0.9602515220642089, "memory(GiB)": 91.52, "step": 30465, "token_acc": 0.7448061326519275, "train_speed(iter/s)": 0.177759 }, { "epoch": 0.3953689796898164, "grad_norm": 0.7330270409584045, "learning_rate": 9.415469685884857e-05, "loss": 0.9231382369995117, "memory(GiB)": 91.52, "step": 30470, "token_acc": 0.7519097734500508, "train_speed(iter/s)": 0.177745 }, { "epoch": 0.3954338580914721, "grad_norm": 0.8162025809288025, "learning_rate": 9.415217995243729e-05, "loss": 1.0181836128234862, "memory(GiB)": 91.52, "step": 30475, "token_acc": 0.7430314630936715, "train_speed(iter/s)": 0.177732 }, { "epoch": 0.3954987364931278, "grad_norm": 0.781146764755249, "learning_rate": 9.414966253792552e-05, "loss": 0.9476943016052246, "memory(GiB)": 91.52, "step": 30480, "token_acc": 0.7487748166696476, "train_speed(iter/s)": 0.17772 }, { "epoch": 0.3955636148947835, "grad_norm": 0.6859941482543945, "learning_rate": 9.414714461534229e-05, "loss": 0.9111531257629395, "memory(GiB)": 91.52, "step": 30485, "token_acc": 0.7450059658981564, "train_speed(iter/s)": 0.177705 }, { "epoch": 0.3956284932964392, "grad_norm": 0.8306434154510498, "learning_rate": 9.414462618471657e-05, "loss": 0.9287013053894043, "memory(GiB)": 91.52, "step": 30490, "token_acc": 0.7302491942457354, "train_speed(iter/s)": 0.177693 }, { "epoch": 0.3956933716980948, "grad_norm": 0.8201347589492798, "learning_rate": 9.414210724607733e-05, "loss": 0.9367942810058594, "memory(GiB)": 91.52, "step": 30495, "token_acc": 0.7675456253354804, "train_speed(iter/s)": 0.177679 }, { "epoch": 0.3957582500997505, "grad_norm": 0.765710711479187, "learning_rate": 9.413958779945355e-05, "loss": 0.9699192047119141, "memory(GiB)": 91.52, "step": 30500, "token_acc": 0.749489778269743, "train_speed(iter/s)": 0.177663 }, { "epoch": 0.3958231285014062, "grad_norm": 0.7697431445121765, "learning_rate": 9.413706784487425e-05, "loss": 0.9481828689575196, "memory(GiB)": 91.52, "step": 30505, "token_acc": 0.7371432440667637, "train_speed(iter/s)": 0.17765 }, { "epoch": 0.3958880069030619, "grad_norm": 0.7272100448608398, "learning_rate": 9.41345473823684e-05, "loss": 0.9410039901733398, "memory(GiB)": 91.52, "step": 30510, "token_acc": 0.7568315228392146, "train_speed(iter/s)": 0.177637 }, { "epoch": 0.3959528853047176, "grad_norm": 0.8500612378120422, "learning_rate": 9.413202641196504e-05, "loss": 0.9350221633911133, "memory(GiB)": 91.52, "step": 30515, "token_acc": 0.7495275076457854, "train_speed(iter/s)": 0.177622 }, { "epoch": 0.3960177637063733, "grad_norm": 0.8326915502548218, "learning_rate": 9.412950493369313e-05, "loss": 0.9775102615356446, "memory(GiB)": 91.52, "step": 30520, "token_acc": 0.7158617308163766, "train_speed(iter/s)": 0.17761 }, { "epoch": 0.396082642108029, "grad_norm": 0.8542982935905457, "learning_rate": 9.412698294758173e-05, "loss": 0.9847079277038574, "memory(GiB)": 91.52, "step": 30525, "token_acc": 0.7402952923753534, "train_speed(iter/s)": 0.177597 }, { "epoch": 0.3961475205096847, "grad_norm": 0.8625434041023254, "learning_rate": 9.412446045365986e-05, "loss": 0.9180051803588867, "memory(GiB)": 91.52, "step": 30530, "token_acc": 0.7375415282392026, "train_speed(iter/s)": 0.177584 }, { "epoch": 0.3962123989113404, "grad_norm": 0.7900070548057556, "learning_rate": 9.412193745195655e-05, "loss": 0.955527400970459, "memory(GiB)": 91.52, "step": 30535, "token_acc": 0.7609563034944333, "train_speed(iter/s)": 0.17757 }, { "epoch": 0.3962772773129961, "grad_norm": 0.8125698566436768, "learning_rate": 9.411941394250078e-05, "loss": 0.9422967910766602, "memory(GiB)": 91.52, "step": 30540, "token_acc": 0.7597911227154047, "train_speed(iter/s)": 0.177556 }, { "epoch": 0.3963421557146518, "grad_norm": 0.879607617855072, "learning_rate": 9.411688992532167e-05, "loss": 0.9381586074829101, "memory(GiB)": 91.52, "step": 30545, "token_acc": 0.7426500337154417, "train_speed(iter/s)": 0.177543 }, { "epoch": 0.3964070341163075, "grad_norm": 0.8905418515205383, "learning_rate": 9.411436540044822e-05, "loss": 0.9230961799621582, "memory(GiB)": 91.52, "step": 30550, "token_acc": 0.7371774322808142, "train_speed(iter/s)": 0.177532 }, { "epoch": 0.3964719125179632, "grad_norm": 0.8777288794517517, "learning_rate": 9.41118403679095e-05, "loss": 0.9245415687561035, "memory(GiB)": 91.52, "step": 30555, "token_acc": 0.7549828614954932, "train_speed(iter/s)": 0.17752 }, { "epoch": 0.3965367909196189, "grad_norm": 0.7021977305412292, "learning_rate": 9.410931482773455e-05, "loss": 0.9233568191528321, "memory(GiB)": 91.52, "step": 30560, "token_acc": 0.7530260707635009, "train_speed(iter/s)": 0.177507 }, { "epoch": 0.3966016693212746, "grad_norm": 0.8476443290710449, "learning_rate": 9.410678877995244e-05, "loss": 0.9389541625976563, "memory(GiB)": 91.52, "step": 30565, "token_acc": 0.7514627011214042, "train_speed(iter/s)": 0.177492 }, { "epoch": 0.3966665477229303, "grad_norm": 0.7710409760475159, "learning_rate": 9.410426222459225e-05, "loss": 0.928983497619629, "memory(GiB)": 91.52, "step": 30570, "token_acc": 0.7492031538332494, "train_speed(iter/s)": 0.17748 }, { "epoch": 0.396731426124586, "grad_norm": 0.7675293684005737, "learning_rate": 9.410173516168305e-05, "loss": 0.9944368362426758, "memory(GiB)": 91.52, "step": 30575, "token_acc": 0.7557149097262668, "train_speed(iter/s)": 0.177467 }, { "epoch": 0.3967963045262417, "grad_norm": 0.8554480075836182, "learning_rate": 9.409920759125393e-05, "loss": 0.9940617561340332, "memory(GiB)": 91.52, "step": 30580, "token_acc": 0.732706064770765, "train_speed(iter/s)": 0.177455 }, { "epoch": 0.3968611829278974, "grad_norm": 0.7679525017738342, "learning_rate": 9.409667951333394e-05, "loss": 0.9734947204589843, "memory(GiB)": 91.52, "step": 30585, "token_acc": 0.7481293081204615, "train_speed(iter/s)": 0.177443 }, { "epoch": 0.3969260613295531, "grad_norm": 0.7740088105201721, "learning_rate": 9.409415092795222e-05, "loss": 0.9387435913085938, "memory(GiB)": 91.52, "step": 30590, "token_acc": 0.7410426601366156, "train_speed(iter/s)": 0.177431 }, { "epoch": 0.3969909397312088, "grad_norm": 0.791222095489502, "learning_rate": 9.409162183513784e-05, "loss": 0.9635736465454101, "memory(GiB)": 91.52, "step": 30595, "token_acc": 0.7479064844503938, "train_speed(iter/s)": 0.177417 }, { "epoch": 0.3970558181328645, "grad_norm": 0.7836349606513977, "learning_rate": 9.408909223491992e-05, "loss": 0.9223669052124024, "memory(GiB)": 91.52, "step": 30600, "token_acc": 0.7463389121338913, "train_speed(iter/s)": 0.177403 }, { "epoch": 0.3971206965345202, "grad_norm": 0.8554094433784485, "learning_rate": 9.408656212732756e-05, "loss": 0.944399070739746, "memory(GiB)": 91.52, "step": 30605, "token_acc": 0.7681200960194905, "train_speed(iter/s)": 0.17739 }, { "epoch": 0.39718557493617584, "grad_norm": 0.7526324391365051, "learning_rate": 9.40840315123899e-05, "loss": 0.9024136543273926, "memory(GiB)": 91.52, "step": 30610, "token_acc": 0.7559827856654466, "train_speed(iter/s)": 0.177376 }, { "epoch": 0.39725045333783154, "grad_norm": 0.9861668944358826, "learning_rate": 9.4081500390136e-05, "loss": 0.9960210800170899, "memory(GiB)": 91.52, "step": 30615, "token_acc": 0.7356156040644933, "train_speed(iter/s)": 0.177362 }, { "epoch": 0.39731533173948724, "grad_norm": 0.7334877252578735, "learning_rate": 9.407896876059508e-05, "loss": 0.9332544326782226, "memory(GiB)": 91.52, "step": 30620, "token_acc": 0.7331283379187571, "train_speed(iter/s)": 0.177348 }, { "epoch": 0.39738021014114294, "grad_norm": 0.7802478075027466, "learning_rate": 9.40764366237962e-05, "loss": 1.0018375396728516, "memory(GiB)": 91.52, "step": 30625, "token_acc": 0.7372435831357592, "train_speed(iter/s)": 0.177336 }, { "epoch": 0.39744508854279864, "grad_norm": 0.7610138654708862, "learning_rate": 9.407390397976851e-05, "loss": 0.9084640502929687, "memory(GiB)": 91.52, "step": 30630, "token_acc": 0.7414447403462051, "train_speed(iter/s)": 0.177323 }, { "epoch": 0.39750996694445434, "grad_norm": 0.8421491980552673, "learning_rate": 9.407137082854119e-05, "loss": 0.9672874450683594, "memory(GiB)": 91.52, "step": 30635, "token_acc": 0.7355993491231242, "train_speed(iter/s)": 0.177311 }, { "epoch": 0.39757484534611004, "grad_norm": 0.6647448539733887, "learning_rate": 9.406883717014337e-05, "loss": 0.9271122932434082, "memory(GiB)": 91.52, "step": 30640, "token_acc": 0.7741063489240619, "train_speed(iter/s)": 0.177297 }, { "epoch": 0.39763972374776574, "grad_norm": 0.7714486122131348, "learning_rate": 9.40663030046042e-05, "loss": 0.9399341583251953, "memory(GiB)": 91.52, "step": 30645, "token_acc": 0.7533344941668118, "train_speed(iter/s)": 0.177284 }, { "epoch": 0.39770460214942144, "grad_norm": 0.8252086043357849, "learning_rate": 9.406376833195285e-05, "loss": 0.9555866241455078, "memory(GiB)": 91.52, "step": 30650, "token_acc": 0.7291970566306849, "train_speed(iter/s)": 0.17727 }, { "epoch": 0.39776948055107714, "grad_norm": 0.8121910691261292, "learning_rate": 9.40612331522185e-05, "loss": 0.9312868118286133, "memory(GiB)": 91.52, "step": 30655, "token_acc": 0.7511169098169455, "train_speed(iter/s)": 0.177255 }, { "epoch": 0.39783435895273284, "grad_norm": 0.8796672821044922, "learning_rate": 9.40586974654303e-05, "loss": 0.9591073036193848, "memory(GiB)": 91.52, "step": 30660, "token_acc": 0.7461364307510582, "train_speed(iter/s)": 0.17724 }, { "epoch": 0.39789923735438854, "grad_norm": 0.7101108431816101, "learning_rate": 9.405616127161746e-05, "loss": 0.9500320434570313, "memory(GiB)": 91.52, "step": 30665, "token_acc": 0.7416745416745417, "train_speed(iter/s)": 0.177226 }, { "epoch": 0.39796411575604423, "grad_norm": 0.7686423659324646, "learning_rate": 9.405362457080915e-05, "loss": 0.946535873413086, "memory(GiB)": 91.52, "step": 30670, "token_acc": 0.7494245723172628, "train_speed(iter/s)": 0.177213 }, { "epoch": 0.39802899415769993, "grad_norm": 0.8094658851623535, "learning_rate": 9.405108736303456e-05, "loss": 0.9892595291137696, "memory(GiB)": 91.52, "step": 30675, "token_acc": 0.7465706858628275, "train_speed(iter/s)": 0.177202 }, { "epoch": 0.39809387255935563, "grad_norm": 0.825911819934845, "learning_rate": 9.404854964832289e-05, "loss": 0.9898979187011718, "memory(GiB)": 91.52, "step": 30680, "token_acc": 0.7344401296804304, "train_speed(iter/s)": 0.177188 }, { "epoch": 0.39815875096101133, "grad_norm": 0.8097023367881775, "learning_rate": 9.404601142670335e-05, "loss": 0.9415339469909668, "memory(GiB)": 91.52, "step": 30685, "token_acc": 0.7552032075815565, "train_speed(iter/s)": 0.177174 }, { "epoch": 0.39822362936266703, "grad_norm": 0.7416654229164124, "learning_rate": 9.404347269820514e-05, "loss": 0.9557075500488281, "memory(GiB)": 91.52, "step": 30690, "token_acc": 0.7572012548721362, "train_speed(iter/s)": 0.177161 }, { "epoch": 0.39828850776432273, "grad_norm": 0.7701320052146912, "learning_rate": 9.404093346285747e-05, "loss": 0.8926808357238769, "memory(GiB)": 91.52, "step": 30695, "token_acc": 0.7772538051255367, "train_speed(iter/s)": 0.177148 }, { "epoch": 0.39835338616597843, "grad_norm": 0.8198584914207458, "learning_rate": 9.403839372068959e-05, "loss": 0.8993306159973145, "memory(GiB)": 91.52, "step": 30700, "token_acc": 0.7795631049389801, "train_speed(iter/s)": 0.177135 }, { "epoch": 0.39841826456763413, "grad_norm": 0.8548679351806641, "learning_rate": 9.403585347173071e-05, "loss": 0.9341683387756348, "memory(GiB)": 91.52, "step": 30705, "token_acc": 0.7461538461538462, "train_speed(iter/s)": 0.177122 }, { "epoch": 0.3984831429692898, "grad_norm": 0.9319796562194824, "learning_rate": 9.403331271601005e-05, "loss": 0.981229019165039, "memory(GiB)": 91.52, "step": 30710, "token_acc": 0.7512082853855006, "train_speed(iter/s)": 0.17711 }, { "epoch": 0.3985480213709455, "grad_norm": 0.7869051694869995, "learning_rate": 9.403077145355687e-05, "loss": 0.9728364944458008, "memory(GiB)": 91.52, "step": 30715, "token_acc": 0.7379006053972705, "train_speed(iter/s)": 0.177097 }, { "epoch": 0.3986128997726012, "grad_norm": 0.7797741293907166, "learning_rate": 9.402822968440042e-05, "loss": 0.9315386772155761, "memory(GiB)": 91.52, "step": 30720, "token_acc": 0.7430643571327982, "train_speed(iter/s)": 0.177085 }, { "epoch": 0.3986777781742569, "grad_norm": 0.7547988891601562, "learning_rate": 9.402568740856991e-05, "loss": 0.9165087699890136, "memory(GiB)": 91.52, "step": 30725, "token_acc": 0.774654454621149, "train_speed(iter/s)": 0.177071 }, { "epoch": 0.39874265657591257, "grad_norm": 0.7496618032455444, "learning_rate": 9.402314462609463e-05, "loss": 0.9712339401245117, "memory(GiB)": 91.52, "step": 30730, "token_acc": 0.7482830980541778, "train_speed(iter/s)": 0.177057 }, { "epoch": 0.39880753497756827, "grad_norm": 0.7637626528739929, "learning_rate": 9.402060133700383e-05, "loss": 0.9770730972290039, "memory(GiB)": 91.52, "step": 30735, "token_acc": 0.7267360477146857, "train_speed(iter/s)": 0.177043 }, { "epoch": 0.39887241337922397, "grad_norm": 0.8140337467193604, "learning_rate": 9.40180575413268e-05, "loss": 0.9539621353149415, "memory(GiB)": 91.52, "step": 30740, "token_acc": 0.7450717446694382, "train_speed(iter/s)": 0.177032 }, { "epoch": 0.39893729178087967, "grad_norm": 0.8989642858505249, "learning_rate": 9.401551323909279e-05, "loss": 0.9815258979797363, "memory(GiB)": 91.52, "step": 30745, "token_acc": 0.7401720365200332, "train_speed(iter/s)": 0.177021 }, { "epoch": 0.39900217018253537, "grad_norm": 0.6978453993797302, "learning_rate": 9.401296843033108e-05, "loss": 0.9143900871276855, "memory(GiB)": 91.52, "step": 30750, "token_acc": 0.7738549284439623, "train_speed(iter/s)": 0.177007 }, { "epoch": 0.39906704858419106, "grad_norm": 0.7294329404830933, "learning_rate": 9.401042311507096e-05, "loss": 0.9384281158447265, "memory(GiB)": 91.52, "step": 30755, "token_acc": 0.7487677691263662, "train_speed(iter/s)": 0.176994 }, { "epoch": 0.39913192698584676, "grad_norm": 0.7925856113433838, "learning_rate": 9.400787729334174e-05, "loss": 0.9592299461364746, "memory(GiB)": 91.52, "step": 30760, "token_acc": 0.7364417267234477, "train_speed(iter/s)": 0.17698 }, { "epoch": 0.39919680538750246, "grad_norm": 0.744215726852417, "learning_rate": 9.400533096517269e-05, "loss": 0.9645868301391601, "memory(GiB)": 91.52, "step": 30765, "token_acc": 0.7442678520034537, "train_speed(iter/s)": 0.176968 }, { "epoch": 0.39926168378915816, "grad_norm": 0.7541499733924866, "learning_rate": 9.400278413059311e-05, "loss": 0.9260910034179688, "memory(GiB)": 91.52, "step": 30770, "token_acc": 0.747979319886405, "train_speed(iter/s)": 0.176954 }, { "epoch": 0.39932656219081386, "grad_norm": 0.7761158347129822, "learning_rate": 9.400023678963234e-05, "loss": 0.9758371353149414, "memory(GiB)": 91.52, "step": 30775, "token_acc": 0.7453891954690677, "train_speed(iter/s)": 0.176942 }, { "epoch": 0.39939144059246956, "grad_norm": 0.8396448493003845, "learning_rate": 9.399768894231968e-05, "loss": 0.9596671104431153, "memory(GiB)": 91.52, "step": 30780, "token_acc": 0.7461621684844586, "train_speed(iter/s)": 0.176928 }, { "epoch": 0.39945631899412526, "grad_norm": 0.7487090826034546, "learning_rate": 9.399514058868444e-05, "loss": 0.9349469184875489, "memory(GiB)": 91.52, "step": 30785, "token_acc": 0.7551923731699013, "train_speed(iter/s)": 0.176915 }, { "epoch": 0.39952119739578096, "grad_norm": 0.715721070766449, "learning_rate": 9.399259172875595e-05, "loss": 0.9354207992553711, "memory(GiB)": 91.52, "step": 30790, "token_acc": 0.7433196054017814, "train_speed(iter/s)": 0.176902 }, { "epoch": 0.39958607579743666, "grad_norm": 0.792370080947876, "learning_rate": 9.399004236256354e-05, "loss": 0.8872955322265625, "memory(GiB)": 91.52, "step": 30795, "token_acc": 0.7806436019611905, "train_speed(iter/s)": 0.176888 }, { "epoch": 0.39965095419909236, "grad_norm": 0.7695406079292297, "learning_rate": 9.398749249013657e-05, "loss": 0.9909029006958008, "memory(GiB)": 91.52, "step": 30800, "token_acc": 0.7203371791051131, "train_speed(iter/s)": 0.176877 }, { "epoch": 0.39971583260074806, "grad_norm": 0.8311358690261841, "learning_rate": 9.398494211150437e-05, "loss": 0.9393892288208008, "memory(GiB)": 91.52, "step": 30805, "token_acc": 0.741106986162673, "train_speed(iter/s)": 0.176865 }, { "epoch": 0.39978071100240375, "grad_norm": 0.796950101852417, "learning_rate": 9.398239122669627e-05, "loss": 0.9464838027954101, "memory(GiB)": 91.52, "step": 30810, "token_acc": 0.7565587258219753, "train_speed(iter/s)": 0.176851 }, { "epoch": 0.39984558940405945, "grad_norm": 0.8707072138786316, "learning_rate": 9.397983983574166e-05, "loss": 0.9414796829223633, "memory(GiB)": 91.52, "step": 30815, "token_acc": 0.7504040780803183, "train_speed(iter/s)": 0.176838 }, { "epoch": 0.39991046780571515, "grad_norm": 0.7982200384140015, "learning_rate": 9.397728793866987e-05, "loss": 0.9695978164672852, "memory(GiB)": 91.52, "step": 30820, "token_acc": 0.7432745509553272, "train_speed(iter/s)": 0.176824 }, { "epoch": 0.39997534620737085, "grad_norm": 0.86129230260849, "learning_rate": 9.397473553551028e-05, "loss": 0.9507972717285156, "memory(GiB)": 91.52, "step": 30825, "token_acc": 0.7436403000077322, "train_speed(iter/s)": 0.176811 }, { "epoch": 0.40004022460902655, "grad_norm": 0.8062918186187744, "learning_rate": 9.397218262629226e-05, "loss": 0.924657154083252, "memory(GiB)": 91.52, "step": 30830, "token_acc": 0.7527176542692171, "train_speed(iter/s)": 0.176798 }, { "epoch": 0.40010510301068225, "grad_norm": 0.8138889670372009, "learning_rate": 9.396962921104521e-05, "loss": 1.0052808761596679, "memory(GiB)": 91.52, "step": 30835, "token_acc": 0.7330547462045699, "train_speed(iter/s)": 0.176784 }, { "epoch": 0.40016998141233795, "grad_norm": 0.8209106922149658, "learning_rate": 9.396707528979849e-05, "loss": 0.9902785301208497, "memory(GiB)": 91.52, "step": 30840, "token_acc": 0.7373337394659356, "train_speed(iter/s)": 0.17677 }, { "epoch": 0.40023485981399365, "grad_norm": 0.8203849792480469, "learning_rate": 9.396452086258148e-05, "loss": 0.9127294540405273, "memory(GiB)": 91.52, "step": 30845, "token_acc": 0.7697132616487455, "train_speed(iter/s)": 0.176755 }, { "epoch": 0.4002997382156493, "grad_norm": 0.8012884855270386, "learning_rate": 9.396196592942361e-05, "loss": 0.9561614036560059, "memory(GiB)": 91.52, "step": 30850, "token_acc": 0.721231189338399, "train_speed(iter/s)": 0.17674 }, { "epoch": 0.400364616617305, "grad_norm": 0.7837713956832886, "learning_rate": 9.395941049035427e-05, "loss": 0.9258711814880372, "memory(GiB)": 91.52, "step": 30855, "token_acc": 0.75457925435754, "train_speed(iter/s)": 0.176729 }, { "epoch": 0.4004294950189607, "grad_norm": 0.7156401872634888, "learning_rate": 9.395685454540286e-05, "loss": 0.9568336486816407, "memory(GiB)": 91.52, "step": 30860, "token_acc": 0.7467781016258735, "train_speed(iter/s)": 0.176715 }, { "epoch": 0.4004943734206164, "grad_norm": 0.6492774486541748, "learning_rate": 9.39542980945988e-05, "loss": 0.9319498062133789, "memory(GiB)": 91.52, "step": 30865, "token_acc": 0.7630622208372286, "train_speed(iter/s)": 0.176702 }, { "epoch": 0.4005592518222721, "grad_norm": 0.8131874203681946, "learning_rate": 9.395174113797149e-05, "loss": 0.9279932022094727, "memory(GiB)": 91.52, "step": 30870, "token_acc": 0.7408348208155979, "train_speed(iter/s)": 0.176689 }, { "epoch": 0.4006241302239278, "grad_norm": 0.7907511591911316, "learning_rate": 9.394918367555037e-05, "loss": 0.9040671348571777, "memory(GiB)": 91.52, "step": 30875, "token_acc": 0.7556080999151207, "train_speed(iter/s)": 0.176675 }, { "epoch": 0.4006890086255835, "grad_norm": 0.7692623734474182, "learning_rate": 9.394662570736489e-05, "loss": 0.9002440452575684, "memory(GiB)": 91.52, "step": 30880, "token_acc": 0.7409742965667293, "train_speed(iter/s)": 0.176662 }, { "epoch": 0.4007538870272392, "grad_norm": 0.6836897134780884, "learning_rate": 9.394406723344447e-05, "loss": 0.9578356742858887, "memory(GiB)": 91.52, "step": 30885, "token_acc": 0.7406017242390476, "train_speed(iter/s)": 0.176649 }, { "epoch": 0.4008187654288949, "grad_norm": 0.89786297082901, "learning_rate": 9.394150825381855e-05, "loss": 0.9720458984375, "memory(GiB)": 91.52, "step": 30890, "token_acc": 0.7396998123827392, "train_speed(iter/s)": 0.176636 }, { "epoch": 0.4008836438305506, "grad_norm": 0.7415603995323181, "learning_rate": 9.393894876851657e-05, "loss": 0.9615148544311524, "memory(GiB)": 91.52, "step": 30895, "token_acc": 0.7510895404120443, "train_speed(iter/s)": 0.176623 }, { "epoch": 0.4009485222322063, "grad_norm": 0.7400916218757629, "learning_rate": 9.3936388777568e-05, "loss": 0.8871911048889161, "memory(GiB)": 91.52, "step": 30900, "token_acc": 0.7677613553042875, "train_speed(iter/s)": 0.176609 }, { "epoch": 0.401013400633862, "grad_norm": 0.72299724817276, "learning_rate": 9.39338282810023e-05, "loss": 0.936185359954834, "memory(GiB)": 91.52, "step": 30905, "token_acc": 0.7751136952391917, "train_speed(iter/s)": 0.176595 }, { "epoch": 0.4010782790355177, "grad_norm": 0.7828817367553711, "learning_rate": 9.393126727884893e-05, "loss": 0.9366191864013672, "memory(GiB)": 91.52, "step": 30910, "token_acc": 0.746964002857409, "train_speed(iter/s)": 0.176582 }, { "epoch": 0.4011431574371734, "grad_norm": 0.716863214969635, "learning_rate": 9.392870577113737e-05, "loss": 0.9571820259094238, "memory(GiB)": 91.52, "step": 30915, "token_acc": 0.7591049281448914, "train_speed(iter/s)": 0.17657 }, { "epoch": 0.4012080358388291, "grad_norm": 0.717492938041687, "learning_rate": 9.392614375789707e-05, "loss": 0.9728357315063476, "memory(GiB)": 91.52, "step": 30920, "token_acc": 0.7458859295486088, "train_speed(iter/s)": 0.176555 }, { "epoch": 0.4012729142404848, "grad_norm": 0.6998801231384277, "learning_rate": 9.392358123915757e-05, "loss": 0.9429315567016602, "memory(GiB)": 91.52, "step": 30925, "token_acc": 0.7504795834475199, "train_speed(iter/s)": 0.176543 }, { "epoch": 0.4013377926421405, "grad_norm": 0.733304500579834, "learning_rate": 9.392101821494829e-05, "loss": 0.9153388977050781, "memory(GiB)": 91.52, "step": 30930, "token_acc": 0.7737463882081855, "train_speed(iter/s)": 0.17653 }, { "epoch": 0.4014026710437962, "grad_norm": 0.7648231983184814, "learning_rate": 9.391845468529879e-05, "loss": 0.94601469039917, "memory(GiB)": 91.52, "step": 30935, "token_acc": 0.7552702255251831, "train_speed(iter/s)": 0.176518 }, { "epoch": 0.4014675494454519, "grad_norm": 0.8752191662788391, "learning_rate": 9.391589065023851e-05, "loss": 0.9439638137817383, "memory(GiB)": 91.52, "step": 30940, "token_acc": 0.7659122401847575, "train_speed(iter/s)": 0.176505 }, { "epoch": 0.4015324278471076, "grad_norm": 0.8765441179275513, "learning_rate": 9.3913326109797e-05, "loss": 0.972221565246582, "memory(GiB)": 91.52, "step": 30945, "token_acc": 0.7441479989932042, "train_speed(iter/s)": 0.176493 }, { "epoch": 0.4015973062487633, "grad_norm": 0.7942385673522949, "learning_rate": 9.391076106400376e-05, "loss": 0.9057463645935059, "memory(GiB)": 91.52, "step": 30950, "token_acc": 0.741919617762788, "train_speed(iter/s)": 0.17648 }, { "epoch": 0.401662184650419, "grad_norm": 0.7834344506263733, "learning_rate": 9.39081955128883e-05, "loss": 0.8828664779663086, "memory(GiB)": 91.52, "step": 30955, "token_acc": 0.759847178550901, "train_speed(iter/s)": 0.176466 }, { "epoch": 0.4017270630520747, "grad_norm": 0.780005693435669, "learning_rate": 9.390562945648016e-05, "loss": 0.94912109375, "memory(GiB)": 91.52, "step": 30960, "token_acc": 0.746519356866972, "train_speed(iter/s)": 0.176453 }, { "epoch": 0.4017919414537304, "grad_norm": 0.7970722317695618, "learning_rate": 9.390306289480887e-05, "loss": 0.9460305213928223, "memory(GiB)": 91.52, "step": 30965, "token_acc": 0.7615454663717965, "train_speed(iter/s)": 0.176441 }, { "epoch": 0.401856819855386, "grad_norm": 0.8318986296653748, "learning_rate": 9.390049582790394e-05, "loss": 0.9712095260620117, "memory(GiB)": 91.52, "step": 30970, "token_acc": 0.7419740380465456, "train_speed(iter/s)": 0.176426 }, { "epoch": 0.4019216982570417, "grad_norm": 0.7806443572044373, "learning_rate": 9.389792825579496e-05, "loss": 0.9672733306884765, "memory(GiB)": 91.52, "step": 30975, "token_acc": 0.7631378290175984, "train_speed(iter/s)": 0.176412 }, { "epoch": 0.4019865766586974, "grad_norm": 0.8680114150047302, "learning_rate": 9.389536017851141e-05, "loss": 0.9476130485534668, "memory(GiB)": 91.52, "step": 30980, "token_acc": 0.7636783971230413, "train_speed(iter/s)": 0.176402 }, { "epoch": 0.4020514550603531, "grad_norm": 0.7978131771087646, "learning_rate": 9.38927915960829e-05, "loss": 0.9503168106079102, "memory(GiB)": 91.52, "step": 30985, "token_acc": 0.7715864818338316, "train_speed(iter/s)": 0.17639 }, { "epoch": 0.4021163334620088, "grad_norm": 0.7310302257537842, "learning_rate": 9.389022250853896e-05, "loss": 0.9337385177612305, "memory(GiB)": 91.52, "step": 30990, "token_acc": 0.7555764579414136, "train_speed(iter/s)": 0.176377 }, { "epoch": 0.4021812118636645, "grad_norm": 0.7855218648910522, "learning_rate": 9.388765291590917e-05, "loss": 0.8974869728088379, "memory(GiB)": 91.52, "step": 30995, "token_acc": 0.7705250942848854, "train_speed(iter/s)": 0.176366 }, { "epoch": 0.4022460902653202, "grad_norm": 0.8105162978172302, "learning_rate": 9.388508281822309e-05, "loss": 0.960352897644043, "memory(GiB)": 91.52, "step": 31000, "token_acc": 0.7194367632076845, "train_speed(iter/s)": 0.176355 }, { "epoch": 0.4023109686669759, "grad_norm": 0.7772133946418762, "learning_rate": 9.38825122155103e-05, "loss": 0.9442032814025879, "memory(GiB)": 91.52, "step": 31005, "token_acc": 0.7535622605007627, "train_speed(iter/s)": 0.176342 }, { "epoch": 0.4023758470686316, "grad_norm": 0.7752960920333862, "learning_rate": 9.38799411078004e-05, "loss": 0.9343367576599121, "memory(GiB)": 91.52, "step": 31010, "token_acc": 0.7638122948878439, "train_speed(iter/s)": 0.176329 }, { "epoch": 0.4024407254702873, "grad_norm": 0.771090030670166, "learning_rate": 9.387736949512293e-05, "loss": 0.9403154373168945, "memory(GiB)": 91.52, "step": 31015, "token_acc": 0.7630383370674633, "train_speed(iter/s)": 0.176316 }, { "epoch": 0.402505603871943, "grad_norm": 0.7189018130302429, "learning_rate": 9.387479737750754e-05, "loss": 0.9489608764648437, "memory(GiB)": 91.52, "step": 31020, "token_acc": 0.7205287813643072, "train_speed(iter/s)": 0.176304 }, { "epoch": 0.4025704822735987, "grad_norm": 0.7138764262199402, "learning_rate": 9.38722247549838e-05, "loss": 0.9119282722473144, "memory(GiB)": 91.52, "step": 31025, "token_acc": 0.7560507624465614, "train_speed(iter/s)": 0.176293 }, { "epoch": 0.4026353606752544, "grad_norm": 0.8025005459785461, "learning_rate": 9.386965162758134e-05, "loss": 0.943390941619873, "memory(GiB)": 91.52, "step": 31030, "token_acc": 0.7621995254307232, "train_speed(iter/s)": 0.17628 }, { "epoch": 0.4027002390769101, "grad_norm": 0.7775358557701111, "learning_rate": 9.386707799532973e-05, "loss": 0.9804325103759766, "memory(GiB)": 91.52, "step": 31035, "token_acc": 0.7249507407137123, "train_speed(iter/s)": 0.176267 }, { "epoch": 0.4027651174785658, "grad_norm": 0.8261006474494934, "learning_rate": 9.386450385825862e-05, "loss": 0.9727973937988281, "memory(GiB)": 91.52, "step": 31040, "token_acc": 0.7584096005818535, "train_speed(iter/s)": 0.176255 }, { "epoch": 0.4028299958802215, "grad_norm": 0.7789152264595032, "learning_rate": 9.386192921639761e-05, "loss": 0.9439565658569335, "memory(GiB)": 91.52, "step": 31045, "token_acc": 0.7530571333110592, "train_speed(iter/s)": 0.17624 }, { "epoch": 0.4028948742818772, "grad_norm": 0.883073627948761, "learning_rate": 9.385935406977636e-05, "loss": 0.9711016654968262, "memory(GiB)": 91.52, "step": 31050, "token_acc": 0.7415995040297582, "train_speed(iter/s)": 0.176229 }, { "epoch": 0.4029597526835329, "grad_norm": 0.7869230508804321, "learning_rate": 9.385677841842446e-05, "loss": 0.9568418502807617, "memory(GiB)": 91.52, "step": 31055, "token_acc": 0.7239120262049602, "train_speed(iter/s)": 0.176216 }, { "epoch": 0.4030246310851886, "grad_norm": 0.7983121275901794, "learning_rate": 9.38542022623716e-05, "loss": 0.9467740058898926, "memory(GiB)": 91.52, "step": 31060, "token_acc": 0.7508783930847921, "train_speed(iter/s)": 0.176203 }, { "epoch": 0.4030895094868443, "grad_norm": 0.8495720028877258, "learning_rate": 9.38516256016474e-05, "loss": 0.9463285446166992, "memory(GiB)": 91.52, "step": 31065, "token_acc": 0.747827442422353, "train_speed(iter/s)": 0.17619 }, { "epoch": 0.4031543878885, "grad_norm": 0.7676209807395935, "learning_rate": 9.38490484362815e-05, "loss": 0.9123534202575684, "memory(GiB)": 91.52, "step": 31070, "token_acc": 0.7562653364978479, "train_speed(iter/s)": 0.176178 }, { "epoch": 0.4032192662901557, "grad_norm": 0.6864010691642761, "learning_rate": 9.384647076630358e-05, "loss": 0.8803129196166992, "memory(GiB)": 91.52, "step": 31075, "token_acc": 0.7734089598154201, "train_speed(iter/s)": 0.176165 }, { "epoch": 0.4032841446918114, "grad_norm": 0.7595831155776978, "learning_rate": 9.38438925917433e-05, "loss": 0.9380151748657226, "memory(GiB)": 91.52, "step": 31080, "token_acc": 0.7728251447107184, "train_speed(iter/s)": 0.17615 }, { "epoch": 0.4033490230934671, "grad_norm": 0.8046871423721313, "learning_rate": 9.384131391263033e-05, "loss": 0.9191083908081055, "memory(GiB)": 91.52, "step": 31085, "token_acc": 0.7572621291985687, "train_speed(iter/s)": 0.176139 }, { "epoch": 0.40341390149512274, "grad_norm": 0.7503700852394104, "learning_rate": 9.383873472899434e-05, "loss": 0.9500776290893554, "memory(GiB)": 91.52, "step": 31090, "token_acc": 0.7643957263771334, "train_speed(iter/s)": 0.176126 }, { "epoch": 0.40347877989677844, "grad_norm": 0.8521972298622131, "learning_rate": 9.383615504086501e-05, "loss": 0.9057978630065918, "memory(GiB)": 91.52, "step": 31095, "token_acc": 0.7540763989895124, "train_speed(iter/s)": 0.176113 }, { "epoch": 0.40354365829843414, "grad_norm": 0.7742888331413269, "learning_rate": 9.383357484827202e-05, "loss": 0.9357898712158204, "memory(GiB)": 91.52, "step": 31100, "token_acc": 0.7750257289879932, "train_speed(iter/s)": 0.1761 }, { "epoch": 0.40360853670008984, "grad_norm": 0.784026563167572, "learning_rate": 9.383099415124508e-05, "loss": 0.9490615844726562, "memory(GiB)": 91.52, "step": 31105, "token_acc": 0.7276049382716049, "train_speed(iter/s)": 0.176087 }, { "epoch": 0.40367341510174554, "grad_norm": 0.7303404808044434, "learning_rate": 9.382841294981387e-05, "loss": 0.9315579414367676, "memory(GiB)": 91.52, "step": 31110, "token_acc": 0.7458616955445545, "train_speed(iter/s)": 0.176076 }, { "epoch": 0.40373829350340124, "grad_norm": 0.818168044090271, "learning_rate": 9.38258312440081e-05, "loss": 0.9474908828735351, "memory(GiB)": 91.52, "step": 31115, "token_acc": 0.7552604407733114, "train_speed(iter/s)": 0.176065 }, { "epoch": 0.40380317190505693, "grad_norm": 0.787489652633667, "learning_rate": 9.382324903385751e-05, "loss": 0.9677646636962891, "memory(GiB)": 91.52, "step": 31120, "token_acc": 0.7607472428539275, "train_speed(iter/s)": 0.176054 }, { "epoch": 0.40386805030671263, "grad_norm": 0.7769078016281128, "learning_rate": 9.382066631939179e-05, "loss": 0.9813800811767578, "memory(GiB)": 91.52, "step": 31125, "token_acc": 0.7463142218912882, "train_speed(iter/s)": 0.176042 }, { "epoch": 0.40393292870836833, "grad_norm": 0.8705115914344788, "learning_rate": 9.381808310064064e-05, "loss": 0.9587225914001465, "memory(GiB)": 91.52, "step": 31130, "token_acc": 0.7457223233493298, "train_speed(iter/s)": 0.17603 }, { "epoch": 0.40399780711002403, "grad_norm": 0.7634389996528625, "learning_rate": 9.381549937763383e-05, "loss": 0.8963766098022461, "memory(GiB)": 91.52, "step": 31135, "token_acc": 0.777077497665733, "train_speed(iter/s)": 0.176018 }, { "epoch": 0.40406268551167973, "grad_norm": 0.8301726579666138, "learning_rate": 9.381291515040105e-05, "loss": 0.9167649269104003, "memory(GiB)": 91.52, "step": 31140, "token_acc": 0.7682204124068619, "train_speed(iter/s)": 0.176006 }, { "epoch": 0.40412756391333543, "grad_norm": 0.9046823382377625, "learning_rate": 9.381033041897207e-05, "loss": 0.9690721511840821, "memory(GiB)": 91.52, "step": 31145, "token_acc": 0.7554039523933114, "train_speed(iter/s)": 0.175994 }, { "epoch": 0.40419244231499113, "grad_norm": 0.7261375188827515, "learning_rate": 9.380774518337665e-05, "loss": 0.9662343978881835, "memory(GiB)": 91.52, "step": 31150, "token_acc": 0.7411227998502026, "train_speed(iter/s)": 0.17598 }, { "epoch": 0.40425732071664683, "grad_norm": 0.7301857471466064, "learning_rate": 9.38051594436445e-05, "loss": 0.940888500213623, "memory(GiB)": 91.52, "step": 31155, "token_acc": 0.7379558579226327, "train_speed(iter/s)": 0.175969 }, { "epoch": 0.4043221991183025, "grad_norm": 0.7229544520378113, "learning_rate": 9.380257319980539e-05, "loss": 0.9233283996582031, "memory(GiB)": 91.52, "step": 31160, "token_acc": 0.7565384227001919, "train_speed(iter/s)": 0.175956 }, { "epoch": 0.4043870775199582, "grad_norm": 0.758267343044281, "learning_rate": 9.37999864518891e-05, "loss": 0.9196565628051758, "memory(GiB)": 91.52, "step": 31165, "token_acc": 0.769715260464506, "train_speed(iter/s)": 0.175942 }, { "epoch": 0.4044519559216139, "grad_norm": 0.7475209832191467, "learning_rate": 9.379739919992536e-05, "loss": 0.9137277603149414, "memory(GiB)": 91.52, "step": 31170, "token_acc": 0.7519932432432432, "train_speed(iter/s)": 0.175931 }, { "epoch": 0.4045168343232696, "grad_norm": 0.7749026417732239, "learning_rate": 9.3794811443944e-05, "loss": 0.9346601486206054, "memory(GiB)": 91.52, "step": 31175, "token_acc": 0.7310767847393943, "train_speed(iter/s)": 0.175919 }, { "epoch": 0.4045817127249253, "grad_norm": 0.9164412617683411, "learning_rate": 9.379222318397475e-05, "loss": 0.9906959533691406, "memory(GiB)": 91.52, "step": 31180, "token_acc": 0.7431163708086785, "train_speed(iter/s)": 0.17591 }, { "epoch": 0.404646591126581, "grad_norm": 0.6684076189994812, "learning_rate": 9.378963442004743e-05, "loss": 0.9310531616210938, "memory(GiB)": 91.52, "step": 31185, "token_acc": 0.754653130287648, "train_speed(iter/s)": 0.175897 }, { "epoch": 0.4047114695282367, "grad_norm": 0.7910862565040588, "learning_rate": 9.378704515219179e-05, "loss": 0.9734269142150879, "memory(GiB)": 91.52, "step": 31190, "token_acc": 0.7367609782832089, "train_speed(iter/s)": 0.175884 }, { "epoch": 0.4047763479298924, "grad_norm": 0.8672221899032593, "learning_rate": 9.378445538043766e-05, "loss": 0.9751958847045898, "memory(GiB)": 91.52, "step": 31195, "token_acc": 0.7554530724168017, "train_speed(iter/s)": 0.175873 }, { "epoch": 0.4048412263315481, "grad_norm": 0.8096677660942078, "learning_rate": 9.378186510481485e-05, "loss": 0.9029541015625, "memory(GiB)": 91.52, "step": 31200, "token_acc": 0.7451021624084394, "train_speed(iter/s)": 0.175862 }, { "epoch": 0.4049061047332038, "grad_norm": 0.792488694190979, "learning_rate": 9.377927432535313e-05, "loss": 0.9765661239624024, "memory(GiB)": 91.52, "step": 31205, "token_acc": 0.7326339002305456, "train_speed(iter/s)": 0.17585 }, { "epoch": 0.40497098313485946, "grad_norm": 0.7514455318450928, "learning_rate": 9.377668304208236e-05, "loss": 0.9166780471801758, "memory(GiB)": 91.52, "step": 31210, "token_acc": 0.7693124314377721, "train_speed(iter/s)": 0.175836 }, { "epoch": 0.40503586153651516, "grad_norm": 0.682257354259491, "learning_rate": 9.377409125503234e-05, "loss": 0.9244400978088378, "memory(GiB)": 91.52, "step": 31215, "token_acc": 0.7435535858178888, "train_speed(iter/s)": 0.175824 }, { "epoch": 0.40510073993817086, "grad_norm": 0.811083197593689, "learning_rate": 9.377149896423288e-05, "loss": 0.8619806289672851, "memory(GiB)": 91.52, "step": 31220, "token_acc": 0.7640791476407914, "train_speed(iter/s)": 0.17581 }, { "epoch": 0.40516561833982656, "grad_norm": 0.7823031544685364, "learning_rate": 9.376890616971384e-05, "loss": 0.907752799987793, "memory(GiB)": 91.52, "step": 31225, "token_acc": 0.7585888863269541, "train_speed(iter/s)": 0.175798 }, { "epoch": 0.40523049674148226, "grad_norm": 0.8267873525619507, "learning_rate": 9.376631287150504e-05, "loss": 0.9597522735595703, "memory(GiB)": 91.52, "step": 31230, "token_acc": 0.7454677521866772, "train_speed(iter/s)": 0.175784 }, { "epoch": 0.40529537514313796, "grad_norm": 0.8421109318733215, "learning_rate": 9.376371906963633e-05, "loss": 0.9968573570251464, "memory(GiB)": 91.52, "step": 31235, "token_acc": 0.7386961307946345, "train_speed(iter/s)": 0.175771 }, { "epoch": 0.40536025354479366, "grad_norm": 0.7214899659156799, "learning_rate": 9.376112476413756e-05, "loss": 0.920074462890625, "memory(GiB)": 91.52, "step": 31240, "token_acc": 0.7598873004278409, "train_speed(iter/s)": 0.175758 }, { "epoch": 0.40542513194644936, "grad_norm": 0.7216980457305908, "learning_rate": 9.375852995503857e-05, "loss": 0.9423067092895507, "memory(GiB)": 91.52, "step": 31245, "token_acc": 0.7448590494348359, "train_speed(iter/s)": 0.175746 }, { "epoch": 0.40549001034810506, "grad_norm": 0.8126335740089417, "learning_rate": 9.375593464236925e-05, "loss": 0.9369594573974609, "memory(GiB)": 91.52, "step": 31250, "token_acc": 0.7437341700374248, "train_speed(iter/s)": 0.175732 }, { "epoch": 0.40555488874976076, "grad_norm": 0.7535523176193237, "learning_rate": 9.375333882615944e-05, "loss": 0.9507102966308594, "memory(GiB)": 91.52, "step": 31255, "token_acc": 0.7524329106458272, "train_speed(iter/s)": 0.175719 }, { "epoch": 0.40561976715141645, "grad_norm": 0.6296020150184631, "learning_rate": 9.375074250643903e-05, "loss": 0.9419272422790528, "memory(GiB)": 91.52, "step": 31260, "token_acc": 0.7526157423237366, "train_speed(iter/s)": 0.175706 }, { "epoch": 0.40568464555307215, "grad_norm": 0.7577686309814453, "learning_rate": 9.374814568323789e-05, "loss": 0.9620210647583007, "memory(GiB)": 91.52, "step": 31265, "token_acc": 0.7158445247933884, "train_speed(iter/s)": 0.175694 }, { "epoch": 0.40574952395472785, "grad_norm": 0.7645248174667358, "learning_rate": 9.374554835658589e-05, "loss": 0.9588852882385254, "memory(GiB)": 91.52, "step": 31270, "token_acc": 0.7436574861568528, "train_speed(iter/s)": 0.175683 }, { "epoch": 0.40581440235638355, "grad_norm": 0.879701554775238, "learning_rate": 9.374295052651295e-05, "loss": 0.9590284347534179, "memory(GiB)": 91.52, "step": 31275, "token_acc": 0.7356136479375318, "train_speed(iter/s)": 0.175673 }, { "epoch": 0.40587928075803925, "grad_norm": 0.8443830609321594, "learning_rate": 9.374035219304896e-05, "loss": 0.9267230987548828, "memory(GiB)": 91.52, "step": 31280, "token_acc": 0.761771702601669, "train_speed(iter/s)": 0.175662 }, { "epoch": 0.40594415915969495, "grad_norm": 0.7492393255233765, "learning_rate": 9.373775335622379e-05, "loss": 0.9734546661376953, "memory(GiB)": 91.52, "step": 31285, "token_acc": 0.7470732465379701, "train_speed(iter/s)": 0.175648 }, { "epoch": 0.40600903756135065, "grad_norm": 0.7874253988265991, "learning_rate": 9.373515401606738e-05, "loss": 0.9507036209106445, "memory(GiB)": 91.52, "step": 31290, "token_acc": 0.7520488168304538, "train_speed(iter/s)": 0.175634 }, { "epoch": 0.40607391596300635, "grad_norm": 0.7719106674194336, "learning_rate": 9.373255417260962e-05, "loss": 0.9522899627685547, "memory(GiB)": 91.52, "step": 31295, "token_acc": 0.7552955835525405, "train_speed(iter/s)": 0.175623 }, { "epoch": 0.40613879436466205, "grad_norm": 0.7595130205154419, "learning_rate": 9.372995382588046e-05, "loss": 0.9620114326477051, "memory(GiB)": 91.52, "step": 31300, "token_acc": 0.7534934103628814, "train_speed(iter/s)": 0.17561 }, { "epoch": 0.40620367276631775, "grad_norm": 0.8410561084747314, "learning_rate": 9.37273529759098e-05, "loss": 0.9193634986877441, "memory(GiB)": 91.52, "step": 31305, "token_acc": 0.7577263124951099, "train_speed(iter/s)": 0.1756 }, { "epoch": 0.40626855116797345, "grad_norm": 0.8004106283187866, "learning_rate": 9.372475162272757e-05, "loss": 0.9158472061157227, "memory(GiB)": 91.52, "step": 31310, "token_acc": 0.7531150491485532, "train_speed(iter/s)": 0.175586 }, { "epoch": 0.40633342956962915, "grad_norm": 0.6976600885391235, "learning_rate": 9.372214976636371e-05, "loss": 0.9721761703491211, "memory(GiB)": 91.52, "step": 31315, "token_acc": 0.7569301666075859, "train_speed(iter/s)": 0.175574 }, { "epoch": 0.40639830797128484, "grad_norm": 0.7799521684646606, "learning_rate": 9.371954740684818e-05, "loss": 0.9856545448303222, "memory(GiB)": 91.52, "step": 31320, "token_acc": 0.7308648200842849, "train_speed(iter/s)": 0.175561 }, { "epoch": 0.40646318637294054, "grad_norm": 0.8294139504432678, "learning_rate": 9.37169445442109e-05, "loss": 0.9269322395324707, "memory(GiB)": 91.52, "step": 31325, "token_acc": 0.7410089307265266, "train_speed(iter/s)": 0.17555 }, { "epoch": 0.4065280647745962, "grad_norm": 0.8142476081848145, "learning_rate": 9.371434117848183e-05, "loss": 0.9500415802001954, "memory(GiB)": 91.52, "step": 31330, "token_acc": 0.7424997588346892, "train_speed(iter/s)": 0.175539 }, { "epoch": 0.4065929431762519, "grad_norm": 0.8266568183898926, "learning_rate": 9.371173730969093e-05, "loss": 0.9369604110717773, "memory(GiB)": 91.52, "step": 31335, "token_acc": 0.7603812335536871, "train_speed(iter/s)": 0.175526 }, { "epoch": 0.4066578215779076, "grad_norm": 0.8317412734031677, "learning_rate": 9.37091329378682e-05, "loss": 0.9163267135620117, "memory(GiB)": 91.52, "step": 31340, "token_acc": 0.7679554088117301, "train_speed(iter/s)": 0.175514 }, { "epoch": 0.4067226999795633, "grad_norm": 0.7762434482574463, "learning_rate": 9.370652806304357e-05, "loss": 0.9655950546264649, "memory(GiB)": 91.52, "step": 31345, "token_acc": 0.7554453899601843, "train_speed(iter/s)": 0.175503 }, { "epoch": 0.406787578381219, "grad_norm": 0.8165576457977295, "learning_rate": 9.370392268524701e-05, "loss": 0.9271703720092773, "memory(GiB)": 91.52, "step": 31350, "token_acc": 0.7626469878271664, "train_speed(iter/s)": 0.17549 }, { "epoch": 0.4068524567828747, "grad_norm": 0.8338214159011841, "learning_rate": 9.370131680450855e-05, "loss": 0.9856126785278321, "memory(GiB)": 91.52, "step": 31355, "token_acc": 0.7432102213223487, "train_speed(iter/s)": 0.175478 }, { "epoch": 0.4069173351845304, "grad_norm": 0.7271679043769836, "learning_rate": 9.369871042085812e-05, "loss": 0.9346115112304687, "memory(GiB)": 91.52, "step": 31360, "token_acc": 0.7291567087743257, "train_speed(iter/s)": 0.175467 }, { "epoch": 0.4069822135861861, "grad_norm": 0.7036030292510986, "learning_rate": 9.369610353432576e-05, "loss": 0.9341917991638183, "memory(GiB)": 91.52, "step": 31365, "token_acc": 0.7347131714893473, "train_speed(iter/s)": 0.175453 }, { "epoch": 0.4070470919878418, "grad_norm": 0.8350387811660767, "learning_rate": 9.369349614494145e-05, "loss": 0.9783109664916992, "memory(GiB)": 91.52, "step": 31370, "token_acc": 0.7425269181537535, "train_speed(iter/s)": 0.17544 }, { "epoch": 0.4071119703894975, "grad_norm": 0.782568097114563, "learning_rate": 9.369088825273519e-05, "loss": 0.8958854675292969, "memory(GiB)": 91.52, "step": 31375, "token_acc": 0.780897699174385, "train_speed(iter/s)": 0.175427 }, { "epoch": 0.4071768487911532, "grad_norm": 0.7445340752601624, "learning_rate": 9.368827985773702e-05, "loss": 0.9741360664367675, "memory(GiB)": 91.52, "step": 31380, "token_acc": 0.7317511463626224, "train_speed(iter/s)": 0.175413 }, { "epoch": 0.4072417271928089, "grad_norm": 0.7928206324577332, "learning_rate": 9.368567095997692e-05, "loss": 0.9661958694458008, "memory(GiB)": 91.52, "step": 31385, "token_acc": 0.7414159113117903, "train_speed(iter/s)": 0.175402 }, { "epoch": 0.4073066055944646, "grad_norm": 0.7267639636993408, "learning_rate": 9.368306155948493e-05, "loss": 0.9860944747924805, "memory(GiB)": 91.52, "step": 31390, "token_acc": 0.7449601160261059, "train_speed(iter/s)": 0.175388 }, { "epoch": 0.4073714839961203, "grad_norm": 0.7404069900512695, "learning_rate": 9.368045165629109e-05, "loss": 0.9416999816894531, "memory(GiB)": 91.52, "step": 31395, "token_acc": 0.7426079302204617, "train_speed(iter/s)": 0.175376 }, { "epoch": 0.407436362397776, "grad_norm": 0.8179210424423218, "learning_rate": 9.36778412504254e-05, "loss": 0.9281782150268555, "memory(GiB)": 91.52, "step": 31400, "token_acc": 0.7474319318745942, "train_speed(iter/s)": 0.175364 }, { "epoch": 0.4075012407994317, "grad_norm": 0.7641791701316833, "learning_rate": 9.367523034191796e-05, "loss": 0.9883798599243164, "memory(GiB)": 91.52, "step": 31405, "token_acc": 0.7287216288384513, "train_speed(iter/s)": 0.175353 }, { "epoch": 0.4075661192010874, "grad_norm": 1.191457748413086, "learning_rate": 9.367261893079875e-05, "loss": 0.9472255706787109, "memory(GiB)": 91.52, "step": 31410, "token_acc": 0.7206438068579426, "train_speed(iter/s)": 0.175339 }, { "epoch": 0.4076309976027431, "grad_norm": 0.839061975479126, "learning_rate": 9.367000701709787e-05, "loss": 0.9662430763244629, "memory(GiB)": 91.52, "step": 31415, "token_acc": 0.7370399070127858, "train_speed(iter/s)": 0.175327 }, { "epoch": 0.40769587600439877, "grad_norm": 0.7894713878631592, "learning_rate": 9.366739460084535e-05, "loss": 0.948310375213623, "memory(GiB)": 91.52, "step": 31420, "token_acc": 0.7501019714479945, "train_speed(iter/s)": 0.175314 }, { "epoch": 0.40776075440605447, "grad_norm": 0.6875790357589722, "learning_rate": 9.366478168207126e-05, "loss": 0.9706798553466797, "memory(GiB)": 91.52, "step": 31425, "token_acc": 0.7487816843826656, "train_speed(iter/s)": 0.175302 }, { "epoch": 0.40782563280771017, "grad_norm": 0.7769115567207336, "learning_rate": 9.366216826080568e-05, "loss": 0.9692638397216797, "memory(GiB)": 91.52, "step": 31430, "token_acc": 0.7279045972550542, "train_speed(iter/s)": 0.175288 }, { "epoch": 0.40789051120936587, "grad_norm": 0.8105557560920715, "learning_rate": 9.365955433707867e-05, "loss": 0.9083111763000489, "memory(GiB)": 91.52, "step": 31435, "token_acc": 0.7562871114215858, "train_speed(iter/s)": 0.175274 }, { "epoch": 0.40795538961102157, "grad_norm": 0.8537291884422302, "learning_rate": 9.365693991092033e-05, "loss": 0.915153694152832, "memory(GiB)": 91.52, "step": 31440, "token_acc": 0.7537797710053341, "train_speed(iter/s)": 0.175262 }, { "epoch": 0.40802026801267727, "grad_norm": 0.9240668416023254, "learning_rate": 9.365432498236073e-05, "loss": 0.9870978355407715, "memory(GiB)": 91.52, "step": 31445, "token_acc": 0.7333741830065359, "train_speed(iter/s)": 0.17525 }, { "epoch": 0.4080851464143329, "grad_norm": 0.7928386926651001, "learning_rate": 9.365170955142997e-05, "loss": 0.8996661186218262, "memory(GiB)": 91.52, "step": 31450, "token_acc": 0.7572502262621493, "train_speed(iter/s)": 0.175238 }, { "epoch": 0.4081500248159886, "grad_norm": 0.986099123954773, "learning_rate": 9.364909361815814e-05, "loss": 0.9247732162475586, "memory(GiB)": 91.52, "step": 31455, "token_acc": 0.7681664156626506, "train_speed(iter/s)": 0.175226 }, { "epoch": 0.4082149032176443, "grad_norm": 0.6868710517883301, "learning_rate": 9.364647718257536e-05, "loss": 0.9084585189819336, "memory(GiB)": 91.52, "step": 31460, "token_acc": 0.7450667880995749, "train_speed(iter/s)": 0.175214 }, { "epoch": 0.4082797816193, "grad_norm": 0.7199627757072449, "learning_rate": 9.364386024471173e-05, "loss": 0.9222372055053711, "memory(GiB)": 91.52, "step": 31465, "token_acc": 0.7565832841827707, "train_speed(iter/s)": 0.175203 }, { "epoch": 0.4083446600209557, "grad_norm": 0.7372714281082153, "learning_rate": 9.364124280459736e-05, "loss": 0.9458662033081054, "memory(GiB)": 91.52, "step": 31470, "token_acc": 0.7503471740036106, "train_speed(iter/s)": 0.17519 }, { "epoch": 0.4084095384226114, "grad_norm": 0.6954222321510315, "learning_rate": 9.363862486226236e-05, "loss": 0.9173864364624024, "memory(GiB)": 91.52, "step": 31475, "token_acc": 0.7439087996423516, "train_speed(iter/s)": 0.175179 }, { "epoch": 0.4084744168242671, "grad_norm": 0.7637940049171448, "learning_rate": 9.363600641773689e-05, "loss": 0.9374167442321777, "memory(GiB)": 91.52, "step": 31480, "token_acc": 0.7484778420038536, "train_speed(iter/s)": 0.175167 }, { "epoch": 0.4085392952259228, "grad_norm": 0.7629547119140625, "learning_rate": 9.363338747105107e-05, "loss": 0.9693747520446777, "memory(GiB)": 91.52, "step": 31485, "token_acc": 0.7310692645404738, "train_speed(iter/s)": 0.175156 }, { "epoch": 0.4086041736275785, "grad_norm": 0.8301675319671631, "learning_rate": 9.363076802223503e-05, "loss": 0.9906142234802247, "memory(GiB)": 91.52, "step": 31490, "token_acc": 0.7411091234347048, "train_speed(iter/s)": 0.175145 }, { "epoch": 0.4086690520292342, "grad_norm": 0.804513692855835, "learning_rate": 9.362814807131892e-05, "loss": 0.9602975845336914, "memory(GiB)": 91.52, "step": 31495, "token_acc": 0.7425090288526411, "train_speed(iter/s)": 0.175134 }, { "epoch": 0.4087339304308899, "grad_norm": 0.7658482789993286, "learning_rate": 9.36255276183329e-05, "loss": 0.9118203163146973, "memory(GiB)": 91.52, "step": 31500, "token_acc": 0.7606592400427286, "train_speed(iter/s)": 0.175119 }, { "epoch": 0.4087988088325456, "grad_norm": 0.7314141988754272, "learning_rate": 9.36229066633071e-05, "loss": 0.9770947456359863, "memory(GiB)": 91.52, "step": 31505, "token_acc": 0.7265032392486246, "train_speed(iter/s)": 0.175109 }, { "epoch": 0.4088636872342013, "grad_norm": 0.83194899559021, "learning_rate": 9.36202852062717e-05, "loss": 0.9036511421203614, "memory(GiB)": 91.52, "step": 31510, "token_acc": 0.7619638426090038, "train_speed(iter/s)": 0.175095 }, { "epoch": 0.408928565635857, "grad_norm": 0.7581440210342407, "learning_rate": 9.361766324725687e-05, "loss": 0.9309793472290039, "memory(GiB)": 91.52, "step": 31515, "token_acc": 0.7588251681458192, "train_speed(iter/s)": 0.175085 }, { "epoch": 0.4089934440375127, "grad_norm": 0.8233210444450378, "learning_rate": 9.361504078629278e-05, "loss": 0.9348659515380859, "memory(GiB)": 91.52, "step": 31520, "token_acc": 0.7468618494407079, "train_speed(iter/s)": 0.175073 }, { "epoch": 0.4090583224391684, "grad_norm": 0.8046860694885254, "learning_rate": 9.361241782340959e-05, "loss": 0.928407096862793, "memory(GiB)": 91.52, "step": 31525, "token_acc": 0.7542194785578249, "train_speed(iter/s)": 0.175059 }, { "epoch": 0.4091232008408241, "grad_norm": 0.8097534775733948, "learning_rate": 9.360979435863752e-05, "loss": 0.9531196594238281, "memory(GiB)": 91.52, "step": 31530, "token_acc": 0.7707218334019487, "train_speed(iter/s)": 0.175046 }, { "epoch": 0.4091880792424798, "grad_norm": 0.7035810947418213, "learning_rate": 9.360717039200675e-05, "loss": 0.9396937370300293, "memory(GiB)": 91.52, "step": 31535, "token_acc": 0.7330995451546821, "train_speed(iter/s)": 0.175034 }, { "epoch": 0.4092529576441355, "grad_norm": 0.7637670636177063, "learning_rate": 9.360454592354746e-05, "loss": 0.9783988952636719, "memory(GiB)": 91.52, "step": 31540, "token_acc": 0.7492729676140119, "train_speed(iter/s)": 0.175023 }, { "epoch": 0.4093178360457912, "grad_norm": 0.8188023567199707, "learning_rate": 9.360192095328985e-05, "loss": 0.9618257522583008, "memory(GiB)": 91.52, "step": 31545, "token_acc": 0.7372080772607551, "train_speed(iter/s)": 0.17501 }, { "epoch": 0.4093827144474469, "grad_norm": 0.8224780559539795, "learning_rate": 9.359929548126415e-05, "loss": 0.9812681198120117, "memory(GiB)": 91.52, "step": 31550, "token_acc": 0.7372248876703983, "train_speed(iter/s)": 0.175 }, { "epoch": 0.4094475928491026, "grad_norm": 0.7351405620574951, "learning_rate": 9.359666950750058e-05, "loss": 0.9003173828125, "memory(GiB)": 91.52, "step": 31555, "token_acc": 0.7690405724347207, "train_speed(iter/s)": 0.174989 }, { "epoch": 0.4095124712507583, "grad_norm": 0.7657321691513062, "learning_rate": 9.359404303202933e-05, "loss": 0.9186503410339355, "memory(GiB)": 91.52, "step": 31560, "token_acc": 0.7503450132956343, "train_speed(iter/s)": 0.174975 }, { "epoch": 0.40957734965241394, "grad_norm": 0.6652483344078064, "learning_rate": 9.359141605488063e-05, "loss": 0.9145872116088867, "memory(GiB)": 91.52, "step": 31565, "token_acc": 0.7663516351635163, "train_speed(iter/s)": 0.174962 }, { "epoch": 0.40964222805406963, "grad_norm": 0.7175774574279785, "learning_rate": 9.358878857608473e-05, "loss": 0.9155398368835449, "memory(GiB)": 91.52, "step": 31570, "token_acc": 0.7518173744284727, "train_speed(iter/s)": 0.17495 }, { "epoch": 0.40970710645572533, "grad_norm": 1.1822668313980103, "learning_rate": 9.358616059567185e-05, "loss": 0.9241745948791504, "memory(GiB)": 91.52, "step": 31575, "token_acc": 0.7572310692232694, "train_speed(iter/s)": 0.174938 }, { "epoch": 0.40977198485738103, "grad_norm": 0.7779126167297363, "learning_rate": 9.358353211367225e-05, "loss": 0.958767318725586, "memory(GiB)": 91.52, "step": 31580, "token_acc": 0.7493497563454812, "train_speed(iter/s)": 0.174925 }, { "epoch": 0.40983686325903673, "grad_norm": 0.8815398216247559, "learning_rate": 9.358090313011617e-05, "loss": 0.899477767944336, "memory(GiB)": 91.52, "step": 31585, "token_acc": 0.7721411416959172, "train_speed(iter/s)": 0.174913 }, { "epoch": 0.40990174166069243, "grad_norm": 0.8497154116630554, "learning_rate": 9.357827364503384e-05, "loss": 0.9728801727294922, "memory(GiB)": 91.52, "step": 31590, "token_acc": 0.7427560920211558, "train_speed(iter/s)": 0.174901 }, { "epoch": 0.40996662006234813, "grad_norm": 0.8375741839408875, "learning_rate": 9.357564365845555e-05, "loss": 0.9839081764221191, "memory(GiB)": 91.52, "step": 31595, "token_acc": 0.7117517417887104, "train_speed(iter/s)": 0.174889 }, { "epoch": 0.41003149846400383, "grad_norm": 0.7658572793006897, "learning_rate": 9.357301317041156e-05, "loss": 0.9657901763916016, "memory(GiB)": 91.52, "step": 31600, "token_acc": 0.7414564628758729, "train_speed(iter/s)": 0.174875 }, { "epoch": 0.41009637686565953, "grad_norm": 0.7897817492485046, "learning_rate": 9.357038218093215e-05, "loss": 0.942985725402832, "memory(GiB)": 91.52, "step": 31605, "token_acc": 0.73515625, "train_speed(iter/s)": 0.174861 }, { "epoch": 0.41016125526731523, "grad_norm": 0.8637897968292236, "learning_rate": 9.356775069004757e-05, "loss": 0.9094560623168946, "memory(GiB)": 91.52, "step": 31610, "token_acc": 0.7557321225879682, "train_speed(iter/s)": 0.174851 }, { "epoch": 0.4102261336689709, "grad_norm": 0.7404661178588867, "learning_rate": 9.356511869778812e-05, "loss": 0.9628458023071289, "memory(GiB)": 91.52, "step": 31615, "token_acc": 0.7497713066016161, "train_speed(iter/s)": 0.174838 }, { "epoch": 0.4102910120706266, "grad_norm": 0.8332704901695251, "learning_rate": 9.356248620418409e-05, "loss": 0.9493865966796875, "memory(GiB)": 91.52, "step": 31620, "token_acc": 0.7410699025807554, "train_speed(iter/s)": 0.174826 }, { "epoch": 0.4103558904722823, "grad_norm": 0.7995108366012573, "learning_rate": 9.355985320926577e-05, "loss": 0.9473699569702149, "memory(GiB)": 91.52, "step": 31625, "token_acc": 0.7226086306185028, "train_speed(iter/s)": 0.174817 }, { "epoch": 0.410420768873938, "grad_norm": 0.8211556673049927, "learning_rate": 9.355721971306348e-05, "loss": 0.9180004119873046, "memory(GiB)": 91.52, "step": 31630, "token_acc": 0.7567828378428534, "train_speed(iter/s)": 0.174804 }, { "epoch": 0.4104856472755937, "grad_norm": 0.7078304290771484, "learning_rate": 9.355458571560749e-05, "loss": 0.9217267990112304, "memory(GiB)": 91.52, "step": 31635, "token_acc": 0.7643585103343176, "train_speed(iter/s)": 0.174792 }, { "epoch": 0.4105505256772494, "grad_norm": 0.7470491528511047, "learning_rate": 9.355195121692814e-05, "loss": 0.926272964477539, "memory(GiB)": 91.52, "step": 31640, "token_acc": 0.756106247217688, "train_speed(iter/s)": 0.17478 }, { "epoch": 0.4106154040789051, "grad_norm": 0.7818893194198608, "learning_rate": 9.354931621705574e-05, "loss": 0.874755859375, "memory(GiB)": 91.52, "step": 31645, "token_acc": 0.7705425840189188, "train_speed(iter/s)": 0.174768 }, { "epoch": 0.4106802824805608, "grad_norm": 0.7638379335403442, "learning_rate": 9.35466807160206e-05, "loss": 0.9666568756103515, "memory(GiB)": 91.52, "step": 31650, "token_acc": 0.7437288135593221, "train_speed(iter/s)": 0.174756 }, { "epoch": 0.4107451608822165, "grad_norm": 0.7183151841163635, "learning_rate": 9.354404471385306e-05, "loss": 0.9335395812988281, "memory(GiB)": 91.52, "step": 31655, "token_acc": 0.7505724204150971, "train_speed(iter/s)": 0.174745 }, { "epoch": 0.4108100392838722, "grad_norm": 0.8409419655799866, "learning_rate": 9.354140821058346e-05, "loss": 0.9168952941894531, "memory(GiB)": 91.52, "step": 31660, "token_acc": 0.7510675808031342, "train_speed(iter/s)": 0.174733 }, { "epoch": 0.4108749176855279, "grad_norm": 0.729133665561676, "learning_rate": 9.353877120624215e-05, "loss": 0.9092868804931641, "memory(GiB)": 91.52, "step": 31665, "token_acc": 0.7689933679164751, "train_speed(iter/s)": 0.174723 }, { "epoch": 0.4109397960871836, "grad_norm": 0.7845401167869568, "learning_rate": 9.353613370085945e-05, "loss": 0.9329994201660157, "memory(GiB)": 91.52, "step": 31670, "token_acc": 0.7431400489605028, "train_speed(iter/s)": 0.174714 }, { "epoch": 0.4110046744888393, "grad_norm": 0.7874565720558167, "learning_rate": 9.353349569446574e-05, "loss": 0.9535151481628418, "memory(GiB)": 91.52, "step": 31675, "token_acc": 0.7410977242302543, "train_speed(iter/s)": 0.174702 }, { "epoch": 0.411069552890495, "grad_norm": 0.7475518584251404, "learning_rate": 9.353085718709133e-05, "loss": 0.9400849342346191, "memory(GiB)": 91.52, "step": 31680, "token_acc": 0.7503147293327738, "train_speed(iter/s)": 0.174692 }, { "epoch": 0.41113443129215066, "grad_norm": 0.7657142281532288, "learning_rate": 9.352821817876664e-05, "loss": 0.9934272766113281, "memory(GiB)": 91.52, "step": 31685, "token_acc": 0.7484064766649572, "train_speed(iter/s)": 0.174681 }, { "epoch": 0.41119930969380636, "grad_norm": 0.8036192059516907, "learning_rate": 9.352557866952203e-05, "loss": 0.9265710830688476, "memory(GiB)": 91.52, "step": 31690, "token_acc": 0.7620533856488912, "train_speed(iter/s)": 0.17467 }, { "epoch": 0.41126418809546206, "grad_norm": 0.7486778497695923, "learning_rate": 9.352293865938785e-05, "loss": 0.9628767967224121, "memory(GiB)": 91.52, "step": 31695, "token_acc": 0.7405304366226987, "train_speed(iter/s)": 0.174659 }, { "epoch": 0.41132906649711776, "grad_norm": 0.7721365690231323, "learning_rate": 9.35202981483945e-05, "loss": 0.9587129592895508, "memory(GiB)": 91.52, "step": 31700, "token_acc": 0.7409042145593869, "train_speed(iter/s)": 0.174647 }, { "epoch": 0.41139394489877346, "grad_norm": 0.6900683045387268, "learning_rate": 9.351765713657235e-05, "loss": 0.9302024841308594, "memory(GiB)": 91.52, "step": 31705, "token_acc": 0.7631890229477171, "train_speed(iter/s)": 0.174635 }, { "epoch": 0.41145882330042916, "grad_norm": 0.8264148831367493, "learning_rate": 9.35150156239518e-05, "loss": 0.9205209732055664, "memory(GiB)": 91.52, "step": 31710, "token_acc": 0.7545309090909091, "train_speed(iter/s)": 0.174623 }, { "epoch": 0.41152370170208485, "grad_norm": 0.7277098298072815, "learning_rate": 9.351237361056324e-05, "loss": 0.9786011695861816, "memory(GiB)": 91.52, "step": 31715, "token_acc": 0.7346484042204601, "train_speed(iter/s)": 0.174611 }, { "epoch": 0.41158858010374055, "grad_norm": 0.7149754166603088, "learning_rate": 9.350973109643711e-05, "loss": 0.9466738700866699, "memory(GiB)": 91.52, "step": 31720, "token_acc": 0.7455194258449942, "train_speed(iter/s)": 0.1746 }, { "epoch": 0.41165345850539625, "grad_norm": 0.8331350684165955, "learning_rate": 9.350708808160378e-05, "loss": 1.0062217712402344, "memory(GiB)": 91.52, "step": 31725, "token_acc": 0.736364486855646, "train_speed(iter/s)": 0.174588 }, { "epoch": 0.41171833690705195, "grad_norm": 0.8529848456382751, "learning_rate": 9.350444456609369e-05, "loss": 0.9670360565185547, "memory(GiB)": 91.52, "step": 31730, "token_acc": 0.7530397505845674, "train_speed(iter/s)": 0.174577 }, { "epoch": 0.41178321530870765, "grad_norm": 0.803662896156311, "learning_rate": 9.350180054993725e-05, "loss": 0.9512554168701172, "memory(GiB)": 91.52, "step": 31735, "token_acc": 0.7551372705241842, "train_speed(iter/s)": 0.174567 }, { "epoch": 0.41184809371036335, "grad_norm": 0.7845818400382996, "learning_rate": 9.349915603316488e-05, "loss": 0.968264389038086, "memory(GiB)": 91.52, "step": 31740, "token_acc": 0.7427940837368646, "train_speed(iter/s)": 0.174554 }, { "epoch": 0.41191297211201905, "grad_norm": 0.7812600135803223, "learning_rate": 9.349651101580702e-05, "loss": 0.8980368614196778, "memory(GiB)": 91.52, "step": 31745, "token_acc": 0.7667895556991372, "train_speed(iter/s)": 0.174542 }, { "epoch": 0.41197785051367475, "grad_norm": 0.846617579460144, "learning_rate": 9.349386549789412e-05, "loss": 0.9426105499267579, "memory(GiB)": 91.52, "step": 31750, "token_acc": 0.7501239259748843, "train_speed(iter/s)": 0.174531 }, { "epoch": 0.41204272891533045, "grad_norm": 0.8813991546630859, "learning_rate": 9.34912194794566e-05, "loss": 0.9207853317260742, "memory(GiB)": 91.52, "step": 31755, "token_acc": 0.7402415244090178, "train_speed(iter/s)": 0.174518 }, { "epoch": 0.41210760731698615, "grad_norm": 0.7993224859237671, "learning_rate": 9.348857296052496e-05, "loss": 0.9138172149658204, "memory(GiB)": 91.52, "step": 31760, "token_acc": 0.7518232044198895, "train_speed(iter/s)": 0.174507 }, { "epoch": 0.41217248571864185, "grad_norm": 0.8090019822120667, "learning_rate": 9.348592594112959e-05, "loss": 0.9144447326660157, "memory(GiB)": 91.52, "step": 31765, "token_acc": 0.7443050749711649, "train_speed(iter/s)": 0.174494 }, { "epoch": 0.41223736412029754, "grad_norm": 0.8135501742362976, "learning_rate": 9.3483278421301e-05, "loss": 0.9773855209350586, "memory(GiB)": 91.52, "step": 31770, "token_acc": 0.7412612483418779, "train_speed(iter/s)": 0.174482 }, { "epoch": 0.41230224252195324, "grad_norm": 0.7657650113105774, "learning_rate": 9.348063040106962e-05, "loss": 0.9388477325439453, "memory(GiB)": 91.52, "step": 31775, "token_acc": 0.7381863734428283, "train_speed(iter/s)": 0.174469 }, { "epoch": 0.41236712092360894, "grad_norm": 0.7880260944366455, "learning_rate": 9.347798188046597e-05, "loss": 0.9649576187133789, "memory(GiB)": 91.52, "step": 31780, "token_acc": 0.7285393606316278, "train_speed(iter/s)": 0.174458 }, { "epoch": 0.41243199932526464, "grad_norm": 0.7944138050079346, "learning_rate": 9.347533285952049e-05, "loss": 0.9538324356079102, "memory(GiB)": 91.52, "step": 31785, "token_acc": 0.7449591816661749, "train_speed(iter/s)": 0.174445 }, { "epoch": 0.41249687772692034, "grad_norm": 0.8043890595436096, "learning_rate": 9.347268333826366e-05, "loss": 0.9980792999267578, "memory(GiB)": 91.52, "step": 31790, "token_acc": 0.7632118411451999, "train_speed(iter/s)": 0.174434 }, { "epoch": 0.41256175612857604, "grad_norm": 0.796920657157898, "learning_rate": 9.347003331672602e-05, "loss": 0.9090570449829102, "memory(GiB)": 91.52, "step": 31795, "token_acc": 0.7546499102333932, "train_speed(iter/s)": 0.174422 }, { "epoch": 0.41262663453023174, "grad_norm": 0.8010686039924622, "learning_rate": 9.3467382794938e-05, "loss": 0.971263313293457, "memory(GiB)": 91.52, "step": 31800, "token_acc": 0.7349867352681083, "train_speed(iter/s)": 0.174412 }, { "epoch": 0.4126915129318874, "grad_norm": 0.7836918830871582, "learning_rate": 9.346473177293017e-05, "loss": 0.9204351425170898, "memory(GiB)": 91.52, "step": 31805, "token_acc": 0.7634878637984586, "train_speed(iter/s)": 0.174398 }, { "epoch": 0.4127563913335431, "grad_norm": 0.7957291603088379, "learning_rate": 9.3462080250733e-05, "loss": 0.9446193695068359, "memory(GiB)": 91.52, "step": 31810, "token_acc": 0.7465978226064681, "train_speed(iter/s)": 0.174387 }, { "epoch": 0.4128212697351988, "grad_norm": 0.8963168263435364, "learning_rate": 9.3459428228377e-05, "loss": 0.9740013122558594, "memory(GiB)": 91.52, "step": 31815, "token_acc": 0.7618044527983926, "train_speed(iter/s)": 0.174372 }, { "epoch": 0.4128861481368545, "grad_norm": 0.6709486246109009, "learning_rate": 9.345677570589269e-05, "loss": 0.9189746856689454, "memory(GiB)": 91.52, "step": 31820, "token_acc": 0.7680345298168777, "train_speed(iter/s)": 0.174361 }, { "epoch": 0.4129510265385102, "grad_norm": 0.8183730244636536, "learning_rate": 9.34541226833106e-05, "loss": 0.9628986358642578, "memory(GiB)": 91.52, "step": 31825, "token_acc": 0.7455254237288136, "train_speed(iter/s)": 0.174347 }, { "epoch": 0.4130159049401659, "grad_norm": 0.7675950527191162, "learning_rate": 9.345146916066126e-05, "loss": 0.9454789161682129, "memory(GiB)": 91.52, "step": 31830, "token_acc": 0.7238233604757239, "train_speed(iter/s)": 0.174338 }, { "epoch": 0.4130807833418216, "grad_norm": 0.8619294762611389, "learning_rate": 9.344881513797522e-05, "loss": 0.9177287101745606, "memory(GiB)": 91.52, "step": 31835, "token_acc": 0.7748176921103741, "train_speed(iter/s)": 0.174326 }, { "epoch": 0.4131456617434773, "grad_norm": 0.7899051904678345, "learning_rate": 9.344616061528302e-05, "loss": 0.9442858695983887, "memory(GiB)": 91.52, "step": 31840, "token_acc": 0.7435836782968658, "train_speed(iter/s)": 0.174317 }, { "epoch": 0.413210540145133, "grad_norm": 0.7145039439201355, "learning_rate": 9.344350559261518e-05, "loss": 0.8956192016601563, "memory(GiB)": 91.52, "step": 31845, "token_acc": 0.7583858267716536, "train_speed(iter/s)": 0.174305 }, { "epoch": 0.4132754185467887, "grad_norm": 0.7160961031913757, "learning_rate": 9.344085007000229e-05, "loss": 0.9576216697692871, "memory(GiB)": 91.52, "step": 31850, "token_acc": 0.7583330639164592, "train_speed(iter/s)": 0.174291 }, { "epoch": 0.4133402969484444, "grad_norm": 0.8291712999343872, "learning_rate": 9.343819404747489e-05, "loss": 0.9947192192077636, "memory(GiB)": 91.52, "step": 31855, "token_acc": 0.7286282306163022, "train_speed(iter/s)": 0.174281 }, { "epoch": 0.4134051753501001, "grad_norm": 0.8000054955482483, "learning_rate": 9.343553752506354e-05, "loss": 0.9757617950439453, "memory(GiB)": 91.52, "step": 31860, "token_acc": 0.7588639385433594, "train_speed(iter/s)": 0.174271 }, { "epoch": 0.4134700537517558, "grad_norm": 0.752980649471283, "learning_rate": 9.343288050279883e-05, "loss": 0.9345248222351075, "memory(GiB)": 91.52, "step": 31865, "token_acc": 0.7523523523523523, "train_speed(iter/s)": 0.17426 }, { "epoch": 0.41353493215341147, "grad_norm": 0.7456662058830261, "learning_rate": 9.34302229807113e-05, "loss": 0.9330748558044434, "memory(GiB)": 91.52, "step": 31870, "token_acc": 0.7491337976411298, "train_speed(iter/s)": 0.174249 }, { "epoch": 0.41359981055506717, "grad_norm": 0.869522213935852, "learning_rate": 9.34275649588316e-05, "loss": 0.8813411712646484, "memory(GiB)": 91.52, "step": 31875, "token_acc": 0.7695889211667795, "train_speed(iter/s)": 0.174236 }, { "epoch": 0.41366468895672287, "grad_norm": 0.7965825796127319, "learning_rate": 9.342490643719026e-05, "loss": 0.9432492256164551, "memory(GiB)": 91.52, "step": 31880, "token_acc": 0.7185355500530598, "train_speed(iter/s)": 0.174224 }, { "epoch": 0.41372956735837857, "grad_norm": 0.7865279912948608, "learning_rate": 9.342224741581788e-05, "loss": 0.9668180465698242, "memory(GiB)": 91.52, "step": 31885, "token_acc": 0.7555696559503666, "train_speed(iter/s)": 0.174212 }, { "epoch": 0.41379444576003427, "grad_norm": 0.8019936680793762, "learning_rate": 9.341958789474509e-05, "loss": 0.9702049255371094, "memory(GiB)": 91.52, "step": 31890, "token_acc": 0.738329052726998, "train_speed(iter/s)": 0.1742 }, { "epoch": 0.41385932416168997, "grad_norm": 0.8207198977470398, "learning_rate": 9.341692787400247e-05, "loss": 0.997523021697998, "memory(GiB)": 91.52, "step": 31895, "token_acc": 0.7489472628834971, "train_speed(iter/s)": 0.174189 }, { "epoch": 0.41392420256334567, "grad_norm": 0.9102993607521057, "learning_rate": 9.341426735362063e-05, "loss": 0.9108345031738281, "memory(GiB)": 91.52, "step": 31900, "token_acc": 0.7506975013950028, "train_speed(iter/s)": 0.174177 }, { "epoch": 0.41398908096500137, "grad_norm": 0.7962172031402588, "learning_rate": 9.34116063336302e-05, "loss": 0.8822535514831543, "memory(GiB)": 91.52, "step": 31905, "token_acc": 0.779369348932171, "train_speed(iter/s)": 0.174167 }, { "epoch": 0.41405395936665707, "grad_norm": 0.8069634437561035, "learning_rate": 9.340894481406181e-05, "loss": 0.9481342315673829, "memory(GiB)": 91.52, "step": 31910, "token_acc": 0.7407879924953096, "train_speed(iter/s)": 0.174156 }, { "epoch": 0.41411883776831276, "grad_norm": 0.7368625402450562, "learning_rate": 9.340628279494606e-05, "loss": 0.9344145774841308, "memory(GiB)": 91.52, "step": 31915, "token_acc": 0.7677127586679457, "train_speed(iter/s)": 0.174146 }, { "epoch": 0.41418371616996846, "grad_norm": 0.7856249809265137, "learning_rate": 9.340362027631362e-05, "loss": 0.9506888389587402, "memory(GiB)": 91.52, "step": 31920, "token_acc": 0.7640471905618876, "train_speed(iter/s)": 0.174134 }, { "epoch": 0.4142485945716241, "grad_norm": 0.7967461943626404, "learning_rate": 9.340095725819508e-05, "loss": 0.9658573150634766, "memory(GiB)": 91.52, "step": 31925, "token_acc": 0.7349341595662278, "train_speed(iter/s)": 0.174121 }, { "epoch": 0.4143134729732798, "grad_norm": 0.7691695094108582, "learning_rate": 9.339829374062114e-05, "loss": 0.9373555183410645, "memory(GiB)": 91.52, "step": 31930, "token_acc": 0.7536562409498987, "train_speed(iter/s)": 0.17411 }, { "epoch": 0.4143783513749355, "grad_norm": 0.7666428089141846, "learning_rate": 9.339562972362243e-05, "loss": 0.9367415428161621, "memory(GiB)": 91.52, "step": 31935, "token_acc": 0.7536346033933904, "train_speed(iter/s)": 0.174099 }, { "epoch": 0.4144432297765912, "grad_norm": 0.7753248810768127, "learning_rate": 9.33929652072296e-05, "loss": 0.9402534484863281, "memory(GiB)": 91.52, "step": 31940, "token_acc": 0.7613481566111526, "train_speed(iter/s)": 0.174089 }, { "epoch": 0.4145081081782469, "grad_norm": 0.8952823877334595, "learning_rate": 9.339030019147332e-05, "loss": 1.0139931678771972, "memory(GiB)": 91.52, "step": 31945, "token_acc": 0.7383234885897808, "train_speed(iter/s)": 0.174079 }, { "epoch": 0.4145729865799026, "grad_norm": 0.7945414781570435, "learning_rate": 9.338763467638426e-05, "loss": 0.9419183731079102, "memory(GiB)": 91.52, "step": 31950, "token_acc": 0.7417058955391168, "train_speed(iter/s)": 0.174068 }, { "epoch": 0.4146378649815583, "grad_norm": 0.8402943015098572, "learning_rate": 9.338496866199309e-05, "loss": 0.9357431411743165, "memory(GiB)": 91.52, "step": 31955, "token_acc": 0.7419212459409187, "train_speed(iter/s)": 0.174059 }, { "epoch": 0.414702743383214, "grad_norm": 0.8392823934555054, "learning_rate": 9.33823021483305e-05, "loss": 0.9110626220703125, "memory(GiB)": 91.52, "step": 31960, "token_acc": 0.7724516576668979, "train_speed(iter/s)": 0.174048 }, { "epoch": 0.4147676217848697, "grad_norm": 0.7456502914428711, "learning_rate": 9.337963513542716e-05, "loss": 0.9695055961608887, "memory(GiB)": 91.52, "step": 31965, "token_acc": 0.7510503580093497, "train_speed(iter/s)": 0.174036 }, { "epoch": 0.4148325001865254, "grad_norm": 0.7852722406387329, "learning_rate": 9.337696762331377e-05, "loss": 0.9700541496276855, "memory(GiB)": 91.52, "step": 31970, "token_acc": 0.7461307385881277, "train_speed(iter/s)": 0.174025 }, { "epoch": 0.4148973785881811, "grad_norm": 0.7272210121154785, "learning_rate": 9.337429961202104e-05, "loss": 0.9559017181396484, "memory(GiB)": 91.52, "step": 31975, "token_acc": 0.7491854056880808, "train_speed(iter/s)": 0.174015 }, { "epoch": 0.4149622569898368, "grad_norm": 0.7807518839836121, "learning_rate": 9.337163110157964e-05, "loss": 0.932887077331543, "memory(GiB)": 91.52, "step": 31980, "token_acc": 0.7438143045320232, "train_speed(iter/s)": 0.174004 }, { "epoch": 0.4150271353914925, "grad_norm": 1.2178934812545776, "learning_rate": 9.336896209202033e-05, "loss": 0.9686018943786621, "memory(GiB)": 91.52, "step": 31985, "token_acc": 0.761233105344111, "train_speed(iter/s)": 0.17399 }, { "epoch": 0.4150920137931482, "grad_norm": 0.8561861515045166, "learning_rate": 9.336629258337378e-05, "loss": 0.978111457824707, "memory(GiB)": 91.52, "step": 31990, "token_acc": 0.7417007578297901, "train_speed(iter/s)": 0.173979 }, { "epoch": 0.4151568921948039, "grad_norm": 0.7763125896453857, "learning_rate": 9.336362257567071e-05, "loss": 0.962308692932129, "memory(GiB)": 91.52, "step": 31995, "token_acc": 0.7551920999845703, "train_speed(iter/s)": 0.173967 }, { "epoch": 0.4152217705964596, "grad_norm": 0.9056085348129272, "learning_rate": 9.336095206894189e-05, "loss": 1.0009237289428712, "memory(GiB)": 91.52, "step": 32000, "token_acc": 0.7320705096311039, "train_speed(iter/s)": 0.173956 }, { "epoch": 0.4152866489981153, "grad_norm": 0.7662197351455688, "learning_rate": 9.3358281063218e-05, "loss": 0.982392406463623, "memory(GiB)": 91.52, "step": 32005, "token_acc": 0.7411546813130668, "train_speed(iter/s)": 0.173943 }, { "epoch": 0.415351527399771, "grad_norm": 0.8447470664978027, "learning_rate": 9.335560955852982e-05, "loss": 0.9770448684692383, "memory(GiB)": 91.52, "step": 32010, "token_acc": 0.7529911662752992, "train_speed(iter/s)": 0.173932 }, { "epoch": 0.4154164058014267, "grad_norm": 0.6607049107551575, "learning_rate": 9.335293755490807e-05, "loss": 0.8747231483459472, "memory(GiB)": 91.52, "step": 32015, "token_acc": 0.769117313365101, "train_speed(iter/s)": 0.17392 }, { "epoch": 0.4154812842030824, "grad_norm": 0.8896887898445129, "learning_rate": 9.335026505238352e-05, "loss": 0.90068359375, "memory(GiB)": 91.52, "step": 32020, "token_acc": 0.7595521182038555, "train_speed(iter/s)": 0.173907 }, { "epoch": 0.4155461626047381, "grad_norm": 0.9000008702278137, "learning_rate": 9.334759205098689e-05, "loss": 0.9278118133544921, "memory(GiB)": 91.52, "step": 32025, "token_acc": 0.7418327257282412, "train_speed(iter/s)": 0.173896 }, { "epoch": 0.4156110410063938, "grad_norm": 0.7693438529968262, "learning_rate": 9.334491855074896e-05, "loss": 0.9399006843566895, "memory(GiB)": 91.52, "step": 32030, "token_acc": 0.745857966571783, "train_speed(iter/s)": 0.173884 }, { "epoch": 0.4156759194080495, "grad_norm": 0.9508394598960876, "learning_rate": 9.334224455170049e-05, "loss": 0.9183358192443848, "memory(GiB)": 91.52, "step": 32035, "token_acc": 0.7545928693756812, "train_speed(iter/s)": 0.17387 }, { "epoch": 0.4157407978097052, "grad_norm": 0.7056887149810791, "learning_rate": 9.333957005387228e-05, "loss": 0.8834968566894531, "memory(GiB)": 91.52, "step": 32040, "token_acc": 0.7677049070282608, "train_speed(iter/s)": 0.17386 }, { "epoch": 0.41580567621136083, "grad_norm": 0.723859429359436, "learning_rate": 9.333689505729509e-05, "loss": 0.9538830757141114, "memory(GiB)": 91.52, "step": 32045, "token_acc": 0.7454928507287422, "train_speed(iter/s)": 0.173848 }, { "epoch": 0.41587055461301653, "grad_norm": 0.8144069314002991, "learning_rate": 9.333421956199968e-05, "loss": 0.9571202278137207, "memory(GiB)": 91.52, "step": 32050, "token_acc": 0.7326642335766423, "train_speed(iter/s)": 0.173836 }, { "epoch": 0.41593543301467223, "grad_norm": 0.75223708152771, "learning_rate": 9.333154356801687e-05, "loss": 0.9322198867797852, "memory(GiB)": 91.52, "step": 32055, "token_acc": 0.735663302978095, "train_speed(iter/s)": 0.173824 }, { "epoch": 0.41600031141632793, "grad_norm": 0.6950082182884216, "learning_rate": 9.332886707537745e-05, "loss": 0.952420997619629, "memory(GiB)": 91.52, "step": 32060, "token_acc": 0.7346013025305382, "train_speed(iter/s)": 0.173813 }, { "epoch": 0.4160651898179836, "grad_norm": 0.8677070140838623, "learning_rate": 9.332619008411221e-05, "loss": 0.9791453361511231, "memory(GiB)": 91.52, "step": 32065, "token_acc": 0.7352244960293219, "train_speed(iter/s)": 0.173803 }, { "epoch": 0.4161300682196393, "grad_norm": 0.9032026529312134, "learning_rate": 9.332351259425196e-05, "loss": 0.9522510528564453, "memory(GiB)": 91.52, "step": 32070, "token_acc": 0.7502106422969732, "train_speed(iter/s)": 0.173793 }, { "epoch": 0.416194946621295, "grad_norm": 0.7163418531417847, "learning_rate": 9.332083460582752e-05, "loss": 0.9334558486938477, "memory(GiB)": 91.52, "step": 32075, "token_acc": 0.7337204199439624, "train_speed(iter/s)": 0.17378 }, { "epoch": 0.4162598250229507, "grad_norm": 0.7454622983932495, "learning_rate": 9.33181561188697e-05, "loss": 0.9314418792724609, "memory(GiB)": 91.52, "step": 32080, "token_acc": 0.7426180113826581, "train_speed(iter/s)": 0.17377 }, { "epoch": 0.4163247034246064, "grad_norm": 0.8118318319320679, "learning_rate": 9.331547713340932e-05, "loss": 0.9525913238525391, "memory(GiB)": 91.52, "step": 32085, "token_acc": 0.7410308072273495, "train_speed(iter/s)": 0.173759 }, { "epoch": 0.4163895818262621, "grad_norm": 0.7557472586631775, "learning_rate": 9.331279764947723e-05, "loss": 0.8675321578979492, "memory(GiB)": 91.52, "step": 32090, "token_acc": 0.7820777878278847, "train_speed(iter/s)": 0.173748 }, { "epoch": 0.4164544602279178, "grad_norm": 0.8068810105323792, "learning_rate": 9.331011766710425e-05, "loss": 0.8981471061706543, "memory(GiB)": 91.52, "step": 32095, "token_acc": 0.7486020319760573, "train_speed(iter/s)": 0.173737 }, { "epoch": 0.4165193386295735, "grad_norm": 0.743834912776947, "learning_rate": 9.330743718632122e-05, "loss": 0.9042172431945801, "memory(GiB)": 91.52, "step": 32100, "token_acc": 0.7594784859677368, "train_speed(iter/s)": 0.173725 }, { "epoch": 0.4165842170312292, "grad_norm": 0.807702898979187, "learning_rate": 9.3304756207159e-05, "loss": 0.909276008605957, "memory(GiB)": 91.52, "step": 32105, "token_acc": 0.7608461951931199, "train_speed(iter/s)": 0.173712 }, { "epoch": 0.4166490954328849, "grad_norm": 0.8898073434829712, "learning_rate": 9.330207472964842e-05, "loss": 0.9594789505004883, "memory(GiB)": 91.52, "step": 32110, "token_acc": 0.7310264515055731, "train_speed(iter/s)": 0.173701 }, { "epoch": 0.4167139738345406, "grad_norm": 0.7192021012306213, "learning_rate": 9.329939275382035e-05, "loss": 0.9483430862426758, "memory(GiB)": 91.52, "step": 32115, "token_acc": 0.7297723983686357, "train_speed(iter/s)": 0.17369 }, { "epoch": 0.4167788522361963, "grad_norm": 0.7453730702400208, "learning_rate": 9.329671027970567e-05, "loss": 0.9371967315673828, "memory(GiB)": 91.52, "step": 32120, "token_acc": 0.7361180880250396, "train_speed(iter/s)": 0.173678 }, { "epoch": 0.416843730637852, "grad_norm": 0.8157517313957214, "learning_rate": 9.329402730733523e-05, "loss": 0.978211784362793, "memory(GiB)": 91.52, "step": 32125, "token_acc": 0.7539219263042685, "train_speed(iter/s)": 0.173667 }, { "epoch": 0.4169086090395077, "grad_norm": 0.791010856628418, "learning_rate": 9.32913438367399e-05, "loss": 0.937799072265625, "memory(GiB)": 91.52, "step": 32130, "token_acc": 0.73623401213339, "train_speed(iter/s)": 0.173656 }, { "epoch": 0.4169734874411634, "grad_norm": 0.7787402868270874, "learning_rate": 9.328865986795058e-05, "loss": 0.9816064834594727, "memory(GiB)": 91.52, "step": 32135, "token_acc": 0.7406192600367357, "train_speed(iter/s)": 0.173646 }, { "epoch": 0.4170383658428191, "grad_norm": 0.7172399163246155, "learning_rate": 9.328597540099814e-05, "loss": 0.9198692321777344, "memory(GiB)": 91.52, "step": 32140, "token_acc": 0.7511480885071867, "train_speed(iter/s)": 0.173635 }, { "epoch": 0.4171032442444748, "grad_norm": 0.8012939095497131, "learning_rate": 9.32832904359135e-05, "loss": 0.9046384811401367, "memory(GiB)": 91.52, "step": 32145, "token_acc": 0.7542695130086724, "train_speed(iter/s)": 0.173625 }, { "epoch": 0.4171681226461305, "grad_norm": 0.8076927065849304, "learning_rate": 9.328060497272751e-05, "loss": 0.9395576477050781, "memory(GiB)": 91.52, "step": 32150, "token_acc": 0.7596071587448243, "train_speed(iter/s)": 0.173613 }, { "epoch": 0.4172330010477862, "grad_norm": 0.8152045011520386, "learning_rate": 9.327791901147113e-05, "loss": 0.9715921401977539, "memory(GiB)": 91.52, "step": 32155, "token_acc": 0.7316267260269208, "train_speed(iter/s)": 0.1736 }, { "epoch": 0.4172978794494419, "grad_norm": 0.7779024243354797, "learning_rate": 9.327523255217523e-05, "loss": 0.9137434005737305, "memory(GiB)": 91.52, "step": 32160, "token_acc": 0.7446587440742906, "train_speed(iter/s)": 0.173589 }, { "epoch": 0.41736275785109755, "grad_norm": 0.7818149924278259, "learning_rate": 9.327254559487074e-05, "loss": 0.9362232208251953, "memory(GiB)": 91.52, "step": 32165, "token_acc": 0.7522985570169838, "train_speed(iter/s)": 0.173577 }, { "epoch": 0.41742763625275325, "grad_norm": 0.8155418038368225, "learning_rate": 9.326985813958858e-05, "loss": 0.947808837890625, "memory(GiB)": 91.52, "step": 32170, "token_acc": 0.746268154922001, "train_speed(iter/s)": 0.173567 }, { "epoch": 0.41749251465440895, "grad_norm": 0.7860286831855774, "learning_rate": 9.326717018635969e-05, "loss": 1.0132615089416503, "memory(GiB)": 91.52, "step": 32175, "token_acc": 0.7340890467085496, "train_speed(iter/s)": 0.173555 }, { "epoch": 0.41755739305606465, "grad_norm": 0.8154483437538147, "learning_rate": 9.326448173521499e-05, "loss": 0.9569797515869141, "memory(GiB)": 91.52, "step": 32180, "token_acc": 0.7417569384043646, "train_speed(iter/s)": 0.173542 }, { "epoch": 0.41762227145772035, "grad_norm": 0.6838920712471008, "learning_rate": 9.326179278618539e-05, "loss": 0.9357392311096191, "memory(GiB)": 91.52, "step": 32185, "token_acc": 0.7514468071034095, "train_speed(iter/s)": 0.173531 }, { "epoch": 0.41768714985937605, "grad_norm": 0.6813107132911682, "learning_rate": 9.325910333930187e-05, "loss": 0.9024242401123047, "memory(GiB)": 91.52, "step": 32190, "token_acc": 0.7519294047262938, "train_speed(iter/s)": 0.173516 }, { "epoch": 0.41775202826103175, "grad_norm": 0.7714915871620178, "learning_rate": 9.325641339459539e-05, "loss": 0.9782487869262695, "memory(GiB)": 91.52, "step": 32195, "token_acc": 0.743458322093337, "train_speed(iter/s)": 0.173505 }, { "epoch": 0.41781690666268745, "grad_norm": 0.7004991173744202, "learning_rate": 9.325372295209689e-05, "loss": 0.908029842376709, "memory(GiB)": 91.52, "step": 32200, "token_acc": 0.7448438797011685, "train_speed(iter/s)": 0.173493 }, { "epoch": 0.41788178506434315, "grad_norm": 1.6824625730514526, "learning_rate": 9.325103201183732e-05, "loss": 0.934384536743164, "memory(GiB)": 91.52, "step": 32205, "token_acc": 0.7632346225454034, "train_speed(iter/s)": 0.173483 }, { "epoch": 0.41794666346599885, "grad_norm": 0.8215696811676025, "learning_rate": 9.324834057384766e-05, "loss": 0.9061208724975586, "memory(GiB)": 91.52, "step": 32210, "token_acc": 0.7568137117167744, "train_speed(iter/s)": 0.17347 }, { "epoch": 0.41801154186765455, "grad_norm": 0.7269037961959839, "learning_rate": 9.324564863815888e-05, "loss": 0.9116191864013672, "memory(GiB)": 91.52, "step": 32215, "token_acc": 0.7464602743745873, "train_speed(iter/s)": 0.173459 }, { "epoch": 0.41807642026931024, "grad_norm": 0.8152003288269043, "learning_rate": 9.324295620480196e-05, "loss": 0.9636337280273437, "memory(GiB)": 91.52, "step": 32220, "token_acc": 0.7447710238198014, "train_speed(iter/s)": 0.17345 }, { "epoch": 0.41814129867096594, "grad_norm": 0.6954739093780518, "learning_rate": 9.324026327380789e-05, "loss": 0.9547998428344726, "memory(GiB)": 91.52, "step": 32225, "token_acc": 0.7422804187145913, "train_speed(iter/s)": 0.17344 }, { "epoch": 0.41820617707262164, "grad_norm": 0.7781709432601929, "learning_rate": 9.323756984520764e-05, "loss": 0.968966293334961, "memory(GiB)": 91.52, "step": 32230, "token_acc": 0.7412958184581669, "train_speed(iter/s)": 0.173429 }, { "epoch": 0.41827105547427734, "grad_norm": 0.8237952589988708, "learning_rate": 9.323487591903222e-05, "loss": 0.9154841423034668, "memory(GiB)": 91.52, "step": 32235, "token_acc": 0.7641031415381151, "train_speed(iter/s)": 0.173419 }, { "epoch": 0.41833593387593304, "grad_norm": 0.7888846397399902, "learning_rate": 9.323218149531263e-05, "loss": 0.9597421646118164, "memory(GiB)": 91.52, "step": 32240, "token_acc": 0.7457723987224744, "train_speed(iter/s)": 0.173408 }, { "epoch": 0.41840081227758874, "grad_norm": 0.7216825485229492, "learning_rate": 9.322948657407988e-05, "loss": 0.938232707977295, "memory(GiB)": 91.52, "step": 32245, "token_acc": 0.7547508191067426, "train_speed(iter/s)": 0.173397 }, { "epoch": 0.41846569067924444, "grad_norm": 0.7614269852638245, "learning_rate": 9.3226791155365e-05, "loss": 0.9487846374511719, "memory(GiB)": 91.52, "step": 32250, "token_acc": 0.755692981608529, "train_speed(iter/s)": 0.173387 }, { "epoch": 0.41853056908090014, "grad_norm": 0.721973717212677, "learning_rate": 9.322409523919896e-05, "loss": 0.984984016418457, "memory(GiB)": 91.52, "step": 32255, "token_acc": 0.7414470422212899, "train_speed(iter/s)": 0.173375 }, { "epoch": 0.41859544748255584, "grad_norm": 0.781671404838562, "learning_rate": 9.322139882561283e-05, "loss": 0.9039813995361328, "memory(GiB)": 91.52, "step": 32260, "token_acc": 0.745457876576384, "train_speed(iter/s)": 0.173362 }, { "epoch": 0.41866032588421154, "grad_norm": 0.7874250411987305, "learning_rate": 9.321870191463762e-05, "loss": 0.9275342941284179, "memory(GiB)": 91.52, "step": 32265, "token_acc": 0.7531049894709699, "train_speed(iter/s)": 0.173351 }, { "epoch": 0.41872520428586724, "grad_norm": 0.7294583916664124, "learning_rate": 9.321600450630437e-05, "loss": 0.9586164474487304, "memory(GiB)": 91.52, "step": 32270, "token_acc": 0.7439082656473961, "train_speed(iter/s)": 0.173342 }, { "epoch": 0.41879008268752294, "grad_norm": 0.8871759176254272, "learning_rate": 9.321330660064411e-05, "loss": 0.9693568229675293, "memory(GiB)": 91.52, "step": 32275, "token_acc": 0.7351924149470161, "train_speed(iter/s)": 0.173331 }, { "epoch": 0.41885496108917863, "grad_norm": 0.7622249126434326, "learning_rate": 9.321060819768792e-05, "loss": 0.9335917472839356, "memory(GiB)": 91.52, "step": 32280, "token_acc": 0.7380073800738007, "train_speed(iter/s)": 0.17332 }, { "epoch": 0.4189198394908343, "grad_norm": 0.7740657329559326, "learning_rate": 9.320790929746682e-05, "loss": 0.9023554801940918, "memory(GiB)": 91.52, "step": 32285, "token_acc": 0.7356551444618146, "train_speed(iter/s)": 0.17331 }, { "epoch": 0.41898471789249, "grad_norm": 0.7744718790054321, "learning_rate": 9.32052099000119e-05, "loss": 0.8769811630249024, "memory(GiB)": 91.52, "step": 32290, "token_acc": 0.7341104955370883, "train_speed(iter/s)": 0.173296 }, { "epoch": 0.4190495962941457, "grad_norm": 0.747398853302002, "learning_rate": 9.320251000535417e-05, "loss": 0.916802978515625, "memory(GiB)": 91.52, "step": 32295, "token_acc": 0.7584209619031201, "train_speed(iter/s)": 0.173286 }, { "epoch": 0.4191144746958014, "grad_norm": 0.9272090792655945, "learning_rate": 9.319980961352475e-05, "loss": 0.9525692939758301, "memory(GiB)": 91.52, "step": 32300, "token_acc": 0.7480046493607129, "train_speed(iter/s)": 0.173277 }, { "epoch": 0.4191793530974571, "grad_norm": 0.7455995678901672, "learning_rate": 9.319710872455472e-05, "loss": 0.9314632415771484, "memory(GiB)": 91.52, "step": 32305, "token_acc": 0.7524940344421125, "train_speed(iter/s)": 0.173265 }, { "epoch": 0.4192442314991128, "grad_norm": 0.9000696539878845, "learning_rate": 9.319440733847511e-05, "loss": 0.9426787376403809, "memory(GiB)": 91.52, "step": 32310, "token_acc": 0.7502472586540528, "train_speed(iter/s)": 0.173252 }, { "epoch": 0.4193091099007685, "grad_norm": 0.7783221006393433, "learning_rate": 9.319170545531707e-05, "loss": 0.9561796188354492, "memory(GiB)": 91.52, "step": 32315, "token_acc": 0.7511903401516017, "train_speed(iter/s)": 0.17324 }, { "epoch": 0.41937398830242417, "grad_norm": 0.7590140700340271, "learning_rate": 9.318900307511165e-05, "loss": 0.9173967361450195, "memory(GiB)": 91.52, "step": 32320, "token_acc": 0.7664442009496297, "train_speed(iter/s)": 0.173229 }, { "epoch": 0.41943886670407987, "grad_norm": 0.8512668609619141, "learning_rate": 9.318630019788997e-05, "loss": 0.9212802886962891, "memory(GiB)": 91.52, "step": 32325, "token_acc": 0.7547331483597193, "train_speed(iter/s)": 0.173217 }, { "epoch": 0.41950374510573557, "grad_norm": 0.7773668169975281, "learning_rate": 9.318359682368312e-05, "loss": 0.9647319793701172, "memory(GiB)": 91.52, "step": 32330, "token_acc": 0.7404936624416277, "train_speed(iter/s)": 0.173207 }, { "epoch": 0.41956862350739127, "grad_norm": 0.6807208061218262, "learning_rate": 9.318089295252221e-05, "loss": 0.9545497894287109, "memory(GiB)": 91.52, "step": 32335, "token_acc": 0.7531589588071772, "train_speed(iter/s)": 0.173195 }, { "epoch": 0.41963350190904697, "grad_norm": 0.8141696453094482, "learning_rate": 9.317818858443837e-05, "loss": 0.9358889579772949, "memory(GiB)": 91.52, "step": 32340, "token_acc": 0.7688887185188025, "train_speed(iter/s)": 0.173183 }, { "epoch": 0.41969838031070267, "grad_norm": 0.8354020118713379, "learning_rate": 9.317548371946272e-05, "loss": 0.9720688819885254, "memory(GiB)": 91.52, "step": 32345, "token_acc": 0.7412630288166769, "train_speed(iter/s)": 0.173173 }, { "epoch": 0.41976325871235837, "grad_norm": 0.754264771938324, "learning_rate": 9.317277835762637e-05, "loss": 0.9313777923583985, "memory(GiB)": 91.52, "step": 32350, "token_acc": 0.7569721115537849, "train_speed(iter/s)": 0.173163 }, { "epoch": 0.41982813711401407, "grad_norm": 0.8763130307197571, "learning_rate": 9.317007249896048e-05, "loss": 0.9674877166748047, "memory(GiB)": 91.52, "step": 32355, "token_acc": 0.7343550446998723, "train_speed(iter/s)": 0.173153 }, { "epoch": 0.41989301551566977, "grad_norm": 0.734741747379303, "learning_rate": 9.316736614349616e-05, "loss": 0.9660905838012696, "memory(GiB)": 91.52, "step": 32360, "token_acc": 0.7461535495314849, "train_speed(iter/s)": 0.173142 }, { "epoch": 0.41995789391732546, "grad_norm": 0.6935851573944092, "learning_rate": 9.316465929126457e-05, "loss": 0.9067309379577637, "memory(GiB)": 91.52, "step": 32365, "token_acc": 0.7545523383694516, "train_speed(iter/s)": 0.17313 }, { "epoch": 0.42002277231898116, "grad_norm": 0.9162092804908752, "learning_rate": 9.316195194229687e-05, "loss": 0.9368988037109375, "memory(GiB)": 91.52, "step": 32370, "token_acc": 0.7505710909807011, "train_speed(iter/s)": 0.173119 }, { "epoch": 0.42008765072063686, "grad_norm": 0.7560697793960571, "learning_rate": 9.31592440966242e-05, "loss": 0.9071830749511719, "memory(GiB)": 91.52, "step": 32375, "token_acc": 0.7541693540956418, "train_speed(iter/s)": 0.173107 }, { "epoch": 0.42015252912229256, "grad_norm": 0.840969443321228, "learning_rate": 9.315653575427773e-05, "loss": 0.9522354125976562, "memory(GiB)": 91.52, "step": 32380, "token_acc": 0.744007319304666, "train_speed(iter/s)": 0.173097 }, { "epoch": 0.42021740752394826, "grad_norm": 0.7702451348304749, "learning_rate": 9.315382691528862e-05, "loss": 0.9282170295715332, "memory(GiB)": 91.52, "step": 32385, "token_acc": 0.7615257847984244, "train_speed(iter/s)": 0.173086 }, { "epoch": 0.42028228592560396, "grad_norm": 0.8006665110588074, "learning_rate": 9.315111757968804e-05, "loss": 0.9575994491577149, "memory(GiB)": 91.52, "step": 32390, "token_acc": 0.731085822975765, "train_speed(iter/s)": 0.173073 }, { "epoch": 0.42034716432725966, "grad_norm": 0.9008704423904419, "learning_rate": 9.31484077475072e-05, "loss": 0.9486900329589844, "memory(GiB)": 91.52, "step": 32395, "token_acc": 0.7383464726096625, "train_speed(iter/s)": 0.173065 }, { "epoch": 0.4204120427289153, "grad_norm": 0.8511435985565186, "learning_rate": 9.314569741877723e-05, "loss": 0.9252523422241211, "memory(GiB)": 91.52, "step": 32400, "token_acc": 0.7554489973844812, "train_speed(iter/s)": 0.173055 }, { "epoch": 0.420476921130571, "grad_norm": 0.7619073390960693, "learning_rate": 9.314298659352936e-05, "loss": 0.9670339584350586, "memory(GiB)": 91.52, "step": 32405, "token_acc": 0.7384620031412348, "train_speed(iter/s)": 0.173045 }, { "epoch": 0.4205417995322267, "grad_norm": 0.8243787288665771, "learning_rate": 9.314027527179479e-05, "loss": 0.9634893417358399, "memory(GiB)": 91.52, "step": 32410, "token_acc": 0.7507643227435863, "train_speed(iter/s)": 0.173033 }, { "epoch": 0.4206066779338824, "grad_norm": 0.8331146836280823, "learning_rate": 9.31375634536047e-05, "loss": 0.9660378456115722, "memory(GiB)": 91.52, "step": 32415, "token_acc": 0.7447313944412394, "train_speed(iter/s)": 0.173022 }, { "epoch": 0.4206715563355381, "grad_norm": 0.7615674734115601, "learning_rate": 9.31348511389903e-05, "loss": 0.9710308074951172, "memory(GiB)": 91.52, "step": 32420, "token_acc": 0.7553787208959623, "train_speed(iter/s)": 0.173009 }, { "epoch": 0.4207364347371938, "grad_norm": 0.8894494771957397, "learning_rate": 9.313213832798282e-05, "loss": 0.9051431655883789, "memory(GiB)": 91.52, "step": 32425, "token_acc": 0.7536276734750017, "train_speed(iter/s)": 0.172997 }, { "epoch": 0.4208013131388495, "grad_norm": 0.8015509247779846, "learning_rate": 9.312942502061345e-05, "loss": 0.9789953231811523, "memory(GiB)": 91.52, "step": 32430, "token_acc": 0.7083318206571065, "train_speed(iter/s)": 0.172986 }, { "epoch": 0.4208661915405052, "grad_norm": 0.8544031381607056, "learning_rate": 9.312671121691345e-05, "loss": 0.965878963470459, "memory(GiB)": 91.52, "step": 32435, "token_acc": 0.7514109376547081, "train_speed(iter/s)": 0.172975 }, { "epoch": 0.4209310699421609, "grad_norm": 0.7858709096908569, "learning_rate": 9.312399691691403e-05, "loss": 0.9507914543151855, "memory(GiB)": 91.52, "step": 32440, "token_acc": 0.7516590368714291, "train_speed(iter/s)": 0.172964 }, { "epoch": 0.4209959483438166, "grad_norm": 0.6618151664733887, "learning_rate": 9.312128212064643e-05, "loss": 0.8911264419555665, "memory(GiB)": 91.52, "step": 32445, "token_acc": 0.7509631080639896, "train_speed(iter/s)": 0.17295 }, { "epoch": 0.4210608267454723, "grad_norm": 0.7645052671432495, "learning_rate": 9.311856682814189e-05, "loss": 0.9534351348876953, "memory(GiB)": 91.52, "step": 32450, "token_acc": 0.7427762664509963, "train_speed(iter/s)": 0.172941 }, { "epoch": 0.421125705147128, "grad_norm": 0.8042992353439331, "learning_rate": 9.311585103943165e-05, "loss": 0.9902907371520996, "memory(GiB)": 91.52, "step": 32455, "token_acc": 0.7362938959063554, "train_speed(iter/s)": 0.172931 }, { "epoch": 0.4211905835487837, "grad_norm": 0.7386267781257629, "learning_rate": 9.3113134754547e-05, "loss": 0.934503173828125, "memory(GiB)": 91.52, "step": 32460, "token_acc": 0.7678493050475493, "train_speed(iter/s)": 0.17292 }, { "epoch": 0.4212554619504394, "grad_norm": 0.7792412638664246, "learning_rate": 9.311041797351914e-05, "loss": 0.961478328704834, "memory(GiB)": 91.52, "step": 32465, "token_acc": 0.7237808592210399, "train_speed(iter/s)": 0.17291 }, { "epoch": 0.4213203403520951, "grad_norm": 0.7506521344184875, "learning_rate": 9.310770069637938e-05, "loss": 0.9687150001525879, "memory(GiB)": 91.52, "step": 32470, "token_acc": 0.7476753700810831, "train_speed(iter/s)": 0.1729 }, { "epoch": 0.4213852187537508, "grad_norm": 0.8158371448516846, "learning_rate": 9.310498292315896e-05, "loss": 0.9624446868896485, "memory(GiB)": 91.52, "step": 32475, "token_acc": 0.7404476670870114, "train_speed(iter/s)": 0.172889 }, { "epoch": 0.4214500971554065, "grad_norm": 0.6874532103538513, "learning_rate": 9.310226465388918e-05, "loss": 0.9791366577148437, "memory(GiB)": 91.52, "step": 32480, "token_acc": 0.7405877680698968, "train_speed(iter/s)": 0.172879 }, { "epoch": 0.4215149755570622, "grad_norm": 0.7626676559448242, "learning_rate": 9.30995458886013e-05, "loss": 0.9449329376220703, "memory(GiB)": 91.52, "step": 32485, "token_acc": 0.7392887951691748, "train_speed(iter/s)": 0.172868 }, { "epoch": 0.4215798539587179, "grad_norm": 0.781201958656311, "learning_rate": 9.309682662732664e-05, "loss": 0.9496639251708985, "memory(GiB)": 91.52, "step": 32490, "token_acc": 0.7354123077944937, "train_speed(iter/s)": 0.172858 }, { "epoch": 0.4216447323603736, "grad_norm": 0.8493373990058899, "learning_rate": 9.309410687009646e-05, "loss": 0.9373881340026855, "memory(GiB)": 91.52, "step": 32495, "token_acc": 0.7523671052164971, "train_speed(iter/s)": 0.172847 }, { "epoch": 0.4217096107620293, "grad_norm": 0.764623761177063, "learning_rate": 9.309138661694208e-05, "loss": 0.9312018394470215, "memory(GiB)": 91.52, "step": 32500, "token_acc": 0.7357573118513681, "train_speed(iter/s)": 0.172836 }, { "epoch": 0.421774489163685, "grad_norm": 0.7322025895118713, "learning_rate": 9.308866586789479e-05, "loss": 0.9287917137145996, "memory(GiB)": 91.52, "step": 32505, "token_acc": 0.7531055900621118, "train_speed(iter/s)": 0.172825 }, { "epoch": 0.4218393675653407, "grad_norm": 0.8719342350959778, "learning_rate": 9.30859446229859e-05, "loss": 0.9533818244934082, "memory(GiB)": 91.52, "step": 32510, "token_acc": 0.742640682762083, "train_speed(iter/s)": 0.172814 }, { "epoch": 0.4219042459669964, "grad_norm": 0.7586506605148315, "learning_rate": 9.308322288224673e-05, "loss": 0.9312021255493164, "memory(GiB)": 91.52, "step": 32515, "token_acc": 0.7333125928998583, "train_speed(iter/s)": 0.172805 }, { "epoch": 0.421969124368652, "grad_norm": 0.7654719352722168, "learning_rate": 9.308050064570863e-05, "loss": 0.9296900749206543, "memory(GiB)": 91.52, "step": 32520, "token_acc": 0.7480579851978715, "train_speed(iter/s)": 0.172793 }, { "epoch": 0.4220340027703077, "grad_norm": 0.7606598734855652, "learning_rate": 9.307777791340289e-05, "loss": 0.9351247787475586, "memory(GiB)": 91.52, "step": 32525, "token_acc": 0.7732042565770026, "train_speed(iter/s)": 0.172782 }, { "epoch": 0.4220988811719634, "grad_norm": 0.791293740272522, "learning_rate": 9.307505468536084e-05, "loss": 0.8991388320922852, "memory(GiB)": 91.52, "step": 32530, "token_acc": 0.7724382327614758, "train_speed(iter/s)": 0.17277 }, { "epoch": 0.4221637595736191, "grad_norm": 0.763117253780365, "learning_rate": 9.307233096161385e-05, "loss": 0.9224233627319336, "memory(GiB)": 91.52, "step": 32535, "token_acc": 0.7598288785178899, "train_speed(iter/s)": 0.172761 }, { "epoch": 0.4222286379752748, "grad_norm": 0.7425333261489868, "learning_rate": 9.306960674219323e-05, "loss": 0.9563119888305665, "memory(GiB)": 91.52, "step": 32540, "token_acc": 0.7478483368225168, "train_speed(iter/s)": 0.17275 }, { "epoch": 0.4222935163769305, "grad_norm": 0.8402734994888306, "learning_rate": 9.306688202713035e-05, "loss": 0.9273073196411132, "memory(GiB)": 91.52, "step": 32545, "token_acc": 0.7544552516482799, "train_speed(iter/s)": 0.172739 }, { "epoch": 0.4223583947785862, "grad_norm": 0.7560285925865173, "learning_rate": 9.306415681645658e-05, "loss": 0.9134984970092773, "memory(GiB)": 91.52, "step": 32550, "token_acc": 0.7579600728119125, "train_speed(iter/s)": 0.172727 }, { "epoch": 0.4224232731802419, "grad_norm": 0.7455723881721497, "learning_rate": 9.306143111020325e-05, "loss": 0.953167724609375, "memory(GiB)": 91.52, "step": 32555, "token_acc": 0.7352487347241081, "train_speed(iter/s)": 0.172715 }, { "epoch": 0.4224881515818976, "grad_norm": 0.8143560290336609, "learning_rate": 9.305870490840175e-05, "loss": 0.9520004272460938, "memory(GiB)": 91.52, "step": 32560, "token_acc": 0.7363821267399733, "train_speed(iter/s)": 0.172704 }, { "epoch": 0.4225530299835533, "grad_norm": 0.7244952321052551, "learning_rate": 9.305597821108343e-05, "loss": 0.9802707672119141, "memory(GiB)": 91.52, "step": 32565, "token_acc": 0.732480195003047, "train_speed(iter/s)": 0.172691 }, { "epoch": 0.422617908385209, "grad_norm": 0.7441573739051819, "learning_rate": 9.305325101827971e-05, "loss": 0.9672607421875, "memory(GiB)": 91.52, "step": 32570, "token_acc": 0.7388183898481381, "train_speed(iter/s)": 0.172679 }, { "epoch": 0.4226827867868647, "grad_norm": 0.720592200756073, "learning_rate": 9.305052333002193e-05, "loss": 0.9316068649291992, "memory(GiB)": 91.52, "step": 32575, "token_acc": 0.7528610354223433, "train_speed(iter/s)": 0.172668 }, { "epoch": 0.4227476651885204, "grad_norm": 0.7797793745994568, "learning_rate": 9.30477951463415e-05, "loss": 0.9543552398681641, "memory(GiB)": 91.52, "step": 32580, "token_acc": 0.7452461317381278, "train_speed(iter/s)": 0.172656 }, { "epoch": 0.4228125435901761, "grad_norm": 0.8215578198432922, "learning_rate": 9.304506646726983e-05, "loss": 0.9533609390258789, "memory(GiB)": 91.52, "step": 32585, "token_acc": 0.724090682610862, "train_speed(iter/s)": 0.172647 }, { "epoch": 0.4228774219918318, "grad_norm": 0.7806787490844727, "learning_rate": 9.30423372928383e-05, "loss": 0.9903804779052734, "memory(GiB)": 91.52, "step": 32590, "token_acc": 0.7204180064308682, "train_speed(iter/s)": 0.172635 }, { "epoch": 0.4229423003934875, "grad_norm": 0.9262691140174866, "learning_rate": 9.30396076230783e-05, "loss": 0.900416374206543, "memory(GiB)": 91.52, "step": 32595, "token_acc": 0.7429427312775331, "train_speed(iter/s)": 0.172626 }, { "epoch": 0.4230071787951432, "grad_norm": 0.8757491111755371, "learning_rate": 9.303687745802128e-05, "loss": 0.9524253845214844, "memory(GiB)": 91.52, "step": 32600, "token_acc": 0.7452722162462264, "train_speed(iter/s)": 0.172614 }, { "epoch": 0.4230720571967989, "grad_norm": 0.8059176802635193, "learning_rate": 9.303414679769863e-05, "loss": 0.924567985534668, "memory(GiB)": 91.52, "step": 32605, "token_acc": 0.7625086549324004, "train_speed(iter/s)": 0.172602 }, { "epoch": 0.4231369355984546, "grad_norm": 0.7330669164657593, "learning_rate": 9.303141564214182e-05, "loss": 1.0079048156738282, "memory(GiB)": 91.52, "step": 32610, "token_acc": 0.7422555173111965, "train_speed(iter/s)": 0.172591 }, { "epoch": 0.4232018140001103, "grad_norm": 0.7607157826423645, "learning_rate": 9.302868399138223e-05, "loss": 0.9440910339355468, "memory(GiB)": 91.52, "step": 32615, "token_acc": 0.7394111852905768, "train_speed(iter/s)": 0.17258 }, { "epoch": 0.423266692401766, "grad_norm": 0.6740568280220032, "learning_rate": 9.30259518454513e-05, "loss": 0.8910097122192383, "memory(GiB)": 91.52, "step": 32620, "token_acc": 0.7686585408444802, "train_speed(iter/s)": 0.172568 }, { "epoch": 0.4233315708034217, "grad_norm": 0.7719643712043762, "learning_rate": 9.302321920438051e-05, "loss": 0.9507868766784668, "memory(GiB)": 91.52, "step": 32625, "token_acc": 0.7414974619289341, "train_speed(iter/s)": 0.172557 }, { "epoch": 0.4233964492050774, "grad_norm": 0.7482474446296692, "learning_rate": 9.302048606820127e-05, "loss": 0.9294488906860352, "memory(GiB)": 91.52, "step": 32630, "token_acc": 0.7609306056959486, "train_speed(iter/s)": 0.172547 }, { "epoch": 0.4234613276067331, "grad_norm": 0.7417513132095337, "learning_rate": 9.301775243694505e-05, "loss": 0.923643970489502, "memory(GiB)": 91.52, "step": 32635, "token_acc": 0.7661819317669107, "train_speed(iter/s)": 0.172534 }, { "epoch": 0.42352620600838875, "grad_norm": 0.9942312836647034, "learning_rate": 9.30150183106433e-05, "loss": 0.9551530838012695, "memory(GiB)": 91.52, "step": 32640, "token_acc": 0.7584267804571466, "train_speed(iter/s)": 0.172523 }, { "epoch": 0.42359108441004445, "grad_norm": 0.8005229830741882, "learning_rate": 9.301228368932748e-05, "loss": 0.9519832611083985, "memory(GiB)": 91.52, "step": 32645, "token_acc": 0.748709469269237, "train_speed(iter/s)": 0.172513 }, { "epoch": 0.42365596281170015, "grad_norm": 0.8290718197822571, "learning_rate": 9.300954857302909e-05, "loss": 0.9183658599853516, "memory(GiB)": 91.52, "step": 32650, "token_acc": 0.76254371293903, "train_speed(iter/s)": 0.172503 }, { "epoch": 0.42372084121335585, "grad_norm": 0.9824446439743042, "learning_rate": 9.300681296177957e-05, "loss": 1.0005319595336915, "memory(GiB)": 91.52, "step": 32655, "token_acc": 0.7480010920862749, "train_speed(iter/s)": 0.172493 }, { "epoch": 0.42378571961501155, "grad_norm": 1.0003600120544434, "learning_rate": 9.300407685561041e-05, "loss": 0.9966327667236328, "memory(GiB)": 91.52, "step": 32660, "token_acc": 0.7415956969968623, "train_speed(iter/s)": 0.172483 }, { "epoch": 0.42385059801666725, "grad_norm": 0.7605562210083008, "learning_rate": 9.30013402545531e-05, "loss": 0.9694563865661621, "memory(GiB)": 91.52, "step": 32665, "token_acc": 0.7453333333333333, "train_speed(iter/s)": 0.172474 }, { "epoch": 0.42391547641832295, "grad_norm": 0.7940008044242859, "learning_rate": 9.299860315863914e-05, "loss": 0.9376513481140136, "memory(GiB)": 91.52, "step": 32670, "token_acc": 0.7555770470664087, "train_speed(iter/s)": 0.172461 }, { "epoch": 0.42398035481997864, "grad_norm": 0.8089720606803894, "learning_rate": 9.299586556790003e-05, "loss": 0.8929481506347656, "memory(GiB)": 91.52, "step": 32675, "token_acc": 0.7655316096861231, "train_speed(iter/s)": 0.17245 }, { "epoch": 0.42404523322163434, "grad_norm": 0.7652925252914429, "learning_rate": 9.299312748236724e-05, "loss": 0.9699928283691406, "memory(GiB)": 91.52, "step": 32680, "token_acc": 0.7494580866414313, "train_speed(iter/s)": 0.172437 }, { "epoch": 0.42411011162329004, "grad_norm": 0.859044075012207, "learning_rate": 9.299038890207233e-05, "loss": 0.9623519897460937, "memory(GiB)": 91.52, "step": 32685, "token_acc": 0.7620547892573097, "train_speed(iter/s)": 0.172427 }, { "epoch": 0.42417499002494574, "grad_norm": 0.8373337388038635, "learning_rate": 9.298764982704678e-05, "loss": 0.887850284576416, "memory(GiB)": 91.52, "step": 32690, "token_acc": 0.7506853023790991, "train_speed(iter/s)": 0.172416 }, { "epoch": 0.42423986842660144, "grad_norm": 0.7455126047134399, "learning_rate": 9.298491025732213e-05, "loss": 0.8769491195678711, "memory(GiB)": 91.52, "step": 32695, "token_acc": 0.7503626616209398, "train_speed(iter/s)": 0.172405 }, { "epoch": 0.42430474682825714, "grad_norm": 0.6899811029434204, "learning_rate": 9.29821701929299e-05, "loss": 0.9363611221313477, "memory(GiB)": 91.52, "step": 32700, "token_acc": 0.725667467156378, "train_speed(iter/s)": 0.172393 }, { "epoch": 0.42436962522991284, "grad_norm": 0.7149273157119751, "learning_rate": 9.297942963390162e-05, "loss": 0.9417560577392579, "memory(GiB)": 91.52, "step": 32705, "token_acc": 0.7521254487058379, "train_speed(iter/s)": 0.172382 }, { "epoch": 0.42443450363156854, "grad_norm": 0.7802834510803223, "learning_rate": 9.297668858026883e-05, "loss": 0.9314191818237305, "memory(GiB)": 91.52, "step": 32710, "token_acc": 0.7608747406727204, "train_speed(iter/s)": 0.17237 }, { "epoch": 0.42449938203322424, "grad_norm": 0.7094376087188721, "learning_rate": 9.297394703206307e-05, "loss": 0.9154994010925293, "memory(GiB)": 91.52, "step": 32715, "token_acc": 0.7439665783878548, "train_speed(iter/s)": 0.172358 }, { "epoch": 0.42456426043487994, "grad_norm": 0.7953419089317322, "learning_rate": 9.29712049893159e-05, "loss": 0.9701442718505859, "memory(GiB)": 91.52, "step": 32720, "token_acc": 0.7605686879068174, "train_speed(iter/s)": 0.172346 }, { "epoch": 0.42462913883653564, "grad_norm": 0.7848103642463684, "learning_rate": 9.296846245205886e-05, "loss": 0.9182826042175293, "memory(GiB)": 91.52, "step": 32725, "token_acc": 0.7447681434235477, "train_speed(iter/s)": 0.172337 }, { "epoch": 0.42469401723819133, "grad_norm": 0.7501035332679749, "learning_rate": 9.296571942032353e-05, "loss": 0.898995304107666, "memory(GiB)": 91.52, "step": 32730, "token_acc": 0.7622701660735468, "train_speed(iter/s)": 0.172326 }, { "epoch": 0.42475889563984703, "grad_norm": 0.7612999677658081, "learning_rate": 9.296297589414145e-05, "loss": 0.9140180587768555, "memory(GiB)": 91.52, "step": 32735, "token_acc": 0.7792974271981905, "train_speed(iter/s)": 0.172315 }, { "epoch": 0.42482377404150273, "grad_norm": 0.8633562326431274, "learning_rate": 9.296023187354423e-05, "loss": 0.9621234893798828, "memory(GiB)": 91.52, "step": 32740, "token_acc": 0.7628156082631982, "train_speed(iter/s)": 0.172305 }, { "epoch": 0.42488865244315843, "grad_norm": 0.7043877243995667, "learning_rate": 9.29574873585634e-05, "loss": 0.9653555870056152, "memory(GiB)": 91.52, "step": 32745, "token_acc": 0.7334864005651713, "train_speed(iter/s)": 0.172293 }, { "epoch": 0.42495353084481413, "grad_norm": 0.7306627035140991, "learning_rate": 9.295474234923058e-05, "loss": 0.9356489181518555, "memory(GiB)": 91.52, "step": 32750, "token_acc": 0.7436421203145747, "train_speed(iter/s)": 0.172284 }, { "epoch": 0.42501840924646983, "grad_norm": 0.8069274425506592, "learning_rate": 9.295199684557735e-05, "loss": 0.9824646949768067, "memory(GiB)": 91.52, "step": 32755, "token_acc": 0.732981530343008, "train_speed(iter/s)": 0.172274 }, { "epoch": 0.4250832876481255, "grad_norm": 0.8179144859313965, "learning_rate": 9.294925084763533e-05, "loss": 0.973057746887207, "memory(GiB)": 91.52, "step": 32760, "token_acc": 0.7217198745155933, "train_speed(iter/s)": 0.172263 }, { "epoch": 0.4251481660497812, "grad_norm": 0.7470949292182922, "learning_rate": 9.294650435543606e-05, "loss": 0.9617672920227051, "memory(GiB)": 91.52, "step": 32765, "token_acc": 0.7446313724065259, "train_speed(iter/s)": 0.172253 }, { "epoch": 0.4252130444514369, "grad_norm": 0.6802018284797668, "learning_rate": 9.29437573690112e-05, "loss": 0.9177227020263672, "memory(GiB)": 91.52, "step": 32770, "token_acc": 0.7641560634384611, "train_speed(iter/s)": 0.172242 }, { "epoch": 0.42527792285309257, "grad_norm": 0.781481146812439, "learning_rate": 9.294100988839234e-05, "loss": 0.9011220932006836, "memory(GiB)": 91.52, "step": 32775, "token_acc": 0.7295472597299444, "train_speed(iter/s)": 0.172229 }, { "epoch": 0.42534280125474827, "grad_norm": 0.8199692964553833, "learning_rate": 9.29382619136111e-05, "loss": 0.9570343971252442, "memory(GiB)": 91.52, "step": 32780, "token_acc": 0.7467555000347005, "train_speed(iter/s)": 0.172219 }, { "epoch": 0.42540767965640397, "grad_norm": 0.7925897240638733, "learning_rate": 9.29355134446991e-05, "loss": 0.9429437637329101, "memory(GiB)": 91.52, "step": 32785, "token_acc": 0.7377868957558661, "train_speed(iter/s)": 0.172208 }, { "epoch": 0.42547255805805967, "grad_norm": 0.6430844664573669, "learning_rate": 9.2932764481688e-05, "loss": 0.8779022216796875, "memory(GiB)": 91.52, "step": 32790, "token_acc": 0.7556435219856639, "train_speed(iter/s)": 0.172197 }, { "epoch": 0.42553743645971537, "grad_norm": 0.7041826844215393, "learning_rate": 9.293001502460938e-05, "loss": 0.9382777214050293, "memory(GiB)": 91.52, "step": 32795, "token_acc": 0.7584432392456552, "train_speed(iter/s)": 0.172187 }, { "epoch": 0.42560231486137107, "grad_norm": 0.7770675420761108, "learning_rate": 9.292726507349493e-05, "loss": 1.0042741775512696, "memory(GiB)": 91.52, "step": 32800, "token_acc": 0.7387236005403446, "train_speed(iter/s)": 0.172177 }, { "epoch": 0.42566719326302677, "grad_norm": 0.8398405313491821, "learning_rate": 9.292451462837626e-05, "loss": 0.9333366394042969, "memory(GiB)": 91.52, "step": 32805, "token_acc": 0.7405586432948553, "train_speed(iter/s)": 0.172168 }, { "epoch": 0.42573207166468247, "grad_norm": 0.7330073714256287, "learning_rate": 9.292176368928504e-05, "loss": 0.9664160728454589, "memory(GiB)": 91.52, "step": 32810, "token_acc": 0.7553281155930163, "train_speed(iter/s)": 0.172156 }, { "epoch": 0.42579695006633816, "grad_norm": 0.7402070164680481, "learning_rate": 9.291901225625295e-05, "loss": 0.9485450744628906, "memory(GiB)": 91.52, "step": 32815, "token_acc": 0.7485592243541935, "train_speed(iter/s)": 0.172146 }, { "epoch": 0.42586182846799386, "grad_norm": 0.8995720744132996, "learning_rate": 9.29162603293116e-05, "loss": 0.9522850036621093, "memory(GiB)": 91.52, "step": 32820, "token_acc": 0.7340895981542012, "train_speed(iter/s)": 0.172136 }, { "epoch": 0.42592670686964956, "grad_norm": 0.7948769330978394, "learning_rate": 9.29135079084927e-05, "loss": 0.968383502960205, "memory(GiB)": 91.52, "step": 32825, "token_acc": 0.7397129186602871, "train_speed(iter/s)": 0.172124 }, { "epoch": 0.42599158527130526, "grad_norm": 0.9041336178779602, "learning_rate": 9.29107549938279e-05, "loss": 0.9564408302307129, "memory(GiB)": 91.52, "step": 32830, "token_acc": 0.7554560291494288, "train_speed(iter/s)": 0.172114 }, { "epoch": 0.42605646367296096, "grad_norm": 0.8019195795059204, "learning_rate": 9.290800158534892e-05, "loss": 0.9394962310791015, "memory(GiB)": 91.52, "step": 32835, "token_acc": 0.7409806952346499, "train_speed(iter/s)": 0.172102 }, { "epoch": 0.42612134207461666, "grad_norm": 0.7519826889038086, "learning_rate": 9.29052476830874e-05, "loss": 0.9019946098327637, "memory(GiB)": 91.52, "step": 32840, "token_acc": 0.7784934805040493, "train_speed(iter/s)": 0.17209 }, { "epoch": 0.42618622047627236, "grad_norm": 0.7627002596855164, "learning_rate": 9.290249328707506e-05, "loss": 0.9656258583068847, "memory(GiB)": 91.52, "step": 32845, "token_acc": 0.724361467652016, "train_speed(iter/s)": 0.172078 }, { "epoch": 0.42625109887792806, "grad_norm": 0.8107022643089294, "learning_rate": 9.289973839734359e-05, "loss": 0.9361730575561523, "memory(GiB)": 91.52, "step": 32850, "token_acc": 0.7520570488206253, "train_speed(iter/s)": 0.172068 }, { "epoch": 0.42631597727958376, "grad_norm": 0.7659928202629089, "learning_rate": 9.289698301392468e-05, "loss": 0.9958139419555664, "memory(GiB)": 91.52, "step": 32855, "token_acc": 0.7367045499544609, "train_speed(iter/s)": 0.172059 }, { "epoch": 0.42638085568123946, "grad_norm": 0.715430498123169, "learning_rate": 9.289422713685005e-05, "loss": 0.9386362075805664, "memory(GiB)": 91.52, "step": 32860, "token_acc": 0.7455341969107595, "train_speed(iter/s)": 0.172049 }, { "epoch": 0.42644573408289516, "grad_norm": 0.7856586575508118, "learning_rate": 9.289147076615143e-05, "loss": 0.9358001708984375, "memory(GiB)": 91.52, "step": 32865, "token_acc": 0.740891162572734, "train_speed(iter/s)": 0.172038 }, { "epoch": 0.42651061248455086, "grad_norm": 0.7990708947181702, "learning_rate": 9.288871390186052e-05, "loss": 0.9075496673583985, "memory(GiB)": 91.52, "step": 32870, "token_acc": 0.7547176261452209, "train_speed(iter/s)": 0.172028 }, { "epoch": 0.42657549088620655, "grad_norm": 0.7355177402496338, "learning_rate": 9.288595654400904e-05, "loss": 0.9802326202392578, "memory(GiB)": 91.52, "step": 32875, "token_acc": 0.7379116779763385, "train_speed(iter/s)": 0.172017 }, { "epoch": 0.4266403692878622, "grad_norm": 0.7554275989532471, "learning_rate": 9.288319869262874e-05, "loss": 0.9139845848083497, "memory(GiB)": 91.52, "step": 32880, "token_acc": 0.7930201786304995, "train_speed(iter/s)": 0.172005 }, { "epoch": 0.4267052476895179, "grad_norm": 0.7332385778427124, "learning_rate": 9.288044034775135e-05, "loss": 0.9820083618164063, "memory(GiB)": 91.52, "step": 32885, "token_acc": 0.7341843314924436, "train_speed(iter/s)": 0.171993 }, { "epoch": 0.4267701260911736, "grad_norm": 0.8529233932495117, "learning_rate": 9.287768150940861e-05, "loss": 0.914206600189209, "memory(GiB)": 91.52, "step": 32890, "token_acc": 0.7711958993013931, "train_speed(iter/s)": 0.171983 }, { "epoch": 0.4268350044928293, "grad_norm": 0.7600998878479004, "learning_rate": 9.287492217763228e-05, "loss": 0.9209607124328614, "memory(GiB)": 91.52, "step": 32895, "token_acc": 0.7475331866259672, "train_speed(iter/s)": 0.171972 }, { "epoch": 0.426899882894485, "grad_norm": 0.8106878995895386, "learning_rate": 9.287216235245409e-05, "loss": 0.9010542869567871, "memory(GiB)": 91.52, "step": 32900, "token_acc": 0.7695371000680735, "train_speed(iter/s)": 0.171959 }, { "epoch": 0.4269647612961407, "grad_norm": 0.6774662137031555, "learning_rate": 9.286940203390582e-05, "loss": 0.9122540473937988, "memory(GiB)": 91.52, "step": 32905, "token_acc": 0.759866014707581, "train_speed(iter/s)": 0.171948 }, { "epoch": 0.4270296396977964, "grad_norm": 0.7917654514312744, "learning_rate": 9.286664122201922e-05, "loss": 0.9205349922180176, "memory(GiB)": 91.52, "step": 32910, "token_acc": 0.7478629382787598, "train_speed(iter/s)": 0.171937 }, { "epoch": 0.4270945180994521, "grad_norm": 0.9005681276321411, "learning_rate": 9.28638799168261e-05, "loss": 0.8935918807983398, "memory(GiB)": 91.52, "step": 32915, "token_acc": 0.7682547282738807, "train_speed(iter/s)": 0.171926 }, { "epoch": 0.4271593965011078, "grad_norm": 0.6900621652603149, "learning_rate": 9.286111811835818e-05, "loss": 0.9269016265869141, "memory(GiB)": 91.52, "step": 32920, "token_acc": 0.7654276917585114, "train_speed(iter/s)": 0.171915 }, { "epoch": 0.4272242749027635, "grad_norm": 0.7973517775535583, "learning_rate": 9.285835582664728e-05, "loss": 0.9146608352661133, "memory(GiB)": 91.52, "step": 32925, "token_acc": 0.7579375346516356, "train_speed(iter/s)": 0.171905 }, { "epoch": 0.4272891533044192, "grad_norm": 0.7285781502723694, "learning_rate": 9.28555930417252e-05, "loss": 0.9194406509399414, "memory(GiB)": 91.52, "step": 32930, "token_acc": 0.7577272727272727, "train_speed(iter/s)": 0.171894 }, { "epoch": 0.4273540317060749, "grad_norm": 0.7369920015335083, "learning_rate": 9.285282976362368e-05, "loss": 0.9430593490600586, "memory(GiB)": 91.52, "step": 32935, "token_acc": 0.733270433658315, "train_speed(iter/s)": 0.171881 }, { "epoch": 0.4274189101077306, "grad_norm": 0.8613285422325134, "learning_rate": 9.285006599237458e-05, "loss": 0.9415386199951172, "memory(GiB)": 91.52, "step": 32940, "token_acc": 0.7548778374631779, "train_speed(iter/s)": 0.171871 }, { "epoch": 0.4274837885093863, "grad_norm": 0.8200610876083374, "learning_rate": 9.284730172800966e-05, "loss": 0.9250995635986328, "memory(GiB)": 91.52, "step": 32945, "token_acc": 0.7461538461538462, "train_speed(iter/s)": 0.17186 }, { "epoch": 0.427548666911042, "grad_norm": 0.7937828302383423, "learning_rate": 9.284453697056076e-05, "loss": 0.9497953414916992, "memory(GiB)": 91.52, "step": 32950, "token_acc": 0.7446512920255627, "train_speed(iter/s)": 0.171849 }, { "epoch": 0.4276135453126977, "grad_norm": 0.7983458638191223, "learning_rate": 9.284177172005968e-05, "loss": 0.9122255325317383, "memory(GiB)": 91.52, "step": 32955, "token_acc": 0.7445459258963826, "train_speed(iter/s)": 0.171837 }, { "epoch": 0.4276784237143534, "grad_norm": 0.8479536175727844, "learning_rate": 9.283900597653826e-05, "loss": 0.9313133239746094, "memory(GiB)": 91.52, "step": 32960, "token_acc": 0.7427831715210356, "train_speed(iter/s)": 0.171828 }, { "epoch": 0.4277433021160091, "grad_norm": 0.8065454363822937, "learning_rate": 9.28362397400283e-05, "loss": 0.9365495681762696, "memory(GiB)": 91.52, "step": 32965, "token_acc": 0.7562299104828397, "train_speed(iter/s)": 0.171818 }, { "epoch": 0.4278081805176648, "grad_norm": 0.7006327509880066, "learning_rate": 9.283347301056165e-05, "loss": 0.9375410079956055, "memory(GiB)": 91.52, "step": 32970, "token_acc": 0.7616669488177868, "train_speed(iter/s)": 0.171806 }, { "epoch": 0.4278730589193205, "grad_norm": 0.7716831564903259, "learning_rate": 9.283070578817017e-05, "loss": 0.9225408554077148, "memory(GiB)": 91.52, "step": 32975, "token_acc": 0.7605837939867871, "train_speed(iter/s)": 0.171795 }, { "epoch": 0.4279379373209762, "grad_norm": 0.7677512168884277, "learning_rate": 9.282793807288566e-05, "loss": 0.9343215942382812, "memory(GiB)": 91.52, "step": 32980, "token_acc": 0.7302070219611134, "train_speed(iter/s)": 0.171785 }, { "epoch": 0.4280028157226319, "grad_norm": 0.7428388595581055, "learning_rate": 9.282516986474001e-05, "loss": 0.9475820541381836, "memory(GiB)": 91.52, "step": 32985, "token_acc": 0.7682820855614974, "train_speed(iter/s)": 0.171775 }, { "epoch": 0.4280676941242876, "grad_norm": 0.7648767828941345, "learning_rate": 9.282240116376506e-05, "loss": 0.9352354049682617, "memory(GiB)": 91.52, "step": 32990, "token_acc": 0.734868653173814, "train_speed(iter/s)": 0.171765 }, { "epoch": 0.4281325725259433, "grad_norm": 0.7825779914855957, "learning_rate": 9.281963196999267e-05, "loss": 0.9377377510070801, "memory(GiB)": 91.52, "step": 32995, "token_acc": 0.7454677830360393, "train_speed(iter/s)": 0.171755 }, { "epoch": 0.4281974509275989, "grad_norm": 0.8137017488479614, "learning_rate": 9.281686228345471e-05, "loss": 0.9527253150939942, "memory(GiB)": 91.52, "step": 33000, "token_acc": 0.7591166642452419, "train_speed(iter/s)": 0.171745 }, { "epoch": 0.4282623293292546, "grad_norm": 0.7203220725059509, "learning_rate": 9.281409210418305e-05, "loss": 0.9271066665649415, "memory(GiB)": 91.52, "step": 33005, "token_acc": 0.7386043600709293, "train_speed(iter/s)": 0.171734 }, { "epoch": 0.4283272077309103, "grad_norm": 0.75566166639328, "learning_rate": 9.281132143220959e-05, "loss": 0.8998455047607422, "memory(GiB)": 91.52, "step": 33010, "token_acc": 0.7598577274639537, "train_speed(iter/s)": 0.171724 }, { "epoch": 0.428392086132566, "grad_norm": 0.8329281210899353, "learning_rate": 9.280855026756618e-05, "loss": 0.9166492462158203, "memory(GiB)": 91.52, "step": 33015, "token_acc": 0.755624106538259, "train_speed(iter/s)": 0.171715 }, { "epoch": 0.4284569645342217, "grad_norm": 0.8856369256973267, "learning_rate": 9.280577861028474e-05, "loss": 1.008000087738037, "memory(GiB)": 91.52, "step": 33020, "token_acc": 0.7116758522147377, "train_speed(iter/s)": 0.171704 }, { "epoch": 0.4285218429358774, "grad_norm": 0.8916617631912231, "learning_rate": 9.280300646039716e-05, "loss": 0.9559803009033203, "memory(GiB)": 91.52, "step": 33025, "token_acc": 0.7363018649703629, "train_speed(iter/s)": 0.171693 }, { "epoch": 0.4285867213375331, "grad_norm": 0.7776075601577759, "learning_rate": 9.280023381793533e-05, "loss": 0.9137098312377929, "memory(GiB)": 91.52, "step": 33030, "token_acc": 0.7546226886556722, "train_speed(iter/s)": 0.171681 }, { "epoch": 0.4286515997391888, "grad_norm": 0.7182288765907288, "learning_rate": 9.279746068293116e-05, "loss": 0.918883991241455, "memory(GiB)": 91.52, "step": 33035, "token_acc": 0.7533731688511951, "train_speed(iter/s)": 0.171669 }, { "epoch": 0.4287164781408445, "grad_norm": 0.7669334411621094, "learning_rate": 9.279468705541657e-05, "loss": 0.9829940795898438, "memory(GiB)": 91.52, "step": 33040, "token_acc": 0.7399527902191018, "train_speed(iter/s)": 0.171658 }, { "epoch": 0.4287813565425002, "grad_norm": 0.8492539525032043, "learning_rate": 9.279191293542347e-05, "loss": 0.9146085739135742, "memory(GiB)": 91.52, "step": 33045, "token_acc": 0.7610461054837583, "train_speed(iter/s)": 0.17165 }, { "epoch": 0.4288462349441559, "grad_norm": 0.7846406698226929, "learning_rate": 9.27891383229838e-05, "loss": 0.9238364219665527, "memory(GiB)": 91.52, "step": 33050, "token_acc": 0.742636110571632, "train_speed(iter/s)": 0.171639 }, { "epoch": 0.4289111133458116, "grad_norm": 0.7718328237533569, "learning_rate": 9.278636321812946e-05, "loss": 0.8851064682006836, "memory(GiB)": 91.52, "step": 33055, "token_acc": 0.7627655869035179, "train_speed(iter/s)": 0.171627 }, { "epoch": 0.4289759917474673, "grad_norm": 0.8559707403182983, "learning_rate": 9.278358762089244e-05, "loss": 0.961580467224121, "memory(GiB)": 91.52, "step": 33060, "token_acc": 0.7317760046214391, "train_speed(iter/s)": 0.171616 }, { "epoch": 0.429040870149123, "grad_norm": 0.7684522271156311, "learning_rate": 9.278081153130463e-05, "loss": 0.9548014640808106, "memory(GiB)": 91.52, "step": 33065, "token_acc": 0.7487519469627382, "train_speed(iter/s)": 0.171606 }, { "epoch": 0.4291057485507787, "grad_norm": 0.7913159132003784, "learning_rate": 9.277803494939797e-05, "loss": 0.9631150245666504, "memory(GiB)": 91.52, "step": 33070, "token_acc": 0.7484644844280656, "train_speed(iter/s)": 0.171595 }, { "epoch": 0.4291706269524344, "grad_norm": 0.7779655456542969, "learning_rate": 9.277525787520447e-05, "loss": 0.9246139526367188, "memory(GiB)": 91.52, "step": 33075, "token_acc": 0.76689112260499, "train_speed(iter/s)": 0.171583 }, { "epoch": 0.4292355053540901, "grad_norm": 0.7007750868797302, "learning_rate": 9.277248030875604e-05, "loss": 0.967263126373291, "memory(GiB)": 91.52, "step": 33080, "token_acc": 0.7497551020408163, "train_speed(iter/s)": 0.171573 }, { "epoch": 0.4293003837557458, "grad_norm": 0.7615530490875244, "learning_rate": 9.276970225008465e-05, "loss": 0.9412984848022461, "memory(GiB)": 91.52, "step": 33085, "token_acc": 0.738684794477123, "train_speed(iter/s)": 0.171562 }, { "epoch": 0.4293652621574015, "grad_norm": 0.8424212336540222, "learning_rate": 9.276692369922231e-05, "loss": 0.8818740844726562, "memory(GiB)": 91.52, "step": 33090, "token_acc": 0.768987936412492, "train_speed(iter/s)": 0.17155 }, { "epoch": 0.4294301405590572, "grad_norm": 0.8271918296813965, "learning_rate": 9.276414465620094e-05, "loss": 0.9410987854003906, "memory(GiB)": 91.52, "step": 33095, "token_acc": 0.7526921648718901, "train_speed(iter/s)": 0.17154 }, { "epoch": 0.4294950189607129, "grad_norm": 0.7461459636688232, "learning_rate": 9.276136512105256e-05, "loss": 0.9658568382263184, "memory(GiB)": 91.52, "step": 33100, "token_acc": 0.7464143803216651, "train_speed(iter/s)": 0.17153 }, { "epoch": 0.4295598973623686, "grad_norm": 0.7578951716423035, "learning_rate": 9.275858509380913e-05, "loss": 0.9780394554138183, "memory(GiB)": 91.52, "step": 33105, "token_acc": 0.7508084207747641, "train_speed(iter/s)": 0.171519 }, { "epoch": 0.4296247757640243, "grad_norm": 0.7876623868942261, "learning_rate": 9.275580457450266e-05, "loss": 0.9446338653564453, "memory(GiB)": 91.52, "step": 33110, "token_acc": 0.7488414572269391, "train_speed(iter/s)": 0.171509 }, { "epoch": 0.42968965416568, "grad_norm": 0.7834365367889404, "learning_rate": 9.275302356316514e-05, "loss": 0.9573583602905273, "memory(GiB)": 91.52, "step": 33115, "token_acc": 0.7514084507042254, "train_speed(iter/s)": 0.171499 }, { "epoch": 0.42975453256733565, "grad_norm": 0.7602387070655823, "learning_rate": 9.275024205982858e-05, "loss": 0.9486506462097168, "memory(GiB)": 91.52, "step": 33120, "token_acc": 0.7497052069757338, "train_speed(iter/s)": 0.17149 }, { "epoch": 0.42981941096899134, "grad_norm": 0.765941858291626, "learning_rate": 9.2747460064525e-05, "loss": 0.9165376663208008, "memory(GiB)": 91.52, "step": 33125, "token_acc": 0.763268108479231, "train_speed(iter/s)": 0.171479 }, { "epoch": 0.42988428937064704, "grad_norm": 0.7649083733558655, "learning_rate": 9.274467757728637e-05, "loss": 0.8987583160400391, "memory(GiB)": 91.52, "step": 33130, "token_acc": 0.7671906830149517, "train_speed(iter/s)": 0.171468 }, { "epoch": 0.42994916777230274, "grad_norm": 0.8070810437202454, "learning_rate": 9.274189459814475e-05, "loss": 0.8973822593688965, "memory(GiB)": 91.52, "step": 33135, "token_acc": 0.7450965824665676, "train_speed(iter/s)": 0.171456 }, { "epoch": 0.43001404617395844, "grad_norm": 0.7292694449424744, "learning_rate": 9.273911112713218e-05, "loss": 0.9164716720581054, "memory(GiB)": 91.52, "step": 33140, "token_acc": 0.7510551418865594, "train_speed(iter/s)": 0.171445 }, { "epoch": 0.43007892457561414, "grad_norm": 0.7742223739624023, "learning_rate": 9.273632716428064e-05, "loss": 0.9052474975585938, "memory(GiB)": 91.52, "step": 33145, "token_acc": 0.7358530704864559, "train_speed(iter/s)": 0.171435 }, { "epoch": 0.43014380297726984, "grad_norm": 0.8973662257194519, "learning_rate": 9.273354270962222e-05, "loss": 0.9609600067138672, "memory(GiB)": 91.52, "step": 33150, "token_acc": 0.7308248914616498, "train_speed(iter/s)": 0.171424 }, { "epoch": 0.43020868137892554, "grad_norm": 0.8487192392349243, "learning_rate": 9.273075776318894e-05, "loss": 0.9401692390441895, "memory(GiB)": 91.52, "step": 33155, "token_acc": 0.754235956115809, "train_speed(iter/s)": 0.171414 }, { "epoch": 0.43027355978058124, "grad_norm": 0.8267116546630859, "learning_rate": 9.272797232501283e-05, "loss": 0.9222957611083984, "memory(GiB)": 91.52, "step": 33160, "token_acc": 0.7658057191513518, "train_speed(iter/s)": 0.171402 }, { "epoch": 0.43033843818223694, "grad_norm": 0.8687804341316223, "learning_rate": 9.272518639512598e-05, "loss": 0.9555642127990722, "memory(GiB)": 91.52, "step": 33165, "token_acc": 0.7396806266947876, "train_speed(iter/s)": 0.171392 }, { "epoch": 0.43040331658389264, "grad_norm": 0.7040562033653259, "learning_rate": 9.272239997356044e-05, "loss": 0.9464986801147461, "memory(GiB)": 91.52, "step": 33170, "token_acc": 0.7372348322068992, "train_speed(iter/s)": 0.17138 }, { "epoch": 0.43046819498554834, "grad_norm": 0.7169488668441772, "learning_rate": 9.271961306034826e-05, "loss": 0.8894092559814453, "memory(GiB)": 91.52, "step": 33175, "token_acc": 0.7581168831168831, "train_speed(iter/s)": 0.171369 }, { "epoch": 0.43053307338720403, "grad_norm": 0.9826798439025879, "learning_rate": 9.271682565552152e-05, "loss": 0.8893760681152344, "memory(GiB)": 91.52, "step": 33180, "token_acc": 0.7771420397654126, "train_speed(iter/s)": 0.171358 }, { "epoch": 0.43059795178885973, "grad_norm": 0.6887859106063843, "learning_rate": 9.27140377591123e-05, "loss": 0.924889087677002, "memory(GiB)": 91.52, "step": 33185, "token_acc": 0.7628659577261389, "train_speed(iter/s)": 0.171347 }, { "epoch": 0.43066283019051543, "grad_norm": 0.8781493306159973, "learning_rate": 9.27112493711527e-05, "loss": 0.9462767601013183, "memory(GiB)": 91.52, "step": 33190, "token_acc": 0.7500362897372623, "train_speed(iter/s)": 0.171337 }, { "epoch": 0.43072770859217113, "grad_norm": 0.7936499714851379, "learning_rate": 9.270846049167476e-05, "loss": 0.9930133819580078, "memory(GiB)": 91.52, "step": 33195, "token_acc": 0.7252629179826132, "train_speed(iter/s)": 0.171327 }, { "epoch": 0.43079258699382683, "grad_norm": 0.7454954385757446, "learning_rate": 9.270567112071064e-05, "loss": 0.973147964477539, "memory(GiB)": 91.52, "step": 33200, "token_acc": 0.7395702893297305, "train_speed(iter/s)": 0.171316 }, { "epoch": 0.43085746539548253, "grad_norm": 0.7807418704032898, "learning_rate": 9.270288125829238e-05, "loss": 0.9279993057250977, "memory(GiB)": 91.52, "step": 33205, "token_acc": 0.7574747218497445, "train_speed(iter/s)": 0.171304 }, { "epoch": 0.43092234379713823, "grad_norm": 0.7568601965904236, "learning_rate": 9.270009090445211e-05, "loss": 0.9129487991333007, "memory(GiB)": 91.52, "step": 33210, "token_acc": 0.7642907003800512, "train_speed(iter/s)": 0.171293 }, { "epoch": 0.43098722219879393, "grad_norm": 0.7769526839256287, "learning_rate": 9.269730005922197e-05, "loss": 0.9266579627990723, "memory(GiB)": 91.52, "step": 33215, "token_acc": 0.7475824846596203, "train_speed(iter/s)": 0.171282 }, { "epoch": 0.43105210060044963, "grad_norm": 0.7798435688018799, "learning_rate": 9.269450872263402e-05, "loss": 0.9577569007873535, "memory(GiB)": 91.52, "step": 33220, "token_acc": 0.762293956043956, "train_speed(iter/s)": 0.171272 }, { "epoch": 0.4311169790021053, "grad_norm": 0.7472542524337769, "learning_rate": 9.269171689472042e-05, "loss": 0.964320182800293, "memory(GiB)": 91.52, "step": 33225, "token_acc": 0.748662484123013, "train_speed(iter/s)": 0.171262 }, { "epoch": 0.431181857403761, "grad_norm": 0.7854889631271362, "learning_rate": 9.26889245755133e-05, "loss": 0.9592438697814941, "memory(GiB)": 91.52, "step": 33230, "token_acc": 0.7371844728841123, "train_speed(iter/s)": 0.171253 }, { "epoch": 0.4312467358054167, "grad_norm": 0.8241093754768372, "learning_rate": 9.268613176504477e-05, "loss": 0.9440121650695801, "memory(GiB)": 91.52, "step": 33235, "token_acc": 0.7373391352652532, "train_speed(iter/s)": 0.171245 }, { "epoch": 0.43131161420707237, "grad_norm": 0.7626962661743164, "learning_rate": 9.268333846334698e-05, "loss": 0.9286535263061524, "memory(GiB)": 91.52, "step": 33240, "token_acc": 0.7519914318227459, "train_speed(iter/s)": 0.171235 }, { "epoch": 0.43137649260872807, "grad_norm": 0.7615876793861389, "learning_rate": 9.268054467045209e-05, "loss": 0.9134188652038574, "memory(GiB)": 91.52, "step": 33245, "token_acc": 0.7364813112745098, "train_speed(iter/s)": 0.171225 }, { "epoch": 0.43144137101038377, "grad_norm": 0.7644191980361938, "learning_rate": 9.267775038639223e-05, "loss": 0.9515879631042481, "memory(GiB)": 91.52, "step": 33250, "token_acc": 0.7402419538918055, "train_speed(iter/s)": 0.171216 }, { "epoch": 0.43150624941203947, "grad_norm": 0.7774769067764282, "learning_rate": 9.267495561119957e-05, "loss": 0.930174732208252, "memory(GiB)": 91.52, "step": 33255, "token_acc": 0.7624684946154434, "train_speed(iter/s)": 0.171206 }, { "epoch": 0.43157112781369517, "grad_norm": 0.7992874383926392, "learning_rate": 9.267216034490628e-05, "loss": 0.948579978942871, "memory(GiB)": 91.52, "step": 33260, "token_acc": 0.740770901194354, "train_speed(iter/s)": 0.171196 }, { "epoch": 0.43163600621535086, "grad_norm": 0.7245952486991882, "learning_rate": 9.26693645875445e-05, "loss": 0.9860145568847656, "memory(GiB)": 91.52, "step": 33265, "token_acc": 0.7250899126562768, "train_speed(iter/s)": 0.171186 }, { "epoch": 0.43170088461700656, "grad_norm": 0.7371408939361572, "learning_rate": 9.266656833914643e-05, "loss": 0.9305960655212402, "memory(GiB)": 91.52, "step": 33270, "token_acc": 0.7551005900334116, "train_speed(iter/s)": 0.171175 }, { "epoch": 0.43176576301866226, "grad_norm": 0.7913194298744202, "learning_rate": 9.266377159974423e-05, "loss": 0.9349349975585938, "memory(GiB)": 91.52, "step": 33275, "token_acc": 0.7382612126792055, "train_speed(iter/s)": 0.171166 }, { "epoch": 0.43183064142031796, "grad_norm": 0.769270658493042, "learning_rate": 9.266097436937008e-05, "loss": 0.9641519546508789, "memory(GiB)": 91.52, "step": 33280, "token_acc": 0.7582772439014901, "train_speed(iter/s)": 0.171157 }, { "epoch": 0.43189551982197366, "grad_norm": 0.8203083872795105, "learning_rate": 9.265817664805619e-05, "loss": 0.9293632507324219, "memory(GiB)": 91.52, "step": 33285, "token_acc": 0.7727594823406556, "train_speed(iter/s)": 0.171146 }, { "epoch": 0.43196039822362936, "grad_norm": 0.7834833860397339, "learning_rate": 9.265537843583477e-05, "loss": 0.9194789886474609, "memory(GiB)": 91.52, "step": 33290, "token_acc": 0.746741355165356, "train_speed(iter/s)": 0.171137 }, { "epoch": 0.43202527662528506, "grad_norm": 0.7471171617507935, "learning_rate": 9.265257973273796e-05, "loss": 0.9480162620544433, "memory(GiB)": 91.52, "step": 33295, "token_acc": 0.7624199241730946, "train_speed(iter/s)": 0.171125 }, { "epoch": 0.43209015502694076, "grad_norm": 0.7775827646255493, "learning_rate": 9.264978053879802e-05, "loss": 0.9593620300292969, "memory(GiB)": 91.52, "step": 33300, "token_acc": 0.7367845307422541, "train_speed(iter/s)": 0.171115 }, { "epoch": 0.43215503342859646, "grad_norm": 0.762872040271759, "learning_rate": 9.264698085404717e-05, "loss": 0.9416796684265136, "memory(GiB)": 91.52, "step": 33305, "token_acc": 0.7489503751118453, "train_speed(iter/s)": 0.171104 }, { "epoch": 0.43221991183025216, "grad_norm": 0.8034043312072754, "learning_rate": 9.264418067851761e-05, "loss": 0.9148475646972656, "memory(GiB)": 91.52, "step": 33310, "token_acc": 0.7399816106045514, "train_speed(iter/s)": 0.171095 }, { "epoch": 0.43228479023190786, "grad_norm": 0.8265343308448792, "learning_rate": 9.264138001224155e-05, "loss": 0.9586256980895996, "memory(GiB)": 91.52, "step": 33315, "token_acc": 0.7451858838324295, "train_speed(iter/s)": 0.171084 }, { "epoch": 0.43234966863356356, "grad_norm": 0.8715009689331055, "learning_rate": 9.263857885525122e-05, "loss": 0.9544194221496582, "memory(GiB)": 91.52, "step": 33320, "token_acc": 0.7503757461244471, "train_speed(iter/s)": 0.171076 }, { "epoch": 0.43241454703521925, "grad_norm": 0.7855721116065979, "learning_rate": 9.26357772075789e-05, "loss": 0.9579656600952149, "memory(GiB)": 91.52, "step": 33325, "token_acc": 0.7434260212838998, "train_speed(iter/s)": 0.171065 }, { "epoch": 0.43247942543687495, "grad_norm": 0.736732542514801, "learning_rate": 9.263297506925677e-05, "loss": 0.9133920669555664, "memory(GiB)": 91.52, "step": 33330, "token_acc": 0.7663802552950745, "train_speed(iter/s)": 0.171054 }, { "epoch": 0.43254430383853065, "grad_norm": 0.8188751339912415, "learning_rate": 9.263017244031712e-05, "loss": 0.9167379379272461, "memory(GiB)": 91.52, "step": 33335, "token_acc": 0.752196098695259, "train_speed(iter/s)": 0.171044 }, { "epoch": 0.43260918224018635, "grad_norm": 0.8121152520179749, "learning_rate": 9.26273693207922e-05, "loss": 0.9575944900512695, "memory(GiB)": 91.52, "step": 33340, "token_acc": 0.7445001692255623, "train_speed(iter/s)": 0.171034 }, { "epoch": 0.43267406064184205, "grad_norm": 0.7603461742401123, "learning_rate": 9.262456571071425e-05, "loss": 0.9568965911865235, "memory(GiB)": 91.52, "step": 33345, "token_acc": 0.7401555053925257, "train_speed(iter/s)": 0.171023 }, { "epoch": 0.43273893904349775, "grad_norm": 0.7650533318519592, "learning_rate": 9.262176161011553e-05, "loss": 0.9564571380615234, "memory(GiB)": 91.52, "step": 33350, "token_acc": 0.7576149425287356, "train_speed(iter/s)": 0.171014 }, { "epoch": 0.4328038174451534, "grad_norm": 0.7878081202507019, "learning_rate": 9.261895701902834e-05, "loss": 0.9065903663635254, "memory(GiB)": 91.52, "step": 33355, "token_acc": 0.7512786187484982, "train_speed(iter/s)": 0.171003 }, { "epoch": 0.4328686958468091, "grad_norm": 0.8193385004997253, "learning_rate": 9.261615193748491e-05, "loss": 0.9432729721069336, "memory(GiB)": 91.52, "step": 33360, "token_acc": 0.7323484267075978, "train_speed(iter/s)": 0.170994 }, { "epoch": 0.4329335742484648, "grad_norm": 0.8093302249908447, "learning_rate": 9.261334636551757e-05, "loss": 0.9259601593017578, "memory(GiB)": 91.52, "step": 33365, "token_acc": 0.7618187181945572, "train_speed(iter/s)": 0.170984 }, { "epoch": 0.4329984526501205, "grad_norm": 0.7872223854064941, "learning_rate": 9.261054030315857e-05, "loss": 0.9133127212524415, "memory(GiB)": 91.52, "step": 33370, "token_acc": 0.7489405331510595, "train_speed(iter/s)": 0.170974 }, { "epoch": 0.4330633310517762, "grad_norm": 0.8115139007568359, "learning_rate": 9.260773375044023e-05, "loss": 0.9576563835144043, "memory(GiB)": 91.52, "step": 33375, "token_acc": 0.7407042309057214, "train_speed(iter/s)": 0.170964 }, { "epoch": 0.4331282094534319, "grad_norm": 0.7652854323387146, "learning_rate": 9.260492670739482e-05, "loss": 0.9549823760986328, "memory(GiB)": 91.52, "step": 33380, "token_acc": 0.7517608914396452, "train_speed(iter/s)": 0.170954 }, { "epoch": 0.4331930878550876, "grad_norm": 0.827589213848114, "learning_rate": 9.260211917405466e-05, "loss": 0.9374238967895507, "memory(GiB)": 91.52, "step": 33385, "token_acc": 0.7652972900992077, "train_speed(iter/s)": 0.170944 }, { "epoch": 0.4332579662567433, "grad_norm": 0.8100990056991577, "learning_rate": 9.259931115045204e-05, "loss": 0.9261740684509278, "memory(GiB)": 91.52, "step": 33390, "token_acc": 0.7702878590671066, "train_speed(iter/s)": 0.170934 }, { "epoch": 0.433322844658399, "grad_norm": 0.7784375548362732, "learning_rate": 9.25965026366193e-05, "loss": 0.9641585350036621, "memory(GiB)": 91.52, "step": 33395, "token_acc": 0.7305072804133396, "train_speed(iter/s)": 0.170922 }, { "epoch": 0.4333877230600547, "grad_norm": 0.8001490831375122, "learning_rate": 9.259369363258874e-05, "loss": 0.9323354721069336, "memory(GiB)": 91.52, "step": 33400, "token_acc": 0.7504870920603994, "train_speed(iter/s)": 0.170912 }, { "epoch": 0.4334526014617104, "grad_norm": 0.753176212310791, "learning_rate": 9.259088413839271e-05, "loss": 0.9327160835266113, "memory(GiB)": 91.52, "step": 33405, "token_acc": 0.7518386449743704, "train_speed(iter/s)": 0.170902 }, { "epoch": 0.4335174798633661, "grad_norm": 0.7699146270751953, "learning_rate": 9.258807415406352e-05, "loss": 0.973979377746582, "memory(GiB)": 91.52, "step": 33410, "token_acc": 0.7422353057916846, "train_speed(iter/s)": 0.170892 }, { "epoch": 0.4335823582650218, "grad_norm": 0.8315110206604004, "learning_rate": 9.258526367963352e-05, "loss": 0.95841064453125, "memory(GiB)": 91.52, "step": 33415, "token_acc": 0.7461892945763914, "train_speed(iter/s)": 0.170883 }, { "epoch": 0.4336472366666775, "grad_norm": 0.7559326887130737, "learning_rate": 9.258245271513504e-05, "loss": 0.9430898666381836, "memory(GiB)": 91.52, "step": 33420, "token_acc": 0.7551187551187551, "train_speed(iter/s)": 0.170873 }, { "epoch": 0.4337121150683332, "grad_norm": 0.7961052656173706, "learning_rate": 9.257964126060043e-05, "loss": 0.9589414596557617, "memory(GiB)": 91.52, "step": 33425, "token_acc": 0.7649685320632761, "train_speed(iter/s)": 0.170864 }, { "epoch": 0.4337769934699889, "grad_norm": 0.7073022127151489, "learning_rate": 9.257682931606205e-05, "loss": 0.9382081985473633, "memory(GiB)": 91.52, "step": 33430, "token_acc": 0.7569678982021628, "train_speed(iter/s)": 0.170854 }, { "epoch": 0.4338418718716446, "grad_norm": 0.8655390739440918, "learning_rate": 9.257401688155227e-05, "loss": 0.8960741996765137, "memory(GiB)": 91.52, "step": 33435, "token_acc": 0.7604800064430395, "train_speed(iter/s)": 0.170844 }, { "epoch": 0.4339067502733003, "grad_norm": 0.7031536102294922, "learning_rate": 9.257120395710344e-05, "loss": 0.9029474258422852, "memory(GiB)": 91.52, "step": 33440, "token_acc": 0.7578084997439836, "train_speed(iter/s)": 0.170833 }, { "epoch": 0.433971628674956, "grad_norm": 0.801712155342102, "learning_rate": 9.256839054274792e-05, "loss": 0.9689216613769531, "memory(GiB)": 91.52, "step": 33445, "token_acc": 0.7174571140262361, "train_speed(iter/s)": 0.170822 }, { "epoch": 0.4340365070766117, "grad_norm": 0.7722235918045044, "learning_rate": 9.256557663851811e-05, "loss": 0.8982349395751953, "memory(GiB)": 91.52, "step": 33450, "token_acc": 0.7535058756701104, "train_speed(iter/s)": 0.170813 }, { "epoch": 0.4341013854782674, "grad_norm": 0.7012496590614319, "learning_rate": 9.25627622444464e-05, "loss": 0.930377197265625, "memory(GiB)": 91.52, "step": 33455, "token_acc": 0.7504413267676299, "train_speed(iter/s)": 0.170803 }, { "epoch": 0.4341662638799231, "grad_norm": 0.7985218167304993, "learning_rate": 9.255994736056514e-05, "loss": 0.9213797569274902, "memory(GiB)": 91.52, "step": 33460, "token_acc": 0.7432930453734892, "train_speed(iter/s)": 0.17079 }, { "epoch": 0.4342311422815788, "grad_norm": 0.9039619565010071, "learning_rate": 9.255713198690674e-05, "loss": 0.9286369323730469, "memory(GiB)": 91.52, "step": 33465, "token_acc": 0.7385039067503523, "train_speed(iter/s)": 0.170781 }, { "epoch": 0.4342960206832345, "grad_norm": 0.7938376069068909, "learning_rate": 9.255431612350363e-05, "loss": 0.9342395782470703, "memory(GiB)": 91.52, "step": 33470, "token_acc": 0.7471382593981221, "train_speed(iter/s)": 0.17077 }, { "epoch": 0.4343608990848901, "grad_norm": 0.8182578682899475, "learning_rate": 9.255149977038816e-05, "loss": 0.9519638061523438, "memory(GiB)": 91.52, "step": 33475, "token_acc": 0.7228511169457238, "train_speed(iter/s)": 0.170758 }, { "epoch": 0.4344257774865458, "grad_norm": 0.7528185248374939, "learning_rate": 9.254868292759278e-05, "loss": 0.913233757019043, "memory(GiB)": 91.52, "step": 33480, "token_acc": 0.7397911030013213, "train_speed(iter/s)": 0.170747 }, { "epoch": 0.4344906558882015, "grad_norm": 0.6631625294685364, "learning_rate": 9.254586559514991e-05, "loss": 0.9094910621643066, "memory(GiB)": 91.52, "step": 33485, "token_acc": 0.7492153070606512, "train_speed(iter/s)": 0.170736 }, { "epoch": 0.4345555342898572, "grad_norm": 0.7264717817306519, "learning_rate": 9.254304777309194e-05, "loss": 0.9035477638244629, "memory(GiB)": 91.52, "step": 33490, "token_acc": 0.7636634888290518, "train_speed(iter/s)": 0.170727 }, { "epoch": 0.4346204126915129, "grad_norm": 0.7620152235031128, "learning_rate": 9.254022946145132e-05, "loss": 0.8984287261962891, "memory(GiB)": 91.52, "step": 33495, "token_acc": 0.7600786248361983, "train_speed(iter/s)": 0.170716 }, { "epoch": 0.4346852910931686, "grad_norm": 0.7117815613746643, "learning_rate": 9.253741066026047e-05, "loss": 0.9458907127380372, "memory(GiB)": 91.52, "step": 33500, "token_acc": 0.7270251872021783, "train_speed(iter/s)": 0.170705 }, { "epoch": 0.4347501694948243, "grad_norm": 0.7876895666122437, "learning_rate": 9.253459136955184e-05, "loss": 0.9295398712158203, "memory(GiB)": 91.52, "step": 33505, "token_acc": 0.7369164900029097, "train_speed(iter/s)": 0.170696 }, { "epoch": 0.43481504789648, "grad_norm": 0.7041345834732056, "learning_rate": 9.253177158935789e-05, "loss": 0.8922348022460938, "memory(GiB)": 91.52, "step": 33510, "token_acc": 0.7352226454748456, "train_speed(iter/s)": 0.170686 }, { "epoch": 0.4348799262981357, "grad_norm": 0.6881921291351318, "learning_rate": 9.252895131971103e-05, "loss": 0.904025650024414, "memory(GiB)": 91.52, "step": 33515, "token_acc": 0.7530521642619312, "train_speed(iter/s)": 0.170676 }, { "epoch": 0.4349448046997914, "grad_norm": 0.7232113480567932, "learning_rate": 9.252613056064375e-05, "loss": 0.9197210311889649, "memory(GiB)": 91.52, "step": 33520, "token_acc": 0.7474302496328928, "train_speed(iter/s)": 0.170666 }, { "epoch": 0.4350096831014471, "grad_norm": 0.6826912760734558, "learning_rate": 9.252330931218846e-05, "loss": 0.9649792671203613, "memory(GiB)": 91.52, "step": 33525, "token_acc": 0.7407933016391116, "train_speed(iter/s)": 0.170657 }, { "epoch": 0.4350745615031028, "grad_norm": 0.8513585925102234, "learning_rate": 9.25204875743777e-05, "loss": 0.9252571105957031, "memory(GiB)": 91.52, "step": 33530, "token_acc": 0.7702119071644803, "train_speed(iter/s)": 0.170648 }, { "epoch": 0.4351394399047585, "grad_norm": 0.792766273021698, "learning_rate": 9.251766534724389e-05, "loss": 0.9171867370605469, "memory(GiB)": 91.52, "step": 33535, "token_acc": 0.7723818734457032, "train_speed(iter/s)": 0.170638 }, { "epoch": 0.4352043183064142, "grad_norm": 0.752438485622406, "learning_rate": 9.251484263081953e-05, "loss": 0.9457172393798828, "memory(GiB)": 91.52, "step": 33540, "token_acc": 0.7578460072162843, "train_speed(iter/s)": 0.170629 }, { "epoch": 0.4352691967080699, "grad_norm": 0.7345693111419678, "learning_rate": 9.25120194251371e-05, "loss": 0.935662841796875, "memory(GiB)": 91.52, "step": 33545, "token_acc": 0.7499294084427502, "train_speed(iter/s)": 0.170619 }, { "epoch": 0.4353340751097256, "grad_norm": 0.7821974754333496, "learning_rate": 9.250919573022907e-05, "loss": 0.9674261093139649, "memory(GiB)": 91.52, "step": 33550, "token_acc": 0.7498184082183252, "train_speed(iter/s)": 0.170608 }, { "epoch": 0.4353989535113813, "grad_norm": 0.7714020013809204, "learning_rate": 9.250637154612797e-05, "loss": 0.932734489440918, "memory(GiB)": 91.52, "step": 33555, "token_acc": 0.7283979293867799, "train_speed(iter/s)": 0.170597 }, { "epoch": 0.435463831913037, "grad_norm": 0.8173750638961792, "learning_rate": 9.250354687286627e-05, "loss": 0.982239818572998, "memory(GiB)": 91.52, "step": 33560, "token_acc": 0.752069076708108, "train_speed(iter/s)": 0.170586 }, { "epoch": 0.4355287103146927, "grad_norm": 0.8627483248710632, "learning_rate": 9.25007217104765e-05, "loss": 0.8854719161987304, "memory(GiB)": 91.52, "step": 33565, "token_acc": 0.7801566228615602, "train_speed(iter/s)": 0.170576 }, { "epoch": 0.4355935887163484, "grad_norm": 0.706807553768158, "learning_rate": 9.249789605899115e-05, "loss": 0.9331052780151368, "memory(GiB)": 91.52, "step": 33570, "token_acc": 0.7564579606440072, "train_speed(iter/s)": 0.170566 }, { "epoch": 0.4356584671180041, "grad_norm": 0.7580415606498718, "learning_rate": 9.249506991844274e-05, "loss": 0.9259419441223145, "memory(GiB)": 91.52, "step": 33575, "token_acc": 0.7503461696792785, "train_speed(iter/s)": 0.170555 }, { "epoch": 0.4357233455196598, "grad_norm": 0.7696175575256348, "learning_rate": 9.249224328886381e-05, "loss": 0.9652099609375, "memory(GiB)": 91.52, "step": 33580, "token_acc": 0.7515426072014372, "train_speed(iter/s)": 0.170547 }, { "epoch": 0.4357882239213155, "grad_norm": 0.7766745090484619, "learning_rate": 9.248941617028689e-05, "loss": 0.9273707389831543, "memory(GiB)": 91.52, "step": 33585, "token_acc": 0.719859235948973, "train_speed(iter/s)": 0.170536 }, { "epoch": 0.4358531023229712, "grad_norm": 0.8351068496704102, "learning_rate": 9.248658856274448e-05, "loss": 0.887753677368164, "memory(GiB)": 91.52, "step": 33590, "token_acc": 0.7659396243435678, "train_speed(iter/s)": 0.170525 }, { "epoch": 0.43591798072462684, "grad_norm": 0.7287572026252747, "learning_rate": 9.248376046626917e-05, "loss": 0.9394326210021973, "memory(GiB)": 91.52, "step": 33595, "token_acc": 0.7376239322103674, "train_speed(iter/s)": 0.170515 }, { "epoch": 0.43598285912628254, "grad_norm": 0.7971779704093933, "learning_rate": 9.248093188089346e-05, "loss": 0.9480690956115723, "memory(GiB)": 91.52, "step": 33600, "token_acc": 0.7526417081230256, "train_speed(iter/s)": 0.170506 }, { "epoch": 0.43604773752793824, "grad_norm": 0.819849967956543, "learning_rate": 9.247810280664993e-05, "loss": 0.9429159164428711, "memory(GiB)": 91.52, "step": 33605, "token_acc": 0.7534100303113805, "train_speed(iter/s)": 0.170498 }, { "epoch": 0.43611261592959394, "grad_norm": 0.7532683610916138, "learning_rate": 9.24752732435711e-05, "loss": 0.9160734176635742, "memory(GiB)": 91.52, "step": 33610, "token_acc": 0.7472634577897735, "train_speed(iter/s)": 0.170488 }, { "epoch": 0.43617749433124964, "grad_norm": 0.8487735986709595, "learning_rate": 9.24724431916896e-05, "loss": 0.9304521560668946, "memory(GiB)": 91.52, "step": 33615, "token_acc": 0.7459352801894238, "train_speed(iter/s)": 0.170477 }, { "epoch": 0.43624237273290534, "grad_norm": 0.7987513542175293, "learning_rate": 9.246961265103793e-05, "loss": 0.9424533843994141, "memory(GiB)": 91.52, "step": 33620, "token_acc": 0.7275023956790662, "train_speed(iter/s)": 0.170466 }, { "epoch": 0.43630725113456104, "grad_norm": 0.8336693644523621, "learning_rate": 9.24667816216487e-05, "loss": 0.9681585311889649, "memory(GiB)": 91.52, "step": 33625, "token_acc": 0.732348305752561, "train_speed(iter/s)": 0.170456 }, { "epoch": 0.43637212953621674, "grad_norm": 0.8654475808143616, "learning_rate": 9.246395010355448e-05, "loss": 0.9230144500732422, "memory(GiB)": 91.52, "step": 33630, "token_acc": 0.7641087472288126, "train_speed(iter/s)": 0.170447 }, { "epoch": 0.43643700793787243, "grad_norm": 0.896148681640625, "learning_rate": 9.246111809678785e-05, "loss": 0.9368474960327149, "memory(GiB)": 91.52, "step": 33635, "token_acc": 0.7590400944169426, "train_speed(iter/s)": 0.170438 }, { "epoch": 0.43650188633952813, "grad_norm": 0.8439935445785522, "learning_rate": 9.245828560138141e-05, "loss": 0.9499959945678711, "memory(GiB)": 91.52, "step": 33640, "token_acc": 0.7377429538625385, "train_speed(iter/s)": 0.170428 }, { "epoch": 0.43656676474118383, "grad_norm": 0.8128577470779419, "learning_rate": 9.245545261736775e-05, "loss": 0.9360135078430176, "memory(GiB)": 91.52, "step": 33645, "token_acc": 0.7566577082824334, "train_speed(iter/s)": 0.170416 }, { "epoch": 0.43663164314283953, "grad_norm": 0.7842325568199158, "learning_rate": 9.245261914477947e-05, "loss": 0.928325080871582, "memory(GiB)": 91.52, "step": 33650, "token_acc": 0.7546922220454907, "train_speed(iter/s)": 0.170406 }, { "epoch": 0.43669652154449523, "grad_norm": 0.7562484741210938, "learning_rate": 9.24497851836492e-05, "loss": 0.917175006866455, "memory(GiB)": 91.52, "step": 33655, "token_acc": 0.7470858697059967, "train_speed(iter/s)": 0.170396 }, { "epoch": 0.43676139994615093, "grad_norm": 0.8057266473770142, "learning_rate": 9.24469507340095e-05, "loss": 0.954989242553711, "memory(GiB)": 91.52, "step": 33660, "token_acc": 0.7480218886341788, "train_speed(iter/s)": 0.170386 }, { "epoch": 0.43682627834780663, "grad_norm": 0.795674204826355, "learning_rate": 9.244411579589305e-05, "loss": 0.9376821517944336, "memory(GiB)": 91.52, "step": 33665, "token_acc": 0.7610399003495801, "train_speed(iter/s)": 0.170376 }, { "epoch": 0.43689115674946233, "grad_norm": 0.8353824019432068, "learning_rate": 9.244128036933244e-05, "loss": 0.9504529953002929, "memory(GiB)": 91.52, "step": 33670, "token_acc": 0.7483420992453693, "train_speed(iter/s)": 0.170366 }, { "epoch": 0.436956035151118, "grad_norm": 0.7877737283706665, "learning_rate": 9.243844445436032e-05, "loss": 0.986359977722168, "memory(GiB)": 91.52, "step": 33675, "token_acc": 0.738637909919717, "train_speed(iter/s)": 0.170357 }, { "epoch": 0.4370209135527737, "grad_norm": 0.8267471194267273, "learning_rate": 9.243560805100929e-05, "loss": 0.9118124961853027, "memory(GiB)": 91.52, "step": 33680, "token_acc": 0.7511128619263344, "train_speed(iter/s)": 0.170348 }, { "epoch": 0.4370857919544294, "grad_norm": 0.7071099877357483, "learning_rate": 9.243277115931202e-05, "loss": 0.921965217590332, "memory(GiB)": 91.52, "step": 33685, "token_acc": 0.7316757051741192, "train_speed(iter/s)": 0.170337 }, { "epoch": 0.4371506703560851, "grad_norm": 0.7699211239814758, "learning_rate": 9.242993377930116e-05, "loss": 0.9340574264526367, "memory(GiB)": 91.52, "step": 33690, "token_acc": 0.7435621267012631, "train_speed(iter/s)": 0.170326 }, { "epoch": 0.4372155487577408, "grad_norm": 0.7771443128585815, "learning_rate": 9.242709591100935e-05, "loss": 0.885678482055664, "memory(GiB)": 91.52, "step": 33695, "token_acc": 0.7759623992837958, "train_speed(iter/s)": 0.170316 }, { "epoch": 0.4372804271593965, "grad_norm": 0.7451342344284058, "learning_rate": 9.242425755446924e-05, "loss": 0.9463203430175782, "memory(GiB)": 91.52, "step": 33700, "token_acc": 0.7569208719921551, "train_speed(iter/s)": 0.170306 }, { "epoch": 0.4373453055610522, "grad_norm": 0.7662222981452942, "learning_rate": 9.242141870971351e-05, "loss": 0.9832649230957031, "memory(GiB)": 91.52, "step": 33705, "token_acc": 0.7482786829368056, "train_speed(iter/s)": 0.170296 }, { "epoch": 0.4374101839627079, "grad_norm": 0.7756611704826355, "learning_rate": 9.241857937677482e-05, "loss": 0.9511259078979493, "memory(GiB)": 91.52, "step": 33710, "token_acc": 0.7468026083510466, "train_speed(iter/s)": 0.170288 }, { "epoch": 0.43747506236436356, "grad_norm": 0.7243354320526123, "learning_rate": 9.241573955568586e-05, "loss": 0.8970471382141113, "memory(GiB)": 91.52, "step": 33715, "token_acc": 0.7724059536073319, "train_speed(iter/s)": 0.170278 }, { "epoch": 0.43753994076601926, "grad_norm": 0.7563467621803284, "learning_rate": 9.24128992464793e-05, "loss": 0.9594463348388672, "memory(GiB)": 91.52, "step": 33720, "token_acc": 0.731106786863111, "train_speed(iter/s)": 0.170269 }, { "epoch": 0.43760481916767496, "grad_norm": 0.754776120185852, "learning_rate": 9.241005844918782e-05, "loss": 0.9776059150695801, "memory(GiB)": 91.52, "step": 33725, "token_acc": 0.721460245436733, "train_speed(iter/s)": 0.17026 }, { "epoch": 0.43766969756933066, "grad_norm": 0.8256752490997314, "learning_rate": 9.24072171638441e-05, "loss": 0.9370277404785157, "memory(GiB)": 91.52, "step": 33730, "token_acc": 0.7584350794275961, "train_speed(iter/s)": 0.170251 }, { "epoch": 0.43773457597098636, "grad_norm": 0.8162552118301392, "learning_rate": 9.240437539048089e-05, "loss": 0.9248397827148438, "memory(GiB)": 91.52, "step": 33735, "token_acc": 0.7329525585732602, "train_speed(iter/s)": 0.170241 }, { "epoch": 0.43779945437264206, "grad_norm": 0.8828868269920349, "learning_rate": 9.240153312913082e-05, "loss": 0.897076416015625, "memory(GiB)": 91.52, "step": 33740, "token_acc": 0.7646318632415677, "train_speed(iter/s)": 0.17023 }, { "epoch": 0.43786433277429776, "grad_norm": 0.7671847939491272, "learning_rate": 9.239869037982666e-05, "loss": 0.9370839118957519, "memory(GiB)": 91.52, "step": 33745, "token_acc": 0.7511006289308176, "train_speed(iter/s)": 0.170222 }, { "epoch": 0.43792921117595346, "grad_norm": 0.6977105140686035, "learning_rate": 9.23958471426011e-05, "loss": 0.9563562393188476, "memory(GiB)": 91.52, "step": 33750, "token_acc": 0.765414912399837, "train_speed(iter/s)": 0.170211 }, { "epoch": 0.43799408957760916, "grad_norm": 0.8426721692085266, "learning_rate": 9.239300341748684e-05, "loss": 0.9124946594238281, "memory(GiB)": 91.52, "step": 33755, "token_acc": 0.7621050572239069, "train_speed(iter/s)": 0.1702 }, { "epoch": 0.43805896797926486, "grad_norm": 0.9479579329490662, "learning_rate": 9.239015920451664e-05, "loss": 0.9420228958129883, "memory(GiB)": 91.52, "step": 33760, "token_acc": 0.7266168026594138, "train_speed(iter/s)": 0.170189 }, { "epoch": 0.43812384638092056, "grad_norm": 0.6828603744506836, "learning_rate": 9.238731450372322e-05, "loss": 0.9305112838745118, "memory(GiB)": 91.52, "step": 33765, "token_acc": 0.7549140783860293, "train_speed(iter/s)": 0.170181 }, { "epoch": 0.43818872478257626, "grad_norm": 0.6789276599884033, "learning_rate": 9.23844693151393e-05, "loss": 0.9002050399780274, "memory(GiB)": 91.52, "step": 33770, "token_acc": 0.7734017147555103, "train_speed(iter/s)": 0.170171 }, { "epoch": 0.43825360318423195, "grad_norm": 0.7971832752227783, "learning_rate": 9.238162363879765e-05, "loss": 0.9378412246704102, "memory(GiB)": 91.52, "step": 33775, "token_acc": 0.7543101843232173, "train_speed(iter/s)": 0.170163 }, { "epoch": 0.43831848158588765, "grad_norm": 0.7624059915542603, "learning_rate": 9.237877747473099e-05, "loss": 0.937592887878418, "memory(GiB)": 91.52, "step": 33780, "token_acc": 0.7399531513066393, "train_speed(iter/s)": 0.170154 }, { "epoch": 0.43838335998754335, "grad_norm": 0.9002411961555481, "learning_rate": 9.23759308229721e-05, "loss": 0.9346725463867187, "memory(GiB)": 91.52, "step": 33785, "token_acc": 0.7592667130154781, "train_speed(iter/s)": 0.170144 }, { "epoch": 0.43844823838919905, "grad_norm": 0.7283272743225098, "learning_rate": 9.237308368355369e-05, "loss": 0.9161815643310547, "memory(GiB)": 91.52, "step": 33790, "token_acc": 0.7568576155118686, "train_speed(iter/s)": 0.170136 }, { "epoch": 0.43851311679085475, "grad_norm": 0.8021551370620728, "learning_rate": 9.23702360565086e-05, "loss": 0.935028076171875, "memory(GiB)": 91.52, "step": 33795, "token_acc": 0.7358065323936608, "train_speed(iter/s)": 0.170127 }, { "epoch": 0.43857799519251045, "grad_norm": 0.7256999611854553, "learning_rate": 9.236738794186954e-05, "loss": 0.9772188186645507, "memory(GiB)": 91.52, "step": 33800, "token_acc": 0.7320298125799108, "train_speed(iter/s)": 0.170116 }, { "epoch": 0.43864287359416615, "grad_norm": 0.6910710334777832, "learning_rate": 9.23645393396693e-05, "loss": 0.947603416442871, "memory(GiB)": 91.52, "step": 33805, "token_acc": 0.7441146723398319, "train_speed(iter/s)": 0.170106 }, { "epoch": 0.43870775199582185, "grad_norm": 1.005016565322876, "learning_rate": 9.23616902499407e-05, "loss": 0.9541141510009765, "memory(GiB)": 91.52, "step": 33810, "token_acc": 0.7461260812745037, "train_speed(iter/s)": 0.170096 }, { "epoch": 0.43877263039747755, "grad_norm": 0.837246835231781, "learning_rate": 9.235884067271647e-05, "loss": 0.9532991409301758, "memory(GiB)": 91.52, "step": 33815, "token_acc": 0.7468863467770273, "train_speed(iter/s)": 0.170086 }, { "epoch": 0.43883750879913325, "grad_norm": 0.7500336170196533, "learning_rate": 9.235599060802944e-05, "loss": 0.8978459358215332, "memory(GiB)": 91.52, "step": 33820, "token_acc": 0.7547606434884434, "train_speed(iter/s)": 0.170075 }, { "epoch": 0.43890238720078895, "grad_norm": 0.8304175734519958, "learning_rate": 9.235314005591239e-05, "loss": 0.9389826774597168, "memory(GiB)": 91.52, "step": 33825, "token_acc": 0.76270960873037, "train_speed(iter/s)": 0.170064 }, { "epoch": 0.43896726560244465, "grad_norm": 0.754930317401886, "learning_rate": 9.235028901639812e-05, "loss": 0.986022663116455, "memory(GiB)": 91.52, "step": 33830, "token_acc": 0.7293439579142813, "train_speed(iter/s)": 0.170055 }, { "epoch": 0.4390321440041003, "grad_norm": 0.7669550776481628, "learning_rate": 9.234743748951947e-05, "loss": 0.988980484008789, "memory(GiB)": 91.52, "step": 33835, "token_acc": 0.7436812852448376, "train_speed(iter/s)": 0.170046 }, { "epoch": 0.439097022405756, "grad_norm": 0.8642609715461731, "learning_rate": 9.234458547530921e-05, "loss": 0.9404729843139649, "memory(GiB)": 91.52, "step": 33840, "token_acc": 0.7400766918851694, "train_speed(iter/s)": 0.170037 }, { "epoch": 0.4391619008074117, "grad_norm": 0.7747504711151123, "learning_rate": 9.23417329738002e-05, "loss": 0.9656744956970215, "memory(GiB)": 91.52, "step": 33845, "token_acc": 0.7427234727921157, "train_speed(iter/s)": 0.170027 }, { "epoch": 0.4392267792090674, "grad_norm": 0.7606679797172546, "learning_rate": 9.233887998502525e-05, "loss": 0.9395647048950195, "memory(GiB)": 91.52, "step": 33850, "token_acc": 0.7640853733546562, "train_speed(iter/s)": 0.170016 }, { "epoch": 0.4392916576107231, "grad_norm": 0.8735952973365784, "learning_rate": 9.23360265090172e-05, "loss": 0.9469255447387696, "memory(GiB)": 91.52, "step": 33855, "token_acc": 0.7379900168505389, "train_speed(iter/s)": 0.170006 }, { "epoch": 0.4393565360123788, "grad_norm": 0.7297995686531067, "learning_rate": 9.233317254580887e-05, "loss": 0.922788143157959, "memory(GiB)": 91.52, "step": 33860, "token_acc": 0.7397704253882512, "train_speed(iter/s)": 0.169995 }, { "epoch": 0.4394214144140345, "grad_norm": 0.7468858957290649, "learning_rate": 9.233031809543312e-05, "loss": 0.9442741394042968, "memory(GiB)": 91.52, "step": 33865, "token_acc": 0.7508746618575293, "train_speed(iter/s)": 0.169984 }, { "epoch": 0.4394862928156902, "grad_norm": 0.8839508295059204, "learning_rate": 9.23274631579228e-05, "loss": 0.8895586013793946, "memory(GiB)": 91.52, "step": 33870, "token_acc": 0.7658834862001089, "train_speed(iter/s)": 0.169974 }, { "epoch": 0.4395511712173459, "grad_norm": 0.7902483344078064, "learning_rate": 9.232460773331076e-05, "loss": 0.9883085250854492, "memory(GiB)": 91.52, "step": 33875, "token_acc": 0.7394967381174278, "train_speed(iter/s)": 0.169966 }, { "epoch": 0.4396160496190016, "grad_norm": 0.7793551087379456, "learning_rate": 9.232175182162984e-05, "loss": 0.9166516304016114, "memory(GiB)": 91.52, "step": 33880, "token_acc": 0.7561955313273321, "train_speed(iter/s)": 0.169957 }, { "epoch": 0.4396809280206573, "grad_norm": 0.7960692644119263, "learning_rate": 9.231889542291295e-05, "loss": 0.9420559883117676, "memory(GiB)": 91.52, "step": 33885, "token_acc": 0.7585744908896034, "train_speed(iter/s)": 0.169947 }, { "epoch": 0.439745806422313, "grad_norm": 0.7350149154663086, "learning_rate": 9.23160385371929e-05, "loss": 0.9282304763793945, "memory(GiB)": 91.52, "step": 33890, "token_acc": 0.7513942680092951, "train_speed(iter/s)": 0.169936 }, { "epoch": 0.4398106848239687, "grad_norm": 0.7676815390586853, "learning_rate": 9.231318116450262e-05, "loss": 0.9974637985229492, "memory(GiB)": 91.52, "step": 33895, "token_acc": 0.7525187032418953, "train_speed(iter/s)": 0.169928 }, { "epoch": 0.4398755632256244, "grad_norm": 0.8241065740585327, "learning_rate": 9.2310323304875e-05, "loss": 0.9575862884521484, "memory(GiB)": 91.52, "step": 33900, "token_acc": 0.7506280758509137, "train_speed(iter/s)": 0.169918 }, { "epoch": 0.4399404416272801, "grad_norm": 0.7259421348571777, "learning_rate": 9.230746495834286e-05, "loss": 0.8746095657348633, "memory(GiB)": 91.52, "step": 33905, "token_acc": 0.7487139182623607, "train_speed(iter/s)": 0.169909 }, { "epoch": 0.4400053200289358, "grad_norm": 0.7162346243858337, "learning_rate": 9.230460612493917e-05, "loss": 0.9180507659912109, "memory(GiB)": 91.52, "step": 33910, "token_acc": 0.7610337610337611, "train_speed(iter/s)": 0.1699 }, { "epoch": 0.4400701984305915, "grad_norm": 0.7143326997756958, "learning_rate": 9.230174680469677e-05, "loss": 0.9457890510559082, "memory(GiB)": 91.52, "step": 33915, "token_acc": 0.7527036048064085, "train_speed(iter/s)": 0.169889 }, { "epoch": 0.4401350768322472, "grad_norm": 0.7403339147567749, "learning_rate": 9.22988869976486e-05, "loss": 0.8770224571228027, "memory(GiB)": 91.52, "step": 33920, "token_acc": 0.7669910322457546, "train_speed(iter/s)": 0.169878 }, { "epoch": 0.4401999552339029, "grad_norm": 0.7354872822761536, "learning_rate": 9.229602670382757e-05, "loss": 0.9011994361877441, "memory(GiB)": 91.52, "step": 33925, "token_acc": 0.7711361613764461, "train_speed(iter/s)": 0.169868 }, { "epoch": 0.4402648336355586, "grad_norm": 0.6926062703132629, "learning_rate": 9.229316592326657e-05, "loss": 0.9055639266967773, "memory(GiB)": 91.52, "step": 33930, "token_acc": 0.762910035725885, "train_speed(iter/s)": 0.169857 }, { "epoch": 0.44032971203721427, "grad_norm": 0.8662548661231995, "learning_rate": 9.229030465599855e-05, "loss": 0.9717914581298828, "memory(GiB)": 91.52, "step": 33935, "token_acc": 0.7292331495986848, "train_speed(iter/s)": 0.169846 }, { "epoch": 0.44039459043886997, "grad_norm": 0.7439454197883606, "learning_rate": 9.228744290205643e-05, "loss": 0.9176023483276368, "memory(GiB)": 91.52, "step": 33940, "token_acc": 0.753921534068878, "train_speed(iter/s)": 0.169836 }, { "epoch": 0.44045946884052567, "grad_norm": 0.797348141670227, "learning_rate": 9.228458066147311e-05, "loss": 0.9722408294677735, "memory(GiB)": 91.52, "step": 33945, "token_acc": 0.7395540699121653, "train_speed(iter/s)": 0.169826 }, { "epoch": 0.44052434724218137, "grad_norm": 0.7886691689491272, "learning_rate": 9.22817179342816e-05, "loss": 0.9146665573120117, "memory(GiB)": 91.52, "step": 33950, "token_acc": 0.7984424539046387, "train_speed(iter/s)": 0.169814 }, { "epoch": 0.440589225643837, "grad_norm": 0.8132298588752747, "learning_rate": 9.227885472051478e-05, "loss": 0.9614105224609375, "memory(GiB)": 91.52, "step": 33955, "token_acc": 0.7480562562631743, "train_speed(iter/s)": 0.169804 }, { "epoch": 0.4406541040454927, "grad_norm": 0.8104344010353088, "learning_rate": 9.227599102020562e-05, "loss": 0.9510153770446778, "memory(GiB)": 91.52, "step": 33960, "token_acc": 0.7439726457555563, "train_speed(iter/s)": 0.169793 }, { "epoch": 0.4407189824471484, "grad_norm": 0.9361721277236938, "learning_rate": 9.227312683338707e-05, "loss": 0.9339005470275878, "memory(GiB)": 91.52, "step": 33965, "token_acc": 0.7577371278398285, "train_speed(iter/s)": 0.169783 }, { "epoch": 0.4407838608488041, "grad_norm": 0.7568673491477966, "learning_rate": 9.227026216009212e-05, "loss": 0.9480829238891602, "memory(GiB)": 91.52, "step": 33970, "token_acc": 0.7651896490606168, "train_speed(iter/s)": 0.169773 }, { "epoch": 0.4408487392504598, "grad_norm": 0.7503663301467896, "learning_rate": 9.226739700035368e-05, "loss": 0.9536516189575195, "memory(GiB)": 91.52, "step": 33975, "token_acc": 0.7585843320563737, "train_speed(iter/s)": 0.169763 }, { "epoch": 0.4409136176521155, "grad_norm": 0.711746871471405, "learning_rate": 9.226453135420479e-05, "loss": 0.8996395111083985, "memory(GiB)": 91.52, "step": 33980, "token_acc": 0.7394086367089348, "train_speed(iter/s)": 0.169752 }, { "epoch": 0.4409784960537712, "grad_norm": 0.7546747922897339, "learning_rate": 9.226166522167836e-05, "loss": 0.9137405395507813, "memory(GiB)": 91.52, "step": 33985, "token_acc": 0.7437568583114421, "train_speed(iter/s)": 0.169743 }, { "epoch": 0.4410433744554269, "grad_norm": 0.7304615378379822, "learning_rate": 9.225879860280744e-05, "loss": 0.9231181144714355, "memory(GiB)": 91.52, "step": 33990, "token_acc": 0.7666165118409002, "train_speed(iter/s)": 0.169732 }, { "epoch": 0.4411082528570826, "grad_norm": 0.752501368522644, "learning_rate": 9.225593149762496e-05, "loss": 0.8990097999572754, "memory(GiB)": 91.52, "step": 33995, "token_acc": 0.7469571025679086, "train_speed(iter/s)": 0.169721 }, { "epoch": 0.4411731312587383, "grad_norm": 0.7639393210411072, "learning_rate": 9.225306390616396e-05, "loss": 0.9030625343322753, "memory(GiB)": 91.52, "step": 34000, "token_acc": 0.7519234057103779, "train_speed(iter/s)": 0.169712 }, { "epoch": 0.441238009660394, "grad_norm": 0.8061012029647827, "learning_rate": 9.225019582845739e-05, "loss": 0.9293054580688477, "memory(GiB)": 91.52, "step": 34005, "token_acc": 0.7546842617265153, "train_speed(iter/s)": 0.169702 }, { "epoch": 0.4413028880620497, "grad_norm": 0.8572454452514648, "learning_rate": 9.22473272645383e-05, "loss": 0.9817166328430176, "memory(GiB)": 91.52, "step": 34010, "token_acc": 0.753600269179004, "train_speed(iter/s)": 0.169693 }, { "epoch": 0.4413677664637054, "grad_norm": 0.76175856590271, "learning_rate": 9.22444582144397e-05, "loss": 0.915123462677002, "memory(GiB)": 91.52, "step": 34015, "token_acc": 0.7712604984868577, "train_speed(iter/s)": 0.169682 }, { "epoch": 0.4414326448653611, "grad_norm": 0.7862226963043213, "learning_rate": 9.22415886781946e-05, "loss": 0.9940812110900878, "memory(GiB)": 91.52, "step": 34020, "token_acc": 0.7323788902174967, "train_speed(iter/s)": 0.169671 }, { "epoch": 0.4414975232670168, "grad_norm": 0.81510329246521, "learning_rate": 9.223871865583599e-05, "loss": 0.9106935501098633, "memory(GiB)": 91.52, "step": 34025, "token_acc": 0.7512992678578203, "train_speed(iter/s)": 0.16966 }, { "epoch": 0.4415624016686725, "grad_norm": 0.7785477042198181, "learning_rate": 9.223584814739695e-05, "loss": 0.9123677253723145, "memory(GiB)": 91.52, "step": 34030, "token_acc": 0.7590263848171579, "train_speed(iter/s)": 0.169651 }, { "epoch": 0.4416272800703282, "grad_norm": 0.7871537208557129, "learning_rate": 9.223297715291046e-05, "loss": 0.963101577758789, "memory(GiB)": 91.52, "step": 34035, "token_acc": 0.7502827744736746, "train_speed(iter/s)": 0.16964 }, { "epoch": 0.4416921584719839, "grad_norm": 0.718424379825592, "learning_rate": 9.22301056724096e-05, "loss": 0.9069293975830078, "memory(GiB)": 91.52, "step": 34040, "token_acc": 0.7648386109836316, "train_speed(iter/s)": 0.169631 }, { "epoch": 0.4417570368736396, "grad_norm": 0.763141393661499, "learning_rate": 9.22272337059274e-05, "loss": 0.949822998046875, "memory(GiB)": 91.52, "step": 34045, "token_acc": 0.7375057383320581, "train_speed(iter/s)": 0.169622 }, { "epoch": 0.4418219152752953, "grad_norm": 0.8465983271598816, "learning_rate": 9.222436125349691e-05, "loss": 0.9469644546508789, "memory(GiB)": 91.52, "step": 34050, "token_acc": 0.753826054497947, "train_speed(iter/s)": 0.169613 }, { "epoch": 0.441886793676951, "grad_norm": 0.8226732611656189, "learning_rate": 9.22214883151512e-05, "loss": 0.9638435363769531, "memory(GiB)": 91.52, "step": 34055, "token_acc": 0.7545561903546009, "train_speed(iter/s)": 0.169605 }, { "epoch": 0.4419516720786067, "grad_norm": 0.7058073282241821, "learning_rate": 9.221861489092331e-05, "loss": 0.8976522445678711, "memory(GiB)": 91.52, "step": 34060, "token_acc": 0.7570929789044392, "train_speed(iter/s)": 0.169594 }, { "epoch": 0.4420165504802624, "grad_norm": 0.7312268018722534, "learning_rate": 9.221574098084631e-05, "loss": 0.8968175888061524, "memory(GiB)": 91.52, "step": 34065, "token_acc": 0.7779670641680864, "train_speed(iter/s)": 0.169584 }, { "epoch": 0.4420814288819181, "grad_norm": 0.7234767079353333, "learning_rate": 9.22128665849533e-05, "loss": 0.9228202819824218, "memory(GiB)": 91.52, "step": 34070, "token_acc": 0.7677685113279535, "train_speed(iter/s)": 0.169575 }, { "epoch": 0.44214630728357374, "grad_norm": 0.7195411324501038, "learning_rate": 9.220999170327732e-05, "loss": 0.9250959396362305, "memory(GiB)": 91.52, "step": 34075, "token_acc": 0.7522334157004372, "train_speed(iter/s)": 0.169565 }, { "epoch": 0.44221118568522944, "grad_norm": 0.7480044960975647, "learning_rate": 9.220711633585148e-05, "loss": 0.9476076126098633, "memory(GiB)": 91.52, "step": 34080, "token_acc": 0.7462641049100336, "train_speed(iter/s)": 0.169556 }, { "epoch": 0.44227606408688513, "grad_norm": 0.7463935017585754, "learning_rate": 9.220424048270886e-05, "loss": 0.9377892494201661, "memory(GiB)": 91.52, "step": 34085, "token_acc": 0.7355055292259084, "train_speed(iter/s)": 0.169546 }, { "epoch": 0.44234094248854083, "grad_norm": 0.7299858331680298, "learning_rate": 9.220136414388256e-05, "loss": 0.9798888206481934, "memory(GiB)": 91.52, "step": 34090, "token_acc": 0.7258548076236705, "train_speed(iter/s)": 0.169536 }, { "epoch": 0.44240582089019653, "grad_norm": 0.7265331149101257, "learning_rate": 9.219848731940568e-05, "loss": 0.9187160491943359, "memory(GiB)": 91.52, "step": 34095, "token_acc": 0.7511750362229211, "train_speed(iter/s)": 0.169526 }, { "epoch": 0.44247069929185223, "grad_norm": 0.8618683218955994, "learning_rate": 9.219561000931131e-05, "loss": 0.9138494491577148, "memory(GiB)": 91.52, "step": 34100, "token_acc": 0.7423007246376812, "train_speed(iter/s)": 0.169517 }, { "epoch": 0.44253557769350793, "grad_norm": 0.7121118903160095, "learning_rate": 9.219273221363258e-05, "loss": 0.9508133888244629, "memory(GiB)": 91.52, "step": 34105, "token_acc": 0.7318506907024591, "train_speed(iter/s)": 0.169507 }, { "epoch": 0.44260045609516363, "grad_norm": 0.8499022126197815, "learning_rate": 9.218985393240262e-05, "loss": 0.9151228904724121, "memory(GiB)": 91.52, "step": 34110, "token_acc": 0.7395266100341248, "train_speed(iter/s)": 0.169498 }, { "epoch": 0.44266533449681933, "grad_norm": 0.7839766144752502, "learning_rate": 9.218697516565453e-05, "loss": 0.9795654296875, "memory(GiB)": 91.52, "step": 34115, "token_acc": 0.7364293716264385, "train_speed(iter/s)": 0.169489 }, { "epoch": 0.44273021289847503, "grad_norm": 0.7302044034004211, "learning_rate": 9.218409591342143e-05, "loss": 0.9410538673400879, "memory(GiB)": 91.52, "step": 34120, "token_acc": 0.753792485438136, "train_speed(iter/s)": 0.16948 }, { "epoch": 0.4427950913001307, "grad_norm": 0.7248607277870178, "learning_rate": 9.218121617573647e-05, "loss": 0.9122085571289062, "memory(GiB)": 91.52, "step": 34125, "token_acc": 0.7624201753436519, "train_speed(iter/s)": 0.16947 }, { "epoch": 0.4428599697017864, "grad_norm": 0.7059208750724792, "learning_rate": 9.217833595263282e-05, "loss": 0.9622247695922852, "memory(GiB)": 91.52, "step": 34130, "token_acc": 0.7720167104642545, "train_speed(iter/s)": 0.16946 }, { "epoch": 0.4429248481034421, "grad_norm": 0.8754905462265015, "learning_rate": 9.217545524414355e-05, "loss": 0.9616292953491211, "memory(GiB)": 91.52, "step": 34135, "token_acc": 0.759694559786051, "train_speed(iter/s)": 0.169451 }, { "epoch": 0.4429897265050978, "grad_norm": 0.8709549903869629, "learning_rate": 9.217257405030188e-05, "loss": 0.9299408912658691, "memory(GiB)": 91.52, "step": 34140, "token_acc": 0.743579930345593, "train_speed(iter/s)": 0.169443 }, { "epoch": 0.4430546049067535, "grad_norm": 0.7932848930358887, "learning_rate": 9.216969237114095e-05, "loss": 0.9571500778198242, "memory(GiB)": 91.52, "step": 34145, "token_acc": 0.7409541021875824, "train_speed(iter/s)": 0.169434 }, { "epoch": 0.4431194833084092, "grad_norm": 0.8278056979179382, "learning_rate": 9.216681020669389e-05, "loss": 0.969459342956543, "memory(GiB)": 91.52, "step": 34150, "token_acc": 0.7465432198593029, "train_speed(iter/s)": 0.169425 }, { "epoch": 0.4431843617100649, "grad_norm": 0.6673687100410461, "learning_rate": 9.216392755699392e-05, "loss": 0.8725224494934082, "memory(GiB)": 91.52, "step": 34155, "token_acc": 0.7589009974796777, "train_speed(iter/s)": 0.169415 }, { "epoch": 0.4432492401117206, "grad_norm": 0.8273966908454895, "learning_rate": 9.216104442207416e-05, "loss": 0.9352373123168946, "memory(GiB)": 91.52, "step": 34160, "token_acc": 0.7508682423087562, "train_speed(iter/s)": 0.169404 }, { "epoch": 0.4433141185133763, "grad_norm": 0.774625837802887, "learning_rate": 9.215816080196782e-05, "loss": 0.9377130508422852, "memory(GiB)": 91.52, "step": 34165, "token_acc": 0.7460014480352795, "train_speed(iter/s)": 0.169393 }, { "epoch": 0.443378996915032, "grad_norm": 0.8448056578636169, "learning_rate": 9.21552766967081e-05, "loss": 0.9521217346191406, "memory(GiB)": 91.52, "step": 34170, "token_acc": 0.7537389798488665, "train_speed(iter/s)": 0.169385 }, { "epoch": 0.4434438753166877, "grad_norm": 0.7230356335639954, "learning_rate": 9.215239210632813e-05, "loss": 0.9261600494384765, "memory(GiB)": 91.52, "step": 34175, "token_acc": 0.7503478579274991, "train_speed(iter/s)": 0.169375 }, { "epoch": 0.4435087537183434, "grad_norm": 0.7452117800712585, "learning_rate": 9.214950703086119e-05, "loss": 0.9018363952636719, "memory(GiB)": 91.52, "step": 34180, "token_acc": 0.7485134216785593, "train_speed(iter/s)": 0.169364 }, { "epoch": 0.4435736321199991, "grad_norm": 0.7345507740974426, "learning_rate": 9.21466214703404e-05, "loss": 0.9589988708496093, "memory(GiB)": 91.52, "step": 34185, "token_acc": 0.7397436964300574, "train_speed(iter/s)": 0.169353 }, { "epoch": 0.44363851052165476, "grad_norm": 0.7194031476974487, "learning_rate": 9.214373542479901e-05, "loss": 0.9077530860900879, "memory(GiB)": 91.52, "step": 34190, "token_acc": 0.7708970099667775, "train_speed(iter/s)": 0.169345 }, { "epoch": 0.44370338892331046, "grad_norm": 0.7190639972686768, "learning_rate": 9.214084889427024e-05, "loss": 0.9023685455322266, "memory(GiB)": 91.52, "step": 34195, "token_acc": 0.7536348949919225, "train_speed(iter/s)": 0.169334 }, { "epoch": 0.44376826732496616, "grad_norm": 0.7799274921417236, "learning_rate": 9.213796187878729e-05, "loss": 0.9090166091918945, "memory(GiB)": 91.52, "step": 34200, "token_acc": 0.7625458157777549, "train_speed(iter/s)": 0.169324 }, { "epoch": 0.44383314572662186, "grad_norm": 0.8682419657707214, "learning_rate": 9.213507437838337e-05, "loss": 0.9605320930480957, "memory(GiB)": 91.52, "step": 34205, "token_acc": 0.7414969612965134, "train_speed(iter/s)": 0.169315 }, { "epoch": 0.44389802412827756, "grad_norm": 0.7298011183738708, "learning_rate": 9.213218639309173e-05, "loss": 0.9746139526367188, "memory(GiB)": 91.52, "step": 34210, "token_acc": 0.7420625985138483, "train_speed(iter/s)": 0.169306 }, { "epoch": 0.44396290252993326, "grad_norm": 0.7230516076087952, "learning_rate": 9.212929792294563e-05, "loss": 0.914430809020996, "memory(GiB)": 91.52, "step": 34215, "token_acc": 0.745274975131448, "train_speed(iter/s)": 0.169295 }, { "epoch": 0.44402778093158896, "grad_norm": 0.7832767963409424, "learning_rate": 9.212640896797824e-05, "loss": 0.9423107147216797, "memory(GiB)": 91.52, "step": 34220, "token_acc": 0.7399244332493703, "train_speed(iter/s)": 0.169286 }, { "epoch": 0.44409265933324465, "grad_norm": 0.685255229473114, "learning_rate": 9.212351952822286e-05, "loss": 0.9189702987670898, "memory(GiB)": 91.52, "step": 34225, "token_acc": 0.7481075801444169, "train_speed(iter/s)": 0.169276 }, { "epoch": 0.44415753773490035, "grad_norm": 0.7594262361526489, "learning_rate": 9.212062960371274e-05, "loss": 0.9484151840209961, "memory(GiB)": 91.52, "step": 34230, "token_acc": 0.7418744397596362, "train_speed(iter/s)": 0.169267 }, { "epoch": 0.44422241613655605, "grad_norm": 0.702927827835083, "learning_rate": 9.211773919448112e-05, "loss": 0.9042963027954102, "memory(GiB)": 91.52, "step": 34235, "token_acc": 0.7411064473955825, "train_speed(iter/s)": 0.169258 }, { "epoch": 0.44428729453821175, "grad_norm": 0.7949833273887634, "learning_rate": 9.211484830056126e-05, "loss": 0.979802417755127, "memory(GiB)": 91.52, "step": 34240, "token_acc": 0.7456136938338844, "train_speed(iter/s)": 0.169248 }, { "epoch": 0.44435217293986745, "grad_norm": 0.7690462470054626, "learning_rate": 9.211195692198645e-05, "loss": 0.9404741287231445, "memory(GiB)": 91.52, "step": 34245, "token_acc": 0.7399274860471748, "train_speed(iter/s)": 0.169238 }, { "epoch": 0.44441705134152315, "grad_norm": 0.7429558038711548, "learning_rate": 9.210906505878993e-05, "loss": 0.9413281440734863, "memory(GiB)": 91.52, "step": 34250, "token_acc": 0.7674161953372879, "train_speed(iter/s)": 0.169228 }, { "epoch": 0.44448192974317885, "grad_norm": 0.7820587754249573, "learning_rate": 9.210617271100502e-05, "loss": 0.9191633224487304, "memory(GiB)": 91.52, "step": 34255, "token_acc": 0.7545708754570876, "train_speed(iter/s)": 0.16922 }, { "epoch": 0.44454680814483455, "grad_norm": 0.8101846575737, "learning_rate": 9.210327987866498e-05, "loss": 0.9402215957641602, "memory(GiB)": 91.52, "step": 34260, "token_acc": 0.7408626118954109, "train_speed(iter/s)": 0.169209 }, { "epoch": 0.44461168654649025, "grad_norm": 0.7764964699745178, "learning_rate": 9.21003865618031e-05, "loss": 0.9173020362854004, "memory(GiB)": 91.52, "step": 34265, "token_acc": 0.74486160591943, "train_speed(iter/s)": 0.169201 }, { "epoch": 0.44467656494814595, "grad_norm": 0.7319511771202087, "learning_rate": 9.209749276045268e-05, "loss": 0.9327764511108398, "memory(GiB)": 91.52, "step": 34270, "token_acc": 0.7231088360266319, "train_speed(iter/s)": 0.16919 }, { "epoch": 0.44474144334980165, "grad_norm": 0.7258176207542419, "learning_rate": 9.209459847464703e-05, "loss": 0.9237781524658203, "memory(GiB)": 91.52, "step": 34275, "token_acc": 0.7563192488262911, "train_speed(iter/s)": 0.16918 }, { "epoch": 0.44480632175145735, "grad_norm": 0.8242348432540894, "learning_rate": 9.209170370441944e-05, "loss": 0.9205497741699219, "memory(GiB)": 91.52, "step": 34280, "token_acc": 0.7638696125135976, "train_speed(iter/s)": 0.169171 }, { "epoch": 0.44487120015311304, "grad_norm": 0.7903441190719604, "learning_rate": 9.208880844980323e-05, "loss": 0.9541692733764648, "memory(GiB)": 91.52, "step": 34285, "token_acc": 0.7475905286955142, "train_speed(iter/s)": 0.169162 }, { "epoch": 0.44493607855476874, "grad_norm": 0.8006429076194763, "learning_rate": 9.208591271083173e-05, "loss": 0.9690756797790527, "memory(GiB)": 91.52, "step": 34290, "token_acc": 0.7397589396548272, "train_speed(iter/s)": 0.169153 }, { "epoch": 0.44500095695642444, "grad_norm": 0.78294837474823, "learning_rate": 9.208301648753826e-05, "loss": 0.9265508651733398, "memory(GiB)": 91.52, "step": 34295, "token_acc": 0.7573057999105413, "train_speed(iter/s)": 0.169145 }, { "epoch": 0.44506583535808014, "grad_norm": 0.7510607242584229, "learning_rate": 9.208011977995615e-05, "loss": 0.9031476974487305, "memory(GiB)": 91.52, "step": 34300, "token_acc": 0.780283543698032, "train_speed(iter/s)": 0.169134 }, { "epoch": 0.44513071375973584, "grad_norm": 0.777443528175354, "learning_rate": 9.20772225881187e-05, "loss": 0.9276498794555664, "memory(GiB)": 91.52, "step": 34305, "token_acc": 0.7509420734379108, "train_speed(iter/s)": 0.169126 }, { "epoch": 0.4451955921613915, "grad_norm": 0.7809839248657227, "learning_rate": 9.207432491205931e-05, "loss": 0.9308399200439453, "memory(GiB)": 91.52, "step": 34310, "token_acc": 0.7293362743132143, "train_speed(iter/s)": 0.169117 }, { "epoch": 0.4452604705630472, "grad_norm": 0.7078399062156677, "learning_rate": 9.207142675181129e-05, "loss": 0.9144779205322265, "memory(GiB)": 91.52, "step": 34315, "token_acc": 0.7452715176425511, "train_speed(iter/s)": 0.169108 }, { "epoch": 0.4453253489647029, "grad_norm": 0.738568902015686, "learning_rate": 9.206852810740798e-05, "loss": 0.9048168182373046, "memory(GiB)": 91.52, "step": 34320, "token_acc": 0.764707882745018, "train_speed(iter/s)": 0.169099 }, { "epoch": 0.4453902273663586, "grad_norm": 0.7224847674369812, "learning_rate": 9.206562897888278e-05, "loss": 0.9438845634460449, "memory(GiB)": 91.52, "step": 34325, "token_acc": 0.7522439365968553, "train_speed(iter/s)": 0.169088 }, { "epoch": 0.4454551057680143, "grad_norm": 0.78795325756073, "learning_rate": 9.2062729366269e-05, "loss": 0.9299579620361328, "memory(GiB)": 91.52, "step": 34330, "token_acc": 0.7601211787386395, "train_speed(iter/s)": 0.169079 }, { "epoch": 0.44551998416967, "grad_norm": 0.8292548656463623, "learning_rate": 9.205982926960007e-05, "loss": 0.9697319030761719, "memory(GiB)": 91.52, "step": 34335, "token_acc": 0.7545595723933692, "train_speed(iter/s)": 0.169069 }, { "epoch": 0.4455848625713257, "grad_norm": 0.6711727380752563, "learning_rate": 9.205692868890932e-05, "loss": 0.8846742630004882, "memory(GiB)": 91.52, "step": 34340, "token_acc": 0.7645151380687443, "train_speed(iter/s)": 0.169058 }, { "epoch": 0.4456497409729814, "grad_norm": 0.7587552666664124, "learning_rate": 9.205402762423014e-05, "loss": 0.9383758544921875, "memory(GiB)": 91.52, "step": 34345, "token_acc": 0.7527613482505523, "train_speed(iter/s)": 0.169048 }, { "epoch": 0.4457146193746371, "grad_norm": 0.7746407985687256, "learning_rate": 9.205112607559592e-05, "loss": 0.9480138778686523, "memory(GiB)": 91.52, "step": 34350, "token_acc": 0.7446713880050635, "train_speed(iter/s)": 0.169037 }, { "epoch": 0.4457794977762928, "grad_norm": 0.7163345217704773, "learning_rate": 9.204822404304004e-05, "loss": 0.9248527526855469, "memory(GiB)": 91.52, "step": 34355, "token_acc": 0.7554121565362198, "train_speed(iter/s)": 0.169027 }, { "epoch": 0.4458443761779485, "grad_norm": 0.7997617721557617, "learning_rate": 9.20453215265959e-05, "loss": 0.9878945350646973, "memory(GiB)": 91.52, "step": 34360, "token_acc": 0.749234622877818, "train_speed(iter/s)": 0.169019 }, { "epoch": 0.4459092545796042, "grad_norm": 0.7250449061393738, "learning_rate": 9.204241852629692e-05, "loss": 0.9205577850341797, "memory(GiB)": 91.52, "step": 34365, "token_acc": 0.7619143509195865, "train_speed(iter/s)": 0.169008 }, { "epoch": 0.4459741329812599, "grad_norm": 0.7361339926719666, "learning_rate": 9.203951504217649e-05, "loss": 0.9047649383544922, "memory(GiB)": 91.52, "step": 34370, "token_acc": 0.7410039619327696, "train_speed(iter/s)": 0.168998 }, { "epoch": 0.4460390113829156, "grad_norm": 0.8129934072494507, "learning_rate": 9.203661107426803e-05, "loss": 0.9222352027893066, "memory(GiB)": 91.52, "step": 34375, "token_acc": 0.7626054607055072, "train_speed(iter/s)": 0.168985 }, { "epoch": 0.4461038897845713, "grad_norm": 0.8574717044830322, "learning_rate": 9.203370662260496e-05, "loss": 0.9033550262451172, "memory(GiB)": 91.52, "step": 34380, "token_acc": 0.7478593879836699, "train_speed(iter/s)": 0.168976 }, { "epoch": 0.44616876818622697, "grad_norm": 0.8158627152442932, "learning_rate": 9.203080168722068e-05, "loss": 0.9797244071960449, "memory(GiB)": 91.52, "step": 34385, "token_acc": 0.738038486134527, "train_speed(iter/s)": 0.168967 }, { "epoch": 0.44623364658788267, "grad_norm": 0.7034274339675903, "learning_rate": 9.202789626814865e-05, "loss": 0.9309968948364258, "memory(GiB)": 91.52, "step": 34390, "token_acc": 0.7592146277193997, "train_speed(iter/s)": 0.168958 }, { "epoch": 0.44629852498953837, "grad_norm": 0.6987966895103455, "learning_rate": 9.202499036542229e-05, "loss": 0.9019556045532227, "memory(GiB)": 91.52, "step": 34395, "token_acc": 0.7671228496514677, "train_speed(iter/s)": 0.168948 }, { "epoch": 0.44636340339119407, "grad_norm": 0.825166642665863, "learning_rate": 9.202208397907504e-05, "loss": 0.9281049728393554, "memory(GiB)": 91.52, "step": 34400, "token_acc": 0.7543674409675561, "train_speed(iter/s)": 0.16894 }, { "epoch": 0.44642828179284977, "grad_norm": 0.7205750346183777, "learning_rate": 9.201917710914037e-05, "loss": 0.9398805618286132, "memory(GiB)": 91.52, "step": 34405, "token_acc": 0.7509937437350939, "train_speed(iter/s)": 0.168931 }, { "epoch": 0.44649316019450547, "grad_norm": 0.7351528406143188, "learning_rate": 9.20162697556517e-05, "loss": 0.9103946685791016, "memory(GiB)": 91.52, "step": 34410, "token_acc": 0.7610561814976623, "train_speed(iter/s)": 0.168921 }, { "epoch": 0.44655803859616117, "grad_norm": 0.7929400205612183, "learning_rate": 9.201336191864252e-05, "loss": 0.9259160995483399, "memory(GiB)": 91.52, "step": 34415, "token_acc": 0.7475612000736241, "train_speed(iter/s)": 0.168909 }, { "epoch": 0.44662291699781687, "grad_norm": 0.8530884981155396, "learning_rate": 9.201045359814623e-05, "loss": 0.9260276794433594, "memory(GiB)": 91.52, "step": 34420, "token_acc": 0.7684821967410984, "train_speed(iter/s)": 0.168901 }, { "epoch": 0.44668779539947256, "grad_norm": 0.7376468777656555, "learning_rate": 9.20075447941964e-05, "loss": 0.9385696411132812, "memory(GiB)": 91.52, "step": 34425, "token_acc": 0.7452131276618589, "train_speed(iter/s)": 0.168892 }, { "epoch": 0.4467526738011282, "grad_norm": 0.8318372964859009, "learning_rate": 9.200463550682643e-05, "loss": 0.9828626632690429, "memory(GiB)": 91.52, "step": 34430, "token_acc": 0.7345313078918118, "train_speed(iter/s)": 0.168882 }, { "epoch": 0.4468175522027839, "grad_norm": 0.8026096820831299, "learning_rate": 9.20017257360698e-05, "loss": 0.9248214721679687, "memory(GiB)": 91.52, "step": 34435, "token_acc": 0.7485589953048059, "train_speed(iter/s)": 0.168873 }, { "epoch": 0.4468824306044396, "grad_norm": 0.7281906008720398, "learning_rate": 9.199881548196002e-05, "loss": 0.886497688293457, "memory(GiB)": 91.52, "step": 34440, "token_acc": 0.7684313725490196, "train_speed(iter/s)": 0.168865 }, { "epoch": 0.4469473090060953, "grad_norm": 0.6980749368667603, "learning_rate": 9.199590474453059e-05, "loss": 0.9159908294677734, "memory(GiB)": 91.52, "step": 34445, "token_acc": 0.7611734770243975, "train_speed(iter/s)": 0.168856 }, { "epoch": 0.447012187407751, "grad_norm": 0.7208905220031738, "learning_rate": 9.199299352381497e-05, "loss": 0.9053175926208497, "memory(GiB)": 91.52, "step": 34450, "token_acc": 0.7657184011523227, "train_speed(iter/s)": 0.168846 }, { "epoch": 0.4470770658094067, "grad_norm": 0.7717043161392212, "learning_rate": 9.199008181984669e-05, "loss": 0.9132896423339844, "memory(GiB)": 91.52, "step": 34455, "token_acc": 0.7509534706331045, "train_speed(iter/s)": 0.168837 }, { "epoch": 0.4471419442110624, "grad_norm": 0.8294475078582764, "learning_rate": 9.198716963265925e-05, "loss": 0.9204582214355469, "memory(GiB)": 91.52, "step": 34460, "token_acc": 0.7446953254073666, "train_speed(iter/s)": 0.168827 }, { "epoch": 0.4472068226127181, "grad_norm": 0.8001875877380371, "learning_rate": 9.198425696228617e-05, "loss": 0.9668360710144043, "memory(GiB)": 91.52, "step": 34465, "token_acc": 0.7429994614970382, "train_speed(iter/s)": 0.168818 }, { "epoch": 0.4472717010143738, "grad_norm": 0.8110756278038025, "learning_rate": 9.198134380876096e-05, "loss": 0.9767145156860352, "memory(GiB)": 91.52, "step": 34470, "token_acc": 0.7508263310899606, "train_speed(iter/s)": 0.168809 }, { "epoch": 0.4473365794160295, "grad_norm": 0.933260977268219, "learning_rate": 9.197843017211716e-05, "loss": 0.990632152557373, "memory(GiB)": 91.52, "step": 34475, "token_acc": 0.7191391741315272, "train_speed(iter/s)": 0.168801 }, { "epoch": 0.4474014578176852, "grad_norm": 0.7353588342666626, "learning_rate": 9.197551605238828e-05, "loss": 0.8423562049865723, "memory(GiB)": 91.52, "step": 34480, "token_acc": 0.7682742903106881, "train_speed(iter/s)": 0.168791 }, { "epoch": 0.4474663362193409, "grad_norm": 0.800258994102478, "learning_rate": 9.197260144960786e-05, "loss": 0.9115304946899414, "memory(GiB)": 91.52, "step": 34485, "token_acc": 0.7587945981993998, "train_speed(iter/s)": 0.168784 }, { "epoch": 0.4475312146209966, "grad_norm": 0.8308528661727905, "learning_rate": 9.196968636380944e-05, "loss": 0.8906702041625977, "memory(GiB)": 91.52, "step": 34490, "token_acc": 0.7579951277665474, "train_speed(iter/s)": 0.168776 }, { "epoch": 0.4475960930226523, "grad_norm": 0.8269880414009094, "learning_rate": 9.196677079502657e-05, "loss": 0.9497161865234375, "memory(GiB)": 91.52, "step": 34495, "token_acc": 0.7656827855697439, "train_speed(iter/s)": 0.168768 }, { "epoch": 0.447660971424308, "grad_norm": 0.7907686829566956, "learning_rate": 9.196385474329281e-05, "loss": 0.9368995666503906, "memory(GiB)": 91.52, "step": 34500, "token_acc": 0.7535503159611334, "train_speed(iter/s)": 0.168757 }, { "epoch": 0.4477258498259637, "grad_norm": 0.7735003232955933, "learning_rate": 9.19609382086417e-05, "loss": 0.95081787109375, "memory(GiB)": 91.52, "step": 34505, "token_acc": 0.7312608061836797, "train_speed(iter/s)": 0.168747 }, { "epoch": 0.4477907282276194, "grad_norm": 0.8933537006378174, "learning_rate": 9.195802119110681e-05, "loss": 0.9830268859863281, "memory(GiB)": 91.52, "step": 34510, "token_acc": 0.732472684703434, "train_speed(iter/s)": 0.168737 }, { "epoch": 0.4478556066292751, "grad_norm": 0.796840250492096, "learning_rate": 9.195510369072174e-05, "loss": 0.9363405227661132, "memory(GiB)": 91.52, "step": 34515, "token_acc": 0.7490854796565882, "train_speed(iter/s)": 0.168728 }, { "epoch": 0.4479204850309308, "grad_norm": 0.8361436128616333, "learning_rate": 9.195218570752002e-05, "loss": 0.936821174621582, "memory(GiB)": 91.52, "step": 34520, "token_acc": 0.7591347916412073, "train_speed(iter/s)": 0.16872 }, { "epoch": 0.4479853634325865, "grad_norm": 0.8215303421020508, "learning_rate": 9.194926724153523e-05, "loss": 0.9048238754272461, "memory(GiB)": 91.52, "step": 34525, "token_acc": 0.7531786888296222, "train_speed(iter/s)": 0.16871 }, { "epoch": 0.4480502418342422, "grad_norm": 0.7813591957092285, "learning_rate": 9.1946348292801e-05, "loss": 0.948669719696045, "memory(GiB)": 91.52, "step": 34530, "token_acc": 0.7451917236911366, "train_speed(iter/s)": 0.168701 }, { "epoch": 0.4481151202358979, "grad_norm": 0.7788256406784058, "learning_rate": 9.194342886135087e-05, "loss": 0.9416092872619629, "memory(GiB)": 91.52, "step": 34535, "token_acc": 0.735505066864097, "train_speed(iter/s)": 0.168693 }, { "epoch": 0.4481799986375536, "grad_norm": 0.8664308786392212, "learning_rate": 9.194050894721848e-05, "loss": 0.9269571304321289, "memory(GiB)": 91.52, "step": 34540, "token_acc": 0.7432535780618377, "train_speed(iter/s)": 0.168684 }, { "epoch": 0.4482448770392093, "grad_norm": 0.7579393982887268, "learning_rate": 9.19375885504374e-05, "loss": 0.916837215423584, "memory(GiB)": 91.52, "step": 34545, "token_acc": 0.7691101528812231, "train_speed(iter/s)": 0.168675 }, { "epoch": 0.44830975544086493, "grad_norm": 0.7418537735939026, "learning_rate": 9.193466767104126e-05, "loss": 0.9074533462524415, "memory(GiB)": 91.52, "step": 34550, "token_acc": 0.7552279010420359, "train_speed(iter/s)": 0.168665 }, { "epoch": 0.44837463384252063, "grad_norm": 0.7508977651596069, "learning_rate": 9.193174630906366e-05, "loss": 0.9641000747680664, "memory(GiB)": 91.52, "step": 34555, "token_acc": 0.758124318429662, "train_speed(iter/s)": 0.168655 }, { "epoch": 0.44843951224417633, "grad_norm": 0.7808078527450562, "learning_rate": 9.192882446453822e-05, "loss": 0.9362448692321778, "memory(GiB)": 91.52, "step": 34560, "token_acc": 0.742011077971879, "train_speed(iter/s)": 0.168646 }, { "epoch": 0.44850439064583203, "grad_norm": 0.7940340638160706, "learning_rate": 9.192590213749856e-05, "loss": 0.9725242614746094, "memory(GiB)": 91.52, "step": 34565, "token_acc": 0.7549529388534193, "train_speed(iter/s)": 0.168636 }, { "epoch": 0.44856926904748773, "grad_norm": 0.7567580342292786, "learning_rate": 9.192297932797834e-05, "loss": 1.072452449798584, "memory(GiB)": 91.52, "step": 34570, "token_acc": 0.7508274904794107, "train_speed(iter/s)": 0.168627 }, { "epoch": 0.44863414744914343, "grad_norm": 0.7873967289924622, "learning_rate": 9.192005603601115e-05, "loss": 0.9044087409973145, "memory(GiB)": 91.52, "step": 34575, "token_acc": 0.737673434192992, "train_speed(iter/s)": 0.168618 }, { "epoch": 0.4486990258507991, "grad_norm": 0.774776816368103, "learning_rate": 9.191713226163065e-05, "loss": 0.9109911918640137, "memory(GiB)": 91.52, "step": 34580, "token_acc": 0.7428982513096731, "train_speed(iter/s)": 0.168608 }, { "epoch": 0.4487639042524548, "grad_norm": 0.7376053929328918, "learning_rate": 9.191420800487049e-05, "loss": 0.9636913299560547, "memory(GiB)": 91.52, "step": 34585, "token_acc": 0.7328917457082386, "train_speed(iter/s)": 0.168599 }, { "epoch": 0.4488287826541105, "grad_norm": 0.8004452586174011, "learning_rate": 9.191128326576434e-05, "loss": 0.9130496978759766, "memory(GiB)": 91.52, "step": 34590, "token_acc": 0.7710392925189632, "train_speed(iter/s)": 0.16859 }, { "epoch": 0.4488936610557662, "grad_norm": 0.8152819871902466, "learning_rate": 9.190835804434583e-05, "loss": 0.9658109664916992, "memory(GiB)": 91.52, "step": 34595, "token_acc": 0.7516536165039865, "train_speed(iter/s)": 0.16858 }, { "epoch": 0.4489585394574219, "grad_norm": 0.8409842252731323, "learning_rate": 9.190543234064863e-05, "loss": 0.8931568145751954, "memory(GiB)": 91.52, "step": 34600, "token_acc": 0.7516479519934537, "train_speed(iter/s)": 0.168571 }, { "epoch": 0.4490234178590776, "grad_norm": 0.7421589493751526, "learning_rate": 9.190250615470642e-05, "loss": 0.9518063545227051, "memory(GiB)": 91.52, "step": 34605, "token_acc": 0.7255129834755766, "train_speed(iter/s)": 0.168561 }, { "epoch": 0.4490882962607333, "grad_norm": 0.7560704350471497, "learning_rate": 9.189957948655285e-05, "loss": 0.9084373474121094, "memory(GiB)": 91.52, "step": 34610, "token_acc": 0.7854681647940075, "train_speed(iter/s)": 0.168551 }, { "epoch": 0.449153174662389, "grad_norm": 0.8547929525375366, "learning_rate": 9.189665233622162e-05, "loss": 0.9137098312377929, "memory(GiB)": 91.52, "step": 34615, "token_acc": 0.7496865382423344, "train_speed(iter/s)": 0.16854 }, { "epoch": 0.4492180530640447, "grad_norm": 0.7348806858062744, "learning_rate": 9.189372470374642e-05, "loss": 0.9188022613525391, "memory(GiB)": 91.52, "step": 34620, "token_acc": 0.7527772617499535, "train_speed(iter/s)": 0.16853 }, { "epoch": 0.4492829314657004, "grad_norm": 0.8094965219497681, "learning_rate": 9.189079658916093e-05, "loss": 0.9140888214111328, "memory(GiB)": 91.52, "step": 34625, "token_acc": 0.7781627838395754, "train_speed(iter/s)": 0.16852 }, { "epoch": 0.4493478098673561, "grad_norm": 0.8037238121032715, "learning_rate": 9.188786799249885e-05, "loss": 0.9084712982177734, "memory(GiB)": 91.52, "step": 34630, "token_acc": 0.7430263660680169, "train_speed(iter/s)": 0.168474 }, { "epoch": 0.4494126882690118, "grad_norm": 0.803832471370697, "learning_rate": 9.188493891379388e-05, "loss": 0.8895380020141601, "memory(GiB)": 91.52, "step": 34635, "token_acc": 0.7518033317159954, "train_speed(iter/s)": 0.168411 }, { "epoch": 0.4494775666706675, "grad_norm": 0.7190906405448914, "learning_rate": 9.188200935307974e-05, "loss": 0.9080434799194336, "memory(GiB)": 91.52, "step": 34640, "token_acc": 0.7587531059408177, "train_speed(iter/s)": 0.16836 }, { "epoch": 0.4495424450723232, "grad_norm": 0.7378900051116943, "learning_rate": 9.18790793103901e-05, "loss": 0.9315778732299804, "memory(GiB)": 91.52, "step": 34645, "token_acc": 0.7606379368849677, "train_speed(iter/s)": 0.168351 }, { "epoch": 0.4496073234739789, "grad_norm": 0.8137529492378235, "learning_rate": 9.187614878575874e-05, "loss": 0.9280198097229004, "memory(GiB)": 91.52, "step": 34650, "token_acc": 0.7365699600582097, "train_speed(iter/s)": 0.168342 }, { "epoch": 0.4496722018756346, "grad_norm": 0.8525418639183044, "learning_rate": 9.187321777921935e-05, "loss": 0.9654909133911133, "memory(GiB)": 91.52, "step": 34655, "token_acc": 0.7305222343390282, "train_speed(iter/s)": 0.168334 }, { "epoch": 0.4497370802772903, "grad_norm": 0.8372376561164856, "learning_rate": 9.187028629080565e-05, "loss": 0.9262643814086914, "memory(GiB)": 91.52, "step": 34660, "token_acc": 0.7390201363564294, "train_speed(iter/s)": 0.168325 }, { "epoch": 0.449801958678946, "grad_norm": 0.8879560828208923, "learning_rate": 9.186735432055141e-05, "loss": 0.8777887344360351, "memory(GiB)": 91.52, "step": 34665, "token_acc": 0.7441544581352479, "train_speed(iter/s)": 0.168317 }, { "epoch": 0.44986683708060166, "grad_norm": 0.7069893479347229, "learning_rate": 9.186442186849034e-05, "loss": 0.9341886520385743, "memory(GiB)": 91.52, "step": 34670, "token_acc": 0.7404713237946375, "train_speed(iter/s)": 0.168307 }, { "epoch": 0.44993171548225736, "grad_norm": 0.6979290843009949, "learning_rate": 9.18614889346562e-05, "loss": 0.881960105895996, "memory(GiB)": 91.52, "step": 34675, "token_acc": 0.7637014148642055, "train_speed(iter/s)": 0.168298 }, { "epoch": 0.44999659388391305, "grad_norm": 0.7497026920318604, "learning_rate": 9.185855551908273e-05, "loss": 0.9401533126831054, "memory(GiB)": 91.52, "step": 34680, "token_acc": 0.7606740751571868, "train_speed(iter/s)": 0.168288 }, { "epoch": 0.45006147228556875, "grad_norm": 0.8346364498138428, "learning_rate": 9.185562162180372e-05, "loss": 0.9820891380310058, "memory(GiB)": 91.52, "step": 34685, "token_acc": 0.7498693932383014, "train_speed(iter/s)": 0.16828 }, { "epoch": 0.45012635068722445, "grad_norm": 0.8494754433631897, "learning_rate": 9.185268724285289e-05, "loss": 0.939082145690918, "memory(GiB)": 91.52, "step": 34690, "token_acc": 0.7605454545454545, "train_speed(iter/s)": 0.168271 }, { "epoch": 0.45019122908888015, "grad_norm": 0.8055437207221985, "learning_rate": 9.184975238226403e-05, "loss": 0.915189552307129, "memory(GiB)": 91.52, "step": 34695, "token_acc": 0.7488956846755012, "train_speed(iter/s)": 0.168261 }, { "epoch": 0.45025610749053585, "grad_norm": 0.7362242341041565, "learning_rate": 9.184681704007092e-05, "loss": 0.9418656349182128, "memory(GiB)": 91.52, "step": 34700, "token_acc": 0.7452304853537184, "train_speed(iter/s)": 0.168253 }, { "epoch": 0.45032098589219155, "grad_norm": 0.7759969234466553, "learning_rate": 9.184388121630733e-05, "loss": 0.9043281555175782, "memory(GiB)": 91.52, "step": 34705, "token_acc": 0.7719130642464772, "train_speed(iter/s)": 0.168243 }, { "epoch": 0.45038586429384725, "grad_norm": 0.7364447116851807, "learning_rate": 9.184094491100705e-05, "loss": 0.9086784362792969, "memory(GiB)": 91.52, "step": 34710, "token_acc": 0.7394866933026343, "train_speed(iter/s)": 0.168235 }, { "epoch": 0.45045074269550295, "grad_norm": 0.831207275390625, "learning_rate": 9.183800812420386e-05, "loss": 0.9184648513793945, "memory(GiB)": 91.52, "step": 34715, "token_acc": 0.7433191673269466, "train_speed(iter/s)": 0.168226 }, { "epoch": 0.45051562109715865, "grad_norm": 0.8110468983650208, "learning_rate": 9.183507085593159e-05, "loss": 0.961241626739502, "memory(GiB)": 91.52, "step": 34720, "token_acc": 0.7354525681780157, "train_speed(iter/s)": 0.168217 }, { "epoch": 0.45058049949881435, "grad_norm": 0.6340048313140869, "learning_rate": 9.183213310622398e-05, "loss": 0.8799868583679199, "memory(GiB)": 91.52, "step": 34725, "token_acc": 0.7641387714142356, "train_speed(iter/s)": 0.168207 }, { "epoch": 0.45064537790047005, "grad_norm": 0.8722267746925354, "learning_rate": 9.18291948751149e-05, "loss": 0.9387825012207032, "memory(GiB)": 91.52, "step": 34730, "token_acc": 0.7293831775700934, "train_speed(iter/s)": 0.168198 }, { "epoch": 0.45071025630212574, "grad_norm": 0.760854959487915, "learning_rate": 9.182625616263814e-05, "loss": 0.9488674163818359, "memory(GiB)": 91.52, "step": 34735, "token_acc": 0.7527868448605037, "train_speed(iter/s)": 0.168189 }, { "epoch": 0.45077513470378144, "grad_norm": 0.8072935938835144, "learning_rate": 9.18233169688275e-05, "loss": 0.8901485443115235, "memory(GiB)": 91.52, "step": 34740, "token_acc": 0.7593622688812285, "train_speed(iter/s)": 0.168178 }, { "epoch": 0.45084001310543714, "grad_norm": 0.8166311383247375, "learning_rate": 9.182037729371683e-05, "loss": 0.9650669097900391, "memory(GiB)": 91.52, "step": 34745, "token_acc": 0.7251190774630233, "train_speed(iter/s)": 0.16817 }, { "epoch": 0.45090489150709284, "grad_norm": 0.6921681761741638, "learning_rate": 9.181743713733995e-05, "loss": 0.8958015441894531, "memory(GiB)": 91.52, "step": 34750, "token_acc": 0.7740027991602519, "train_speed(iter/s)": 0.168161 }, { "epoch": 0.45096976990874854, "grad_norm": 0.785194456577301, "learning_rate": 9.181449649973068e-05, "loss": 0.9072355270385742, "memory(GiB)": 91.52, "step": 34755, "token_acc": 0.7568325373594207, "train_speed(iter/s)": 0.168151 }, { "epoch": 0.45103464831040424, "grad_norm": 0.7280822396278381, "learning_rate": 9.18115553809229e-05, "loss": 0.9303714752197265, "memory(GiB)": 91.52, "step": 34760, "token_acc": 0.7587439016010444, "train_speed(iter/s)": 0.168142 }, { "epoch": 0.45109952671205994, "grad_norm": 0.7621335387229919, "learning_rate": 9.180861378095042e-05, "loss": 0.9599955558776856, "memory(GiB)": 91.52, "step": 34765, "token_acc": 0.7560975609756098, "train_speed(iter/s)": 0.168134 }, { "epoch": 0.45116440511371564, "grad_norm": 0.7534878849983215, "learning_rate": 9.180567169984709e-05, "loss": 0.9679143905639649, "memory(GiB)": 91.52, "step": 34770, "token_acc": 0.7465428058693128, "train_speed(iter/s)": 0.168125 }, { "epoch": 0.45122928351537134, "grad_norm": 0.6694162487983704, "learning_rate": 9.18027291376468e-05, "loss": 0.911400032043457, "memory(GiB)": 91.52, "step": 34775, "token_acc": 0.7599465606569129, "train_speed(iter/s)": 0.168115 }, { "epoch": 0.45129416191702704, "grad_norm": 0.7736457586288452, "learning_rate": 9.179978609438339e-05, "loss": 0.9457161903381348, "memory(GiB)": 91.52, "step": 34780, "token_acc": 0.7638898644377327, "train_speed(iter/s)": 0.168107 }, { "epoch": 0.45135904031868274, "grad_norm": 0.7914275527000427, "learning_rate": 9.179684257009072e-05, "loss": 0.9454702377319336, "memory(GiB)": 91.52, "step": 34785, "token_acc": 0.7576529008737193, "train_speed(iter/s)": 0.168098 }, { "epoch": 0.4514239187203384, "grad_norm": 0.9110901355743408, "learning_rate": 9.179389856480269e-05, "loss": 0.9333501815795898, "memory(GiB)": 91.52, "step": 34790, "token_acc": 0.7507598784194529, "train_speed(iter/s)": 0.168089 }, { "epoch": 0.4514887971219941, "grad_norm": 0.7277281880378723, "learning_rate": 9.179095407855315e-05, "loss": 0.9325742721557617, "memory(GiB)": 91.52, "step": 34795, "token_acc": 0.7680474118908014, "train_speed(iter/s)": 0.168081 }, { "epoch": 0.4515536755236498, "grad_norm": 0.7518046498298645, "learning_rate": 9.178800911137601e-05, "loss": 0.9913908958435058, "memory(GiB)": 91.52, "step": 34800, "token_acc": 0.727699186395954, "train_speed(iter/s)": 0.168071 }, { "epoch": 0.4516185539253055, "grad_norm": 0.7753762006759644, "learning_rate": 9.178506366330515e-05, "loss": 0.9348020553588867, "memory(GiB)": 91.52, "step": 34805, "token_acc": 0.7463559447825442, "train_speed(iter/s)": 0.168061 }, { "epoch": 0.4516834323269612, "grad_norm": 0.8701016306877136, "learning_rate": 9.178211773437448e-05, "loss": 0.9951760292053222, "memory(GiB)": 91.52, "step": 34810, "token_acc": 0.7394636015325671, "train_speed(iter/s)": 0.168054 }, { "epoch": 0.4517483107286169, "grad_norm": 0.7863270044326782, "learning_rate": 9.177917132461787e-05, "loss": 0.9697797775268555, "memory(GiB)": 91.52, "step": 34815, "token_acc": 0.7375311322562105, "train_speed(iter/s)": 0.168045 }, { "epoch": 0.4518131891302726, "grad_norm": 0.8181257843971252, "learning_rate": 9.177622443406925e-05, "loss": 0.9518377304077148, "memory(GiB)": 91.52, "step": 34820, "token_acc": 0.7496294515883507, "train_speed(iter/s)": 0.168036 }, { "epoch": 0.4518780675319283, "grad_norm": 0.7283294200897217, "learning_rate": 9.177327706276253e-05, "loss": 0.9287506103515625, "memory(GiB)": 91.52, "step": 34825, "token_acc": 0.7551184263348053, "train_speed(iter/s)": 0.168027 }, { "epoch": 0.451942945933584, "grad_norm": 0.7081485390663147, "learning_rate": 9.177032921073162e-05, "loss": 0.9389151573181153, "memory(GiB)": 91.52, "step": 34830, "token_acc": 0.7440206851971558, "train_speed(iter/s)": 0.168017 }, { "epoch": 0.45200782433523967, "grad_norm": 0.7403780221939087, "learning_rate": 9.176738087801045e-05, "loss": 0.9542919158935547, "memory(GiB)": 91.52, "step": 34835, "token_acc": 0.7544496124031008, "train_speed(iter/s)": 0.168008 }, { "epoch": 0.45207270273689537, "grad_norm": 0.7874975204467773, "learning_rate": 9.176443206463297e-05, "loss": 0.9822231292724609, "memory(GiB)": 91.52, "step": 34840, "token_acc": 0.7340117607629262, "train_speed(iter/s)": 0.167998 }, { "epoch": 0.45213758113855107, "grad_norm": 0.7202053070068359, "learning_rate": 9.176148277063307e-05, "loss": 0.879153060913086, "memory(GiB)": 91.52, "step": 34845, "token_acc": 0.7669772220057388, "train_speed(iter/s)": 0.167989 }, { "epoch": 0.45220245954020677, "grad_norm": 0.7862081527709961, "learning_rate": 9.17585329960447e-05, "loss": 0.9302162170410156, "memory(GiB)": 91.52, "step": 34850, "token_acc": 0.7559995271308665, "train_speed(iter/s)": 0.16798 }, { "epoch": 0.45226733794186247, "grad_norm": 0.7946388125419617, "learning_rate": 9.175558274090186e-05, "loss": 0.9165701866149902, "memory(GiB)": 91.52, "step": 34855, "token_acc": 0.7426771371413216, "train_speed(iter/s)": 0.16797 }, { "epoch": 0.45233221634351817, "grad_norm": 1.4713153839111328, "learning_rate": 9.175263200523844e-05, "loss": 0.9694816589355468, "memory(GiB)": 91.52, "step": 34860, "token_acc": 0.7567612355826594, "train_speed(iter/s)": 0.167962 }, { "epoch": 0.45239709474517387, "grad_norm": 0.7450053095817566, "learning_rate": 9.174968078908841e-05, "loss": 0.9210927963256836, "memory(GiB)": 91.52, "step": 34865, "token_acc": 0.7410166965827887, "train_speed(iter/s)": 0.167952 }, { "epoch": 0.45246197314682957, "grad_norm": 0.7799027562141418, "learning_rate": 9.174672909248577e-05, "loss": 0.9517557144165039, "memory(GiB)": 91.52, "step": 34870, "token_acc": 0.7542829676932056, "train_speed(iter/s)": 0.167942 }, { "epoch": 0.45252685154848526, "grad_norm": 0.7419429421424866, "learning_rate": 9.174377691546444e-05, "loss": 0.9467685699462891, "memory(GiB)": 91.52, "step": 34875, "token_acc": 0.7429553539310124, "train_speed(iter/s)": 0.167933 }, { "epoch": 0.45259172995014096, "grad_norm": 0.7428159713745117, "learning_rate": 9.174082425805839e-05, "loss": 0.9572141647338868, "memory(GiB)": 91.52, "step": 34880, "token_acc": 0.7439123979213066, "train_speed(iter/s)": 0.167924 }, { "epoch": 0.45265660835179666, "grad_norm": 0.6867218613624573, "learning_rate": 9.173787112030165e-05, "loss": 0.9397597312927246, "memory(GiB)": 91.52, "step": 34885, "token_acc": 0.7548559487513274, "train_speed(iter/s)": 0.167916 }, { "epoch": 0.45272148675345236, "grad_norm": 0.8201705813407898, "learning_rate": 9.173491750222816e-05, "loss": 0.9698999404907227, "memory(GiB)": 91.52, "step": 34890, "token_acc": 0.7221304979527143, "train_speed(iter/s)": 0.167907 }, { "epoch": 0.45278636515510806, "grad_norm": 0.854033350944519, "learning_rate": 9.173196340387193e-05, "loss": 0.932696533203125, "memory(GiB)": 91.52, "step": 34895, "token_acc": 0.7520527661865661, "train_speed(iter/s)": 0.167897 }, { "epoch": 0.45285124355676376, "grad_norm": 0.8282272815704346, "learning_rate": 9.172900882526696e-05, "loss": 0.9404664993286133, "memory(GiB)": 91.52, "step": 34900, "token_acc": 0.722449418854929, "train_speed(iter/s)": 0.167888 }, { "epoch": 0.45291612195841946, "grad_norm": 0.815483570098877, "learning_rate": 9.172605376644722e-05, "loss": 0.9554287910461425, "memory(GiB)": 91.52, "step": 34905, "token_acc": 0.7545853608738557, "train_speed(iter/s)": 0.167879 }, { "epoch": 0.4529810003600751, "grad_norm": 0.7745267152786255, "learning_rate": 9.172309822744675e-05, "loss": 0.9444992065429687, "memory(GiB)": 91.52, "step": 34910, "token_acc": 0.7517001838825487, "train_speed(iter/s)": 0.167871 }, { "epoch": 0.4530458787617308, "grad_norm": 0.8006210327148438, "learning_rate": 9.172014220829956e-05, "loss": 0.923861312866211, "memory(GiB)": 91.52, "step": 34915, "token_acc": 0.7537647234232892, "train_speed(iter/s)": 0.167861 }, { "epoch": 0.4531107571633865, "grad_norm": 0.7849236130714417, "learning_rate": 9.171718570903966e-05, "loss": 0.9488470077514648, "memory(GiB)": 91.52, "step": 34920, "token_acc": 0.7528890180972454, "train_speed(iter/s)": 0.167853 }, { "epoch": 0.4531756355650422, "grad_norm": 0.7046148180961609, "learning_rate": 9.171422872970107e-05, "loss": 0.9161289215087891, "memory(GiB)": 91.52, "step": 34925, "token_acc": 0.7550212723854755, "train_speed(iter/s)": 0.167843 }, { "epoch": 0.4532405139666979, "grad_norm": 0.7709460258483887, "learning_rate": 9.171127127031782e-05, "loss": 0.902254867553711, "memory(GiB)": 91.52, "step": 34930, "token_acc": 0.7691490007896495, "train_speed(iter/s)": 0.167835 }, { "epoch": 0.4533053923683536, "grad_norm": 0.7130882143974304, "learning_rate": 9.170831333092393e-05, "loss": 0.931539249420166, "memory(GiB)": 91.52, "step": 34935, "token_acc": 0.7821685173089484, "train_speed(iter/s)": 0.167824 }, { "epoch": 0.4533702707700093, "grad_norm": 0.8145925402641296, "learning_rate": 9.170535491155348e-05, "loss": 0.9365930557250977, "memory(GiB)": 91.52, "step": 34940, "token_acc": 0.7654662122747229, "train_speed(iter/s)": 0.167815 }, { "epoch": 0.453435149171665, "grad_norm": 0.9652552604675293, "learning_rate": 9.170239601224047e-05, "loss": 0.9566853523254395, "memory(GiB)": 91.52, "step": 34945, "token_acc": 0.7520615341972143, "train_speed(iter/s)": 0.167806 }, { "epoch": 0.4535000275733207, "grad_norm": 0.8805146813392639, "learning_rate": 9.169943663301897e-05, "loss": 0.9013219833374023, "memory(GiB)": 91.52, "step": 34950, "token_acc": 0.7564610519428592, "train_speed(iter/s)": 0.167799 }, { "epoch": 0.4535649059749764, "grad_norm": 0.8715558648109436, "learning_rate": 9.169647677392305e-05, "loss": 0.9471593856811523, "memory(GiB)": 91.52, "step": 34955, "token_acc": 0.7573936422584217, "train_speed(iter/s)": 0.167791 }, { "epoch": 0.4536297843766321, "grad_norm": 0.7694317698478699, "learning_rate": 9.169351643498674e-05, "loss": 0.9071227073669433, "memory(GiB)": 91.52, "step": 34960, "token_acc": 0.7562730862310534, "train_speed(iter/s)": 0.167783 }, { "epoch": 0.4536946627782878, "grad_norm": 0.7125844955444336, "learning_rate": 9.169055561624413e-05, "loss": 0.9654855728149414, "memory(GiB)": 91.52, "step": 34965, "token_acc": 0.7582723143942904, "train_speed(iter/s)": 0.167775 }, { "epoch": 0.4537595411799435, "grad_norm": 0.7453534603118896, "learning_rate": 9.168759431772931e-05, "loss": 0.9651365280151367, "memory(GiB)": 91.52, "step": 34970, "token_acc": 0.7492616656822209, "train_speed(iter/s)": 0.167767 }, { "epoch": 0.4538244195815992, "grad_norm": 0.7604342103004456, "learning_rate": 9.168463253947632e-05, "loss": 0.9478255271911621, "memory(GiB)": 91.52, "step": 34975, "token_acc": 0.7435776988344441, "train_speed(iter/s)": 0.167757 }, { "epoch": 0.4538892979832549, "grad_norm": 0.7478224039077759, "learning_rate": 9.168167028151926e-05, "loss": 0.918367576599121, "memory(GiB)": 91.52, "step": 34980, "token_acc": 0.7332094607672357, "train_speed(iter/s)": 0.167747 }, { "epoch": 0.4539541763849106, "grad_norm": 0.7085460424423218, "learning_rate": 9.167870754389222e-05, "loss": 0.8899293899536133, "memory(GiB)": 91.52, "step": 34985, "token_acc": 0.748331804813257, "train_speed(iter/s)": 0.167738 }, { "epoch": 0.4540190547865663, "grad_norm": 0.7810055613517761, "learning_rate": 9.167574432662931e-05, "loss": 0.9237375259399414, "memory(GiB)": 91.52, "step": 34990, "token_acc": 0.7448598483419474, "train_speed(iter/s)": 0.167728 }, { "epoch": 0.454083933188222, "grad_norm": 0.7628288269042969, "learning_rate": 9.16727806297646e-05, "loss": 0.9124008178710937, "memory(GiB)": 91.52, "step": 34995, "token_acc": 0.7640644562012843, "train_speed(iter/s)": 0.167718 }, { "epoch": 0.4541488115898777, "grad_norm": 0.787085771560669, "learning_rate": 9.166981645333223e-05, "loss": 0.9489184379577636, "memory(GiB)": 91.52, "step": 35000, "token_acc": 0.7443600323936601, "train_speed(iter/s)": 0.167708 }, { "epoch": 0.4542136899915334, "grad_norm": 0.7205386757850647, "learning_rate": 9.16668517973663e-05, "loss": 0.9157720565795898, "memory(GiB)": 91.52, "step": 35005, "token_acc": 0.7519833653230966, "train_speed(iter/s)": 0.1677 }, { "epoch": 0.4542785683931891, "grad_norm": 0.7901052832603455, "learning_rate": 9.16638866619009e-05, "loss": 0.9391443252563476, "memory(GiB)": 91.52, "step": 35010, "token_acc": 0.7547025335419649, "train_speed(iter/s)": 0.16769 }, { "epoch": 0.4543434467948448, "grad_norm": 0.8007659912109375, "learning_rate": 9.166092104697018e-05, "loss": 0.9350727081298829, "memory(GiB)": 91.52, "step": 35015, "token_acc": 0.7445898523175903, "train_speed(iter/s)": 0.16768 }, { "epoch": 0.4544083251965005, "grad_norm": 0.7910777926445007, "learning_rate": 9.165795495260826e-05, "loss": 0.9050752639770507, "memory(GiB)": 91.52, "step": 35020, "token_acc": 0.7545018007202882, "train_speed(iter/s)": 0.167669 }, { "epoch": 0.4544732035981562, "grad_norm": 0.7654515504837036, "learning_rate": 9.165498837884928e-05, "loss": 0.9587240219116211, "memory(GiB)": 91.52, "step": 35025, "token_acc": 0.7294666396268505, "train_speed(iter/s)": 0.167661 }, { "epoch": 0.4545380819998118, "grad_norm": 0.7687620520591736, "learning_rate": 9.165202132572737e-05, "loss": 0.9414295196533203, "memory(GiB)": 91.52, "step": 35030, "token_acc": 0.7443749444164468, "train_speed(iter/s)": 0.16765 }, { "epoch": 0.4546029604014675, "grad_norm": 0.77130126953125, "learning_rate": 9.164905379327668e-05, "loss": 0.9331441879272461, "memory(GiB)": 91.52, "step": 35035, "token_acc": 0.748276604911676, "train_speed(iter/s)": 0.167642 }, { "epoch": 0.4546678388031232, "grad_norm": 0.741213858127594, "learning_rate": 9.164608578153137e-05, "loss": 0.920262622833252, "memory(GiB)": 91.52, "step": 35040, "token_acc": 0.7415691672401927, "train_speed(iter/s)": 0.167634 }, { "epoch": 0.4547327172047789, "grad_norm": 0.8392524719238281, "learning_rate": 9.164311729052558e-05, "loss": 0.9077022552490235, "memory(GiB)": 91.52, "step": 35045, "token_acc": 0.7644134137310906, "train_speed(iter/s)": 0.167625 }, { "epoch": 0.4547975956064346, "grad_norm": 0.8580380082130432, "learning_rate": 9.164014832029346e-05, "loss": 0.9358585357666016, "memory(GiB)": 91.52, "step": 35050, "token_acc": 0.7435844584149233, "train_speed(iter/s)": 0.167616 }, { "epoch": 0.4548624740080903, "grad_norm": 0.816996157169342, "learning_rate": 9.163717887086921e-05, "loss": 0.943875503540039, "memory(GiB)": 91.52, "step": 35055, "token_acc": 0.7583092230611813, "train_speed(iter/s)": 0.167607 }, { "epoch": 0.454927352409746, "grad_norm": 0.8259018063545227, "learning_rate": 9.163420894228699e-05, "loss": 0.914954948425293, "memory(GiB)": 91.52, "step": 35060, "token_acc": 0.7542869315156033, "train_speed(iter/s)": 0.167598 }, { "epoch": 0.4549922308114017, "grad_norm": 0.7724080085754395, "learning_rate": 9.163123853458095e-05, "loss": 0.9357036590576172, "memory(GiB)": 91.52, "step": 35065, "token_acc": 0.7429402973334382, "train_speed(iter/s)": 0.167588 }, { "epoch": 0.4550571092130574, "grad_norm": 0.8045805096626282, "learning_rate": 9.162826764778532e-05, "loss": 0.9259279251098633, "memory(GiB)": 91.52, "step": 35070, "token_acc": 0.7744016854857293, "train_speed(iter/s)": 0.167579 }, { "epoch": 0.4551219876147131, "grad_norm": 0.7893227338790894, "learning_rate": 9.162529628193425e-05, "loss": 0.9353017807006836, "memory(GiB)": 91.52, "step": 35075, "token_acc": 0.7251256471299551, "train_speed(iter/s)": 0.167572 }, { "epoch": 0.4551868660163688, "grad_norm": 0.7609859108924866, "learning_rate": 9.162232443706196e-05, "loss": 0.9166243553161622, "memory(GiB)": 91.52, "step": 35080, "token_acc": 0.7345783814374647, "train_speed(iter/s)": 0.167563 }, { "epoch": 0.4552517444180245, "grad_norm": 0.7443593144416809, "learning_rate": 9.161935211320263e-05, "loss": 0.929375171661377, "memory(GiB)": 91.52, "step": 35085, "token_acc": 0.7560445638432364, "train_speed(iter/s)": 0.167553 }, { "epoch": 0.4553166228196802, "grad_norm": 0.9474352598190308, "learning_rate": 9.161637931039048e-05, "loss": 0.9701238632202148, "memory(GiB)": 91.52, "step": 35090, "token_acc": 0.744977894981773, "train_speed(iter/s)": 0.167544 }, { "epoch": 0.4553815012213359, "grad_norm": 0.7841624021530151, "learning_rate": 9.161340602865971e-05, "loss": 0.9120545387268066, "memory(GiB)": 91.52, "step": 35095, "token_acc": 0.7504871295251769, "train_speed(iter/s)": 0.167536 }, { "epoch": 0.4554463796229916, "grad_norm": 0.7574517130851746, "learning_rate": 9.161043226804456e-05, "loss": 0.9153092384338379, "memory(GiB)": 91.52, "step": 35100, "token_acc": 0.7524799416484318, "train_speed(iter/s)": 0.167528 }, { "epoch": 0.4555112580246473, "grad_norm": 0.9489496946334839, "learning_rate": 9.160745802857921e-05, "loss": 0.9887943267822266, "memory(GiB)": 91.52, "step": 35105, "token_acc": 0.7338976052848886, "train_speed(iter/s)": 0.167519 }, { "epoch": 0.455576136426303, "grad_norm": 0.7223383784294128, "learning_rate": 9.160448331029792e-05, "loss": 0.8967300415039062, "memory(GiB)": 91.52, "step": 35110, "token_acc": 0.7617549439911492, "train_speed(iter/s)": 0.167509 }, { "epoch": 0.4556410148279587, "grad_norm": 0.8224212527275085, "learning_rate": 9.160150811323491e-05, "loss": 0.9166227340698242, "memory(GiB)": 91.52, "step": 35115, "token_acc": 0.7436979043633787, "train_speed(iter/s)": 0.1675 }, { "epoch": 0.4557058932296144, "grad_norm": 0.8079283237457275, "learning_rate": 9.159853243742442e-05, "loss": 0.9773153305053711, "memory(GiB)": 91.52, "step": 35120, "token_acc": 0.75333511715913, "train_speed(iter/s)": 0.167491 }, { "epoch": 0.4557707716312701, "grad_norm": 0.7924146056175232, "learning_rate": 9.15955562829007e-05, "loss": 0.8697761535644531, "memory(GiB)": 91.52, "step": 35125, "token_acc": 0.750544856908071, "train_speed(iter/s)": 0.167482 }, { "epoch": 0.4558356500329258, "grad_norm": 0.8258086442947388, "learning_rate": 9.1592579649698e-05, "loss": 0.9273817062377929, "memory(GiB)": 91.52, "step": 35130, "token_acc": 0.7401306330361911, "train_speed(iter/s)": 0.167473 }, { "epoch": 0.4559005284345815, "grad_norm": 0.8360722661018372, "learning_rate": 9.158960253785058e-05, "loss": 0.8983041763305664, "memory(GiB)": 91.52, "step": 35135, "token_acc": 0.7781044289640187, "train_speed(iter/s)": 0.167463 }, { "epoch": 0.4559654068362372, "grad_norm": 0.7792496681213379, "learning_rate": 9.158662494739268e-05, "loss": 0.9058518409729004, "memory(GiB)": 91.52, "step": 35140, "token_acc": 0.7449624458194091, "train_speed(iter/s)": 0.167453 }, { "epoch": 0.45603028523789285, "grad_norm": 0.8312479853630066, "learning_rate": 9.158364687835856e-05, "loss": 0.9589261054992676, "memory(GiB)": 91.52, "step": 35145, "token_acc": 0.7540181691125087, "train_speed(iter/s)": 0.167446 }, { "epoch": 0.45609516363954855, "grad_norm": 0.7306687235832214, "learning_rate": 9.158066833078254e-05, "loss": 0.9328989028930664, "memory(GiB)": 91.52, "step": 35150, "token_acc": 0.7433523945675482, "train_speed(iter/s)": 0.167437 }, { "epoch": 0.45616004204120425, "grad_norm": 0.8026595115661621, "learning_rate": 9.157768930469883e-05, "loss": 0.971010398864746, "memory(GiB)": 91.52, "step": 35155, "token_acc": 0.7485816118708176, "train_speed(iter/s)": 0.167428 }, { "epoch": 0.45622492044285995, "grad_norm": 0.7859755158424377, "learning_rate": 9.157470980014177e-05, "loss": 0.9626235961914062, "memory(GiB)": 91.52, "step": 35160, "token_acc": 0.7444403337855058, "train_speed(iter/s)": 0.167419 }, { "epoch": 0.45628979884451565, "grad_norm": 0.8370198607444763, "learning_rate": 9.157172981714562e-05, "loss": 0.9518228530883789, "memory(GiB)": 91.52, "step": 35165, "token_acc": 0.7347404449515117, "train_speed(iter/s)": 0.167411 }, { "epoch": 0.45635467724617135, "grad_norm": 0.8393673300743103, "learning_rate": 9.15687493557447e-05, "loss": 0.957308006286621, "memory(GiB)": 91.52, "step": 35170, "token_acc": 0.7440289241107296, "train_speed(iter/s)": 0.167403 }, { "epoch": 0.45641955564782705, "grad_norm": 0.8405842185020447, "learning_rate": 9.156576841597325e-05, "loss": 0.9187222480773926, "memory(GiB)": 91.52, "step": 35175, "token_acc": 0.7474931971521228, "train_speed(iter/s)": 0.167394 }, { "epoch": 0.45648443404948275, "grad_norm": 0.8131040334701538, "learning_rate": 9.156278699786563e-05, "loss": 0.9267402648925781, "memory(GiB)": 91.52, "step": 35180, "token_acc": 0.7776064157927205, "train_speed(iter/s)": 0.167384 }, { "epoch": 0.45654931245113844, "grad_norm": 0.7548953294754028, "learning_rate": 9.155980510145613e-05, "loss": 0.9111480712890625, "memory(GiB)": 91.52, "step": 35185, "token_acc": 0.7808304204370978, "train_speed(iter/s)": 0.167375 }, { "epoch": 0.45661419085279414, "grad_norm": 0.684268057346344, "learning_rate": 9.155682272677907e-05, "loss": 0.8924978256225586, "memory(GiB)": 91.52, "step": 35190, "token_acc": 0.7444972109151213, "train_speed(iter/s)": 0.167365 }, { "epoch": 0.45667906925444984, "grad_norm": 0.6836310029029846, "learning_rate": 9.155383987386876e-05, "loss": 0.8986825942993164, "memory(GiB)": 91.52, "step": 35195, "token_acc": 0.7713156408808582, "train_speed(iter/s)": 0.167358 }, { "epoch": 0.45674394765610554, "grad_norm": 0.631680965423584, "learning_rate": 9.155085654275955e-05, "loss": 0.9330350875854492, "memory(GiB)": 91.52, "step": 35200, "token_acc": 0.7527225378787878, "train_speed(iter/s)": 0.167348 }, { "epoch": 0.45680882605776124, "grad_norm": 0.7379662990570068, "learning_rate": 9.154787273348575e-05, "loss": 0.9489073753356934, "memory(GiB)": 91.52, "step": 35205, "token_acc": 0.7384554569303194, "train_speed(iter/s)": 0.16734 }, { "epoch": 0.45687370445941694, "grad_norm": 0.8046874403953552, "learning_rate": 9.154488844608171e-05, "loss": 0.9237844467163085, "memory(GiB)": 91.52, "step": 35210, "token_acc": 0.748399068548974, "train_speed(iter/s)": 0.16733 }, { "epoch": 0.45693858286107264, "grad_norm": 0.8558252453804016, "learning_rate": 9.154190368058175e-05, "loss": 0.9112079620361329, "memory(GiB)": 91.52, "step": 35215, "token_acc": 0.7516737175011295, "train_speed(iter/s)": 0.167322 }, { "epoch": 0.45700346126272834, "grad_norm": 0.7952781319618225, "learning_rate": 9.153891843702025e-05, "loss": 0.8798693656921387, "memory(GiB)": 91.52, "step": 35220, "token_acc": 0.7704457185130387, "train_speed(iter/s)": 0.167313 }, { "epoch": 0.45706833966438404, "grad_norm": 0.7419692873954773, "learning_rate": 9.153593271543155e-05, "loss": 0.9609298706054688, "memory(GiB)": 91.52, "step": 35225, "token_acc": 0.7403233872312052, "train_speed(iter/s)": 0.167303 }, { "epoch": 0.45713321806603974, "grad_norm": 0.7541870474815369, "learning_rate": 9.153294651585001e-05, "loss": 0.8960060119628906, "memory(GiB)": 91.52, "step": 35230, "token_acc": 0.7573314061846259, "train_speed(iter/s)": 0.167294 }, { "epoch": 0.45719809646769544, "grad_norm": 0.8920648097991943, "learning_rate": 9.152995983831e-05, "loss": 0.9509211540222168, "memory(GiB)": 91.52, "step": 35235, "token_acc": 0.7432859674113191, "train_speed(iter/s)": 0.167283 }, { "epoch": 0.45726297486935114, "grad_norm": 0.7928977012634277, "learning_rate": 9.152697268284587e-05, "loss": 0.926696491241455, "memory(GiB)": 91.52, "step": 35240, "token_acc": 0.7638185935002664, "train_speed(iter/s)": 0.167274 }, { "epoch": 0.45732785327100683, "grad_norm": 0.7910410165786743, "learning_rate": 9.152398504949202e-05, "loss": 0.9685531616210937, "memory(GiB)": 91.52, "step": 35245, "token_acc": 0.7384253656512568, "train_speed(iter/s)": 0.167266 }, { "epoch": 0.45739273167266253, "grad_norm": 0.8254736661911011, "learning_rate": 9.152099693828283e-05, "loss": 0.9272407531738281, "memory(GiB)": 91.52, "step": 35250, "token_acc": 0.7651208729188224, "train_speed(iter/s)": 0.167257 }, { "epoch": 0.45745761007431823, "grad_norm": 0.7327966690063477, "learning_rate": 9.151800834925266e-05, "loss": 0.9354045867919922, "memory(GiB)": 91.52, "step": 35255, "token_acc": 0.7479866010975696, "train_speed(iter/s)": 0.167247 }, { "epoch": 0.45752248847597393, "grad_norm": 0.8240915536880493, "learning_rate": 9.151501928243593e-05, "loss": 0.9486912727355957, "memory(GiB)": 91.52, "step": 35260, "token_acc": 0.7621299969484284, "train_speed(iter/s)": 0.167238 }, { "epoch": 0.4575873668776296, "grad_norm": 0.7732603549957275, "learning_rate": 9.151202973786704e-05, "loss": 0.8893621444702149, "memory(GiB)": 91.52, "step": 35265, "token_acc": 0.7322880371660859, "train_speed(iter/s)": 0.167229 }, { "epoch": 0.4576522452792853, "grad_norm": 0.7682240009307861, "learning_rate": 9.150903971558037e-05, "loss": 0.9250563621520996, "memory(GiB)": 91.52, "step": 35270, "token_acc": 0.7617137234123079, "train_speed(iter/s)": 0.167221 }, { "epoch": 0.457717123680941, "grad_norm": 0.8060373663902283, "learning_rate": 9.150604921561035e-05, "loss": 0.9444077491760254, "memory(GiB)": 91.52, "step": 35275, "token_acc": 0.7574254827875735, "train_speed(iter/s)": 0.16721 }, { "epoch": 0.4577820020825967, "grad_norm": 0.7441921234130859, "learning_rate": 9.15030582379914e-05, "loss": 0.9451492309570313, "memory(GiB)": 91.52, "step": 35280, "token_acc": 0.7368836736996502, "train_speed(iter/s)": 0.167204 }, { "epoch": 0.45784688048425237, "grad_norm": 0.7144469618797302, "learning_rate": 9.150006678275789e-05, "loss": 0.9327427864074707, "memory(GiB)": 91.52, "step": 35285, "token_acc": 0.754508509017018, "train_speed(iter/s)": 0.167194 }, { "epoch": 0.45791175888590807, "grad_norm": 0.7171749472618103, "learning_rate": 9.149707484994431e-05, "loss": 0.9036110877990723, "memory(GiB)": 91.52, "step": 35290, "token_acc": 0.7370122191564841, "train_speed(iter/s)": 0.167183 }, { "epoch": 0.45797663728756377, "grad_norm": 0.7782747149467468, "learning_rate": 9.149408243958505e-05, "loss": 0.9391253471374512, "memory(GiB)": 91.52, "step": 35295, "token_acc": 0.7454997861669453, "train_speed(iter/s)": 0.167175 }, { "epoch": 0.45804151568921947, "grad_norm": 0.7630223035812378, "learning_rate": 9.149108955171458e-05, "loss": 0.9330636978149414, "memory(GiB)": 91.52, "step": 35300, "token_acc": 0.7754300051389015, "train_speed(iter/s)": 0.167165 }, { "epoch": 0.45810639409087517, "grad_norm": 0.7626183032989502, "learning_rate": 9.148809618636732e-05, "loss": 0.9211993217468262, "memory(GiB)": 91.52, "step": 35305, "token_acc": 0.7316862013851891, "train_speed(iter/s)": 0.167155 }, { "epoch": 0.45817127249253087, "grad_norm": 0.7996779084205627, "learning_rate": 9.14851023435777e-05, "loss": 0.8945535659790039, "memory(GiB)": 91.52, "step": 35310, "token_acc": 0.776230269266481, "train_speed(iter/s)": 0.167145 }, { "epoch": 0.45823615089418657, "grad_norm": 0.7508857250213623, "learning_rate": 9.14821080233802e-05, "loss": 0.9460671424865723, "memory(GiB)": 91.52, "step": 35315, "token_acc": 0.7520386808111423, "train_speed(iter/s)": 0.167136 }, { "epoch": 0.45830102929584227, "grad_norm": 0.7571578025817871, "learning_rate": 9.147911322580927e-05, "loss": 0.945466136932373, "memory(GiB)": 91.52, "step": 35320, "token_acc": 0.756483664533513, "train_speed(iter/s)": 0.167127 }, { "epoch": 0.45836590769749797, "grad_norm": 0.7992700338363647, "learning_rate": 9.147611795089936e-05, "loss": 0.9146917343139649, "memory(GiB)": 91.52, "step": 35325, "token_acc": 0.7645820575627679, "train_speed(iter/s)": 0.167119 }, { "epoch": 0.45843078609915366, "grad_norm": 0.8118877410888672, "learning_rate": 9.147312219868498e-05, "loss": 0.9474557876586914, "memory(GiB)": 91.52, "step": 35330, "token_acc": 0.7539976133651551, "train_speed(iter/s)": 0.167111 }, { "epoch": 0.45849566450080936, "grad_norm": 0.7623152732849121, "learning_rate": 9.147012596920057e-05, "loss": 0.9393316268920898, "memory(GiB)": 91.52, "step": 35335, "token_acc": 0.7453832752613241, "train_speed(iter/s)": 0.167102 }, { "epoch": 0.45856054290246506, "grad_norm": 0.7280309796333313, "learning_rate": 9.146712926248063e-05, "loss": 0.9280558586120605, "memory(GiB)": 91.52, "step": 35340, "token_acc": 0.7514791148976453, "train_speed(iter/s)": 0.167092 }, { "epoch": 0.45862542130412076, "grad_norm": 0.7181221842765808, "learning_rate": 9.146413207855961e-05, "loss": 0.9433403015136719, "memory(GiB)": 91.52, "step": 35345, "token_acc": 0.7470912626310924, "train_speed(iter/s)": 0.167084 }, { "epoch": 0.45869029970577646, "grad_norm": 0.7769860625267029, "learning_rate": 9.146113441747204e-05, "loss": 0.9740989685058594, "memory(GiB)": 91.52, "step": 35350, "token_acc": 0.7308185966913862, "train_speed(iter/s)": 0.167074 }, { "epoch": 0.45875517810743216, "grad_norm": 0.7248488068580627, "learning_rate": 9.14581362792524e-05, "loss": 0.9371294021606446, "memory(GiB)": 91.52, "step": 35355, "token_acc": 0.7569415331032926, "train_speed(iter/s)": 0.167065 }, { "epoch": 0.45882005650908786, "grad_norm": 0.7750352025032043, "learning_rate": 9.145513766393521e-05, "loss": 0.8879049301147461, "memory(GiB)": 91.52, "step": 35360, "token_acc": 0.7599793021880544, "train_speed(iter/s)": 0.167057 }, { "epoch": 0.45888493491074356, "grad_norm": 0.7583371996879578, "learning_rate": 9.145213857155495e-05, "loss": 0.9488187789916992, "memory(GiB)": 91.52, "step": 35365, "token_acc": 0.7440822533721237, "train_speed(iter/s)": 0.167048 }, { "epoch": 0.45894981331239926, "grad_norm": 0.7553989887237549, "learning_rate": 9.144913900214615e-05, "loss": 0.9091544151306152, "memory(GiB)": 91.52, "step": 35370, "token_acc": 0.7582386564109802, "train_speed(iter/s)": 0.167039 }, { "epoch": 0.45901469171405496, "grad_norm": 0.7817268967628479, "learning_rate": 9.144613895574332e-05, "loss": 0.9377092361450196, "memory(GiB)": 91.52, "step": 35375, "token_acc": 0.739754994895727, "train_speed(iter/s)": 0.16703 }, { "epoch": 0.45907957011571066, "grad_norm": 0.8044393658638, "learning_rate": 9.144313843238101e-05, "loss": 0.9474781036376954, "memory(GiB)": 91.52, "step": 35380, "token_acc": 0.7374525904923224, "train_speed(iter/s)": 0.167022 }, { "epoch": 0.4591444485173663, "grad_norm": 0.8107441663742065, "learning_rate": 9.144013743209372e-05, "loss": 0.9178991317749023, "memory(GiB)": 91.52, "step": 35385, "token_acc": 0.7488601438751731, "train_speed(iter/s)": 0.167014 }, { "epoch": 0.459209326919022, "grad_norm": 0.7202238440513611, "learning_rate": 9.143713595491598e-05, "loss": 0.9142513275146484, "memory(GiB)": 91.52, "step": 35390, "token_acc": 0.7451613877230593, "train_speed(iter/s)": 0.167004 }, { "epoch": 0.4592742053206777, "grad_norm": 0.7842820882797241, "learning_rate": 9.143413400088236e-05, "loss": 0.9623559951782227, "memory(GiB)": 91.52, "step": 35395, "token_acc": 0.757703081232493, "train_speed(iter/s)": 0.166994 }, { "epoch": 0.4593390837223334, "grad_norm": 0.8564779758453369, "learning_rate": 9.14311315700274e-05, "loss": 0.9068746566772461, "memory(GiB)": 91.52, "step": 35400, "token_acc": 0.7687581274382315, "train_speed(iter/s)": 0.166985 }, { "epoch": 0.4594039621239891, "grad_norm": 0.7561297416687012, "learning_rate": 9.142812866238564e-05, "loss": 1.004516315460205, "memory(GiB)": 91.52, "step": 35405, "token_acc": 0.7425092439117684, "train_speed(iter/s)": 0.166978 }, { "epoch": 0.4594688405256448, "grad_norm": 0.7685716152191162, "learning_rate": 9.142512527799163e-05, "loss": 0.9084005355834961, "memory(GiB)": 91.52, "step": 35410, "token_acc": 0.757263382187797, "train_speed(iter/s)": 0.166967 }, { "epoch": 0.4595337189273005, "grad_norm": 0.7549513578414917, "learning_rate": 9.142212141687996e-05, "loss": 0.9308911323547363, "memory(GiB)": 91.52, "step": 35415, "token_acc": 0.7227478555387844, "train_speed(iter/s)": 0.166958 }, { "epoch": 0.4595985973289562, "grad_norm": 0.7888973951339722, "learning_rate": 9.141911707908517e-05, "loss": 0.9703088760375976, "memory(GiB)": 91.52, "step": 35420, "token_acc": 0.7239893505219646, "train_speed(iter/s)": 0.16695 }, { "epoch": 0.4596634757306119, "grad_norm": 0.7315007448196411, "learning_rate": 9.141611226464184e-05, "loss": 0.9189985275268555, "memory(GiB)": 91.52, "step": 35425, "token_acc": 0.7719242230736484, "train_speed(iter/s)": 0.166941 }, { "epoch": 0.4597283541322676, "grad_norm": 0.7876704335212708, "learning_rate": 9.141310697358457e-05, "loss": 0.942232322692871, "memory(GiB)": 91.52, "step": 35430, "token_acc": 0.747780956031966, "train_speed(iter/s)": 0.166932 }, { "epoch": 0.4597932325339233, "grad_norm": 0.8264650106430054, "learning_rate": 9.141010120594793e-05, "loss": 0.9201860427856445, "memory(GiB)": 91.52, "step": 35435, "token_acc": 0.757125748502994, "train_speed(iter/s)": 0.166921 }, { "epoch": 0.459858110935579, "grad_norm": 0.7905394434928894, "learning_rate": 9.140709496176652e-05, "loss": 0.9381107330322266, "memory(GiB)": 91.52, "step": 35440, "token_acc": 0.7420038608577161, "train_speed(iter/s)": 0.166913 }, { "epoch": 0.4599229893372347, "grad_norm": 0.8623343110084534, "learning_rate": 9.14040882410749e-05, "loss": 0.8926301956176758, "memory(GiB)": 91.52, "step": 35445, "token_acc": 0.7528312775012623, "train_speed(iter/s)": 0.166903 }, { "epoch": 0.4599878677388904, "grad_norm": 0.8163372874259949, "learning_rate": 9.140108104390772e-05, "loss": 0.8938450813293457, "memory(GiB)": 91.52, "step": 35450, "token_acc": 0.7471502590673575, "train_speed(iter/s)": 0.166894 }, { "epoch": 0.4600527461405461, "grad_norm": 0.7656959295272827, "learning_rate": 9.139807337029955e-05, "loss": 0.904643440246582, "memory(GiB)": 91.52, "step": 35455, "token_acc": 0.7495574699079537, "train_speed(iter/s)": 0.166884 }, { "epoch": 0.4601176245422018, "grad_norm": 0.7549076080322266, "learning_rate": 9.139506522028503e-05, "loss": 0.8996420860290527, "memory(GiB)": 91.52, "step": 35460, "token_acc": 0.7474084450670402, "train_speed(iter/s)": 0.166875 }, { "epoch": 0.4601825029438575, "grad_norm": 0.7616444230079651, "learning_rate": 9.139205659389874e-05, "loss": 0.9189483642578125, "memory(GiB)": 91.52, "step": 35465, "token_acc": 0.7618301376597837, "train_speed(iter/s)": 0.166866 }, { "epoch": 0.4602473813455132, "grad_norm": 0.7662871479988098, "learning_rate": 9.138904749117534e-05, "loss": 0.9047295570373535, "memory(GiB)": 91.52, "step": 35470, "token_acc": 0.7362998188405797, "train_speed(iter/s)": 0.166857 }, { "epoch": 0.4603122597471689, "grad_norm": 0.7126630544662476, "learning_rate": 9.138603791214945e-05, "loss": 0.9098491668701172, "memory(GiB)": 91.52, "step": 35475, "token_acc": 0.7699421965317919, "train_speed(iter/s)": 0.166847 }, { "epoch": 0.4603771381488246, "grad_norm": 0.8184905648231506, "learning_rate": 9.138302785685569e-05, "loss": 0.9055086135864258, "memory(GiB)": 91.52, "step": 35480, "token_acc": 0.7565859631107074, "train_speed(iter/s)": 0.166839 }, { "epoch": 0.4604420165504803, "grad_norm": 0.804025411605835, "learning_rate": 9.138001732532872e-05, "loss": 0.9383968353271485, "memory(GiB)": 91.52, "step": 35485, "token_acc": 0.7339222165734621, "train_speed(iter/s)": 0.166831 }, { "epoch": 0.460506894952136, "grad_norm": 0.8360942006111145, "learning_rate": 9.137700631760317e-05, "loss": 0.9213227272033692, "memory(GiB)": 91.52, "step": 35490, "token_acc": 0.7349770558671426, "train_speed(iter/s)": 0.166823 }, { "epoch": 0.4605717733537917, "grad_norm": 0.7844931483268738, "learning_rate": 9.137399483371369e-05, "loss": 0.9256100654602051, "memory(GiB)": 91.52, "step": 35495, "token_acc": 0.7512435798924253, "train_speed(iter/s)": 0.166814 }, { "epoch": 0.4606366517554474, "grad_norm": 0.7612779140472412, "learning_rate": 9.137098287369495e-05, "loss": 0.8967966079711914, "memory(GiB)": 91.52, "step": 35500, "token_acc": 0.7610337126449006, "train_speed(iter/s)": 0.166805 }, { "epoch": 0.460701530157103, "grad_norm": 0.8121239542961121, "learning_rate": 9.136797043758158e-05, "loss": 0.946647834777832, "memory(GiB)": 91.52, "step": 35505, "token_acc": 0.7501853224610823, "train_speed(iter/s)": 0.166796 }, { "epoch": 0.4607664085587587, "grad_norm": 0.8713660836219788, "learning_rate": 9.136495752540828e-05, "loss": 0.9869257926940918, "memory(GiB)": 91.52, "step": 35510, "token_acc": 0.7417898777367075, "train_speed(iter/s)": 0.166786 }, { "epoch": 0.4608312869604144, "grad_norm": 0.8198699355125427, "learning_rate": 9.136194413720972e-05, "loss": 0.9148340225219727, "memory(GiB)": 91.52, "step": 35515, "token_acc": 0.7637590496942434, "train_speed(iter/s)": 0.166777 }, { "epoch": 0.4608961653620701, "grad_norm": 0.7333528995513916, "learning_rate": 9.135893027302056e-05, "loss": 0.931328010559082, "memory(GiB)": 91.52, "step": 35520, "token_acc": 0.744765702891326, "train_speed(iter/s)": 0.166769 }, { "epoch": 0.4609610437637258, "grad_norm": 0.7659488320350647, "learning_rate": 9.13559159328755e-05, "loss": 0.9605263710021973, "memory(GiB)": 91.52, "step": 35525, "token_acc": 0.7270279227773635, "train_speed(iter/s)": 0.166761 }, { "epoch": 0.4610259221653815, "grad_norm": 0.7799906730651855, "learning_rate": 9.135290111680922e-05, "loss": 0.9283688545227051, "memory(GiB)": 91.52, "step": 35530, "token_acc": 0.7726169002358818, "train_speed(iter/s)": 0.166751 }, { "epoch": 0.4610908005670372, "grad_norm": 0.7222355008125305, "learning_rate": 9.13498858248564e-05, "loss": 0.9238626480102539, "memory(GiB)": 91.52, "step": 35535, "token_acc": 0.7551325121313923, "train_speed(iter/s)": 0.166743 }, { "epoch": 0.4611556789686929, "grad_norm": 0.8200193047523499, "learning_rate": 9.134687005705178e-05, "loss": 0.8907730102539062, "memory(GiB)": 91.52, "step": 35540, "token_acc": 0.7579142064451875, "train_speed(iter/s)": 0.166733 }, { "epoch": 0.4612205573703486, "grad_norm": 0.8864449858665466, "learning_rate": 9.134385381343003e-05, "loss": 0.9059579849243165, "memory(GiB)": 91.52, "step": 35545, "token_acc": 0.7702062845601139, "train_speed(iter/s)": 0.166724 }, { "epoch": 0.4612854357720043, "grad_norm": 0.8201239705085754, "learning_rate": 9.134083709402585e-05, "loss": 0.9076533317565918, "memory(GiB)": 91.52, "step": 35550, "token_acc": 0.7599338888506301, "train_speed(iter/s)": 0.166716 }, { "epoch": 0.46135031417366, "grad_norm": 0.8319482803344727, "learning_rate": 9.133781989887401e-05, "loss": 0.9406947135925293, "memory(GiB)": 91.52, "step": 35555, "token_acc": 0.7626675628243904, "train_speed(iter/s)": 0.166708 }, { "epoch": 0.4614151925753157, "grad_norm": 0.9492636322975159, "learning_rate": 9.133480222800919e-05, "loss": 0.9630245208740235, "memory(GiB)": 91.52, "step": 35560, "token_acc": 0.745663786849568, "train_speed(iter/s)": 0.166701 }, { "epoch": 0.4614800709769714, "grad_norm": 0.78179532289505, "learning_rate": 9.133178408146612e-05, "loss": 0.9433943748474121, "memory(GiB)": 91.52, "step": 35565, "token_acc": 0.7593736775285654, "train_speed(iter/s)": 0.166692 }, { "epoch": 0.4615449493786271, "grad_norm": 0.8661077618598938, "learning_rate": 9.132876545927953e-05, "loss": 0.9495449066162109, "memory(GiB)": 91.52, "step": 35570, "token_acc": 0.7522776572668113, "train_speed(iter/s)": 0.166684 }, { "epoch": 0.4616098277802828, "grad_norm": 0.7118105888366699, "learning_rate": 9.132574636148418e-05, "loss": 0.942473030090332, "memory(GiB)": 91.52, "step": 35575, "token_acc": 0.7588767931630889, "train_speed(iter/s)": 0.166675 }, { "epoch": 0.4616747061819385, "grad_norm": 0.7825154662132263, "learning_rate": 9.13227267881148e-05, "loss": 0.9106910705566407, "memory(GiB)": 91.52, "step": 35580, "token_acc": 0.7669426441389128, "train_speed(iter/s)": 0.166668 }, { "epoch": 0.4617395845835942, "grad_norm": 0.8217349648475647, "learning_rate": 9.131970673920614e-05, "loss": 0.9575450897216797, "memory(GiB)": 91.52, "step": 35585, "token_acc": 0.7533959629300495, "train_speed(iter/s)": 0.166659 }, { "epoch": 0.4618044629852499, "grad_norm": 0.8281091451644897, "learning_rate": 9.131668621479295e-05, "loss": 0.9228837966918946, "memory(GiB)": 91.52, "step": 35590, "token_acc": 0.7549874890106174, "train_speed(iter/s)": 0.166651 }, { "epoch": 0.4618693413869056, "grad_norm": 0.741206169128418, "learning_rate": 9.131366521491e-05, "loss": 0.9719862937927246, "memory(GiB)": 91.52, "step": 35595, "token_acc": 0.7246764981122333, "train_speed(iter/s)": 0.166641 }, { "epoch": 0.4619342197885613, "grad_norm": 0.8665567636489868, "learning_rate": 9.131064373959203e-05, "loss": 0.9316033363342285, "memory(GiB)": 91.52, "step": 35600, "token_acc": 0.7354609929078014, "train_speed(iter/s)": 0.166633 }, { "epoch": 0.461999098190217, "grad_norm": 0.7828603386878967, "learning_rate": 9.130762178887386e-05, "loss": 0.9442758560180664, "memory(GiB)": 91.52, "step": 35605, "token_acc": 0.7301796604582166, "train_speed(iter/s)": 0.166624 }, { "epoch": 0.4620639765918727, "grad_norm": 0.7323658466339111, "learning_rate": 9.130459936279021e-05, "loss": 0.9446653366088867, "memory(GiB)": 91.52, "step": 35610, "token_acc": 0.739530214816302, "train_speed(iter/s)": 0.166615 }, { "epoch": 0.4621288549935284, "grad_norm": 0.7220640182495117, "learning_rate": 9.130157646137592e-05, "loss": 0.9073759078979492, "memory(GiB)": 91.52, "step": 35615, "token_acc": 0.7531978341049989, "train_speed(iter/s)": 0.166605 }, { "epoch": 0.4621937333951841, "grad_norm": 0.7060415148735046, "learning_rate": 9.129855308466572e-05, "loss": 0.9824475288391114, "memory(GiB)": 91.52, "step": 35620, "token_acc": 0.7355657271340239, "train_speed(iter/s)": 0.166598 }, { "epoch": 0.46225861179683975, "grad_norm": 0.8205859065055847, "learning_rate": 9.129552923269444e-05, "loss": 0.9745527267456054, "memory(GiB)": 91.52, "step": 35625, "token_acc": 0.7224888764320498, "train_speed(iter/s)": 0.166591 }, { "epoch": 0.46232349019849545, "grad_norm": 0.7455087900161743, "learning_rate": 9.129250490549685e-05, "loss": 0.8968417167663574, "memory(GiB)": 91.52, "step": 35630, "token_acc": 0.7550108763206961, "train_speed(iter/s)": 0.166582 }, { "epoch": 0.46238836860015115, "grad_norm": 0.7218388915061951, "learning_rate": 9.12894801031078e-05, "loss": 0.9507087707519531, "memory(GiB)": 91.52, "step": 35635, "token_acc": 0.7410197170617464, "train_speed(iter/s)": 0.166574 }, { "epoch": 0.46245324700180684, "grad_norm": 0.8252098560333252, "learning_rate": 9.128645482556205e-05, "loss": 0.9301471710205078, "memory(GiB)": 91.52, "step": 35640, "token_acc": 0.7670178282009724, "train_speed(iter/s)": 0.166566 }, { "epoch": 0.46251812540346254, "grad_norm": 0.7565166354179382, "learning_rate": 9.128342907289444e-05, "loss": 0.8893364906311035, "memory(GiB)": 91.52, "step": 35645, "token_acc": 0.7603517653167186, "train_speed(iter/s)": 0.166557 }, { "epoch": 0.46258300380511824, "grad_norm": 0.6951299905776978, "learning_rate": 9.12804028451398e-05, "loss": 0.8751635551452637, "memory(GiB)": 91.52, "step": 35650, "token_acc": 0.7754591494294953, "train_speed(iter/s)": 0.166547 }, { "epoch": 0.46264788220677394, "grad_norm": 0.7653360366821289, "learning_rate": 9.127737614233291e-05, "loss": 0.9135297775268555, "memory(GiB)": 91.52, "step": 35655, "token_acc": 0.7791110941261895, "train_speed(iter/s)": 0.166538 }, { "epoch": 0.46271276060842964, "grad_norm": 0.7393662333488464, "learning_rate": 9.127434896450865e-05, "loss": 0.8810188293457031, "memory(GiB)": 91.52, "step": 35660, "token_acc": 0.7730116451932606, "train_speed(iter/s)": 0.166528 }, { "epoch": 0.46277763901008534, "grad_norm": 0.7905455827713013, "learning_rate": 9.127132131170184e-05, "loss": 0.91234130859375, "memory(GiB)": 91.52, "step": 35665, "token_acc": 0.7381205964279862, "train_speed(iter/s)": 0.16652 }, { "epoch": 0.46284251741174104, "grad_norm": 0.7799211144447327, "learning_rate": 9.126829318394732e-05, "loss": 0.9272113800048828, "memory(GiB)": 91.52, "step": 35670, "token_acc": 0.7539629567829134, "train_speed(iter/s)": 0.166511 }, { "epoch": 0.46290739581339674, "grad_norm": 0.8152400851249695, "learning_rate": 9.126526458127993e-05, "loss": 0.949708366394043, "memory(GiB)": 91.52, "step": 35675, "token_acc": 0.7665423893661858, "train_speed(iter/s)": 0.166503 }, { "epoch": 0.46297227421505244, "grad_norm": 0.7914885878562927, "learning_rate": 9.126223550373454e-05, "loss": 0.8941079139709472, "memory(GiB)": 91.52, "step": 35680, "token_acc": 0.7583395691386712, "train_speed(iter/s)": 0.166493 }, { "epoch": 0.46303715261670814, "grad_norm": 0.6979026198387146, "learning_rate": 9.1259205951346e-05, "loss": 0.8998285293579101, "memory(GiB)": 91.52, "step": 35685, "token_acc": 0.762936221419976, "train_speed(iter/s)": 0.166484 }, { "epoch": 0.46310203101836384, "grad_norm": 0.7600184082984924, "learning_rate": 9.125617592414917e-05, "loss": 0.8682948112487793, "memory(GiB)": 91.52, "step": 35690, "token_acc": 0.7784459508710547, "train_speed(iter/s)": 0.166476 }, { "epoch": 0.46316690942001953, "grad_norm": 0.739739179611206, "learning_rate": 9.125314542217893e-05, "loss": 0.8681186676025391, "memory(GiB)": 91.52, "step": 35695, "token_acc": 0.7779431041201686, "train_speed(iter/s)": 0.166466 }, { "epoch": 0.46323178782167523, "grad_norm": 0.7847562432289124, "learning_rate": 9.125011444547013e-05, "loss": 0.9706170082092285, "memory(GiB)": 91.52, "step": 35700, "token_acc": 0.7521969591295857, "train_speed(iter/s)": 0.166457 }, { "epoch": 0.46329666622333093, "grad_norm": 0.8213387131690979, "learning_rate": 9.124708299405767e-05, "loss": 0.9528064727783203, "memory(GiB)": 91.52, "step": 35705, "token_acc": 0.7411083259941739, "train_speed(iter/s)": 0.166449 }, { "epoch": 0.46336154462498663, "grad_norm": 0.7260035872459412, "learning_rate": 9.124405106797645e-05, "loss": 0.88571138381958, "memory(GiB)": 91.52, "step": 35710, "token_acc": 0.7623408212905995, "train_speed(iter/s)": 0.166441 }, { "epoch": 0.46342642302664233, "grad_norm": 0.8016999959945679, "learning_rate": 9.124101866726133e-05, "loss": 0.9063982963562012, "memory(GiB)": 91.52, "step": 35715, "token_acc": 0.7747402597402597, "train_speed(iter/s)": 0.166433 }, { "epoch": 0.46349130142829803, "grad_norm": 0.7975872159004211, "learning_rate": 9.123798579194724e-05, "loss": 0.9415852546691894, "memory(GiB)": 91.52, "step": 35720, "token_acc": 0.7470174066106005, "train_speed(iter/s)": 0.166425 }, { "epoch": 0.46355617982995373, "grad_norm": 0.8152958154678345, "learning_rate": 9.123495244206905e-05, "loss": 0.947809886932373, "memory(GiB)": 91.52, "step": 35725, "token_acc": 0.7401887197083423, "train_speed(iter/s)": 0.166414 }, { "epoch": 0.46362105823160943, "grad_norm": 0.7702805399894714, "learning_rate": 9.123191861766167e-05, "loss": 0.9218620300292969, "memory(GiB)": 91.52, "step": 35730, "token_acc": 0.7608789745297153, "train_speed(iter/s)": 0.166406 }, { "epoch": 0.46368593663326513, "grad_norm": 0.820853590965271, "learning_rate": 9.122888431876004e-05, "loss": 0.9160327911376953, "memory(GiB)": 91.52, "step": 35735, "token_acc": 0.761825761442913, "train_speed(iter/s)": 0.166398 }, { "epoch": 0.4637508150349208, "grad_norm": 0.7798216342926025, "learning_rate": 9.122584954539906e-05, "loss": 0.9490525245666503, "memory(GiB)": 91.52, "step": 35740, "token_acc": 0.7641216635630044, "train_speed(iter/s)": 0.166389 }, { "epoch": 0.46381569343657647, "grad_norm": 0.8441559076309204, "learning_rate": 9.122281429761366e-05, "loss": 0.916468620300293, "memory(GiB)": 91.52, "step": 35745, "token_acc": 0.7444023911428672, "train_speed(iter/s)": 0.166381 }, { "epoch": 0.46388057183823217, "grad_norm": 0.7855513095855713, "learning_rate": 9.121977857543875e-05, "loss": 0.8819696426391601, "memory(GiB)": 91.52, "step": 35750, "token_acc": 0.7637798000069201, "train_speed(iter/s)": 0.166373 }, { "epoch": 0.46394545023988787, "grad_norm": 0.8063040971755981, "learning_rate": 9.12167423789093e-05, "loss": 0.9315191268920898, "memory(GiB)": 91.52, "step": 35755, "token_acc": 0.7299521013597033, "train_speed(iter/s)": 0.166365 }, { "epoch": 0.46401032864154357, "grad_norm": 0.8193349838256836, "learning_rate": 9.121370570806022e-05, "loss": 0.9729252815246582, "memory(GiB)": 91.52, "step": 35760, "token_acc": 0.7530446270073897, "train_speed(iter/s)": 0.166357 }, { "epoch": 0.46407520704319927, "grad_norm": 0.8168878555297852, "learning_rate": 9.121066856292649e-05, "loss": 0.8822763442993165, "memory(GiB)": 91.52, "step": 35765, "token_acc": 0.7757622004092817, "train_speed(iter/s)": 0.166349 }, { "epoch": 0.46414008544485497, "grad_norm": 0.7384392619132996, "learning_rate": 9.120763094354301e-05, "loss": 0.9917253494262696, "memory(GiB)": 91.52, "step": 35770, "token_acc": 0.750737377826615, "train_speed(iter/s)": 0.166338 }, { "epoch": 0.46420496384651067, "grad_norm": 0.7567626237869263, "learning_rate": 9.120459284994478e-05, "loss": 0.9452543258666992, "memory(GiB)": 91.52, "step": 35775, "token_acc": 0.7777484641766856, "train_speed(iter/s)": 0.16633 }, { "epoch": 0.46426984224816636, "grad_norm": 0.7865010499954224, "learning_rate": 9.120155428216674e-05, "loss": 0.9217641830444336, "memory(GiB)": 91.52, "step": 35780, "token_acc": 0.7499047473900785, "train_speed(iter/s)": 0.166322 }, { "epoch": 0.46433472064982206, "grad_norm": 0.8258365392684937, "learning_rate": 9.119851524024387e-05, "loss": 0.9488582611083984, "memory(GiB)": 91.52, "step": 35785, "token_acc": 0.7595488613433266, "train_speed(iter/s)": 0.166313 }, { "epoch": 0.46439959905147776, "grad_norm": 0.7320767045021057, "learning_rate": 9.119547572421113e-05, "loss": 0.9432514190673829, "memory(GiB)": 91.52, "step": 35790, "token_acc": 0.7370290498541164, "train_speed(iter/s)": 0.166305 }, { "epoch": 0.46446447745313346, "grad_norm": 0.6914042830467224, "learning_rate": 9.11924357341035e-05, "loss": 0.9411608695983886, "memory(GiB)": 91.52, "step": 35795, "token_acc": 0.7395870416518334, "train_speed(iter/s)": 0.166294 }, { "epoch": 0.46452935585478916, "grad_norm": 0.78339022397995, "learning_rate": 9.1189395269956e-05, "loss": 0.9335192680358887, "memory(GiB)": 91.52, "step": 35800, "token_acc": 0.7491282321205343, "train_speed(iter/s)": 0.166286 }, { "epoch": 0.46459423425644486, "grad_norm": 0.8537744879722595, "learning_rate": 9.118635433180357e-05, "loss": 0.9394443511962891, "memory(GiB)": 91.52, "step": 35805, "token_acc": 0.7414970632427264, "train_speed(iter/s)": 0.166277 }, { "epoch": 0.46465911265810056, "grad_norm": 0.7137032151222229, "learning_rate": 9.118331291968123e-05, "loss": 0.901151466369629, "memory(GiB)": 91.52, "step": 35810, "token_acc": 0.7577450319432405, "train_speed(iter/s)": 0.166268 }, { "epoch": 0.46472399105975626, "grad_norm": 0.8036795854568481, "learning_rate": 9.118027103362396e-05, "loss": 0.9025524139404297, "memory(GiB)": 91.52, "step": 35815, "token_acc": 0.7674640472068125, "train_speed(iter/s)": 0.16626 }, { "epoch": 0.46478886946141196, "grad_norm": 0.8054662942886353, "learning_rate": 9.11772286736668e-05, "loss": 0.974001121520996, "memory(GiB)": 91.52, "step": 35820, "token_acc": 0.7518149259510212, "train_speed(iter/s)": 0.166252 }, { "epoch": 0.46485374786306766, "grad_norm": 0.8617955446243286, "learning_rate": 9.117418583984474e-05, "loss": 0.9040582656860352, "memory(GiB)": 91.52, "step": 35825, "token_acc": 0.76307295029051, "train_speed(iter/s)": 0.166242 }, { "epoch": 0.46491862626472336, "grad_norm": 0.7864068746566772, "learning_rate": 9.117114253219279e-05, "loss": 0.92254638671875, "memory(GiB)": 91.52, "step": 35830, "token_acc": 0.7425305274097168, "train_speed(iter/s)": 0.166233 }, { "epoch": 0.46498350466637905, "grad_norm": 0.7238946557044983, "learning_rate": 9.1168098750746e-05, "loss": 0.9064784049987793, "memory(GiB)": 91.52, "step": 35835, "token_acc": 0.7647778887349805, "train_speed(iter/s)": 0.166225 }, { "epoch": 0.46504838306803475, "grad_norm": 0.7272093296051025, "learning_rate": 9.116505449553937e-05, "loss": 0.9208235740661621, "memory(GiB)": 91.52, "step": 35840, "token_acc": 0.7502325162761393, "train_speed(iter/s)": 0.166217 }, { "epoch": 0.46511326146969045, "grad_norm": 0.8333846926689148, "learning_rate": 9.116200976660794e-05, "loss": 0.9334959030151367, "memory(GiB)": 91.52, "step": 35845, "token_acc": 0.7340938441670254, "train_speed(iter/s)": 0.166209 }, { "epoch": 0.46517813987134615, "grad_norm": 0.8519284129142761, "learning_rate": 9.115896456398674e-05, "loss": 0.9193341255187988, "memory(GiB)": 91.52, "step": 35850, "token_acc": 0.7547651797995677, "train_speed(iter/s)": 0.166202 }, { "epoch": 0.46524301827300185, "grad_norm": 0.7448782920837402, "learning_rate": 9.115591888771084e-05, "loss": 0.9296687126159668, "memory(GiB)": 91.52, "step": 35855, "token_acc": 0.73039167726576, "train_speed(iter/s)": 0.166193 }, { "epoch": 0.46530789667465755, "grad_norm": 0.7461305260658264, "learning_rate": 9.115287273781528e-05, "loss": 0.9219827651977539, "memory(GiB)": 91.52, "step": 35860, "token_acc": 0.7609335650693435, "train_speed(iter/s)": 0.166184 }, { "epoch": 0.4653727750763132, "grad_norm": 0.8492403626441956, "learning_rate": 9.114982611433509e-05, "loss": 0.9345785140991211, "memory(GiB)": 91.52, "step": 35865, "token_acc": 0.7667424816780389, "train_speed(iter/s)": 0.166177 }, { "epoch": 0.4654376534779689, "grad_norm": 0.7500830292701721, "learning_rate": 9.114677901730536e-05, "loss": 0.9460210800170898, "memory(GiB)": 91.52, "step": 35870, "token_acc": 0.7521780919604378, "train_speed(iter/s)": 0.166168 }, { "epoch": 0.4655025318796246, "grad_norm": 0.8161824941635132, "learning_rate": 9.114373144676114e-05, "loss": 0.9704781532287597, "memory(GiB)": 91.52, "step": 35875, "token_acc": 0.7375462807135644, "train_speed(iter/s)": 0.166161 }, { "epoch": 0.4655674102812803, "grad_norm": 0.7497175931930542, "learning_rate": 9.114068340273752e-05, "loss": 0.960573959350586, "memory(GiB)": 91.52, "step": 35880, "token_acc": 0.7630955147571462, "train_speed(iter/s)": 0.166155 }, { "epoch": 0.465632288682936, "grad_norm": 0.7563576698303223, "learning_rate": 9.113763488526955e-05, "loss": 0.9341526985168457, "memory(GiB)": 91.52, "step": 35885, "token_acc": 0.7328088682281423, "train_speed(iter/s)": 0.166147 }, { "epoch": 0.4656971670845917, "grad_norm": 0.7484958171844482, "learning_rate": 9.113458589439233e-05, "loss": 0.9368831634521484, "memory(GiB)": 91.52, "step": 35890, "token_acc": 0.7499830588873078, "train_speed(iter/s)": 0.166138 }, { "epoch": 0.4657620454862474, "grad_norm": 0.8250815272331238, "learning_rate": 9.113153643014094e-05, "loss": 0.9900413513183594, "memory(GiB)": 91.52, "step": 35895, "token_acc": 0.7336395384148193, "train_speed(iter/s)": 0.16613 }, { "epoch": 0.4658269238879031, "grad_norm": 0.7612385153770447, "learning_rate": 9.112848649255048e-05, "loss": 0.9406881332397461, "memory(GiB)": 91.52, "step": 35900, "token_acc": 0.7512743955723856, "train_speed(iter/s)": 0.166122 }, { "epoch": 0.4658918022895588, "grad_norm": 0.9486637711524963, "learning_rate": 9.112543608165603e-05, "loss": 0.9945316314697266, "memory(GiB)": 91.52, "step": 35905, "token_acc": 0.7300332651753288, "train_speed(iter/s)": 0.166111 }, { "epoch": 0.4659566806912145, "grad_norm": 0.8008556962013245, "learning_rate": 9.112238519749272e-05, "loss": 0.9568675994873047, "memory(GiB)": 91.52, "step": 35910, "token_acc": 0.7581775263217055, "train_speed(iter/s)": 0.166103 }, { "epoch": 0.4660215590928702, "grad_norm": 0.7599888443946838, "learning_rate": 9.111933384009566e-05, "loss": 0.9843852996826172, "memory(GiB)": 91.52, "step": 35915, "token_acc": 0.7574101796407186, "train_speed(iter/s)": 0.166094 }, { "epoch": 0.4660864374945259, "grad_norm": 0.7808365821838379, "learning_rate": 9.111628200949993e-05, "loss": 0.9676178932189942, "memory(GiB)": 91.52, "step": 35920, "token_acc": 0.7247810758557944, "train_speed(iter/s)": 0.166086 }, { "epoch": 0.4661513158961816, "grad_norm": 0.8013666272163391, "learning_rate": 9.11132297057407e-05, "loss": 0.9752618789672851, "memory(GiB)": 91.52, "step": 35925, "token_acc": 0.739565086076714, "train_speed(iter/s)": 0.166077 }, { "epoch": 0.4662161942978373, "grad_norm": 0.7016047239303589, "learning_rate": 9.111017692885304e-05, "loss": 0.9459152221679688, "memory(GiB)": 91.52, "step": 35930, "token_acc": 0.7342951794173195, "train_speed(iter/s)": 0.166069 }, { "epoch": 0.466281072699493, "grad_norm": 0.7835100889205933, "learning_rate": 9.110712367887214e-05, "loss": 0.9251908302307129, "memory(GiB)": 91.52, "step": 35935, "token_acc": 0.768502975736304, "train_speed(iter/s)": 0.16606 }, { "epoch": 0.4663459511011487, "grad_norm": 0.6815703511238098, "learning_rate": 9.110406995583306e-05, "loss": 0.9508419036865234, "memory(GiB)": 91.52, "step": 35940, "token_acc": 0.7482384073356103, "train_speed(iter/s)": 0.166051 }, { "epoch": 0.4664108295028044, "grad_norm": 0.7261002659797668, "learning_rate": 9.110101575977103e-05, "loss": 0.9261186599731446, "memory(GiB)": 91.52, "step": 35945, "token_acc": 0.7564952226062208, "train_speed(iter/s)": 0.166041 }, { "epoch": 0.4664757079044601, "grad_norm": 0.7572284936904907, "learning_rate": 9.109796109072114e-05, "loss": 0.9345888137817383, "memory(GiB)": 91.52, "step": 35950, "token_acc": 0.7627203065134099, "train_speed(iter/s)": 0.166033 }, { "epoch": 0.4665405863061158, "grad_norm": 0.7486462593078613, "learning_rate": 9.109490594871857e-05, "loss": 0.932682991027832, "memory(GiB)": 91.52, "step": 35955, "token_acc": 0.7412772837510105, "train_speed(iter/s)": 0.166023 }, { "epoch": 0.4666054647077715, "grad_norm": 0.756775438785553, "learning_rate": 9.109185033379845e-05, "loss": 0.8996933937072754, "memory(GiB)": 91.52, "step": 35960, "token_acc": 0.7531116642958748, "train_speed(iter/s)": 0.166015 }, { "epoch": 0.4666703431094272, "grad_norm": 0.7523616552352905, "learning_rate": 9.108879424599596e-05, "loss": 0.9007329940795898, "memory(GiB)": 91.52, "step": 35965, "token_acc": 0.764330797863732, "train_speed(iter/s)": 0.166007 }, { "epoch": 0.4667352215110829, "grad_norm": 0.8032159209251404, "learning_rate": 9.108573768534629e-05, "loss": 0.9068845748901367, "memory(GiB)": 91.52, "step": 35970, "token_acc": 0.7534527754156526, "train_speed(iter/s)": 0.165998 }, { "epoch": 0.4668000999127386, "grad_norm": 0.7324484586715698, "learning_rate": 9.108268065188459e-05, "loss": 0.8964084625244141, "memory(GiB)": 91.52, "step": 35975, "token_acc": 0.7389493172876649, "train_speed(iter/s)": 0.165988 }, { "epoch": 0.4668649783143942, "grad_norm": 0.7725326418876648, "learning_rate": 9.107962314564605e-05, "loss": 0.9274182319641113, "memory(GiB)": 91.52, "step": 35980, "token_acc": 0.7472029834842834, "train_speed(iter/s)": 0.165979 }, { "epoch": 0.4669298567160499, "grad_norm": 0.8644530177116394, "learning_rate": 9.107656516666583e-05, "loss": 0.9303768157958985, "memory(GiB)": 91.52, "step": 35985, "token_acc": 0.7579648614131372, "train_speed(iter/s)": 0.165972 }, { "epoch": 0.4669947351177056, "grad_norm": 0.8317886590957642, "learning_rate": 9.107350671497915e-05, "loss": 0.9581354141235352, "memory(GiB)": 91.52, "step": 35990, "token_acc": 0.7536299097427848, "train_speed(iter/s)": 0.165963 }, { "epoch": 0.4670596135193613, "grad_norm": 0.8224300146102905, "learning_rate": 9.10704477906212e-05, "loss": 0.9564565658569336, "memory(GiB)": 91.52, "step": 35995, "token_acc": 0.7350253367404391, "train_speed(iter/s)": 0.165955 }, { "epoch": 0.467124491921017, "grad_norm": 0.8215734958648682, "learning_rate": 9.106738839362718e-05, "loss": 0.960813331604004, "memory(GiB)": 91.52, "step": 36000, "token_acc": 0.759375483073118, "train_speed(iter/s)": 0.165946 }, { "epoch": 0.4671893703226727, "grad_norm": 0.7608416080474854, "learning_rate": 9.10643285240323e-05, "loss": 0.929620361328125, "memory(GiB)": 91.52, "step": 36005, "token_acc": 0.7687311842505835, "train_speed(iter/s)": 0.165937 }, { "epoch": 0.4672542487243284, "grad_norm": 0.7033460140228271, "learning_rate": 9.106126818187177e-05, "loss": 0.9163787841796875, "memory(GiB)": 91.52, "step": 36010, "token_acc": 0.7633174736449295, "train_speed(iter/s)": 0.165929 }, { "epoch": 0.4673191271259841, "grad_norm": 0.8423150181770325, "learning_rate": 9.105820736718081e-05, "loss": 0.8890871047973633, "memory(GiB)": 91.52, "step": 36015, "token_acc": 0.742736776408176, "train_speed(iter/s)": 0.16592 }, { "epoch": 0.4673840055276398, "grad_norm": 0.7834599614143372, "learning_rate": 9.105514607999463e-05, "loss": 0.9740221977233887, "memory(GiB)": 91.52, "step": 36020, "token_acc": 0.7475049575761517, "train_speed(iter/s)": 0.165911 }, { "epoch": 0.4674488839292955, "grad_norm": 0.7904371619224548, "learning_rate": 9.105208432034848e-05, "loss": 0.9844970703125, "memory(GiB)": 91.52, "step": 36025, "token_acc": 0.750950321143007, "train_speed(iter/s)": 0.165902 }, { "epoch": 0.4675137623309512, "grad_norm": 0.7254096269607544, "learning_rate": 9.104902208827759e-05, "loss": 0.9744905471801758, "memory(GiB)": 91.52, "step": 36030, "token_acc": 0.7532047369063607, "train_speed(iter/s)": 0.165894 }, { "epoch": 0.4675786407326069, "grad_norm": 0.8401362299919128, "learning_rate": 9.104595938381719e-05, "loss": 0.9659204483032227, "memory(GiB)": 91.52, "step": 36035, "token_acc": 0.7453047376711499, "train_speed(iter/s)": 0.165885 }, { "epoch": 0.4676435191342626, "grad_norm": 0.8754333257675171, "learning_rate": 9.104289620700253e-05, "loss": 0.9472438812255859, "memory(GiB)": 91.52, "step": 36040, "token_acc": 0.7408950896901613, "train_speed(iter/s)": 0.165877 }, { "epoch": 0.4677083975359183, "grad_norm": 0.72305828332901, "learning_rate": 9.103983255786886e-05, "loss": 0.9211669921875, "memory(GiB)": 91.52, "step": 36045, "token_acc": 0.7442859121762017, "train_speed(iter/s)": 0.165868 }, { "epoch": 0.467773275937574, "grad_norm": 0.7705522775650024, "learning_rate": 9.103676843645145e-05, "loss": 0.9524250030517578, "memory(GiB)": 91.52, "step": 36050, "token_acc": 0.7519836540098757, "train_speed(iter/s)": 0.16586 }, { "epoch": 0.4678381543392297, "grad_norm": 0.8045974969863892, "learning_rate": 9.103370384278553e-05, "loss": 0.9752569198608398, "memory(GiB)": 91.52, "step": 36055, "token_acc": 0.7598690938842299, "train_speed(iter/s)": 0.165852 }, { "epoch": 0.4679030327408854, "grad_norm": 0.7806008458137512, "learning_rate": 9.103063877690641e-05, "loss": 0.9115882873535156, "memory(GiB)": 91.52, "step": 36060, "token_acc": 0.7630799933477466, "train_speed(iter/s)": 0.165842 }, { "epoch": 0.4679679111425411, "grad_norm": 0.7439451813697815, "learning_rate": 9.102757323884933e-05, "loss": 0.9156342506408691, "memory(GiB)": 91.52, "step": 36065, "token_acc": 0.7652679353451708, "train_speed(iter/s)": 0.165834 }, { "epoch": 0.4680327895441968, "grad_norm": 0.8031088709831238, "learning_rate": 9.102450722864959e-05, "loss": 0.9385730743408203, "memory(GiB)": 91.52, "step": 36070, "token_acc": 0.7361282367447596, "train_speed(iter/s)": 0.165826 }, { "epoch": 0.4680976679458525, "grad_norm": 0.8232229351997375, "learning_rate": 9.102144074634244e-05, "loss": 0.9138001441955567, "memory(GiB)": 91.52, "step": 36075, "token_acc": 0.7538664591018308, "train_speed(iter/s)": 0.165819 }, { "epoch": 0.4681625463475082, "grad_norm": 0.790774405002594, "learning_rate": 9.101837379196319e-05, "loss": 0.947877311706543, "memory(GiB)": 91.52, "step": 36080, "token_acc": 0.7489728841413311, "train_speed(iter/s)": 0.165811 }, { "epoch": 0.4682274247491639, "grad_norm": 0.7636467218399048, "learning_rate": 9.101530636554714e-05, "loss": 0.9334991455078125, "memory(GiB)": 91.52, "step": 36085, "token_acc": 0.7375021240441801, "train_speed(iter/s)": 0.165804 }, { "epoch": 0.4682923031508196, "grad_norm": 0.7930740714073181, "learning_rate": 9.101223846712959e-05, "loss": 0.9380815505981446, "memory(GiB)": 91.52, "step": 36090, "token_acc": 0.7594908512330947, "train_speed(iter/s)": 0.165796 }, { "epoch": 0.4683571815524753, "grad_norm": 0.7798535227775574, "learning_rate": 9.100917009674583e-05, "loss": 0.9711666107177734, "memory(GiB)": 91.52, "step": 36095, "token_acc": 0.7298185941043084, "train_speed(iter/s)": 0.165788 }, { "epoch": 0.46842205995413094, "grad_norm": 0.8856799602508545, "learning_rate": 9.100610125443119e-05, "loss": 0.9428348541259766, "memory(GiB)": 91.52, "step": 36100, "token_acc": 0.7330276533731428, "train_speed(iter/s)": 0.165781 }, { "epoch": 0.46848693835578664, "grad_norm": 0.762669026851654, "learning_rate": 9.100303194022097e-05, "loss": 0.9229975700378418, "memory(GiB)": 91.52, "step": 36105, "token_acc": 0.7424049785867237, "train_speed(iter/s)": 0.165773 }, { "epoch": 0.46855181675744234, "grad_norm": 0.7786059379577637, "learning_rate": 9.09999621541505e-05, "loss": 0.9077810287475586, "memory(GiB)": 91.52, "step": 36110, "token_acc": 0.7494152344147259, "train_speed(iter/s)": 0.165765 }, { "epoch": 0.46861669515909804, "grad_norm": 0.7772983312606812, "learning_rate": 9.099689189625511e-05, "loss": 0.9443300247192383, "memory(GiB)": 91.52, "step": 36115, "token_acc": 0.7491793155003738, "train_speed(iter/s)": 0.165757 }, { "epoch": 0.46868157356075374, "grad_norm": 0.7354564070701599, "learning_rate": 9.099382116657011e-05, "loss": 0.9143888473510742, "memory(GiB)": 91.52, "step": 36120, "token_acc": 0.7442269460162104, "train_speed(iter/s)": 0.165749 }, { "epoch": 0.46874645196240944, "grad_norm": 0.8056328892707825, "learning_rate": 9.099074996513087e-05, "loss": 0.9305717468261718, "memory(GiB)": 91.52, "step": 36125, "token_acc": 0.7455008858359494, "train_speed(iter/s)": 0.165741 }, { "epoch": 0.46881133036406514, "grad_norm": 0.8471065163612366, "learning_rate": 9.098767829197271e-05, "loss": 0.9522600173950195, "memory(GiB)": 91.52, "step": 36130, "token_acc": 0.7301410826560344, "train_speed(iter/s)": 0.165735 }, { "epoch": 0.46887620876572084, "grad_norm": 0.6945517659187317, "learning_rate": 9.098460614713099e-05, "loss": 0.9061141967773437, "memory(GiB)": 91.52, "step": 36135, "token_acc": 0.7448885089406568, "train_speed(iter/s)": 0.165726 }, { "epoch": 0.46894108716737654, "grad_norm": 0.7855492830276489, "learning_rate": 9.098153353064106e-05, "loss": 0.9449844360351562, "memory(GiB)": 91.52, "step": 36140, "token_acc": 0.7404906745103282, "train_speed(iter/s)": 0.165718 }, { "epoch": 0.46900596556903223, "grad_norm": 0.7679818272590637, "learning_rate": 9.097846044253829e-05, "loss": 0.9888221740722656, "memory(GiB)": 91.52, "step": 36145, "token_acc": 0.740615957826311, "train_speed(iter/s)": 0.165711 }, { "epoch": 0.46907084397068793, "grad_norm": 0.7397749423980713, "learning_rate": 9.0975386882858e-05, "loss": 0.9392536163330079, "memory(GiB)": 91.52, "step": 36150, "token_acc": 0.7745803357314148, "train_speed(iter/s)": 0.165703 }, { "epoch": 0.46913572237234363, "grad_norm": 0.723077654838562, "learning_rate": 9.097231285163564e-05, "loss": 0.9311046600341797, "memory(GiB)": 91.52, "step": 36155, "token_acc": 0.7637190695873795, "train_speed(iter/s)": 0.165694 }, { "epoch": 0.46920060077399933, "grad_norm": 0.7925923466682434, "learning_rate": 9.09692383489065e-05, "loss": 0.9747591972351074, "memory(GiB)": 91.52, "step": 36160, "token_acc": 0.7303233935321294, "train_speed(iter/s)": 0.165687 }, { "epoch": 0.46926547917565503, "grad_norm": 0.7770748734474182, "learning_rate": 9.096616337470603e-05, "loss": 0.9512445449829101, "memory(GiB)": 91.52, "step": 36165, "token_acc": 0.7489361702127659, "train_speed(iter/s)": 0.165679 }, { "epoch": 0.46933035757731073, "grad_norm": 0.7934471964836121, "learning_rate": 9.096308792906957e-05, "loss": 0.9453906059265137, "memory(GiB)": 91.52, "step": 36170, "token_acc": 0.7506651463321931, "train_speed(iter/s)": 0.165671 }, { "epoch": 0.46939523597896643, "grad_norm": 0.8340616822242737, "learning_rate": 9.096001201203255e-05, "loss": 0.9000360488891601, "memory(GiB)": 91.52, "step": 36175, "token_acc": 0.7534125766871166, "train_speed(iter/s)": 0.165663 }, { "epoch": 0.46946011438062213, "grad_norm": 0.8631490468978882, "learning_rate": 9.095693562363033e-05, "loss": 0.9030658721923828, "memory(GiB)": 91.52, "step": 36180, "token_acc": 0.766048401342519, "train_speed(iter/s)": 0.165655 }, { "epoch": 0.46952499278227783, "grad_norm": 0.8453497886657715, "learning_rate": 9.095385876389833e-05, "loss": 0.9160020828247071, "memory(GiB)": 91.52, "step": 36185, "token_acc": 0.7641033092375313, "train_speed(iter/s)": 0.165648 }, { "epoch": 0.4695898711839335, "grad_norm": 0.8482268452644348, "learning_rate": 9.095078143287197e-05, "loss": 1.0065488815307617, "memory(GiB)": 91.52, "step": 36190, "token_acc": 0.7300346161677866, "train_speed(iter/s)": 0.165641 }, { "epoch": 0.4696547495855892, "grad_norm": 0.8460907936096191, "learning_rate": 9.094770363058664e-05, "loss": 0.9585546493530274, "memory(GiB)": 91.52, "step": 36195, "token_acc": 0.7414511467287264, "train_speed(iter/s)": 0.165633 }, { "epoch": 0.4697196279872449, "grad_norm": 0.7342776656150818, "learning_rate": 9.09446253570778e-05, "loss": 0.9101480484008789, "memory(GiB)": 91.52, "step": 36200, "token_acc": 0.7486851716581446, "train_speed(iter/s)": 0.165625 }, { "epoch": 0.4697845063889006, "grad_norm": 0.7383168339729309, "learning_rate": 9.094154661238083e-05, "loss": 0.8972749710083008, "memory(GiB)": 91.52, "step": 36205, "token_acc": 0.7637862211709048, "train_speed(iter/s)": 0.165617 }, { "epoch": 0.4698493847905563, "grad_norm": 0.8879275321960449, "learning_rate": 9.093846739653116e-05, "loss": 0.9363307952880859, "memory(GiB)": 91.52, "step": 36210, "token_acc": 0.7296100610209852, "train_speed(iter/s)": 0.16561 }, { "epoch": 0.469914263192212, "grad_norm": 0.7415657639503479, "learning_rate": 9.093538770956426e-05, "loss": 0.9621824264526367, "memory(GiB)": 91.52, "step": 36215, "token_acc": 0.7434206052363141, "train_speed(iter/s)": 0.165602 }, { "epoch": 0.46997914159386767, "grad_norm": 0.7516352534294128, "learning_rate": 9.093230755151555e-05, "loss": 0.9491981506347656, "memory(GiB)": 91.52, "step": 36220, "token_acc": 0.7514373531007051, "train_speed(iter/s)": 0.165594 }, { "epoch": 0.47004401999552337, "grad_norm": 0.8618757128715515, "learning_rate": 9.092922692242048e-05, "loss": 0.9076753616333008, "memory(GiB)": 91.52, "step": 36225, "token_acc": 0.756373551465576, "train_speed(iter/s)": 0.165587 }, { "epoch": 0.47010889839717906, "grad_norm": 0.8270345330238342, "learning_rate": 9.092614582231449e-05, "loss": 0.9296125411987305, "memory(GiB)": 91.52, "step": 36230, "token_acc": 0.7609006944941649, "train_speed(iter/s)": 0.165579 }, { "epoch": 0.47017377679883476, "grad_norm": 0.778768002986908, "learning_rate": 9.092306425123307e-05, "loss": 0.9527210235595703, "memory(GiB)": 91.52, "step": 36235, "token_acc": 0.7458130740140465, "train_speed(iter/s)": 0.165571 }, { "epoch": 0.47023865520049046, "grad_norm": 0.7931361198425293, "learning_rate": 9.091998220921164e-05, "loss": 0.9073089599609375, "memory(GiB)": 91.52, "step": 36240, "token_acc": 0.7505411944479816, "train_speed(iter/s)": 0.165563 }, { "epoch": 0.47030353360214616, "grad_norm": 0.8109654784202576, "learning_rate": 9.09168996962857e-05, "loss": 0.978916072845459, "memory(GiB)": 91.52, "step": 36245, "token_acc": 0.7549086227245091, "train_speed(iter/s)": 0.165555 }, { "epoch": 0.47036841200380186, "grad_norm": 0.7249113917350769, "learning_rate": 9.09138167124907e-05, "loss": 0.9197957038879394, "memory(GiB)": 91.52, "step": 36250, "token_acc": 0.7416400116312881, "train_speed(iter/s)": 0.165546 }, { "epoch": 0.47043329040545756, "grad_norm": 0.762100875377655, "learning_rate": 9.091073325786215e-05, "loss": 0.9160306930541993, "memory(GiB)": 91.52, "step": 36255, "token_acc": 0.7495022490966743, "train_speed(iter/s)": 0.165539 }, { "epoch": 0.47049816880711326, "grad_norm": 0.699017345905304, "learning_rate": 9.090764933243549e-05, "loss": 0.9547619819641113, "memory(GiB)": 91.52, "step": 36260, "token_acc": 0.740466278101582, "train_speed(iter/s)": 0.165531 }, { "epoch": 0.47056304720876896, "grad_norm": 0.8129518628120422, "learning_rate": 9.090456493624624e-05, "loss": 0.9183405876159668, "memory(GiB)": 91.52, "step": 36265, "token_acc": 0.7617897905430829, "train_speed(iter/s)": 0.165523 }, { "epoch": 0.47062792561042466, "grad_norm": 0.7919749617576599, "learning_rate": 9.09014800693299e-05, "loss": 0.9241141319274903, "memory(GiB)": 91.52, "step": 36270, "token_acc": 0.749402328988972, "train_speed(iter/s)": 0.165516 }, { "epoch": 0.47069280401208036, "grad_norm": 0.6851952075958252, "learning_rate": 9.089839473172197e-05, "loss": 0.9188062667846679, "memory(GiB)": 91.52, "step": 36275, "token_acc": 0.753088630259624, "train_speed(iter/s)": 0.165508 }, { "epoch": 0.47075768241373606, "grad_norm": 0.6607781052589417, "learning_rate": 9.089530892345792e-05, "loss": 0.9071931838989258, "memory(GiB)": 91.52, "step": 36280, "token_acc": 0.7569129823465084, "train_speed(iter/s)": 0.165499 }, { "epoch": 0.47082256081539176, "grad_norm": 0.7762923836708069, "learning_rate": 9.08922226445733e-05, "loss": 0.8898017883300782, "memory(GiB)": 91.52, "step": 36285, "token_acc": 0.744914470642626, "train_speed(iter/s)": 0.165491 }, { "epoch": 0.47088743921704745, "grad_norm": 0.7859534621238708, "learning_rate": 9.088913589510362e-05, "loss": 0.9288338661193848, "memory(GiB)": 91.52, "step": 36290, "token_acc": 0.7507600884466556, "train_speed(iter/s)": 0.165482 }, { "epoch": 0.47095231761870315, "grad_norm": 0.81474769115448, "learning_rate": 9.088604867508439e-05, "loss": 0.9446554183959961, "memory(GiB)": 91.52, "step": 36295, "token_acc": 0.7392709871200936, "train_speed(iter/s)": 0.165473 }, { "epoch": 0.47101719602035885, "grad_norm": 0.7302085757255554, "learning_rate": 9.088296098455113e-05, "loss": 0.9028139114379883, "memory(GiB)": 91.52, "step": 36300, "token_acc": 0.7590079392006515, "train_speed(iter/s)": 0.165465 }, { "epoch": 0.47108207442201455, "grad_norm": 0.7601704597473145, "learning_rate": 9.08798728235394e-05, "loss": 0.934266185760498, "memory(GiB)": 91.52, "step": 36305, "token_acc": 0.7702697782793898, "train_speed(iter/s)": 0.165456 }, { "epoch": 0.47114695282367025, "grad_norm": 0.8140437006950378, "learning_rate": 9.087678419208472e-05, "loss": 0.8765340805053711, "memory(GiB)": 91.52, "step": 36310, "token_acc": 0.7593000826674015, "train_speed(iter/s)": 0.165448 }, { "epoch": 0.47121183122532595, "grad_norm": 0.7668571472167969, "learning_rate": 9.087369509022265e-05, "loss": 0.9666481018066406, "memory(GiB)": 91.52, "step": 36315, "token_acc": 0.7664247956134879, "train_speed(iter/s)": 0.16544 }, { "epoch": 0.47127670962698165, "grad_norm": 0.7744784951210022, "learning_rate": 9.087060551798871e-05, "loss": 0.928230094909668, "memory(GiB)": 91.52, "step": 36320, "token_acc": 0.7510655702415292, "train_speed(iter/s)": 0.165433 }, { "epoch": 0.47134158802863735, "grad_norm": 0.7815808653831482, "learning_rate": 9.086751547541849e-05, "loss": 0.8939696311950683, "memory(GiB)": 91.52, "step": 36325, "token_acc": 0.7665980153529302, "train_speed(iter/s)": 0.165426 }, { "epoch": 0.47140646643029305, "grad_norm": 0.7561782598495483, "learning_rate": 9.086442496254751e-05, "loss": 0.9146661758422852, "memory(GiB)": 91.52, "step": 36330, "token_acc": 0.7524270118416841, "train_speed(iter/s)": 0.165417 }, { "epoch": 0.47147134483194875, "grad_norm": 0.8188774585723877, "learning_rate": 9.086133397941137e-05, "loss": 0.9276041030883789, "memory(GiB)": 91.52, "step": 36335, "token_acc": 0.7484554280670785, "train_speed(iter/s)": 0.165409 }, { "epoch": 0.4715362232336044, "grad_norm": 0.7132295966148376, "learning_rate": 9.085824252604564e-05, "loss": 0.8909976959228516, "memory(GiB)": 91.52, "step": 36340, "token_acc": 0.7801045388381633, "train_speed(iter/s)": 0.165401 }, { "epoch": 0.4716011016352601, "grad_norm": 0.7733356356620789, "learning_rate": 9.085515060248586e-05, "loss": 0.8966367721557618, "memory(GiB)": 91.52, "step": 36345, "token_acc": 0.7597329965722532, "train_speed(iter/s)": 0.165393 }, { "epoch": 0.4716659800369158, "grad_norm": 0.7958695888519287, "learning_rate": 9.085205820876766e-05, "loss": 0.9073719024658203, "memory(GiB)": 91.52, "step": 36350, "token_acc": 0.7330830136315403, "train_speed(iter/s)": 0.165385 }, { "epoch": 0.4717308584385715, "grad_norm": 0.7408692240715027, "learning_rate": 9.084896534492659e-05, "loss": 0.8827796936035156, "memory(GiB)": 91.52, "step": 36355, "token_acc": 0.7831138652207591, "train_speed(iter/s)": 0.165375 }, { "epoch": 0.4717957368402272, "grad_norm": 0.8308891654014587, "learning_rate": 9.084587201099826e-05, "loss": 0.9105931282043457, "memory(GiB)": 91.52, "step": 36360, "token_acc": 0.7719440568517377, "train_speed(iter/s)": 0.165367 }, { "epoch": 0.4718606152418829, "grad_norm": 0.6440268158912659, "learning_rate": 9.084277820701826e-05, "loss": 0.897044563293457, "memory(GiB)": 91.52, "step": 36365, "token_acc": 0.7347257048498563, "train_speed(iter/s)": 0.165357 }, { "epoch": 0.4719254936435386, "grad_norm": 0.7545230388641357, "learning_rate": 9.083968393302218e-05, "loss": 0.9250300407409668, "memory(GiB)": 91.52, "step": 36370, "token_acc": 0.7765726681127982, "train_speed(iter/s)": 0.16535 }, { "epoch": 0.4719903720451943, "grad_norm": 0.82390958070755, "learning_rate": 9.083658918904568e-05, "loss": 0.9327763557434082, "memory(GiB)": 91.52, "step": 36375, "token_acc": 0.7427314745787983, "train_speed(iter/s)": 0.165342 }, { "epoch": 0.47205525044685, "grad_norm": 0.7517557740211487, "learning_rate": 9.083349397512433e-05, "loss": 0.9509132385253907, "memory(GiB)": 91.52, "step": 36380, "token_acc": 0.738022911100538, "train_speed(iter/s)": 0.165334 }, { "epoch": 0.4721201288485057, "grad_norm": 0.7723812460899353, "learning_rate": 9.083039829129374e-05, "loss": 0.9118514060974121, "memory(GiB)": 91.52, "step": 36385, "token_acc": 0.7336941382466473, "train_speed(iter/s)": 0.165326 }, { "epoch": 0.4721850072501614, "grad_norm": 0.7825648784637451, "learning_rate": 9.082730213758957e-05, "loss": 0.8753026962280274, "memory(GiB)": 91.52, "step": 36390, "token_acc": 0.7541020759677273, "train_speed(iter/s)": 0.165317 }, { "epoch": 0.4722498856518171, "grad_norm": 0.8277667164802551, "learning_rate": 9.082420551404743e-05, "loss": 1.011889362335205, "memory(GiB)": 91.52, "step": 36395, "token_acc": 0.7265316741100464, "train_speed(iter/s)": 0.165311 }, { "epoch": 0.4723147640534728, "grad_norm": 0.8774515986442566, "learning_rate": 9.082110842070297e-05, "loss": 0.9261795043945312, "memory(GiB)": 91.52, "step": 36400, "token_acc": 0.7536742464680972, "train_speed(iter/s)": 0.165302 }, { "epoch": 0.4723796424551285, "grad_norm": 0.7468969821929932, "learning_rate": 9.08180108575918e-05, "loss": 0.971920394897461, "memory(GiB)": 91.52, "step": 36405, "token_acc": 0.7351097178683386, "train_speed(iter/s)": 0.165295 }, { "epoch": 0.4724445208567842, "grad_norm": 0.7667827606201172, "learning_rate": 9.081491282474961e-05, "loss": 0.9398482322692872, "memory(GiB)": 91.52, "step": 36410, "token_acc": 0.7497188595620824, "train_speed(iter/s)": 0.165287 }, { "epoch": 0.4725093992584399, "grad_norm": 0.8071398138999939, "learning_rate": 9.081181432221203e-05, "loss": 0.9089497566223145, "memory(GiB)": 91.52, "step": 36415, "token_acc": 0.765032860389411, "train_speed(iter/s)": 0.16528 }, { "epoch": 0.4725742776600956, "grad_norm": 0.7655400633811951, "learning_rate": 9.08087153500147e-05, "loss": 0.9228395462036133, "memory(GiB)": 91.52, "step": 36420, "token_acc": 0.7514192262867025, "train_speed(iter/s)": 0.165271 }, { "epoch": 0.4726391560617513, "grad_norm": 0.8195177316665649, "learning_rate": 9.080561590819332e-05, "loss": 0.9558073997497558, "memory(GiB)": 91.52, "step": 36425, "token_acc": 0.7605422200976543, "train_speed(iter/s)": 0.165264 }, { "epoch": 0.472704034463407, "grad_norm": 0.7764137387275696, "learning_rate": 9.080251599678354e-05, "loss": 0.953946876525879, "memory(GiB)": 91.52, "step": 36430, "token_acc": 0.7307745219424955, "train_speed(iter/s)": 0.165257 }, { "epoch": 0.4727689128650627, "grad_norm": 0.8148080110549927, "learning_rate": 9.0799415615821e-05, "loss": 0.9224130630493164, "memory(GiB)": 91.52, "step": 36435, "token_acc": 0.7632231404958678, "train_speed(iter/s)": 0.165248 }, { "epoch": 0.4728337912667184, "grad_norm": 0.7853128910064697, "learning_rate": 9.079631476534144e-05, "loss": 0.9711713790893555, "memory(GiB)": 91.52, "step": 36440, "token_acc": 0.7586603070709884, "train_speed(iter/s)": 0.16524 }, { "epoch": 0.47289866966837407, "grad_norm": 0.7600210309028625, "learning_rate": 9.07932134453805e-05, "loss": 0.9742456436157226, "memory(GiB)": 91.52, "step": 36445, "token_acc": 0.7388019839755818, "train_speed(iter/s)": 0.165233 }, { "epoch": 0.47296354807002977, "grad_norm": 0.8837946653366089, "learning_rate": 9.07901116559739e-05, "loss": 0.9639304161071778, "memory(GiB)": 91.52, "step": 36450, "token_acc": 0.7420179286667938, "train_speed(iter/s)": 0.165225 }, { "epoch": 0.47302842647168547, "grad_norm": 0.7845990061759949, "learning_rate": 9.078700939715731e-05, "loss": 0.9904186248779296, "memory(GiB)": 91.52, "step": 36455, "token_acc": 0.7427920340830279, "train_speed(iter/s)": 0.165218 }, { "epoch": 0.4730933048733411, "grad_norm": 0.7344444990158081, "learning_rate": 9.078390666896643e-05, "loss": 0.9724596023559571, "memory(GiB)": 91.52, "step": 36460, "token_acc": 0.7343803935105281, "train_speed(iter/s)": 0.165211 }, { "epoch": 0.4731581832749968, "grad_norm": 0.7779101729393005, "learning_rate": 9.0780803471437e-05, "loss": 0.8994223594665527, "memory(GiB)": 91.52, "step": 36465, "token_acc": 0.7730536722220367, "train_speed(iter/s)": 0.165202 }, { "epoch": 0.4732230616766525, "grad_norm": 0.7038343548774719, "learning_rate": 9.07776998046047e-05, "loss": 0.9330509185791016, "memory(GiB)": 91.52, "step": 36470, "token_acc": 0.767089072543618, "train_speed(iter/s)": 0.165193 }, { "epoch": 0.4732879400783082, "grad_norm": 0.8196814656257629, "learning_rate": 9.077459566850525e-05, "loss": 0.9344396591186523, "memory(GiB)": 91.52, "step": 36475, "token_acc": 0.7297096881836046, "train_speed(iter/s)": 0.165186 }, { "epoch": 0.4733528184799639, "grad_norm": 0.7282048463821411, "learning_rate": 9.077149106317438e-05, "loss": 0.9375024795532226, "memory(GiB)": 91.52, "step": 36480, "token_acc": 0.755440637793579, "train_speed(iter/s)": 0.165177 }, { "epoch": 0.4734176968816196, "grad_norm": 0.8522635102272034, "learning_rate": 9.07683859886478e-05, "loss": 0.9612265586853027, "memory(GiB)": 91.52, "step": 36485, "token_acc": 0.7427452596867271, "train_speed(iter/s)": 0.16517 }, { "epoch": 0.4734825752832753, "grad_norm": 0.7648376226425171, "learning_rate": 9.076528044496128e-05, "loss": 0.9464211463928223, "memory(GiB)": 91.52, "step": 36490, "token_acc": 0.7247602932882121, "train_speed(iter/s)": 0.165163 }, { "epoch": 0.473547453684931, "grad_norm": 0.8114874958992004, "learning_rate": 9.076217443215054e-05, "loss": 0.91116304397583, "memory(GiB)": 91.52, "step": 36495, "token_acc": 0.7579567960941458, "train_speed(iter/s)": 0.165154 }, { "epoch": 0.4736123320865867, "grad_norm": 0.7670544385910034, "learning_rate": 9.07590679502513e-05, "loss": 0.8691232681274415, "memory(GiB)": 91.52, "step": 36500, "token_acc": 0.7640204177580764, "train_speed(iter/s)": 0.165146 }, { "epoch": 0.4736772104882424, "grad_norm": 0.866558313369751, "learning_rate": 9.075596099929935e-05, "loss": 0.8915213584899903, "memory(GiB)": 91.52, "step": 36505, "token_acc": 0.7501119570085087, "train_speed(iter/s)": 0.165138 }, { "epoch": 0.4737420888898981, "grad_norm": 0.6813935041427612, "learning_rate": 9.075285357933041e-05, "loss": 0.8940363883972168, "memory(GiB)": 91.52, "step": 36510, "token_acc": 0.7616688647987905, "train_speed(iter/s)": 0.16513 }, { "epoch": 0.4738069672915538, "grad_norm": 0.7836729884147644, "learning_rate": 9.074974569038026e-05, "loss": 0.9337120056152344, "memory(GiB)": 91.52, "step": 36515, "token_acc": 0.7488070650238587, "train_speed(iter/s)": 0.165123 }, { "epoch": 0.4738718456932095, "grad_norm": 0.8011822700500488, "learning_rate": 9.074663733248465e-05, "loss": 0.9046119689941406, "memory(GiB)": 91.52, "step": 36520, "token_acc": 0.7582911347264817, "train_speed(iter/s)": 0.165115 }, { "epoch": 0.4739367240948652, "grad_norm": 0.8238263726234436, "learning_rate": 9.074352850567937e-05, "loss": 0.92943115234375, "memory(GiB)": 91.52, "step": 36525, "token_acc": 0.7417270346563276, "train_speed(iter/s)": 0.165108 }, { "epoch": 0.4740016024965209, "grad_norm": 0.7552512884140015, "learning_rate": 9.074041921000018e-05, "loss": 0.9007394790649415, "memory(GiB)": 91.52, "step": 36530, "token_acc": 0.7590604836278116, "train_speed(iter/s)": 0.1651 }, { "epoch": 0.4740664808981766, "grad_norm": 0.7472925186157227, "learning_rate": 9.073730944548286e-05, "loss": 0.9370822906494141, "memory(GiB)": 91.52, "step": 36535, "token_acc": 0.7351113857137953, "train_speed(iter/s)": 0.16509 }, { "epoch": 0.4741313592998323, "grad_norm": 0.774864912033081, "learning_rate": 9.073419921216322e-05, "loss": 0.8950422286987305, "memory(GiB)": 91.52, "step": 36540, "token_acc": 0.7627312919600459, "train_speed(iter/s)": 0.165082 }, { "epoch": 0.474196237701488, "grad_norm": 0.677757203578949, "learning_rate": 9.073108851007703e-05, "loss": 0.9155521392822266, "memory(GiB)": 91.52, "step": 36545, "token_acc": 0.7660707493343477, "train_speed(iter/s)": 0.165074 }, { "epoch": 0.4742611161031437, "grad_norm": 0.7395973801612854, "learning_rate": 9.07279773392601e-05, "loss": 0.9229744911193848, "memory(GiB)": 91.52, "step": 36550, "token_acc": 0.7548632172288087, "train_speed(iter/s)": 0.165066 }, { "epoch": 0.4743259945047994, "grad_norm": 0.7190714478492737, "learning_rate": 9.072486569974822e-05, "loss": 0.8829854965209961, "memory(GiB)": 91.52, "step": 36555, "token_acc": 0.7598453645494617, "train_speed(iter/s)": 0.165056 }, { "epoch": 0.4743908729064551, "grad_norm": 0.6849506497383118, "learning_rate": 9.072175359157722e-05, "loss": 0.8505855560302734, "memory(GiB)": 91.52, "step": 36560, "token_acc": 0.7815635451505016, "train_speed(iter/s)": 0.165048 }, { "epoch": 0.4744557513081108, "grad_norm": 0.7918397188186646, "learning_rate": 9.071864101478289e-05, "loss": 0.9956182479858399, "memory(GiB)": 91.52, "step": 36565, "token_acc": 0.7422644837237836, "train_speed(iter/s)": 0.16504 }, { "epoch": 0.4745206297097665, "grad_norm": 0.7840169072151184, "learning_rate": 9.071552796940106e-05, "loss": 0.9786933898925781, "memory(GiB)": 91.52, "step": 36570, "token_acc": 0.7279058917621968, "train_speed(iter/s)": 0.165033 }, { "epoch": 0.4745855081114222, "grad_norm": 0.718011200428009, "learning_rate": 9.071241445546755e-05, "loss": 0.8879543304443359, "memory(GiB)": 91.52, "step": 36575, "token_acc": 0.7722148026213622, "train_speed(iter/s)": 0.165025 }, { "epoch": 0.47465038651307784, "grad_norm": 0.8046117424964905, "learning_rate": 9.07093004730182e-05, "loss": 0.9214062690734863, "memory(GiB)": 91.52, "step": 36580, "token_acc": 0.7672537800083112, "train_speed(iter/s)": 0.165017 }, { "epoch": 0.47471526491473354, "grad_norm": 0.7710679173469543, "learning_rate": 9.070618602208884e-05, "loss": 0.9038625717163086, "memory(GiB)": 91.52, "step": 36585, "token_acc": 0.752287509646125, "train_speed(iter/s)": 0.165009 }, { "epoch": 0.47478014331638924, "grad_norm": 0.821847677230835, "learning_rate": 9.070307110271532e-05, "loss": 0.9759120941162109, "memory(GiB)": 91.52, "step": 36590, "token_acc": 0.742204256723313, "train_speed(iter/s)": 0.165002 }, { "epoch": 0.47484502171804494, "grad_norm": 0.7107226252555847, "learning_rate": 9.069995571493346e-05, "loss": 0.9179704666137696, "memory(GiB)": 91.52, "step": 36595, "token_acc": 0.7605939876856211, "train_speed(iter/s)": 0.164995 }, { "epoch": 0.47490990011970063, "grad_norm": 0.7080618739128113, "learning_rate": 9.069683985877915e-05, "loss": 0.9532744407653808, "memory(GiB)": 91.52, "step": 36600, "token_acc": 0.7550012742099899, "train_speed(iter/s)": 0.164987 }, { "epoch": 0.47497477852135633, "grad_norm": 0.7910723090171814, "learning_rate": 9.069372353428821e-05, "loss": 0.9243576049804687, "memory(GiB)": 91.52, "step": 36605, "token_acc": 0.7593034789317071, "train_speed(iter/s)": 0.164979 }, { "epoch": 0.47503965692301203, "grad_norm": 0.6851146221160889, "learning_rate": 9.069060674149652e-05, "loss": 0.9177421569824219, "memory(GiB)": 91.52, "step": 36610, "token_acc": 0.7563022284122562, "train_speed(iter/s)": 0.16497 }, { "epoch": 0.47510453532466773, "grad_norm": 0.8104870319366455, "learning_rate": 9.068748948043995e-05, "loss": 0.9099775314331054, "memory(GiB)": 91.52, "step": 36615, "token_acc": 0.7506798389039964, "train_speed(iter/s)": 0.164962 }, { "epoch": 0.47516941372632343, "grad_norm": 0.8162328004837036, "learning_rate": 9.068437175115436e-05, "loss": 0.9030144691467286, "memory(GiB)": 91.52, "step": 36620, "token_acc": 0.7540557667934094, "train_speed(iter/s)": 0.164954 }, { "epoch": 0.47523429212797913, "grad_norm": 0.7043758034706116, "learning_rate": 9.068125355367566e-05, "loss": 0.8910961151123047, "memory(GiB)": 91.52, "step": 36625, "token_acc": 0.7601637537395686, "train_speed(iter/s)": 0.164945 }, { "epoch": 0.47529917052963483, "grad_norm": 0.6789089441299438, "learning_rate": 9.06781348880397e-05, "loss": 0.9592461585998535, "memory(GiB)": 91.52, "step": 36630, "token_acc": 0.7359458350430857, "train_speed(iter/s)": 0.164937 }, { "epoch": 0.47536404893129053, "grad_norm": 0.7641194462776184, "learning_rate": 9.067501575428238e-05, "loss": 0.9338679313659668, "memory(GiB)": 91.52, "step": 36635, "token_acc": 0.7442900612237416, "train_speed(iter/s)": 0.164929 }, { "epoch": 0.4754289273329462, "grad_norm": 0.7621066570281982, "learning_rate": 9.06718961524396e-05, "loss": 0.8711457252502441, "memory(GiB)": 91.52, "step": 36640, "token_acc": 0.7573248407643313, "train_speed(iter/s)": 0.164919 }, { "epoch": 0.4754938057346019, "grad_norm": 0.800676167011261, "learning_rate": 9.066877608254725e-05, "loss": 1.0109060287475586, "memory(GiB)": 91.52, "step": 36645, "token_acc": 0.7447102752039828, "train_speed(iter/s)": 0.16491 }, { "epoch": 0.4755586841362576, "grad_norm": 0.9844778180122375, "learning_rate": 9.066565554464125e-05, "loss": 0.8965396881103516, "memory(GiB)": 91.52, "step": 36650, "token_acc": 0.7661007904479205, "train_speed(iter/s)": 0.164902 }, { "epoch": 0.4756235625379133, "grad_norm": 0.7075468897819519, "learning_rate": 9.06625345387575e-05, "loss": 0.891123104095459, "memory(GiB)": 91.52, "step": 36655, "token_acc": 0.7595116836428999, "train_speed(iter/s)": 0.164894 }, { "epoch": 0.475688440939569, "grad_norm": 0.8131992816925049, "learning_rate": 9.065941306493191e-05, "loss": 0.9319267272949219, "memory(GiB)": 91.52, "step": 36660, "token_acc": 0.7333765480536317, "train_speed(iter/s)": 0.164886 }, { "epoch": 0.4757533193412247, "grad_norm": 0.8203904032707214, "learning_rate": 9.065629112320044e-05, "loss": 0.9214410781860352, "memory(GiB)": 91.52, "step": 36665, "token_acc": 0.7471786450508368, "train_speed(iter/s)": 0.164878 }, { "epoch": 0.4758181977428804, "grad_norm": 0.7521120309829712, "learning_rate": 9.065316871359896e-05, "loss": 0.9447696685791016, "memory(GiB)": 91.52, "step": 36670, "token_acc": 0.7519006422860139, "train_speed(iter/s)": 0.164871 }, { "epoch": 0.4758830761445361, "grad_norm": 0.7713744044303894, "learning_rate": 9.065004583616343e-05, "loss": 0.9230066299438476, "memory(GiB)": 91.52, "step": 36675, "token_acc": 0.7565009080020092, "train_speed(iter/s)": 0.164863 }, { "epoch": 0.4759479545461918, "grad_norm": 0.8250057697296143, "learning_rate": 9.064692249092981e-05, "loss": 0.9322803497314454, "memory(GiB)": 91.52, "step": 36680, "token_acc": 0.7497847566725432, "train_speed(iter/s)": 0.164855 }, { "epoch": 0.4760128329478475, "grad_norm": 0.7451507449150085, "learning_rate": 9.0643798677934e-05, "loss": 0.9534862518310547, "memory(GiB)": 91.52, "step": 36685, "token_acc": 0.7420715561303036, "train_speed(iter/s)": 0.164846 }, { "epoch": 0.4760777113495032, "grad_norm": 0.7170410752296448, "learning_rate": 9.064067439721198e-05, "loss": 0.9494206428527832, "memory(GiB)": 91.52, "step": 36690, "token_acc": 0.7665089416185373, "train_speed(iter/s)": 0.164838 }, { "epoch": 0.4761425897511589, "grad_norm": 0.8609438538551331, "learning_rate": 9.063754964879971e-05, "loss": 0.9291718482971192, "memory(GiB)": 91.52, "step": 36695, "token_acc": 0.7515815515108677, "train_speed(iter/s)": 0.16483 }, { "epoch": 0.47620746815281456, "grad_norm": 0.7685126066207886, "learning_rate": 9.063442443273311e-05, "loss": 0.9483905792236328, "memory(GiB)": 91.52, "step": 36700, "token_acc": 0.722797124798612, "train_speed(iter/s)": 0.164821 }, { "epoch": 0.47627234655447026, "grad_norm": 0.7822903394699097, "learning_rate": 9.063129874904818e-05, "loss": 0.9383434295654297, "memory(GiB)": 91.52, "step": 36705, "token_acc": 0.7551522802587975, "train_speed(iter/s)": 0.164812 }, { "epoch": 0.47633722495612596, "grad_norm": 0.8447145819664001, "learning_rate": 9.062817259778089e-05, "loss": 0.8956136703491211, "memory(GiB)": 91.52, "step": 36710, "token_acc": 0.7581344532076889, "train_speed(iter/s)": 0.164804 }, { "epoch": 0.47640210335778166, "grad_norm": 0.7888914942741394, "learning_rate": 9.062504597896718e-05, "loss": 0.9689522743225097, "memory(GiB)": 91.52, "step": 36715, "token_acc": 0.756392003322062, "train_speed(iter/s)": 0.164794 }, { "epoch": 0.47646698175943736, "grad_norm": 0.7819124460220337, "learning_rate": 9.062191889264307e-05, "loss": 0.9101201057434082, "memory(GiB)": 91.52, "step": 36720, "token_acc": 0.7551951180343665, "train_speed(iter/s)": 0.164786 }, { "epoch": 0.47653186016109306, "grad_norm": 0.8352723717689514, "learning_rate": 9.061879133884453e-05, "loss": 0.9476179122924805, "memory(GiB)": 91.52, "step": 36725, "token_acc": 0.7488278412467807, "train_speed(iter/s)": 0.164777 }, { "epoch": 0.47659673856274876, "grad_norm": 0.8114529252052307, "learning_rate": 9.061566331760756e-05, "loss": 0.9231352806091309, "memory(GiB)": 91.52, "step": 36730, "token_acc": 0.755162999839409, "train_speed(iter/s)": 0.16477 }, { "epoch": 0.47666161696440446, "grad_norm": 0.7131319046020508, "learning_rate": 9.061253482896815e-05, "loss": 0.9177989959716797, "memory(GiB)": 91.52, "step": 36735, "token_acc": 0.7464056286326094, "train_speed(iter/s)": 0.164764 }, { "epoch": 0.47672649536606015, "grad_norm": 0.7571233510971069, "learning_rate": 9.06094058729623e-05, "loss": 0.9069675445556641, "memory(GiB)": 91.52, "step": 36740, "token_acc": 0.7538542581211589, "train_speed(iter/s)": 0.164756 }, { "epoch": 0.47679137376771585, "grad_norm": 0.7295432090759277, "learning_rate": 9.0606276449626e-05, "loss": 0.9260905265808106, "memory(GiB)": 91.52, "step": 36745, "token_acc": 0.7407431320011622, "train_speed(iter/s)": 0.164748 }, { "epoch": 0.47685625216937155, "grad_norm": 0.7353668212890625, "learning_rate": 9.060314655899531e-05, "loss": 0.962308406829834, "memory(GiB)": 91.52, "step": 36750, "token_acc": 0.7474682689711045, "train_speed(iter/s)": 0.16474 }, { "epoch": 0.47692113057102725, "grad_norm": 0.817122757434845, "learning_rate": 9.06000162011062e-05, "loss": 0.9455801010131836, "memory(GiB)": 91.52, "step": 36755, "token_acc": 0.7625396825396825, "train_speed(iter/s)": 0.164733 }, { "epoch": 0.47698600897268295, "grad_norm": 0.765786349773407, "learning_rate": 9.059688537599474e-05, "loss": 0.9627331733703614, "memory(GiB)": 91.52, "step": 36760, "token_acc": 0.7327652982184353, "train_speed(iter/s)": 0.164725 }, { "epoch": 0.47705088737433865, "grad_norm": 0.783405601978302, "learning_rate": 9.059375408369693e-05, "loss": 0.8876033782958984, "memory(GiB)": 91.52, "step": 36765, "token_acc": 0.7760279001468429, "train_speed(iter/s)": 0.164717 }, { "epoch": 0.47711576577599435, "grad_norm": 0.6877284049987793, "learning_rate": 9.059062232424881e-05, "loss": 0.9296354293823242, "memory(GiB)": 91.52, "step": 36770, "token_acc": 0.7486262673167712, "train_speed(iter/s)": 0.164709 }, { "epoch": 0.47718064417765005, "grad_norm": 0.7732095718383789, "learning_rate": 9.058749009768641e-05, "loss": 0.9369707107543945, "memory(GiB)": 91.52, "step": 36775, "token_acc": 0.7559641985353946, "train_speed(iter/s)": 0.164701 }, { "epoch": 0.47724552257930575, "grad_norm": 0.813621997833252, "learning_rate": 9.058435740404581e-05, "loss": 0.9546468734741211, "memory(GiB)": 91.52, "step": 36780, "token_acc": 0.745546683046683, "train_speed(iter/s)": 0.164695 }, { "epoch": 0.47731040098096145, "grad_norm": 0.8350788354873657, "learning_rate": 9.058122424336302e-05, "loss": 0.907440185546875, "memory(GiB)": 91.52, "step": 36785, "token_acc": 0.7544525776020667, "train_speed(iter/s)": 0.164686 }, { "epoch": 0.47737527938261715, "grad_norm": 0.7713717222213745, "learning_rate": 9.05780906156741e-05, "loss": 0.9057125091552735, "memory(GiB)": 91.52, "step": 36790, "token_acc": 0.7675948814890213, "train_speed(iter/s)": 0.164678 }, { "epoch": 0.47744015778427285, "grad_norm": 0.7206760048866272, "learning_rate": 9.057495652101514e-05, "loss": 0.9068282127380372, "memory(GiB)": 91.52, "step": 36795, "token_acc": 0.7657927867881211, "train_speed(iter/s)": 0.16467 }, { "epoch": 0.47750503618592854, "grad_norm": 0.7529616951942444, "learning_rate": 9.05718219594222e-05, "loss": 0.9838865280151368, "memory(GiB)": 91.52, "step": 36800, "token_acc": 0.7429809626992726, "train_speed(iter/s)": 0.164662 }, { "epoch": 0.47756991458758424, "grad_norm": 0.6945664882659912, "learning_rate": 9.056868693093132e-05, "loss": 0.9035053253173828, "memory(GiB)": 91.52, "step": 36805, "token_acc": 0.7567825819881924, "train_speed(iter/s)": 0.164655 }, { "epoch": 0.47763479298923994, "grad_norm": 0.7315641045570374, "learning_rate": 9.056555143557863e-05, "loss": 0.8998524665832519, "memory(GiB)": 91.52, "step": 36810, "token_acc": 0.7513852483493149, "train_speed(iter/s)": 0.164647 }, { "epoch": 0.47769967139089564, "grad_norm": 0.7990166544914246, "learning_rate": 9.056241547340017e-05, "loss": 0.9529747009277344, "memory(GiB)": 91.52, "step": 36815, "token_acc": 0.7509807704675542, "train_speed(iter/s)": 0.164638 }, { "epoch": 0.4777645497925513, "grad_norm": 0.7345138192176819, "learning_rate": 9.055927904443205e-05, "loss": 0.8949934005737304, "memory(GiB)": 91.52, "step": 36820, "token_acc": 0.7755415840907538, "train_speed(iter/s)": 0.164631 }, { "epoch": 0.477829428194207, "grad_norm": 0.7791318297386169, "learning_rate": 9.055614214871034e-05, "loss": 0.8946208953857422, "memory(GiB)": 91.52, "step": 36825, "token_acc": 0.7503984580599725, "train_speed(iter/s)": 0.164623 }, { "epoch": 0.4778943065958627, "grad_norm": 0.6785244345664978, "learning_rate": 9.055300478627118e-05, "loss": 0.9479293823242188, "memory(GiB)": 91.52, "step": 36830, "token_acc": 0.7687680649949222, "train_speed(iter/s)": 0.164615 }, { "epoch": 0.4779591849975184, "grad_norm": 0.8087233901023865, "learning_rate": 9.054986695715063e-05, "loss": 0.9740970611572266, "memory(GiB)": 91.52, "step": 36835, "token_acc": 0.7388299103747065, "train_speed(iter/s)": 0.164607 }, { "epoch": 0.4780240633991741, "grad_norm": 0.7101953625679016, "learning_rate": 9.054672866138482e-05, "loss": 0.9657889366149902, "memory(GiB)": 91.52, "step": 36840, "token_acc": 0.7339503509941566, "train_speed(iter/s)": 0.1646 }, { "epoch": 0.4780889418008298, "grad_norm": 0.8304615020751953, "learning_rate": 9.054358989900988e-05, "loss": 0.935434627532959, "memory(GiB)": 91.52, "step": 36845, "token_acc": 0.7609285714285714, "train_speed(iter/s)": 0.164593 }, { "epoch": 0.4781538202024855, "grad_norm": 0.8257830142974854, "learning_rate": 9.054045067006192e-05, "loss": 0.9606277465820312, "memory(GiB)": 91.52, "step": 36850, "token_acc": 0.7489774330042314, "train_speed(iter/s)": 0.164583 }, { "epoch": 0.4782186986041412, "grad_norm": 0.7918851375579834, "learning_rate": 9.053731097457707e-05, "loss": 0.9256162643432617, "memory(GiB)": 91.52, "step": 36855, "token_acc": 0.758313395055307, "train_speed(iter/s)": 0.164576 }, { "epoch": 0.4782835770057969, "grad_norm": 0.7367454767227173, "learning_rate": 9.053417081259143e-05, "loss": 0.9847670555114746, "memory(GiB)": 91.52, "step": 36860, "token_acc": 0.7438516964919996, "train_speed(iter/s)": 0.164568 }, { "epoch": 0.4783484554074526, "grad_norm": 0.7845854163169861, "learning_rate": 9.053103018414117e-05, "loss": 0.943918514251709, "memory(GiB)": 91.52, "step": 36865, "token_acc": 0.7587985311713495, "train_speed(iter/s)": 0.164561 }, { "epoch": 0.4784133338091083, "grad_norm": 0.853808581829071, "learning_rate": 9.052788908926241e-05, "loss": 0.9902219772338867, "memory(GiB)": 91.52, "step": 36870, "token_acc": 0.7215370539798719, "train_speed(iter/s)": 0.164554 }, { "epoch": 0.478478212210764, "grad_norm": 0.7820347547531128, "learning_rate": 9.052474752799133e-05, "loss": 0.9575437545776367, "memory(GiB)": 91.52, "step": 36875, "token_acc": 0.7375910924842836, "train_speed(iter/s)": 0.164547 }, { "epoch": 0.4785430906124197, "grad_norm": 0.7548966407775879, "learning_rate": 9.052160550036406e-05, "loss": 0.9639217376708984, "memory(GiB)": 91.52, "step": 36880, "token_acc": 0.7313266443701226, "train_speed(iter/s)": 0.164539 }, { "epoch": 0.4786079690140754, "grad_norm": 0.8438993096351624, "learning_rate": 9.051846300641678e-05, "loss": 0.9038761138916016, "memory(GiB)": 91.52, "step": 36885, "token_acc": 0.7658802952092487, "train_speed(iter/s)": 0.164533 }, { "epoch": 0.4786728474157311, "grad_norm": 0.8199331164360046, "learning_rate": 9.051532004618561e-05, "loss": 0.9996214866638183, "memory(GiB)": 91.52, "step": 36890, "token_acc": 0.7262743315913051, "train_speed(iter/s)": 0.164526 }, { "epoch": 0.47873772581738677, "grad_norm": 0.7811220288276672, "learning_rate": 9.051217661970675e-05, "loss": 0.947847557067871, "memory(GiB)": 91.52, "step": 36895, "token_acc": 0.743998904222169, "train_speed(iter/s)": 0.164518 }, { "epoch": 0.47880260421904247, "grad_norm": 0.6846945881843567, "learning_rate": 9.050903272701637e-05, "loss": 0.9148797988891602, "memory(GiB)": 91.52, "step": 36900, "token_acc": 0.7383358345839587, "train_speed(iter/s)": 0.164511 }, { "epoch": 0.47886748262069817, "grad_norm": 0.8151888251304626, "learning_rate": 9.050588836815064e-05, "loss": 0.9205185890197753, "memory(GiB)": 91.52, "step": 36905, "token_acc": 0.7682216676265438, "train_speed(iter/s)": 0.164503 }, { "epoch": 0.47893236102235387, "grad_norm": 0.7942412495613098, "learning_rate": 9.050274354314576e-05, "loss": 0.8909361839294434, "memory(GiB)": 91.52, "step": 36910, "token_acc": 0.754257289132566, "train_speed(iter/s)": 0.164495 }, { "epoch": 0.47899723942400957, "grad_norm": 0.7440272569656372, "learning_rate": 9.049959825203792e-05, "loss": 0.8963729858398437, "memory(GiB)": 91.52, "step": 36915, "token_acc": 0.7425198786785802, "train_speed(iter/s)": 0.164488 }, { "epoch": 0.47906211782566527, "grad_norm": 0.7548236846923828, "learning_rate": 9.04964524948633e-05, "loss": 0.9201166152954101, "memory(GiB)": 91.52, "step": 36920, "token_acc": 0.7484908830667746, "train_speed(iter/s)": 0.164481 }, { "epoch": 0.47912699622732097, "grad_norm": 0.8041019439697266, "learning_rate": 9.049330627165813e-05, "loss": 0.9558422088623046, "memory(GiB)": 91.52, "step": 36925, "token_acc": 0.7417000854644665, "train_speed(iter/s)": 0.164472 }, { "epoch": 0.47919187462897667, "grad_norm": 0.6556921601295471, "learning_rate": 9.049015958245857e-05, "loss": 0.9004322052001953, "memory(GiB)": 91.52, "step": 36930, "token_acc": 0.7705624646693047, "train_speed(iter/s)": 0.164464 }, { "epoch": 0.4792567530306323, "grad_norm": 0.7686808705329895, "learning_rate": 9.048701242730088e-05, "loss": 0.9267889022827148, "memory(GiB)": 91.52, "step": 36935, "token_acc": 0.7702889057436393, "train_speed(iter/s)": 0.164456 }, { "epoch": 0.479321631432288, "grad_norm": 0.735335648059845, "learning_rate": 9.048386480622124e-05, "loss": 0.9230436325073242, "memory(GiB)": 91.52, "step": 36940, "token_acc": 0.7610329279956278, "train_speed(iter/s)": 0.164449 }, { "epoch": 0.4793865098339437, "grad_norm": 0.7070876359939575, "learning_rate": 9.048071671925591e-05, "loss": 0.9308560371398926, "memory(GiB)": 91.52, "step": 36945, "token_acc": 0.7357675551945655, "train_speed(iter/s)": 0.164441 }, { "epoch": 0.4794513882355994, "grad_norm": 0.7849900722503662, "learning_rate": 9.047756816644108e-05, "loss": 0.9408100128173829, "memory(GiB)": 91.52, "step": 36950, "token_acc": 0.7583058964381921, "train_speed(iter/s)": 0.164433 }, { "epoch": 0.4795162666372551, "grad_norm": 0.771279513835907, "learning_rate": 9.0474419147813e-05, "loss": 0.9022798538208008, "memory(GiB)": 91.52, "step": 36955, "token_acc": 0.7688176258428728, "train_speed(iter/s)": 0.164426 }, { "epoch": 0.4795811450389108, "grad_norm": 0.6772321462631226, "learning_rate": 9.047126966340793e-05, "loss": 0.9264381408691407, "memory(GiB)": 91.52, "step": 36960, "token_acc": 0.7411641661858049, "train_speed(iter/s)": 0.164417 }, { "epoch": 0.4796460234405665, "grad_norm": 0.8772809505462646, "learning_rate": 9.046811971326208e-05, "loss": 0.9466159820556641, "memory(GiB)": 91.52, "step": 36965, "token_acc": 0.742087300161667, "train_speed(iter/s)": 0.16441 }, { "epoch": 0.4797109018422222, "grad_norm": 0.7929423451423645, "learning_rate": 9.046496929741171e-05, "loss": 0.9194716453552246, "memory(GiB)": 91.52, "step": 36970, "token_acc": 0.7591831492422296, "train_speed(iter/s)": 0.164403 }, { "epoch": 0.4797757802438779, "grad_norm": 0.7636997699737549, "learning_rate": 9.046181841589308e-05, "loss": 0.9455864906311036, "memory(GiB)": 91.52, "step": 36975, "token_acc": 0.7604641379793873, "train_speed(iter/s)": 0.164395 }, { "epoch": 0.4798406586455336, "grad_norm": 0.8044158220291138, "learning_rate": 9.045866706874244e-05, "loss": 0.887966537475586, "memory(GiB)": 91.52, "step": 36980, "token_acc": 0.750669359763258, "train_speed(iter/s)": 0.164387 }, { "epoch": 0.4799055370471893, "grad_norm": 0.9020739197731018, "learning_rate": 9.045551525599609e-05, "loss": 0.923551368713379, "memory(GiB)": 91.52, "step": 36985, "token_acc": 0.741908951113972, "train_speed(iter/s)": 0.164378 }, { "epoch": 0.479970415448845, "grad_norm": 0.7651898264884949, "learning_rate": 9.045236297769024e-05, "loss": 0.8655980110168457, "memory(GiB)": 91.52, "step": 36990, "token_acc": 0.7535764731447612, "train_speed(iter/s)": 0.16437 }, { "epoch": 0.4800352938505007, "grad_norm": 0.8141764998435974, "learning_rate": 9.044921023386122e-05, "loss": 0.8988022804260254, "memory(GiB)": 91.52, "step": 36995, "token_acc": 0.7629551925670613, "train_speed(iter/s)": 0.164362 }, { "epoch": 0.4801001722521564, "grad_norm": 0.838500440120697, "learning_rate": 9.044605702454529e-05, "loss": 0.9678478240966797, "memory(GiB)": 91.52, "step": 37000, "token_acc": 0.730941555208921, "train_speed(iter/s)": 0.164355 }, { "epoch": 0.4801650506538121, "grad_norm": 0.7555230855941772, "learning_rate": 9.044290334977874e-05, "loss": 0.9339432716369629, "memory(GiB)": 91.52, "step": 37005, "token_acc": 0.764544417510338, "train_speed(iter/s)": 0.164347 }, { "epoch": 0.4802299290554678, "grad_norm": 0.7417700886726379, "learning_rate": 9.043974920959786e-05, "loss": 0.95645751953125, "memory(GiB)": 91.52, "step": 37010, "token_acc": 0.7592309800696974, "train_speed(iter/s)": 0.164339 }, { "epoch": 0.4802948074571235, "grad_norm": 0.7756755948066711, "learning_rate": 9.043659460403895e-05, "loss": 1.0016664505004882, "memory(GiB)": 91.52, "step": 37015, "token_acc": 0.738866180913453, "train_speed(iter/s)": 0.164333 }, { "epoch": 0.4803596858587792, "grad_norm": 0.6541677713394165, "learning_rate": 9.04334395331383e-05, "loss": 0.8765082359313965, "memory(GiB)": 91.52, "step": 37020, "token_acc": 0.7570815164930267, "train_speed(iter/s)": 0.164325 }, { "epoch": 0.4804245642604349, "grad_norm": 0.8350761532783508, "learning_rate": 9.043028399693224e-05, "loss": 0.943935775756836, "memory(GiB)": 91.52, "step": 37025, "token_acc": 0.7446990485014789, "train_speed(iter/s)": 0.164317 }, { "epoch": 0.4804894426620906, "grad_norm": 0.7490673065185547, "learning_rate": 9.042712799545707e-05, "loss": 0.9369075775146485, "memory(GiB)": 91.52, "step": 37030, "token_acc": 0.7562085388671542, "train_speed(iter/s)": 0.16431 }, { "epoch": 0.4805543210637463, "grad_norm": 0.8397456407546997, "learning_rate": 9.042397152874912e-05, "loss": 0.9159801483154297, "memory(GiB)": 91.52, "step": 37035, "token_acc": 0.7650537104562924, "train_speed(iter/s)": 0.164303 }, { "epoch": 0.480619199465402, "grad_norm": 0.8343539834022522, "learning_rate": 9.042081459684471e-05, "loss": 0.9126533508300781, "memory(GiB)": 91.52, "step": 37040, "token_acc": 0.7653615989361287, "train_speed(iter/s)": 0.164295 }, { "epoch": 0.4806840778670577, "grad_norm": 0.7648418545722961, "learning_rate": 9.041765719978015e-05, "loss": 0.9854934692382813, "memory(GiB)": 91.52, "step": 37045, "token_acc": 0.7205095006473737, "train_speed(iter/s)": 0.164289 }, { "epoch": 0.4807489562687134, "grad_norm": 0.8034374713897705, "learning_rate": 9.04144993375918e-05, "loss": 0.9406584739685059, "memory(GiB)": 91.52, "step": 37050, "token_acc": 0.749650809300797, "train_speed(iter/s)": 0.164283 }, { "epoch": 0.48081383467036903, "grad_norm": 0.7949866056442261, "learning_rate": 9.0411341010316e-05, "loss": 0.9194173812866211, "memory(GiB)": 91.52, "step": 37055, "token_acc": 0.7612946256187033, "train_speed(iter/s)": 0.164276 }, { "epoch": 0.48087871307202473, "grad_norm": 0.7866657376289368, "learning_rate": 9.040818221798908e-05, "loss": 0.9087282180786133, "memory(GiB)": 91.52, "step": 37060, "token_acc": 0.7542010061008242, "train_speed(iter/s)": 0.164267 }, { "epoch": 0.48094359147368043, "grad_norm": 0.8290927410125732, "learning_rate": 9.040502296064741e-05, "loss": 0.9483325958251954, "memory(GiB)": 91.52, "step": 37065, "token_acc": 0.7368754705020009, "train_speed(iter/s)": 0.164259 }, { "epoch": 0.48100846987533613, "grad_norm": 0.8643492460250854, "learning_rate": 9.040186323832733e-05, "loss": 0.924810791015625, "memory(GiB)": 91.52, "step": 37070, "token_acc": 0.7613059879460607, "train_speed(iter/s)": 0.16425 }, { "epoch": 0.48107334827699183, "grad_norm": 0.7257255911827087, "learning_rate": 9.03987030510652e-05, "loss": 0.8660164833068847, "memory(GiB)": 91.52, "step": 37075, "token_acc": 0.7784009754398189, "train_speed(iter/s)": 0.164242 }, { "epoch": 0.48113822667864753, "grad_norm": 0.7398331761360168, "learning_rate": 9.039554239889741e-05, "loss": 0.9588119506835937, "memory(GiB)": 91.52, "step": 37080, "token_acc": 0.7329293068420509, "train_speed(iter/s)": 0.164235 }, { "epoch": 0.48120310508030323, "grad_norm": 0.7768439054489136, "learning_rate": 9.039238128186029e-05, "loss": 0.9361793518066406, "memory(GiB)": 91.52, "step": 37085, "token_acc": 0.7515433534977509, "train_speed(iter/s)": 0.164228 }, { "epoch": 0.4812679834819589, "grad_norm": 0.7888131141662598, "learning_rate": 9.038921969999027e-05, "loss": 0.9483927726745606, "memory(GiB)": 91.52, "step": 37090, "token_acc": 0.7588377723970944, "train_speed(iter/s)": 0.164219 }, { "epoch": 0.4813328618836146, "grad_norm": 0.7837308049201965, "learning_rate": 9.038605765332371e-05, "loss": 0.9235008239746094, "memory(GiB)": 91.52, "step": 37095, "token_acc": 0.7531372002230898, "train_speed(iter/s)": 0.164212 }, { "epoch": 0.4813977402852703, "grad_norm": 0.7391892075538635, "learning_rate": 9.0382895141897e-05, "loss": 0.9427674293518067, "memory(GiB)": 91.52, "step": 37100, "token_acc": 0.7401311718846677, "train_speed(iter/s)": 0.164206 }, { "epoch": 0.481462618686926, "grad_norm": 0.769025444984436, "learning_rate": 9.037973216574654e-05, "loss": 0.9020419120788574, "memory(GiB)": 91.52, "step": 37105, "token_acc": 0.7476083182506557, "train_speed(iter/s)": 0.164198 }, { "epoch": 0.4815274970885817, "grad_norm": 0.8267367482185364, "learning_rate": 9.03765687249087e-05, "loss": 0.93363037109375, "memory(GiB)": 91.52, "step": 37110, "token_acc": 0.7596815056098444, "train_speed(iter/s)": 0.16419 }, { "epoch": 0.4815923754902374, "grad_norm": 0.7977683544158936, "learning_rate": 9.03734048194199e-05, "loss": 0.93753662109375, "memory(GiB)": 91.52, "step": 37115, "token_acc": 0.7611710874383356, "train_speed(iter/s)": 0.164182 }, { "epoch": 0.4816572538918931, "grad_norm": 0.7603380084037781, "learning_rate": 9.037024044931656e-05, "loss": 0.9121423721313476, "memory(GiB)": 91.52, "step": 37120, "token_acc": 0.7534215500945179, "train_speed(iter/s)": 0.164174 }, { "epoch": 0.4817221322935488, "grad_norm": 0.813515305519104, "learning_rate": 9.036707561463512e-05, "loss": 0.9379467010498047, "memory(GiB)": 91.52, "step": 37125, "token_acc": 0.7481582437011934, "train_speed(iter/s)": 0.164167 }, { "epoch": 0.4817870106952045, "grad_norm": 0.8350465893745422, "learning_rate": 9.036391031541193e-05, "loss": 0.9677897453308105, "memory(GiB)": 91.52, "step": 37130, "token_acc": 0.7331049768285312, "train_speed(iter/s)": 0.16416 }, { "epoch": 0.4818518890968602, "grad_norm": 0.8547181487083435, "learning_rate": 9.036074455168349e-05, "loss": 0.8740482330322266, "memory(GiB)": 91.52, "step": 37135, "token_acc": 0.7797993538513858, "train_speed(iter/s)": 0.164151 }, { "epoch": 0.4819167674985159, "grad_norm": 0.8387686014175415, "learning_rate": 9.035757832348618e-05, "loss": 0.8661445617675781, "memory(GiB)": 91.52, "step": 37140, "token_acc": 0.7593242499311863, "train_speed(iter/s)": 0.164143 }, { "epoch": 0.4819816459001716, "grad_norm": 0.7620444893836975, "learning_rate": 9.035441163085647e-05, "loss": 0.911752986907959, "memory(GiB)": 91.52, "step": 37145, "token_acc": 0.7733609166136219, "train_speed(iter/s)": 0.164135 }, { "epoch": 0.4820465243018273, "grad_norm": 0.7391924262046814, "learning_rate": 9.035124447383079e-05, "loss": 0.9165339469909668, "memory(GiB)": 91.52, "step": 37150, "token_acc": 0.7641389127791453, "train_speed(iter/s)": 0.164126 }, { "epoch": 0.482111402703483, "grad_norm": 0.7614346146583557, "learning_rate": 9.034807685244557e-05, "loss": 0.8833951950073242, "memory(GiB)": 91.52, "step": 37155, "token_acc": 0.7487717342067668, "train_speed(iter/s)": 0.164117 }, { "epoch": 0.4821762811051387, "grad_norm": 0.8084868788719177, "learning_rate": 9.034490876673728e-05, "loss": 0.9063364028930664, "memory(GiB)": 91.52, "step": 37160, "token_acc": 0.767574718864634, "train_speed(iter/s)": 0.164109 }, { "epoch": 0.4822411595067944, "grad_norm": 0.7348396182060242, "learning_rate": 9.034174021674239e-05, "loss": 0.9097168922424317, "memory(GiB)": 91.52, "step": 37165, "token_acc": 0.7415198898732837, "train_speed(iter/s)": 0.164101 }, { "epoch": 0.4823060379084501, "grad_norm": 0.9410738348960876, "learning_rate": 9.033857120249733e-05, "loss": 0.9233470916748047, "memory(GiB)": 91.52, "step": 37170, "token_acc": 0.7596137265045697, "train_speed(iter/s)": 0.164094 }, { "epoch": 0.48237091631010576, "grad_norm": 0.7018114924430847, "learning_rate": 9.03354017240386e-05, "loss": 0.9369512557983398, "memory(GiB)": 91.52, "step": 37175, "token_acc": 0.7452253874348916, "train_speed(iter/s)": 0.164086 }, { "epoch": 0.48243579471176146, "grad_norm": 0.8134874105453491, "learning_rate": 9.033223178140267e-05, "loss": 0.965241527557373, "memory(GiB)": 91.52, "step": 37180, "token_acc": 0.7395566217222268, "train_speed(iter/s)": 0.16408 }, { "epoch": 0.48250067311341716, "grad_norm": 0.751218318939209, "learning_rate": 9.0329061374626e-05, "loss": 0.8893123626708984, "memory(GiB)": 91.52, "step": 37185, "token_acc": 0.7714399392339133, "train_speed(iter/s)": 0.164072 }, { "epoch": 0.48256555151507285, "grad_norm": 0.9095156192779541, "learning_rate": 9.03258905037451e-05, "loss": 0.8985757827758789, "memory(GiB)": 91.52, "step": 37190, "token_acc": 0.7718528751510966, "train_speed(iter/s)": 0.164063 }, { "epoch": 0.48263042991672855, "grad_norm": 0.7362259030342102, "learning_rate": 9.032271916879645e-05, "loss": 0.9369371414184571, "memory(GiB)": 91.52, "step": 37195, "token_acc": 0.7561467828192795, "train_speed(iter/s)": 0.164056 }, { "epoch": 0.48269530831838425, "grad_norm": 0.6698884963989258, "learning_rate": 9.031954736981652e-05, "loss": 0.9047667503356933, "memory(GiB)": 91.52, "step": 37200, "token_acc": 0.74527983816588, "train_speed(iter/s)": 0.164049 }, { "epoch": 0.48276018672003995, "grad_norm": 0.7590576410293579, "learning_rate": 9.031637510684185e-05, "loss": 0.9247925758361817, "memory(GiB)": 91.52, "step": 37205, "token_acc": 0.7752486282578875, "train_speed(iter/s)": 0.164039 }, { "epoch": 0.48282506512169565, "grad_norm": 0.6945743560791016, "learning_rate": 9.031320237990893e-05, "loss": 0.906588363647461, "memory(GiB)": 91.52, "step": 37210, "token_acc": 0.7578544819173837, "train_speed(iter/s)": 0.16403 }, { "epoch": 0.48288994352335135, "grad_norm": 0.8067488074302673, "learning_rate": 9.031002918905427e-05, "loss": 0.9361480712890625, "memory(GiB)": 91.52, "step": 37215, "token_acc": 0.745014309726071, "train_speed(iter/s)": 0.164022 }, { "epoch": 0.48295482192500705, "grad_norm": 0.8317169547080994, "learning_rate": 9.03068555343144e-05, "loss": 0.9217408180236817, "memory(GiB)": 91.52, "step": 37220, "token_acc": 0.7506890795450134, "train_speed(iter/s)": 0.164015 }, { "epoch": 0.48301970032666275, "grad_norm": 0.8085234761238098, "learning_rate": 9.03036814157258e-05, "loss": 0.9821534156799316, "memory(GiB)": 91.52, "step": 37225, "token_acc": 0.7523103374167204, "train_speed(iter/s)": 0.164009 }, { "epoch": 0.48308457872831845, "grad_norm": 0.8153948187828064, "learning_rate": 9.030050683332505e-05, "loss": 0.9319693565368652, "memory(GiB)": 91.52, "step": 37230, "token_acc": 0.7533672042827428, "train_speed(iter/s)": 0.164001 }, { "epoch": 0.48314945712997415, "grad_norm": 0.8648070693016052, "learning_rate": 9.029733178714866e-05, "loss": 0.9509065628051758, "memory(GiB)": 91.52, "step": 37235, "token_acc": 0.748734228080045, "train_speed(iter/s)": 0.163993 }, { "epoch": 0.48321433553162985, "grad_norm": 0.8636986017227173, "learning_rate": 9.029415627723317e-05, "loss": 0.9455699920654297, "memory(GiB)": 91.52, "step": 37240, "token_acc": 0.7636872844017897, "train_speed(iter/s)": 0.163986 }, { "epoch": 0.48327921393328555, "grad_norm": 0.7983958125114441, "learning_rate": 9.029098030361512e-05, "loss": 0.9155657768249512, "memory(GiB)": 91.52, "step": 37245, "token_acc": 0.7547869709602204, "train_speed(iter/s)": 0.163978 }, { "epoch": 0.48334409233494124, "grad_norm": 0.7625327706336975, "learning_rate": 9.028780386633107e-05, "loss": 0.9023521423339844, "memory(GiB)": 91.52, "step": 37250, "token_acc": 0.7493744191034533, "train_speed(iter/s)": 0.16397 }, { "epoch": 0.48340897073659694, "grad_norm": 0.7542018294334412, "learning_rate": 9.028462696541755e-05, "loss": 0.901743221282959, "memory(GiB)": 91.52, "step": 37255, "token_acc": 0.744502795188887, "train_speed(iter/s)": 0.163963 }, { "epoch": 0.48347384913825264, "grad_norm": 0.761153519153595, "learning_rate": 9.028144960091113e-05, "loss": 0.9445740699768066, "memory(GiB)": 91.52, "step": 37260, "token_acc": 0.7697345302367186, "train_speed(iter/s)": 0.163954 }, { "epoch": 0.48353872753990834, "grad_norm": 0.7652332782745361, "learning_rate": 9.02782717728484e-05, "loss": 0.9050947189331054, "memory(GiB)": 91.52, "step": 37265, "token_acc": 0.7585831458227481, "train_speed(iter/s)": 0.163947 }, { "epoch": 0.48360360594156404, "grad_norm": 0.7244859337806702, "learning_rate": 9.02750934812659e-05, "loss": 0.8798031806945801, "memory(GiB)": 91.52, "step": 37270, "token_acc": 0.7330534718876884, "train_speed(iter/s)": 0.16394 }, { "epoch": 0.48366848434321974, "grad_norm": 0.721591055393219, "learning_rate": 9.027191472620022e-05, "loss": 0.8626821517944336, "memory(GiB)": 91.52, "step": 37275, "token_acc": 0.7727912431587177, "train_speed(iter/s)": 0.163932 }, { "epoch": 0.48373336274487544, "grad_norm": 0.8363763689994812, "learning_rate": 9.026873550768795e-05, "loss": 0.9502909660339356, "memory(GiB)": 91.52, "step": 37280, "token_acc": 0.753707644458995, "train_speed(iter/s)": 0.163925 }, { "epoch": 0.48379824114653114, "grad_norm": 0.830996572971344, "learning_rate": 9.026555582576565e-05, "loss": 0.9330483436584472, "memory(GiB)": 91.52, "step": 37285, "token_acc": 0.7664767013074086, "train_speed(iter/s)": 0.163919 }, { "epoch": 0.48386311954818684, "grad_norm": 0.7393617630004883, "learning_rate": 9.026237568046993e-05, "loss": 0.8954893112182617, "memory(GiB)": 91.52, "step": 37290, "token_acc": 0.7791806861146762, "train_speed(iter/s)": 0.163912 }, { "epoch": 0.4839279979498425, "grad_norm": 0.7353655695915222, "learning_rate": 9.025919507183738e-05, "loss": 0.9137861251831054, "memory(GiB)": 91.52, "step": 37295, "token_acc": 0.7510590727229937, "train_speed(iter/s)": 0.163904 }, { "epoch": 0.4839928763514982, "grad_norm": 0.8537885546684265, "learning_rate": 9.02560139999046e-05, "loss": 0.9665909767150879, "memory(GiB)": 91.52, "step": 37300, "token_acc": 0.7538497511440692, "train_speed(iter/s)": 0.163896 }, { "epoch": 0.4840577547531539, "grad_norm": 0.7612228393554688, "learning_rate": 9.025283246470819e-05, "loss": 0.9016521453857422, "memory(GiB)": 91.52, "step": 37305, "token_acc": 0.7632646982164799, "train_speed(iter/s)": 0.163888 }, { "epoch": 0.4841226331548096, "grad_norm": 0.7490016222000122, "learning_rate": 9.02496504662848e-05, "loss": 0.8607166290283204, "memory(GiB)": 91.52, "step": 37310, "token_acc": 0.7824942524905875, "train_speed(iter/s)": 0.163879 }, { "epoch": 0.4841875115564653, "grad_norm": 0.7629415392875671, "learning_rate": 9.024646800467102e-05, "loss": 0.8966157913208008, "memory(GiB)": 91.52, "step": 37315, "token_acc": 0.7630923555863023, "train_speed(iter/s)": 0.163871 }, { "epoch": 0.484252389958121, "grad_norm": 0.657008945941925, "learning_rate": 9.024328507990346e-05, "loss": 0.9331992149353028, "memory(GiB)": 91.52, "step": 37320, "token_acc": 0.7559167089893347, "train_speed(iter/s)": 0.163864 }, { "epoch": 0.4843172683597767, "grad_norm": 0.7190353870391846, "learning_rate": 9.024010169201877e-05, "loss": 0.9060878753662109, "memory(GiB)": 91.52, "step": 37325, "token_acc": 0.7472930201565884, "train_speed(iter/s)": 0.163855 }, { "epoch": 0.4843821467614324, "grad_norm": 0.7279659509658813, "learning_rate": 9.023691784105359e-05, "loss": 0.8995264053344727, "memory(GiB)": 91.52, "step": 37330, "token_acc": 0.7708040043797904, "train_speed(iter/s)": 0.163847 }, { "epoch": 0.4844470251630881, "grad_norm": 0.7743611931800842, "learning_rate": 9.023373352704454e-05, "loss": 0.9588827133178711, "memory(GiB)": 91.52, "step": 37335, "token_acc": 0.7375170186492733, "train_speed(iter/s)": 0.163839 }, { "epoch": 0.4845119035647438, "grad_norm": 0.7380636930465698, "learning_rate": 9.023054875002828e-05, "loss": 0.8904205322265625, "memory(GiB)": 91.52, "step": 37340, "token_acc": 0.7545391061452514, "train_speed(iter/s)": 0.16383 }, { "epoch": 0.4845767819663995, "grad_norm": 0.8681395053863525, "learning_rate": 9.022736351004145e-05, "loss": 0.9251602172851563, "memory(GiB)": 91.52, "step": 37345, "token_acc": 0.7602872531418312, "train_speed(iter/s)": 0.163825 }, { "epoch": 0.48464166036805517, "grad_norm": 0.7405557632446289, "learning_rate": 9.022417780712071e-05, "loss": 0.9550783157348632, "memory(GiB)": 91.52, "step": 37350, "token_acc": 0.7396508728179552, "train_speed(iter/s)": 0.163817 }, { "epoch": 0.48470653876971087, "grad_norm": 0.7564265727996826, "learning_rate": 9.022099164130271e-05, "loss": 0.8947383880615234, "memory(GiB)": 91.52, "step": 37355, "token_acc": 0.76719708862566, "train_speed(iter/s)": 0.16381 }, { "epoch": 0.48477141717136657, "grad_norm": 0.7592186331748962, "learning_rate": 9.021780501262414e-05, "loss": 0.9358340263366699, "memory(GiB)": 91.52, "step": 37360, "token_acc": 0.7399424169325345, "train_speed(iter/s)": 0.163804 }, { "epoch": 0.48483629557302227, "grad_norm": 0.789216160774231, "learning_rate": 9.021461792112166e-05, "loss": 0.9079769134521485, "memory(GiB)": 91.52, "step": 37365, "token_acc": 0.760505043757098, "train_speed(iter/s)": 0.163796 }, { "epoch": 0.48490117397467797, "grad_norm": 0.7695838809013367, "learning_rate": 9.021143036683194e-05, "loss": 0.953085994720459, "memory(GiB)": 91.52, "step": 37370, "token_acc": 0.7479166666666667, "train_speed(iter/s)": 0.163789 }, { "epoch": 0.48496605237633367, "grad_norm": 0.796725869178772, "learning_rate": 9.020824234979167e-05, "loss": 0.9643032073974609, "memory(GiB)": 91.52, "step": 37375, "token_acc": 0.7534215500945179, "train_speed(iter/s)": 0.163781 }, { "epoch": 0.48503093077798937, "grad_norm": 0.7390077710151672, "learning_rate": 9.020505387003753e-05, "loss": 0.9040847778320312, "memory(GiB)": 91.52, "step": 37380, "token_acc": 0.7480483760061482, "train_speed(iter/s)": 0.163773 }, { "epoch": 0.48509580917964507, "grad_norm": 0.7634555101394653, "learning_rate": 9.020186492760623e-05, "loss": 0.9168580055236817, "memory(GiB)": 91.52, "step": 37385, "token_acc": 0.7551623469097123, "train_speed(iter/s)": 0.163767 }, { "epoch": 0.48516068758130076, "grad_norm": 0.854987621307373, "learning_rate": 9.019867552253445e-05, "loss": 0.9486943244934082, "memory(GiB)": 91.52, "step": 37390, "token_acc": 0.7496793252225289, "train_speed(iter/s)": 0.163761 }, { "epoch": 0.48522556598295646, "grad_norm": 0.7018701434135437, "learning_rate": 9.01954856548589e-05, "loss": 0.9042507171630859, "memory(GiB)": 91.52, "step": 37395, "token_acc": 0.7679806469621313, "train_speed(iter/s)": 0.163752 }, { "epoch": 0.48529044438461216, "grad_norm": 0.7648683190345764, "learning_rate": 9.019229532461628e-05, "loss": 0.9140314102172852, "memory(GiB)": 91.52, "step": 37400, "token_acc": 0.7467783505154639, "train_speed(iter/s)": 0.163745 }, { "epoch": 0.48535532278626786, "grad_norm": 0.8570443391799927, "learning_rate": 9.01891045318433e-05, "loss": 0.9346454620361329, "memory(GiB)": 91.52, "step": 37405, "token_acc": 0.7437874579739804, "train_speed(iter/s)": 0.163736 }, { "epoch": 0.48542020118792356, "grad_norm": 0.7727245688438416, "learning_rate": 9.018591327657671e-05, "loss": 0.8980717658996582, "memory(GiB)": 91.52, "step": 37410, "token_acc": 0.7706397585370184, "train_speed(iter/s)": 0.163728 }, { "epoch": 0.4854850795895792, "grad_norm": 0.7829176783561707, "learning_rate": 9.018272155885322e-05, "loss": 0.9226384162902832, "memory(GiB)": 91.52, "step": 37415, "token_acc": 0.7716788843196853, "train_speed(iter/s)": 0.163722 }, { "epoch": 0.4855499579912349, "grad_norm": 0.7657821774482727, "learning_rate": 9.017952937870955e-05, "loss": 0.9713201522827148, "memory(GiB)": 91.52, "step": 37420, "token_acc": 0.7290756059675613, "train_speed(iter/s)": 0.163715 }, { "epoch": 0.4856148363928906, "grad_norm": 0.7138448357582092, "learning_rate": 9.017633673618245e-05, "loss": 0.9127158164978028, "memory(GiB)": 91.52, "step": 37425, "token_acc": 0.7763973221236182, "train_speed(iter/s)": 0.163708 }, { "epoch": 0.4856797147945463, "grad_norm": 0.7117810249328613, "learning_rate": 9.017314363130865e-05, "loss": 0.9255946159362793, "memory(GiB)": 91.52, "step": 37430, "token_acc": 0.7581251304892477, "train_speed(iter/s)": 0.163701 }, { "epoch": 0.485744593196202, "grad_norm": 0.72123783826828, "learning_rate": 9.016995006412488e-05, "loss": 0.9878374099731445, "memory(GiB)": 91.52, "step": 37435, "token_acc": 0.7423774394010975, "train_speed(iter/s)": 0.163693 }, { "epoch": 0.4858094715978577, "grad_norm": 0.7926099300384521, "learning_rate": 9.016675603466793e-05, "loss": 0.9280417442321778, "memory(GiB)": 91.52, "step": 37440, "token_acc": 0.7460939719334129, "train_speed(iter/s)": 0.163685 }, { "epoch": 0.4858743499995134, "grad_norm": 0.8357413411140442, "learning_rate": 9.016356154297451e-05, "loss": 0.9407583236694336, "memory(GiB)": 91.52, "step": 37445, "token_acc": 0.7418376629940362, "train_speed(iter/s)": 0.163677 }, { "epoch": 0.4859392284011691, "grad_norm": 0.7822026610374451, "learning_rate": 9.016036658908144e-05, "loss": 0.929692554473877, "memory(GiB)": 91.52, "step": 37450, "token_acc": 0.7450859382232233, "train_speed(iter/s)": 0.16367 }, { "epoch": 0.4860041068028248, "grad_norm": 0.6969814300537109, "learning_rate": 9.015717117302544e-05, "loss": 0.9182247161865235, "memory(GiB)": 91.52, "step": 37455, "token_acc": 0.7392546731753781, "train_speed(iter/s)": 0.163662 }, { "epoch": 0.4860689852044805, "grad_norm": 0.69538813829422, "learning_rate": 9.01539752948433e-05, "loss": 0.9117117881774902, "memory(GiB)": 91.52, "step": 37460, "token_acc": 0.7467462206696138, "train_speed(iter/s)": 0.163654 }, { "epoch": 0.4861338636061362, "grad_norm": 0.7572193145751953, "learning_rate": 9.015077895457181e-05, "loss": 0.8796659469604492, "memory(GiB)": 91.52, "step": 37465, "token_acc": 0.7741037243299687, "train_speed(iter/s)": 0.163646 }, { "epoch": 0.4861987420077919, "grad_norm": 0.7228693962097168, "learning_rate": 9.014758215224772e-05, "loss": 0.9603355407714844, "memory(GiB)": 91.52, "step": 37470, "token_acc": 0.7678323213109957, "train_speed(iter/s)": 0.163639 }, { "epoch": 0.4862636204094476, "grad_norm": 0.7578989863395691, "learning_rate": 9.014438488790786e-05, "loss": 0.9198198318481445, "memory(GiB)": 91.52, "step": 37475, "token_acc": 0.7700997801454422, "train_speed(iter/s)": 0.163632 }, { "epoch": 0.4863284988111033, "grad_norm": 0.8005654215812683, "learning_rate": 9.014118716158899e-05, "loss": 0.9090202331542969, "memory(GiB)": 91.52, "step": 37480, "token_acc": 0.7573571113491714, "train_speed(iter/s)": 0.163623 }, { "epoch": 0.486393377212759, "grad_norm": 0.806774377822876, "learning_rate": 9.013798897332792e-05, "loss": 0.8815706253051758, "memory(GiB)": 91.52, "step": 37485, "token_acc": 0.7373012987871377, "train_speed(iter/s)": 0.163616 }, { "epoch": 0.4864582556144147, "grad_norm": 0.706506073474884, "learning_rate": 9.013479032316145e-05, "loss": 0.8972932815551757, "memory(GiB)": 91.52, "step": 37490, "token_acc": 0.7647506661591169, "train_speed(iter/s)": 0.163608 }, { "epoch": 0.4865231340160704, "grad_norm": 0.8348091840744019, "learning_rate": 9.013159121112642e-05, "loss": 0.9423748970031738, "memory(GiB)": 91.52, "step": 37495, "token_acc": 0.7541997320416366, "train_speed(iter/s)": 0.1636 }, { "epoch": 0.4865880124177261, "grad_norm": 0.8079753518104553, "learning_rate": 9.01283916372596e-05, "loss": 0.9477941513061523, "memory(GiB)": 91.52, "step": 37500, "token_acc": 0.7440501452776599, "train_speed(iter/s)": 0.163594 }, { "epoch": 0.4866528908193818, "grad_norm": 0.799561083316803, "learning_rate": 9.012519160159785e-05, "loss": 0.970051383972168, "memory(GiB)": 91.52, "step": 37505, "token_acc": 0.7594203768150726, "train_speed(iter/s)": 0.163586 }, { "epoch": 0.4867177692210375, "grad_norm": 0.7514704465866089, "learning_rate": 9.012199110417796e-05, "loss": 0.9363739967346192, "memory(GiB)": 91.52, "step": 37510, "token_acc": 0.7460812798163492, "train_speed(iter/s)": 0.163578 }, { "epoch": 0.4867826476226932, "grad_norm": 0.7982296347618103, "learning_rate": 9.011879014503677e-05, "loss": 0.9506779670715332, "memory(GiB)": 91.52, "step": 37515, "token_acc": 0.7385108230292506, "train_speed(iter/s)": 0.16357 }, { "epoch": 0.4868475260243489, "grad_norm": 0.7994272708892822, "learning_rate": 9.011558872421116e-05, "loss": 0.9379180908203125, "memory(GiB)": 91.52, "step": 37520, "token_acc": 0.7596695456252347, "train_speed(iter/s)": 0.163564 }, { "epoch": 0.4869124044260046, "grad_norm": 0.8533226251602173, "learning_rate": 9.01123868417379e-05, "loss": 0.9416048049926757, "memory(GiB)": 91.52, "step": 37525, "token_acc": 0.7597107594457485, "train_speed(iter/s)": 0.163558 }, { "epoch": 0.4869772828276603, "grad_norm": 0.7561948895454407, "learning_rate": 9.010918449765389e-05, "loss": 0.9717732429504394, "memory(GiB)": 91.52, "step": 37530, "token_acc": 0.7390186838896761, "train_speed(iter/s)": 0.163551 }, { "epoch": 0.48704216122931593, "grad_norm": 0.8856300115585327, "learning_rate": 9.010598169199597e-05, "loss": 0.9602890014648438, "memory(GiB)": 91.52, "step": 37535, "token_acc": 0.7764312439985431, "train_speed(iter/s)": 0.163543 }, { "epoch": 0.4871070396309716, "grad_norm": 0.7786167860031128, "learning_rate": 9.010277842480099e-05, "loss": 0.8777345657348633, "memory(GiB)": 91.52, "step": 37540, "token_acc": 0.7706213834575092, "train_speed(iter/s)": 0.163536 }, { "epoch": 0.4871719180326273, "grad_norm": 0.7159758806228638, "learning_rate": 9.009957469610581e-05, "loss": 0.9321412086486817, "memory(GiB)": 91.52, "step": 37545, "token_acc": 0.7711529337339871, "train_speed(iter/s)": 0.163529 }, { "epoch": 0.487236796434283, "grad_norm": 0.7548480033874512, "learning_rate": 9.009637050594732e-05, "loss": 0.918426513671875, "memory(GiB)": 91.52, "step": 37550, "token_acc": 0.7595663750475466, "train_speed(iter/s)": 0.163522 }, { "epoch": 0.4873016748359387, "grad_norm": 0.8444288372993469, "learning_rate": 9.009316585436235e-05, "loss": 0.9580515861511231, "memory(GiB)": 91.52, "step": 37555, "token_acc": 0.7440750704586215, "train_speed(iter/s)": 0.163515 }, { "epoch": 0.4873665532375944, "grad_norm": 0.7265641689300537, "learning_rate": 9.008996074138785e-05, "loss": 0.9165166854858399, "memory(GiB)": 91.52, "step": 37560, "token_acc": 0.7547706837440973, "train_speed(iter/s)": 0.163508 }, { "epoch": 0.4874314316392501, "grad_norm": 0.7863579392433167, "learning_rate": 9.008675516706063e-05, "loss": 0.932928466796875, "memory(GiB)": 91.52, "step": 37565, "token_acc": 0.7574271025469959, "train_speed(iter/s)": 0.163501 }, { "epoch": 0.4874963100409058, "grad_norm": 0.8297694325447083, "learning_rate": 9.008354913141762e-05, "loss": 0.9149340629577637, "memory(GiB)": 91.52, "step": 37570, "token_acc": 0.7552885259070826, "train_speed(iter/s)": 0.163495 }, { "epoch": 0.4875611884425615, "grad_norm": 0.780861496925354, "learning_rate": 9.008034263449572e-05, "loss": 0.9030744552612304, "memory(GiB)": 91.52, "step": 37575, "token_acc": 0.7376580667893357, "train_speed(iter/s)": 0.163487 }, { "epoch": 0.4876260668442172, "grad_norm": 0.7821309566497803, "learning_rate": 9.00771356763318e-05, "loss": 0.9594261169433593, "memory(GiB)": 91.52, "step": 37580, "token_acc": 0.7452463708852995, "train_speed(iter/s)": 0.16348 }, { "epoch": 0.4876909452458729, "grad_norm": 0.8081151843070984, "learning_rate": 9.007392825696278e-05, "loss": 0.8556371688842773, "memory(GiB)": 91.52, "step": 37585, "token_acc": 0.7660956366331023, "train_speed(iter/s)": 0.163472 }, { "epoch": 0.4877558236475286, "grad_norm": 0.7895299792289734, "learning_rate": 9.007072037642558e-05, "loss": 0.8935794830322266, "memory(GiB)": 91.52, "step": 37590, "token_acc": 0.7396026823662752, "train_speed(iter/s)": 0.163464 }, { "epoch": 0.4878207020491843, "grad_norm": 0.9286609292030334, "learning_rate": 9.006751203475711e-05, "loss": 0.9626359939575195, "memory(GiB)": 91.52, "step": 37595, "token_acc": 0.7467388188073395, "train_speed(iter/s)": 0.163457 }, { "epoch": 0.48788558045084, "grad_norm": 0.7178387641906738, "learning_rate": 9.006430323199428e-05, "loss": 0.9554773330688476, "memory(GiB)": 91.52, "step": 37600, "token_acc": 0.7415834192470404, "train_speed(iter/s)": 0.16345 }, { "epoch": 0.4879504588524957, "grad_norm": 0.7333540320396423, "learning_rate": 9.006109396817404e-05, "loss": 0.9430015563964844, "memory(GiB)": 91.52, "step": 37605, "token_acc": 0.732397765671333, "train_speed(iter/s)": 0.163444 }, { "epoch": 0.4880153372541514, "grad_norm": 0.7204821705818176, "learning_rate": 9.005788424333331e-05, "loss": 0.9149975776672363, "memory(GiB)": 91.52, "step": 37610, "token_acc": 0.733041653327642, "train_speed(iter/s)": 0.163436 }, { "epoch": 0.4880802156558071, "grad_norm": 0.7367596626281738, "learning_rate": 9.005467405750902e-05, "loss": 0.9165651321411132, "memory(GiB)": 91.52, "step": 37615, "token_acc": 0.7518729289727705, "train_speed(iter/s)": 0.163429 }, { "epoch": 0.4881450940574628, "grad_norm": 0.8347969651222229, "learning_rate": 9.005146341073811e-05, "loss": 0.902902889251709, "memory(GiB)": 91.52, "step": 37620, "token_acc": 0.7559729774262646, "train_speed(iter/s)": 0.163422 }, { "epoch": 0.4882099724591185, "grad_norm": 0.7796126008033752, "learning_rate": 9.004825230305756e-05, "loss": 0.9316045761108398, "memory(GiB)": 91.52, "step": 37625, "token_acc": 0.7487268318032219, "train_speed(iter/s)": 0.163414 }, { "epoch": 0.4882748508607742, "grad_norm": 0.7784769535064697, "learning_rate": 9.004504073450428e-05, "loss": 0.968534278869629, "memory(GiB)": 91.52, "step": 37630, "token_acc": 0.7295968534906588, "train_speed(iter/s)": 0.163406 }, { "epoch": 0.4883397292624299, "grad_norm": 0.7928036451339722, "learning_rate": 9.004182870511525e-05, "loss": 0.9242023468017578, "memory(GiB)": 91.52, "step": 37635, "token_acc": 0.721807209264394, "train_speed(iter/s)": 0.163399 }, { "epoch": 0.4884046076640856, "grad_norm": 0.7338783740997314, "learning_rate": 9.003861621492744e-05, "loss": 0.9346967697143554, "memory(GiB)": 91.52, "step": 37640, "token_acc": 0.745924844294782, "train_speed(iter/s)": 0.163392 }, { "epoch": 0.4884694860657413, "grad_norm": 0.7018629908561707, "learning_rate": 9.003540326397781e-05, "loss": 0.9156661987304687, "memory(GiB)": 91.52, "step": 37645, "token_acc": 0.7467446186553282, "train_speed(iter/s)": 0.163384 }, { "epoch": 0.488534364467397, "grad_norm": 0.8296741247177124, "learning_rate": 9.003218985230333e-05, "loss": 0.946255111694336, "memory(GiB)": 91.52, "step": 37650, "token_acc": 0.7404631896064772, "train_speed(iter/s)": 0.163377 }, { "epoch": 0.48859924286905265, "grad_norm": 0.6990853548049927, "learning_rate": 9.002897597994099e-05, "loss": 0.9644222259521484, "memory(GiB)": 91.52, "step": 37655, "token_acc": 0.7365090843351713, "train_speed(iter/s)": 0.16337 }, { "epoch": 0.48866412127070835, "grad_norm": 0.722322940826416, "learning_rate": 9.002576164692777e-05, "loss": 0.8959528923034668, "memory(GiB)": 91.52, "step": 37660, "token_acc": 0.745498589919734, "train_speed(iter/s)": 0.163362 }, { "epoch": 0.48872899967236405, "grad_norm": 0.7316393852233887, "learning_rate": 9.002254685330067e-05, "loss": 0.9393978118896484, "memory(GiB)": 91.52, "step": 37665, "token_acc": 0.7474713418745785, "train_speed(iter/s)": 0.163354 }, { "epoch": 0.48879387807401975, "grad_norm": 0.756471574306488, "learning_rate": 9.001933159909666e-05, "loss": 0.9399236679077149, "memory(GiB)": 91.52, "step": 37670, "token_acc": 0.7462545098758637, "train_speed(iter/s)": 0.163347 }, { "epoch": 0.48885875647567545, "grad_norm": 0.9112948775291443, "learning_rate": 9.001611588435277e-05, "loss": 0.9059186935424804, "memory(GiB)": 91.52, "step": 37675, "token_acc": 0.7537686479731313, "train_speed(iter/s)": 0.163339 }, { "epoch": 0.48892363487733115, "grad_norm": 0.7148289680480957, "learning_rate": 9.001289970910598e-05, "loss": 0.9098306655883789, "memory(GiB)": 91.52, "step": 37680, "token_acc": 0.7696374622356495, "train_speed(iter/s)": 0.16333 }, { "epoch": 0.48898851327898685, "grad_norm": 0.7419551610946655, "learning_rate": 9.000968307339332e-05, "loss": 0.9386503219604492, "memory(GiB)": 91.52, "step": 37685, "token_acc": 0.7479315687114745, "train_speed(iter/s)": 0.163324 }, { "epoch": 0.48905339168064255, "grad_norm": 0.7808006405830383, "learning_rate": 9.000646597725183e-05, "loss": 0.9193268775939941, "memory(GiB)": 91.52, "step": 37690, "token_acc": 0.7400341567042946, "train_speed(iter/s)": 0.163317 }, { "epoch": 0.48911827008229825, "grad_norm": 0.9740790128707886, "learning_rate": 9.000324842071846e-05, "loss": 1.0027458190917968, "memory(GiB)": 91.52, "step": 37695, "token_acc": 0.735873898255495, "train_speed(iter/s)": 0.163309 }, { "epoch": 0.48918314848395394, "grad_norm": 0.7448439002037048, "learning_rate": 9.000003040383031e-05, "loss": 0.967134666442871, "memory(GiB)": 91.52, "step": 37700, "token_acc": 0.736247219383154, "train_speed(iter/s)": 0.163301 }, { "epoch": 0.48924802688560964, "grad_norm": 0.7282761931419373, "learning_rate": 8.999681192662437e-05, "loss": 0.9421848297119141, "memory(GiB)": 91.52, "step": 37705, "token_acc": 0.7543743142576659, "train_speed(iter/s)": 0.163294 }, { "epoch": 0.48931290528726534, "grad_norm": 0.7040739059448242, "learning_rate": 8.99935929891377e-05, "loss": 0.8786693572998047, "memory(GiB)": 91.52, "step": 37710, "token_acc": 0.7699543809973258, "train_speed(iter/s)": 0.163285 }, { "epoch": 0.48937778368892104, "grad_norm": 0.7520244121551514, "learning_rate": 8.999037359140735e-05, "loss": 0.9394051551818847, "memory(GiB)": 91.52, "step": 37715, "token_acc": 0.7474528588807786, "train_speed(iter/s)": 0.163277 }, { "epoch": 0.48944266209057674, "grad_norm": 0.7873300909996033, "learning_rate": 8.998715373347033e-05, "loss": 0.9041150093078614, "memory(GiB)": 91.52, "step": 37720, "token_acc": 0.7535044545511184, "train_speed(iter/s)": 0.163269 }, { "epoch": 0.48950754049223244, "grad_norm": 0.7264984846115112, "learning_rate": 8.998393341536373e-05, "loss": 0.9214157104492188, "memory(GiB)": 91.52, "step": 37725, "token_acc": 0.7670092670598146, "train_speed(iter/s)": 0.163261 }, { "epoch": 0.48957241889388814, "grad_norm": 0.7408735752105713, "learning_rate": 8.99807126371246e-05, "loss": 0.9070934295654297, "memory(GiB)": 91.52, "step": 37730, "token_acc": 0.7740895374312761, "train_speed(iter/s)": 0.163256 }, { "epoch": 0.48963729729554384, "grad_norm": 0.7929748892784119, "learning_rate": 8.997749139879e-05, "loss": 0.9445230484008789, "memory(GiB)": 91.52, "step": 37735, "token_acc": 0.7633858411607762, "train_speed(iter/s)": 0.163249 }, { "epoch": 0.48970217569719954, "grad_norm": 0.7184607982635498, "learning_rate": 8.9974269700397e-05, "loss": 0.9619832992553711, "memory(GiB)": 91.52, "step": 37740, "token_acc": 0.755857691403495, "train_speed(iter/s)": 0.163242 }, { "epoch": 0.48976705409885524, "grad_norm": 0.7492313385009766, "learning_rate": 8.997104754198266e-05, "loss": 0.9580476760864258, "memory(GiB)": 91.52, "step": 37745, "token_acc": 0.7541622675221311, "train_speed(iter/s)": 0.163234 }, { "epoch": 0.48983193250051094, "grad_norm": 0.7611730694770813, "learning_rate": 8.996782492358411e-05, "loss": 0.8883198738098145, "memory(GiB)": 91.52, "step": 37750, "token_acc": 0.7513674503851772, "train_speed(iter/s)": 0.163226 }, { "epoch": 0.48989681090216664, "grad_norm": 0.8050960302352905, "learning_rate": 8.996460184523838e-05, "loss": 0.9063654899597168, "memory(GiB)": 91.52, "step": 37755, "token_acc": 0.7672572448839445, "train_speed(iter/s)": 0.163218 }, { "epoch": 0.48996168930382233, "grad_norm": 0.7201831340789795, "learning_rate": 8.99613783069826e-05, "loss": 0.9307251930236816, "memory(GiB)": 91.52, "step": 37760, "token_acc": 0.7612616499827408, "train_speed(iter/s)": 0.163212 }, { "epoch": 0.49002656770547803, "grad_norm": 0.8001465201377869, "learning_rate": 8.995815430885384e-05, "loss": 0.9440978050231934, "memory(GiB)": 91.52, "step": 37765, "token_acc": 0.7410240899548091, "train_speed(iter/s)": 0.163206 }, { "epoch": 0.4900914461071337, "grad_norm": 0.807342529296875, "learning_rate": 8.995492985088921e-05, "loss": 0.9607345581054687, "memory(GiB)": 91.52, "step": 37770, "token_acc": 0.7611950915881203, "train_speed(iter/s)": 0.163198 }, { "epoch": 0.4901563245087894, "grad_norm": 0.7495357990264893, "learning_rate": 8.995170493312582e-05, "loss": 0.9499748229980469, "memory(GiB)": 91.52, "step": 37775, "token_acc": 0.7304617535275872, "train_speed(iter/s)": 0.163192 }, { "epoch": 0.4902212029104451, "grad_norm": 0.792893648147583, "learning_rate": 8.99484795556008e-05, "loss": 0.9290783882141114, "memory(GiB)": 91.52, "step": 37780, "token_acc": 0.7570970607547024, "train_speed(iter/s)": 0.163184 }, { "epoch": 0.4902860813121008, "grad_norm": 0.7750411629676819, "learning_rate": 8.994525371835121e-05, "loss": 0.9195589065551758, "memory(GiB)": 91.52, "step": 37785, "token_acc": 0.7489911218724778, "train_speed(iter/s)": 0.163178 }, { "epoch": 0.4903509597137565, "grad_norm": 0.7044247984886169, "learning_rate": 8.994202742141425e-05, "loss": 0.867065143585205, "memory(GiB)": 91.52, "step": 37790, "token_acc": 0.7386318844734296, "train_speed(iter/s)": 0.163171 }, { "epoch": 0.4904158381154122, "grad_norm": 0.7813559770584106, "learning_rate": 8.993880066482698e-05, "loss": 0.9307364463806153, "memory(GiB)": 91.52, "step": 37795, "token_acc": 0.7389692779242997, "train_speed(iter/s)": 0.163163 }, { "epoch": 0.49048071651706787, "grad_norm": 0.7453730702400208, "learning_rate": 8.993557344862658e-05, "loss": 0.9012126922607422, "memory(GiB)": 91.52, "step": 37800, "token_acc": 0.7524335318901642, "train_speed(iter/s)": 0.163157 }, { "epoch": 0.49054559491872357, "grad_norm": 0.7986934185028076, "learning_rate": 8.993234577285016e-05, "loss": 0.922877025604248, "memory(GiB)": 91.52, "step": 37805, "token_acc": 0.7448170509929439, "train_speed(iter/s)": 0.163149 }, { "epoch": 0.49061047332037927, "grad_norm": 0.7287336587905884, "learning_rate": 8.992911763753488e-05, "loss": 0.9135847091674805, "memory(GiB)": 91.52, "step": 37810, "token_acc": 0.7618151815181519, "train_speed(iter/s)": 0.163141 }, { "epoch": 0.49067535172203497, "grad_norm": 0.7623665928840637, "learning_rate": 8.992588904271789e-05, "loss": 0.9398677825927735, "memory(GiB)": 91.52, "step": 37815, "token_acc": 0.7677806684010406, "train_speed(iter/s)": 0.163135 }, { "epoch": 0.49074023012369067, "grad_norm": 0.7561687231063843, "learning_rate": 8.992265998843634e-05, "loss": 0.9612468719482422, "memory(GiB)": 91.52, "step": 37820, "token_acc": 0.7479886168007589, "train_speed(iter/s)": 0.163128 }, { "epoch": 0.49080510852534637, "grad_norm": 0.6963998079299927, "learning_rate": 8.991943047472737e-05, "loss": 0.9378676414489746, "memory(GiB)": 91.52, "step": 37825, "token_acc": 0.7347644648295049, "train_speed(iter/s)": 0.163121 }, { "epoch": 0.49086998692700207, "grad_norm": 0.6886767745018005, "learning_rate": 8.991620050162818e-05, "loss": 0.9262905120849609, "memory(GiB)": 91.52, "step": 37830, "token_acc": 0.7415552775400013, "train_speed(iter/s)": 0.163114 }, { "epoch": 0.49093486532865777, "grad_norm": 0.8297800421714783, "learning_rate": 8.991297006917592e-05, "loss": 0.9262752532958984, "memory(GiB)": 91.52, "step": 37835, "token_acc": 0.7584611949344476, "train_speed(iter/s)": 0.163107 }, { "epoch": 0.49099974373031346, "grad_norm": 0.7576852440834045, "learning_rate": 8.990973917740777e-05, "loss": 0.953227424621582, "memory(GiB)": 91.52, "step": 37840, "token_acc": 0.7393220232287306, "train_speed(iter/s)": 0.1631 }, { "epoch": 0.49106462213196916, "grad_norm": 0.8499167561531067, "learning_rate": 8.99065078263609e-05, "loss": 0.9483733177185059, "memory(GiB)": 91.52, "step": 37845, "token_acc": 0.7571953976448359, "train_speed(iter/s)": 0.163093 }, { "epoch": 0.49112950053362486, "grad_norm": 0.7618536949157715, "learning_rate": 8.990327601607252e-05, "loss": 0.9234390258789062, "memory(GiB)": 91.52, "step": 37850, "token_acc": 0.7713462049083943, "train_speed(iter/s)": 0.163085 }, { "epoch": 0.49119437893528056, "grad_norm": 0.7648212909698486, "learning_rate": 8.99000437465798e-05, "loss": 0.8668168067932129, "memory(GiB)": 91.52, "step": 37855, "token_acc": 0.7509349289454001, "train_speed(iter/s)": 0.163076 }, { "epoch": 0.49125925733693626, "grad_norm": 0.8050549030303955, "learning_rate": 8.989681101791993e-05, "loss": 0.8933136940002442, "memory(GiB)": 91.52, "step": 37860, "token_acc": 0.7711453956442249, "train_speed(iter/s)": 0.16307 }, { "epoch": 0.49132413573859196, "grad_norm": 0.8043829202651978, "learning_rate": 8.989357783013015e-05, "loss": 0.9178472518920898, "memory(GiB)": 91.52, "step": 37865, "token_acc": 0.7618503080157881, "train_speed(iter/s)": 0.163064 }, { "epoch": 0.49138901414024766, "grad_norm": 0.769023060798645, "learning_rate": 8.989034418324764e-05, "loss": 0.9381951332092285, "memory(GiB)": 91.52, "step": 37870, "token_acc": 0.7327959270258804, "train_speed(iter/s)": 0.163057 }, { "epoch": 0.49145389254190336, "grad_norm": 0.7828167080879211, "learning_rate": 8.988711007730961e-05, "loss": 0.9726734161376953, "memory(GiB)": 91.52, "step": 37875, "token_acc": 0.7546356188012074, "train_speed(iter/s)": 0.163049 }, { "epoch": 0.49151877094355906, "grad_norm": 0.7933485507965088, "learning_rate": 8.988387551235328e-05, "loss": 0.9096244812011719, "memory(GiB)": 91.52, "step": 37880, "token_acc": 0.7631839281637622, "train_speed(iter/s)": 0.163042 }, { "epoch": 0.49158364934521476, "grad_norm": 0.8383026719093323, "learning_rate": 8.98806404884159e-05, "loss": 0.9951839447021484, "memory(GiB)": 91.52, "step": 37885, "token_acc": 0.7522809351834252, "train_speed(iter/s)": 0.163036 }, { "epoch": 0.4916485277468704, "grad_norm": 0.7423493266105652, "learning_rate": 8.987740500553465e-05, "loss": 0.8954963684082031, "memory(GiB)": 91.52, "step": 37890, "token_acc": 0.7689115646258503, "train_speed(iter/s)": 0.163029 }, { "epoch": 0.4917134061485261, "grad_norm": 0.8068279027938843, "learning_rate": 8.987416906374682e-05, "loss": 0.9594872474670411, "memory(GiB)": 91.52, "step": 37895, "token_acc": 0.7268498779022443, "train_speed(iter/s)": 0.163023 }, { "epoch": 0.4917782845501818, "grad_norm": 0.8430220484733582, "learning_rate": 8.987093266308959e-05, "loss": 0.8887638092041016, "memory(GiB)": 91.52, "step": 37900, "token_acc": 0.7527688460164345, "train_speed(iter/s)": 0.163017 }, { "epoch": 0.4918431629518375, "grad_norm": 0.7820587754249573, "learning_rate": 8.986769580360024e-05, "loss": 0.9559442520141601, "memory(GiB)": 91.52, "step": 37905, "token_acc": 0.752503085996434, "train_speed(iter/s)": 0.16301 }, { "epoch": 0.4919080413534932, "grad_norm": 0.6694411635398865, "learning_rate": 8.986445848531602e-05, "loss": 0.8507916450500488, "memory(GiB)": 91.52, "step": 37910, "token_acc": 0.7927852348993288, "train_speed(iter/s)": 0.163001 }, { "epoch": 0.4919729197551489, "grad_norm": 0.7697842717170715, "learning_rate": 8.986122070827417e-05, "loss": 0.9758479118347168, "memory(GiB)": 91.52, "step": 37915, "token_acc": 0.7440992500950341, "train_speed(iter/s)": 0.162993 }, { "epoch": 0.4920377981568046, "grad_norm": 0.7323951125144958, "learning_rate": 8.985798247251196e-05, "loss": 0.9495230674743652, "memory(GiB)": 91.52, "step": 37920, "token_acc": 0.7484415630677164, "train_speed(iter/s)": 0.162986 }, { "epoch": 0.4921026765584603, "grad_norm": 0.8351104855537415, "learning_rate": 8.985474377806665e-05, "loss": 0.9050642967224121, "memory(GiB)": 91.52, "step": 37925, "token_acc": 0.7633594706068985, "train_speed(iter/s)": 0.162979 }, { "epoch": 0.492167554960116, "grad_norm": 0.8499119281768799, "learning_rate": 8.985150462497552e-05, "loss": 0.9165524482727051, "memory(GiB)": 91.52, "step": 37930, "token_acc": 0.7639141261188506, "train_speed(iter/s)": 0.162972 }, { "epoch": 0.4922324333617717, "grad_norm": 0.7917971611022949, "learning_rate": 8.984826501327583e-05, "loss": 0.9202252388000488, "memory(GiB)": 91.52, "step": 37935, "token_acc": 0.7592195957786518, "train_speed(iter/s)": 0.162966 }, { "epoch": 0.4922973117634274, "grad_norm": 0.7993192672729492, "learning_rate": 8.984502494300489e-05, "loss": 0.9543083190917969, "memory(GiB)": 91.52, "step": 37940, "token_acc": 0.7391175269974394, "train_speed(iter/s)": 0.162959 }, { "epoch": 0.4923621901650831, "grad_norm": 0.8324752449989319, "learning_rate": 8.984178441419995e-05, "loss": 0.9446172714233398, "memory(GiB)": 91.52, "step": 37945, "token_acc": 0.7328097857556456, "train_speed(iter/s)": 0.162952 }, { "epoch": 0.4924270685667388, "grad_norm": 0.839961051940918, "learning_rate": 8.983854342689832e-05, "loss": 0.963987159729004, "memory(GiB)": 91.52, "step": 37950, "token_acc": 0.7430645826067658, "train_speed(iter/s)": 0.162947 }, { "epoch": 0.4924919469683945, "grad_norm": 0.9531944990158081, "learning_rate": 8.98353019811373e-05, "loss": 0.88231201171875, "memory(GiB)": 91.52, "step": 37955, "token_acc": 0.7657604798226743, "train_speed(iter/s)": 0.162939 }, { "epoch": 0.4925568253700502, "grad_norm": 0.7496179938316345, "learning_rate": 8.983206007695418e-05, "loss": 0.9192880630493164, "memory(GiB)": 91.52, "step": 37960, "token_acc": 0.7805127767586707, "train_speed(iter/s)": 0.162933 }, { "epoch": 0.4926217037717059, "grad_norm": 0.7065154910087585, "learning_rate": 8.98288177143863e-05, "loss": 0.9284145355224609, "memory(GiB)": 91.52, "step": 37965, "token_acc": 0.7583571789261068, "train_speed(iter/s)": 0.162925 }, { "epoch": 0.4926865821733616, "grad_norm": 1.1572115421295166, "learning_rate": 8.982557489347092e-05, "loss": 0.9098936080932617, "memory(GiB)": 91.52, "step": 37970, "token_acc": 0.7295941053166487, "train_speed(iter/s)": 0.162918 }, { "epoch": 0.4927514605750173, "grad_norm": 0.8228792548179626, "learning_rate": 8.98223316142454e-05, "loss": 0.9403656005859375, "memory(GiB)": 91.52, "step": 37975, "token_acc": 0.7547363406500788, "train_speed(iter/s)": 0.162912 }, { "epoch": 0.492816338976673, "grad_norm": 0.8022385835647583, "learning_rate": 8.981908787674706e-05, "loss": 0.9327915191650391, "memory(GiB)": 91.52, "step": 37980, "token_acc": 0.7569155058141842, "train_speed(iter/s)": 0.162905 }, { "epoch": 0.4928812173783287, "grad_norm": 0.8555459380149841, "learning_rate": 8.981584368101321e-05, "loss": 0.965859317779541, "memory(GiB)": 91.52, "step": 37985, "token_acc": 0.735892539672088, "train_speed(iter/s)": 0.162897 }, { "epoch": 0.4929460957799844, "grad_norm": 0.7604363560676575, "learning_rate": 8.98125990270812e-05, "loss": 0.9334288597106933, "memory(GiB)": 91.52, "step": 37990, "token_acc": 0.7496311252620952, "train_speed(iter/s)": 0.162888 }, { "epoch": 0.4930109741816401, "grad_norm": 0.8141499757766724, "learning_rate": 8.980935391498835e-05, "loss": 0.9057876586914062, "memory(GiB)": 91.52, "step": 37995, "token_acc": 0.73349895061464, "train_speed(iter/s)": 0.162882 }, { "epoch": 0.4930758525832958, "grad_norm": 0.7869868278503418, "learning_rate": 8.980610834477202e-05, "loss": 0.905804443359375, "memory(GiB)": 91.52, "step": 38000, "token_acc": 0.7499479405272583, "train_speed(iter/s)": 0.162875 }, { "epoch": 0.4931407309849515, "grad_norm": 0.7064185738563538, "learning_rate": 8.980286231646956e-05, "loss": 0.9158254623413086, "memory(GiB)": 91.52, "step": 38005, "token_acc": 0.7693080788558175, "train_speed(iter/s)": 0.162867 }, { "epoch": 0.4932056093866071, "grad_norm": 0.7358172535896301, "learning_rate": 8.979961583011832e-05, "loss": 0.9041772842407226, "memory(GiB)": 91.52, "step": 38010, "token_acc": 0.7597284907068481, "train_speed(iter/s)": 0.162861 }, { "epoch": 0.4932704877882628, "grad_norm": 0.7331957817077637, "learning_rate": 8.979636888575566e-05, "loss": 0.9137981414794922, "memory(GiB)": 91.52, "step": 38015, "token_acc": 0.7716386320597082, "train_speed(iter/s)": 0.162854 }, { "epoch": 0.4933353661899185, "grad_norm": 0.7984059453010559, "learning_rate": 8.979312148341896e-05, "loss": 0.9351558685302734, "memory(GiB)": 91.52, "step": 38020, "token_acc": 0.7328156362080879, "train_speed(iter/s)": 0.162846 }, { "epoch": 0.4934002445915742, "grad_norm": 0.751941978931427, "learning_rate": 8.978987362314558e-05, "loss": 0.8823101997375489, "memory(GiB)": 91.52, "step": 38025, "token_acc": 0.7525024097278861, "train_speed(iter/s)": 0.162839 }, { "epoch": 0.4934651229932299, "grad_norm": 0.7517489194869995, "learning_rate": 8.978662530497288e-05, "loss": 0.8791802406311036, "memory(GiB)": 91.52, "step": 38030, "token_acc": 0.7640067382081358, "train_speed(iter/s)": 0.16283 }, { "epoch": 0.4935300013948856, "grad_norm": 0.7381437420845032, "learning_rate": 8.978337652893827e-05, "loss": 0.8466215133666992, "memory(GiB)": 91.52, "step": 38035, "token_acc": 0.7641053989312696, "train_speed(iter/s)": 0.162822 }, { "epoch": 0.4935948797965413, "grad_norm": 0.7981733083724976, "learning_rate": 8.978012729507912e-05, "loss": 0.9385726928710938, "memory(GiB)": 91.52, "step": 38040, "token_acc": 0.753218639626296, "train_speed(iter/s)": 0.162814 }, { "epoch": 0.493659758198197, "grad_norm": 0.8084747195243835, "learning_rate": 8.977687760343283e-05, "loss": 0.9442097663879394, "memory(GiB)": 91.52, "step": 38045, "token_acc": 0.7542606187729418, "train_speed(iter/s)": 0.162807 }, { "epoch": 0.4937246365998527, "grad_norm": 0.6759606003761292, "learning_rate": 8.977362745403676e-05, "loss": 0.9144453048706055, "memory(GiB)": 91.52, "step": 38050, "token_acc": 0.7389815568212639, "train_speed(iter/s)": 0.1628 }, { "epoch": 0.4937895150015084, "grad_norm": 0.8009119629859924, "learning_rate": 8.97703768469284e-05, "loss": 0.9284849166870117, "memory(GiB)": 91.52, "step": 38055, "token_acc": 0.7590853791683564, "train_speed(iter/s)": 0.162793 }, { "epoch": 0.4938543934031641, "grad_norm": 0.8547624945640564, "learning_rate": 8.976712578214507e-05, "loss": 0.9488515853881836, "memory(GiB)": 91.52, "step": 38060, "token_acc": 0.7459820324734197, "train_speed(iter/s)": 0.162785 }, { "epoch": 0.4939192718048198, "grad_norm": 0.7411747574806213, "learning_rate": 8.976387425972421e-05, "loss": 0.8953954696655273, "memory(GiB)": 91.52, "step": 38065, "token_acc": 0.7514363230130865, "train_speed(iter/s)": 0.162777 }, { "epoch": 0.4939841502064755, "grad_norm": 0.7731838226318359, "learning_rate": 8.976062227970325e-05, "loss": 0.9282951354980469, "memory(GiB)": 91.52, "step": 38070, "token_acc": 0.7516218721037998, "train_speed(iter/s)": 0.162769 }, { "epoch": 0.4940490286081312, "grad_norm": 0.8121098279953003, "learning_rate": 8.975736984211962e-05, "loss": 0.9315146446228028, "memory(GiB)": 91.52, "step": 38075, "token_acc": 0.7643751597239969, "train_speed(iter/s)": 0.162762 }, { "epoch": 0.4941139070097869, "grad_norm": 0.9095163941383362, "learning_rate": 8.975411694701072e-05, "loss": 0.924864387512207, "memory(GiB)": 91.52, "step": 38080, "token_acc": 0.7788733342307175, "train_speed(iter/s)": 0.162755 }, { "epoch": 0.4941787854114426, "grad_norm": 0.8333474397659302, "learning_rate": 8.975086359441401e-05, "loss": 0.908847427368164, "memory(GiB)": 91.52, "step": 38085, "token_acc": 0.7591030721810114, "train_speed(iter/s)": 0.162749 }, { "epoch": 0.4942436638130983, "grad_norm": 0.7530180811882019, "learning_rate": 8.974760978436693e-05, "loss": 0.8139007568359375, "memory(GiB)": 91.52, "step": 38090, "token_acc": 0.7765688346789563, "train_speed(iter/s)": 0.16274 }, { "epoch": 0.494308542214754, "grad_norm": 0.7811079025268555, "learning_rate": 8.97443555169069e-05, "loss": 0.9652297973632813, "memory(GiB)": 91.52, "step": 38095, "token_acc": 0.7491207064281973, "train_speed(iter/s)": 0.162733 }, { "epoch": 0.4943734206164097, "grad_norm": 0.7146812081336975, "learning_rate": 8.974110079207139e-05, "loss": 0.9186616897583008, "memory(GiB)": 91.52, "step": 38100, "token_acc": 0.7649615098087906, "train_speed(iter/s)": 0.162726 }, { "epoch": 0.4944382990180654, "grad_norm": 0.7933741211891174, "learning_rate": 8.973784560989786e-05, "loss": 0.9163493156433106, "memory(GiB)": 91.52, "step": 38105, "token_acc": 0.772115632243111, "train_speed(iter/s)": 0.162719 }, { "epoch": 0.4945031774197211, "grad_norm": 0.7518889904022217, "learning_rate": 8.973458997042374e-05, "loss": 0.933747673034668, "memory(GiB)": 91.52, "step": 38110, "token_acc": 0.7514999647067128, "train_speed(iter/s)": 0.162711 }, { "epoch": 0.4945680558213768, "grad_norm": 0.8102200031280518, "learning_rate": 8.973133387368654e-05, "loss": 0.9153066635131836, "memory(GiB)": 91.52, "step": 38115, "token_acc": 0.7621749228953281, "train_speed(iter/s)": 0.162705 }, { "epoch": 0.4946329342230325, "grad_norm": 0.7251395583152771, "learning_rate": 8.972807731972368e-05, "loss": 0.9499471664428711, "memory(GiB)": 91.52, "step": 38120, "token_acc": 0.7550349033138088, "train_speed(iter/s)": 0.162698 }, { "epoch": 0.4946978126246882, "grad_norm": 0.6817013025283813, "learning_rate": 8.972482030857267e-05, "loss": 0.924644660949707, "memory(GiB)": 91.52, "step": 38125, "token_acc": 0.7487547781767636, "train_speed(iter/s)": 0.162691 }, { "epoch": 0.49476269102634385, "grad_norm": 0.7343141436576843, "learning_rate": 8.972156284027099e-05, "loss": 0.9444962501525879, "memory(GiB)": 91.52, "step": 38130, "token_acc": 0.7264392744479495, "train_speed(iter/s)": 0.162683 }, { "epoch": 0.49482756942799955, "grad_norm": 0.8356541395187378, "learning_rate": 8.971830491485612e-05, "loss": 0.876902961730957, "memory(GiB)": 91.52, "step": 38135, "token_acc": 0.7620907057148668, "train_speed(iter/s)": 0.162677 }, { "epoch": 0.49489244782965525, "grad_norm": 0.7294390201568604, "learning_rate": 8.971504653236556e-05, "loss": 0.9691146850585938, "memory(GiB)": 91.52, "step": 38140, "token_acc": 0.7570303712035995, "train_speed(iter/s)": 0.16267 }, { "epoch": 0.49495732623131095, "grad_norm": 0.74998539686203, "learning_rate": 8.971178769283679e-05, "loss": 0.9508522987365723, "memory(GiB)": 91.52, "step": 38145, "token_acc": 0.7554874616972047, "train_speed(iter/s)": 0.162663 }, { "epoch": 0.49502220463296664, "grad_norm": 0.7242427468299866, "learning_rate": 8.970852839630733e-05, "loss": 0.9109586715698242, "memory(GiB)": 91.52, "step": 38150, "token_acc": 0.7457614759178911, "train_speed(iter/s)": 0.162657 }, { "epoch": 0.49508708303462234, "grad_norm": 0.8129569888114929, "learning_rate": 8.970526864281468e-05, "loss": 0.9471122741699218, "memory(GiB)": 91.52, "step": 38155, "token_acc": 0.7535138715137243, "train_speed(iter/s)": 0.162652 }, { "epoch": 0.49515196143627804, "grad_norm": 0.7669270634651184, "learning_rate": 8.970200843239634e-05, "loss": 0.9186735153198242, "memory(GiB)": 91.52, "step": 38160, "token_acc": 0.7532565130260521, "train_speed(iter/s)": 0.162645 }, { "epoch": 0.49521683983793374, "grad_norm": 0.8953303098678589, "learning_rate": 8.969874776508986e-05, "loss": 0.9715174674987793, "memory(GiB)": 91.52, "step": 38165, "token_acc": 0.7636256946601774, "train_speed(iter/s)": 0.162637 }, { "epoch": 0.49528171823958944, "grad_norm": 0.7388505935668945, "learning_rate": 8.969548664093275e-05, "loss": 0.9161632537841797, "memory(GiB)": 91.52, "step": 38170, "token_acc": 0.7576943383334886, "train_speed(iter/s)": 0.162632 }, { "epoch": 0.49534659664124514, "grad_norm": 0.724881649017334, "learning_rate": 8.969222505996251e-05, "loss": 0.8924630165100098, "memory(GiB)": 91.52, "step": 38175, "token_acc": 0.7531016904681584, "train_speed(iter/s)": 0.162624 }, { "epoch": 0.49541147504290084, "grad_norm": 0.81378173828125, "learning_rate": 8.968896302221673e-05, "loss": 0.9487400054931641, "memory(GiB)": 91.52, "step": 38180, "token_acc": 0.7485481807145904, "train_speed(iter/s)": 0.162617 }, { "epoch": 0.49547635344455654, "grad_norm": 0.7343239188194275, "learning_rate": 8.968570052773289e-05, "loss": 0.9217014312744141, "memory(GiB)": 91.52, "step": 38185, "token_acc": 0.7615481499131974, "train_speed(iter/s)": 0.162609 }, { "epoch": 0.49554123184621224, "grad_norm": 0.795199990272522, "learning_rate": 8.968243757654858e-05, "loss": 0.952881908416748, "memory(GiB)": 91.52, "step": 38190, "token_acc": 0.7490760552421708, "train_speed(iter/s)": 0.162603 }, { "epoch": 0.49560611024786794, "grad_norm": 0.8042293190956116, "learning_rate": 8.967917416870132e-05, "loss": 0.9341022491455078, "memory(GiB)": 91.52, "step": 38195, "token_acc": 0.7394294536650409, "train_speed(iter/s)": 0.162595 }, { "epoch": 0.49567098864952364, "grad_norm": 0.7650213241577148, "learning_rate": 8.967591030422868e-05, "loss": 0.9135220527648926, "memory(GiB)": 91.52, "step": 38200, "token_acc": 0.7426678253313057, "train_speed(iter/s)": 0.162588 }, { "epoch": 0.49573586705117934, "grad_norm": 0.7239030003547668, "learning_rate": 8.967264598316822e-05, "loss": 0.8880016326904296, "memory(GiB)": 91.52, "step": 38205, "token_acc": 0.7630633303662677, "train_speed(iter/s)": 0.162581 }, { "epoch": 0.49580074545283503, "grad_norm": 0.7701918482780457, "learning_rate": 8.96693812055575e-05, "loss": 0.9094478607177734, "memory(GiB)": 91.52, "step": 38210, "token_acc": 0.7566027964785086, "train_speed(iter/s)": 0.162574 }, { "epoch": 0.49586562385449073, "grad_norm": 0.7830193042755127, "learning_rate": 8.966611597143411e-05, "loss": 0.9790042877197266, "memory(GiB)": 91.52, "step": 38215, "token_acc": 0.7296091685825069, "train_speed(iter/s)": 0.162567 }, { "epoch": 0.49593050225614643, "grad_norm": 0.7167066335678101, "learning_rate": 8.966285028083559e-05, "loss": 1.0015621185302734, "memory(GiB)": 91.52, "step": 38220, "token_acc": 0.7415799646337953, "train_speed(iter/s)": 0.16256 }, { "epoch": 0.49599538065780213, "grad_norm": 0.6819595694541931, "learning_rate": 8.965958413379956e-05, "loss": 0.8927276611328125, "memory(GiB)": 91.52, "step": 38225, "token_acc": 0.7745172940686207, "train_speed(iter/s)": 0.162554 }, { "epoch": 0.49606025905945783, "grad_norm": 0.7569393515586853, "learning_rate": 8.965631753036356e-05, "loss": 0.9294883728027343, "memory(GiB)": 91.52, "step": 38230, "token_acc": 0.7436894628705606, "train_speed(iter/s)": 0.162549 }, { "epoch": 0.49612513746111353, "grad_norm": 0.7499381303787231, "learning_rate": 8.965305047056524e-05, "loss": 0.9751602172851562, "memory(GiB)": 91.52, "step": 38235, "token_acc": 0.7336062078963077, "train_speed(iter/s)": 0.162542 }, { "epoch": 0.49619001586276923, "grad_norm": 0.8192142248153687, "learning_rate": 8.964978295444215e-05, "loss": 0.8971396446228027, "memory(GiB)": 91.52, "step": 38240, "token_acc": 0.767472240365774, "train_speed(iter/s)": 0.162535 }, { "epoch": 0.49625489426442493, "grad_norm": 0.7603088021278381, "learning_rate": 8.964651498203191e-05, "loss": 0.9187599182128906, "memory(GiB)": 91.52, "step": 38245, "token_acc": 0.7409410317023413, "train_speed(iter/s)": 0.162527 }, { "epoch": 0.49631977266608057, "grad_norm": 0.6659363508224487, "learning_rate": 8.964324655337216e-05, "loss": 0.9009744644165039, "memory(GiB)": 91.52, "step": 38250, "token_acc": 0.7524639337237538, "train_speed(iter/s)": 0.162519 }, { "epoch": 0.49638465106773627, "grad_norm": 0.7668530344963074, "learning_rate": 8.963997766850042e-05, "loss": 0.9397212982177734, "memory(GiB)": 91.52, "step": 38255, "token_acc": 0.7503897402712004, "train_speed(iter/s)": 0.162512 }, { "epoch": 0.49644952946939197, "grad_norm": 0.7270321846008301, "learning_rate": 8.963670832745441e-05, "loss": 0.9124156951904296, "memory(GiB)": 91.52, "step": 38260, "token_acc": 0.7635466231091542, "train_speed(iter/s)": 0.162506 }, { "epoch": 0.49651440787104767, "grad_norm": 0.7524052858352661, "learning_rate": 8.963343853027171e-05, "loss": 0.9082839965820313, "memory(GiB)": 91.52, "step": 38265, "token_acc": 0.7465848468962966, "train_speed(iter/s)": 0.162498 }, { "epoch": 0.49657928627270337, "grad_norm": 0.7989341616630554, "learning_rate": 8.963016827698995e-05, "loss": 0.9459156036376953, "memory(GiB)": 91.52, "step": 38270, "token_acc": 0.7471577261809448, "train_speed(iter/s)": 0.162491 }, { "epoch": 0.49664416467435907, "grad_norm": 0.7601532340049744, "learning_rate": 8.962689756764675e-05, "loss": 0.9170625686645508, "memory(GiB)": 91.52, "step": 38275, "token_acc": 0.7429400602745843, "train_speed(iter/s)": 0.162484 }, { "epoch": 0.49670904307601477, "grad_norm": 0.6580492854118347, "learning_rate": 8.962362640227977e-05, "loss": 0.8819729804992675, "memory(GiB)": 91.52, "step": 38280, "token_acc": 0.7498424452497243, "train_speed(iter/s)": 0.162479 }, { "epoch": 0.49677392147767047, "grad_norm": 0.8556050062179565, "learning_rate": 8.962035478092665e-05, "loss": 0.9336820602416992, "memory(GiB)": 91.52, "step": 38285, "token_acc": 0.7386403508771929, "train_speed(iter/s)": 0.162474 }, { "epoch": 0.49683879987932617, "grad_norm": 0.7163655757904053, "learning_rate": 8.961708270362503e-05, "loss": 0.9041786193847656, "memory(GiB)": 91.52, "step": 38290, "token_acc": 0.7520300196850394, "train_speed(iter/s)": 0.162465 }, { "epoch": 0.49690367828098186, "grad_norm": 0.7169774174690247, "learning_rate": 8.961381017041258e-05, "loss": 0.9077013015747071, "memory(GiB)": 91.52, "step": 38295, "token_acc": 0.7589971834903856, "train_speed(iter/s)": 0.162455 }, { "epoch": 0.49696855668263756, "grad_norm": 0.8606668710708618, "learning_rate": 8.961053718132694e-05, "loss": 0.9540736198425293, "memory(GiB)": 91.52, "step": 38300, "token_acc": 0.7636311009804986, "train_speed(iter/s)": 0.162448 }, { "epoch": 0.49703343508429326, "grad_norm": 0.812394917011261, "learning_rate": 8.96072637364058e-05, "loss": 0.9725263595581055, "memory(GiB)": 91.52, "step": 38305, "token_acc": 0.7489534188318161, "train_speed(iter/s)": 0.162441 }, { "epoch": 0.49709831348594896, "grad_norm": 0.7709884643554688, "learning_rate": 8.960398983568682e-05, "loss": 0.9488561630249024, "memory(GiB)": 91.52, "step": 38310, "token_acc": 0.7458831491089556, "train_speed(iter/s)": 0.162434 }, { "epoch": 0.49716319188760466, "grad_norm": 0.7259505391120911, "learning_rate": 8.960071547920767e-05, "loss": 0.9413969993591309, "memory(GiB)": 91.52, "step": 38315, "token_acc": 0.7561918979439506, "train_speed(iter/s)": 0.162426 }, { "epoch": 0.49722807028926036, "grad_norm": 0.7695273756980896, "learning_rate": 8.959744066700602e-05, "loss": 0.949307632446289, "memory(GiB)": 91.52, "step": 38320, "token_acc": 0.7392239023501549, "train_speed(iter/s)": 0.16242 }, { "epoch": 0.49729294869091606, "grad_norm": 0.8834912776947021, "learning_rate": 8.95941653991196e-05, "loss": 0.9690875053405762, "memory(GiB)": 91.52, "step": 38325, "token_acc": 0.7322419250536061, "train_speed(iter/s)": 0.162414 }, { "epoch": 0.49735782709257176, "grad_norm": 0.7472430467605591, "learning_rate": 8.959088967558605e-05, "loss": 0.8963385581970215, "memory(GiB)": 91.52, "step": 38330, "token_acc": 0.7638716599054548, "train_speed(iter/s)": 0.162406 }, { "epoch": 0.49742270549422746, "grad_norm": 0.8463080525398254, "learning_rate": 8.95876134964431e-05, "loss": 0.8837522506713867, "memory(GiB)": 91.52, "step": 38335, "token_acc": 0.7739064398541919, "train_speed(iter/s)": 0.162398 }, { "epoch": 0.49748758389588316, "grad_norm": 0.704210102558136, "learning_rate": 8.958433686172844e-05, "loss": 0.8932065963745117, "memory(GiB)": 91.52, "step": 38340, "token_acc": 0.7676095192555266, "train_speed(iter/s)": 0.16239 }, { "epoch": 0.49755246229753886, "grad_norm": 0.7804048657417297, "learning_rate": 8.958105977147977e-05, "loss": 0.9091009140014649, "memory(GiB)": 91.52, "step": 38345, "token_acc": 0.7515928398058253, "train_speed(iter/s)": 0.162384 }, { "epoch": 0.49761734069919455, "grad_norm": 0.745712161064148, "learning_rate": 8.957778222573483e-05, "loss": 0.9381274223327637, "memory(GiB)": 91.52, "step": 38350, "token_acc": 0.7422074320576817, "train_speed(iter/s)": 0.162376 }, { "epoch": 0.49768221910085025, "grad_norm": 0.7461665868759155, "learning_rate": 8.95745042245313e-05, "loss": 0.9130268096923828, "memory(GiB)": 91.52, "step": 38355, "token_acc": 0.7569558101472995, "train_speed(iter/s)": 0.16237 }, { "epoch": 0.49774709750250595, "grad_norm": 0.7622154355049133, "learning_rate": 8.957122576790693e-05, "loss": 0.8966231346130371, "memory(GiB)": 91.52, "step": 38360, "token_acc": 0.7563626919937672, "train_speed(iter/s)": 0.162364 }, { "epoch": 0.49781197590416165, "grad_norm": 0.7724810242652893, "learning_rate": 8.956794685589944e-05, "loss": 0.866365623474121, "memory(GiB)": 91.52, "step": 38365, "token_acc": 0.7516845444931258, "train_speed(iter/s)": 0.162357 }, { "epoch": 0.4978768543058173, "grad_norm": 0.7758313417434692, "learning_rate": 8.956466748854657e-05, "loss": 0.9071897506713867, "memory(GiB)": 91.52, "step": 38370, "token_acc": 0.7482136328890957, "train_speed(iter/s)": 0.16235 }, { "epoch": 0.497941732707473, "grad_norm": 0.7367122173309326, "learning_rate": 8.956138766588604e-05, "loss": 0.9545245170593262, "memory(GiB)": 91.52, "step": 38375, "token_acc": 0.7373132034881488, "train_speed(iter/s)": 0.162342 }, { "epoch": 0.4980066111091287, "grad_norm": 0.7012284398078918, "learning_rate": 8.955810738795562e-05, "loss": 0.9511062622070312, "memory(GiB)": 91.52, "step": 38380, "token_acc": 0.7380187129490751, "train_speed(iter/s)": 0.162336 }, { "epoch": 0.4980714895107844, "grad_norm": 0.7944450974464417, "learning_rate": 8.955482665479304e-05, "loss": 0.9439920425415039, "memory(GiB)": 91.52, "step": 38385, "token_acc": 0.739005903105243, "train_speed(iter/s)": 0.162328 }, { "epoch": 0.4981363679124401, "grad_norm": 0.7610809206962585, "learning_rate": 8.955154546643607e-05, "loss": 0.9465405464172363, "memory(GiB)": 91.52, "step": 38390, "token_acc": 0.7462635611865565, "train_speed(iter/s)": 0.16232 }, { "epoch": 0.4982012463140958, "grad_norm": 0.7731984257698059, "learning_rate": 8.954826382292246e-05, "loss": 0.9217726707458496, "memory(GiB)": 91.52, "step": 38395, "token_acc": 0.7642048815109814, "train_speed(iter/s)": 0.162314 }, { "epoch": 0.4982661247157515, "grad_norm": 0.8133115172386169, "learning_rate": 8.954498172428995e-05, "loss": 0.9153788566589356, "memory(GiB)": 91.52, "step": 38400, "token_acc": 0.7588474490120084, "train_speed(iter/s)": 0.162306 }, { "epoch": 0.4983310031174072, "grad_norm": 0.7781312465667725, "learning_rate": 8.954169917057635e-05, "loss": 0.9651771545410156, "memory(GiB)": 91.52, "step": 38405, "token_acc": 0.7358071446779069, "train_speed(iter/s)": 0.162299 }, { "epoch": 0.4983958815190629, "grad_norm": 0.8760058283805847, "learning_rate": 8.953841616181944e-05, "loss": 0.9956119537353516, "memory(GiB)": 91.52, "step": 38410, "token_acc": 0.7565696022727273, "train_speed(iter/s)": 0.162294 }, { "epoch": 0.4984607599207186, "grad_norm": 0.7996300458908081, "learning_rate": 8.953513269805696e-05, "loss": 0.8886985778808594, "memory(GiB)": 91.52, "step": 38415, "token_acc": 0.7702861782017455, "train_speed(iter/s)": 0.162287 }, { "epoch": 0.4985256383223743, "grad_norm": 0.7387984991073608, "learning_rate": 8.953184877932672e-05, "loss": 0.953807258605957, "memory(GiB)": 91.52, "step": 38420, "token_acc": 0.7437608821822402, "train_speed(iter/s)": 0.16228 }, { "epoch": 0.49859051672403, "grad_norm": 0.7570274472236633, "learning_rate": 8.952856440566652e-05, "loss": 0.907321834564209, "memory(GiB)": 91.52, "step": 38425, "token_acc": 0.7738163352882277, "train_speed(iter/s)": 0.162273 }, { "epoch": 0.4986553951256857, "grad_norm": 0.7839585542678833, "learning_rate": 8.952527957711414e-05, "loss": 0.9145069122314453, "memory(GiB)": 91.52, "step": 38430, "token_acc": 0.7744209821271195, "train_speed(iter/s)": 0.162266 }, { "epoch": 0.4987202735273414, "grad_norm": 0.8053359985351562, "learning_rate": 8.952199429370738e-05, "loss": 0.9315900802612305, "memory(GiB)": 91.52, "step": 38435, "token_acc": 0.750756099200215, "train_speed(iter/s)": 0.162259 }, { "epoch": 0.4987851519289971, "grad_norm": 0.865111768245697, "learning_rate": 8.951870855548405e-05, "loss": 0.9624740600585937, "memory(GiB)": 91.52, "step": 38440, "token_acc": 0.7350551040854948, "train_speed(iter/s)": 0.162253 }, { "epoch": 0.4988500303306528, "grad_norm": 0.7528988718986511, "learning_rate": 8.951542236248197e-05, "loss": 0.8976270675659179, "memory(GiB)": 91.52, "step": 38445, "token_acc": 0.7583860759493671, "train_speed(iter/s)": 0.162246 }, { "epoch": 0.4989149087323085, "grad_norm": 0.8536551594734192, "learning_rate": 8.951213571473896e-05, "loss": 0.9658552169799804, "memory(GiB)": 91.52, "step": 38450, "token_acc": 0.7434288804759329, "train_speed(iter/s)": 0.162239 }, { "epoch": 0.4989797871339642, "grad_norm": 0.740136444568634, "learning_rate": 8.950884861229283e-05, "loss": 0.8971921920776367, "memory(GiB)": 91.52, "step": 38455, "token_acc": 0.752109280835677, "train_speed(iter/s)": 0.162232 }, { "epoch": 0.4990446655356199, "grad_norm": 0.7954616546630859, "learning_rate": 8.95055610551814e-05, "loss": 0.9507749557495118, "memory(GiB)": 91.52, "step": 38460, "token_acc": 0.7535674724701896, "train_speed(iter/s)": 0.162226 }, { "epoch": 0.4991095439372756, "grad_norm": 0.6710954904556274, "learning_rate": 8.950227304344254e-05, "loss": 0.8986165046691894, "memory(GiB)": 91.52, "step": 38465, "token_acc": 0.7497066875244427, "train_speed(iter/s)": 0.162218 }, { "epoch": 0.4991744223389313, "grad_norm": 0.7769730687141418, "learning_rate": 8.949898457711405e-05, "loss": 0.9177720069885253, "memory(GiB)": 91.52, "step": 38470, "token_acc": 0.7420683441662913, "train_speed(iter/s)": 0.16221 }, { "epoch": 0.499239300740587, "grad_norm": 0.7125351428985596, "learning_rate": 8.949569565623379e-05, "loss": 0.918964958190918, "memory(GiB)": 91.52, "step": 38475, "token_acc": 0.7557445125568519, "train_speed(iter/s)": 0.162203 }, { "epoch": 0.4993041791422427, "grad_norm": 0.7309346795082092, "learning_rate": 8.949240628083961e-05, "loss": 0.9140323638916016, "memory(GiB)": 91.52, "step": 38480, "token_acc": 0.7421864429711812, "train_speed(iter/s)": 0.162195 }, { "epoch": 0.4993690575438984, "grad_norm": 0.8934606313705444, "learning_rate": 8.948911645096936e-05, "loss": 0.9673378944396973, "memory(GiB)": 91.52, "step": 38485, "token_acc": 0.7624817567796898, "train_speed(iter/s)": 0.162189 }, { "epoch": 0.499433935945554, "grad_norm": 0.7413938045501709, "learning_rate": 8.948582616666088e-05, "loss": 0.9276391983032226, "memory(GiB)": 91.52, "step": 38490, "token_acc": 0.73634149552396, "train_speed(iter/s)": 0.162183 }, { "epoch": 0.4994988143472097, "grad_norm": 0.7457877397537231, "learning_rate": 8.948253542795208e-05, "loss": 0.935833740234375, "memory(GiB)": 91.52, "step": 38495, "token_acc": 0.7350173897126121, "train_speed(iter/s)": 0.162177 }, { "epoch": 0.4995636927488654, "grad_norm": 0.7203036546707153, "learning_rate": 8.947924423488079e-05, "loss": 0.8926478385925293, "memory(GiB)": 91.52, "step": 38500, "token_acc": 0.7544372520359156, "train_speed(iter/s)": 0.162169 }, { "epoch": 0.4996285711505211, "grad_norm": 0.7709269523620605, "learning_rate": 8.947595258748489e-05, "loss": 0.9225532531738281, "memory(GiB)": 91.52, "step": 38505, "token_acc": 0.7699494239955044, "train_speed(iter/s)": 0.162162 }, { "epoch": 0.4996934495521768, "grad_norm": 0.7870715260505676, "learning_rate": 8.94726604858023e-05, "loss": 0.9262210845947265, "memory(GiB)": 91.52, "step": 38510, "token_acc": 0.749402973576766, "train_speed(iter/s)": 0.162156 }, { "epoch": 0.4997583279538325, "grad_norm": 0.7348527908325195, "learning_rate": 8.946936792987084e-05, "loss": 0.9136337280273438, "memory(GiB)": 91.52, "step": 38515, "token_acc": 0.7637976569377667, "train_speed(iter/s)": 0.162147 }, { "epoch": 0.4998232063554882, "grad_norm": 0.7954690456390381, "learning_rate": 8.946607491972845e-05, "loss": 0.9117199897766113, "memory(GiB)": 91.52, "step": 38520, "token_acc": 0.7691494609259389, "train_speed(iter/s)": 0.16214 }, { "epoch": 0.4998880847571439, "grad_norm": 0.8576841950416565, "learning_rate": 8.9462781455413e-05, "loss": 0.8866713523864747, "memory(GiB)": 91.52, "step": 38525, "token_acc": 0.7705783708566739, "train_speed(iter/s)": 0.162133 }, { "epoch": 0.4999529631587996, "grad_norm": 0.7007740139961243, "learning_rate": 8.94594875369624e-05, "loss": 0.9067619323730469, "memory(GiB)": 91.52, "step": 38530, "token_acc": 0.742820728367275, "train_speed(iter/s)": 0.162125 }, { "epoch": 0.5000178415604554, "grad_norm": 0.7628640532493591, "learning_rate": 8.945619316441458e-05, "loss": 0.951054573059082, "memory(GiB)": 91.52, "step": 38535, "token_acc": 0.7390011510423328, "train_speed(iter/s)": 0.162118 }, { "epoch": 0.5000827199621111, "grad_norm": 0.8677988648414612, "learning_rate": 8.94528983378074e-05, "loss": 0.9602082252502442, "memory(GiB)": 91.52, "step": 38540, "token_acc": 0.7238134690495917, "train_speed(iter/s)": 0.162112 }, { "epoch": 0.5001475983637668, "grad_norm": 0.8101839423179626, "learning_rate": 8.944960305717882e-05, "loss": 0.9310689926147461, "memory(GiB)": 91.52, "step": 38545, "token_acc": 0.7582164357519371, "train_speed(iter/s)": 0.162106 }, { "epoch": 0.5002124767654224, "grad_norm": 0.7798748016357422, "learning_rate": 8.944630732256675e-05, "loss": 0.9156519889831543, "memory(GiB)": 91.52, "step": 38550, "token_acc": 0.7661215772528769, "train_speed(iter/s)": 0.162099 }, { "epoch": 0.500277355167078, "grad_norm": 0.8689656853675842, "learning_rate": 8.944301113400911e-05, "loss": 0.9689602851867676, "memory(GiB)": 91.52, "step": 38555, "token_acc": 0.7583760029028747, "train_speed(iter/s)": 0.162091 }, { "epoch": 0.5003422335687338, "grad_norm": 0.8537879586219788, "learning_rate": 8.943971449154385e-05, "loss": 0.9181086540222168, "memory(GiB)": 91.52, "step": 38560, "token_acc": 0.7479921409120678, "train_speed(iter/s)": 0.162085 }, { "epoch": 0.5004071119703895, "grad_norm": 0.8135399222373962, "learning_rate": 8.943641739520887e-05, "loss": 0.9391918182373047, "memory(GiB)": 91.52, "step": 38565, "token_acc": 0.7460802725247849, "train_speed(iter/s)": 0.162079 }, { "epoch": 0.5004719903720452, "grad_norm": 0.8001379370689392, "learning_rate": 8.943311984504214e-05, "loss": 0.9014949798583984, "memory(GiB)": 91.52, "step": 38570, "token_acc": 0.7388048177887585, "train_speed(iter/s)": 0.16207 }, { "epoch": 0.5005368687737008, "grad_norm": 0.8243147730827332, "learning_rate": 8.942982184108161e-05, "loss": 0.9473663330078125, "memory(GiB)": 91.52, "step": 38575, "token_acc": 0.753768115942029, "train_speed(iter/s)": 0.162064 }, { "epoch": 0.5006017471753565, "grad_norm": 0.7739299535751343, "learning_rate": 8.942652338336523e-05, "loss": 0.9224983215332031, "memory(GiB)": 91.52, "step": 38580, "token_acc": 0.7525308943657055, "train_speed(iter/s)": 0.162057 }, { "epoch": 0.5006666255770122, "grad_norm": 0.6310040354728699, "learning_rate": 8.942322447193097e-05, "loss": 0.9195443153381347, "memory(GiB)": 91.52, "step": 38585, "token_acc": 0.7522691173133651, "train_speed(iter/s)": 0.162051 }, { "epoch": 0.500731503978668, "grad_norm": 0.7734670639038086, "learning_rate": 8.941992510681676e-05, "loss": 0.8968476295471192, "memory(GiB)": 91.52, "step": 38590, "token_acc": 0.7731856043999841, "train_speed(iter/s)": 0.162044 }, { "epoch": 0.5007963823803236, "grad_norm": 0.7191415429115295, "learning_rate": 8.941662528806059e-05, "loss": 0.9389151573181153, "memory(GiB)": 91.52, "step": 38595, "token_acc": 0.7364204507679011, "train_speed(iter/s)": 0.162036 }, { "epoch": 0.5008612607819793, "grad_norm": 0.734695553779602, "learning_rate": 8.941332501570043e-05, "loss": 0.9321343421936035, "memory(GiB)": 91.52, "step": 38600, "token_acc": 0.7619114497384221, "train_speed(iter/s)": 0.162029 }, { "epoch": 0.500926139183635, "grad_norm": 0.7608436942100525, "learning_rate": 8.941002428977428e-05, "loss": 0.8862142562866211, "memory(GiB)": 91.52, "step": 38605, "token_acc": 0.764770806746456, "train_speed(iter/s)": 0.162022 }, { "epoch": 0.5009910175852907, "grad_norm": 0.7296950221061707, "learning_rate": 8.940672311032011e-05, "loss": 0.8593595504760743, "memory(GiB)": 91.52, "step": 38610, "token_acc": 0.7606831992116933, "train_speed(iter/s)": 0.162015 }, { "epoch": 0.5010558959869464, "grad_norm": 0.8110248446464539, "learning_rate": 8.940342147737588e-05, "loss": 0.9399092674255372, "memory(GiB)": 91.52, "step": 38615, "token_acc": 0.7381204139831459, "train_speed(iter/s)": 0.162008 }, { "epoch": 0.5011207743886021, "grad_norm": 0.6975952982902527, "learning_rate": 8.940011939097962e-05, "loss": 0.9302814483642579, "memory(GiB)": 91.52, "step": 38620, "token_acc": 0.7596524367208161, "train_speed(iter/s)": 0.162001 }, { "epoch": 0.5011856527902578, "grad_norm": 0.7902112603187561, "learning_rate": 8.939681685116933e-05, "loss": 0.9808403015136719, "memory(GiB)": 91.52, "step": 38625, "token_acc": 0.7455239648589539, "train_speed(iter/s)": 0.161996 }, { "epoch": 0.5012505311919135, "grad_norm": 0.681067168712616, "learning_rate": 8.9393513857983e-05, "loss": 0.9333150863647461, "memory(GiB)": 91.52, "step": 38630, "token_acc": 0.7541311120140864, "train_speed(iter/s)": 0.16199 }, { "epoch": 0.5013154095935692, "grad_norm": 0.7654119729995728, "learning_rate": 8.939021041145865e-05, "loss": 0.9463756561279297, "memory(GiB)": 91.52, "step": 38635, "token_acc": 0.7478171192849776, "train_speed(iter/s)": 0.161985 }, { "epoch": 0.5013802879952249, "grad_norm": 0.8558741211891174, "learning_rate": 8.93869065116343e-05, "loss": 0.9993317604064942, "memory(GiB)": 91.52, "step": 38640, "token_acc": 0.7250645105797351, "train_speed(iter/s)": 0.16198 }, { "epoch": 0.5014451663968806, "grad_norm": 0.9679347276687622, "learning_rate": 8.938360215854795e-05, "loss": 0.9088418006896972, "memory(GiB)": 91.52, "step": 38645, "token_acc": 0.7574463425317565, "train_speed(iter/s)": 0.161973 }, { "epoch": 0.5015100447985363, "grad_norm": 0.701939046382904, "learning_rate": 8.938029735223766e-05, "loss": 0.8958545684814453, "memory(GiB)": 91.52, "step": 38650, "token_acc": 0.769630669133125, "train_speed(iter/s)": 0.161966 }, { "epoch": 0.501574923200192, "grad_norm": 0.7506699562072754, "learning_rate": 8.937699209274143e-05, "loss": 0.9077132225036622, "memory(GiB)": 91.52, "step": 38655, "token_acc": 0.7604946606112679, "train_speed(iter/s)": 0.161959 }, { "epoch": 0.5016398016018477, "grad_norm": 0.7921278476715088, "learning_rate": 8.93736863800973e-05, "loss": 0.967078971862793, "memory(GiB)": 91.52, "step": 38660, "token_acc": 0.7488663069742809, "train_speed(iter/s)": 0.161952 }, { "epoch": 0.5017046800035034, "grad_norm": 0.7783830165863037, "learning_rate": 8.937038021434335e-05, "loss": 0.9032402038574219, "memory(GiB)": 91.52, "step": 38665, "token_acc": 0.7500085205003237, "train_speed(iter/s)": 0.161945 }, { "epoch": 0.5017695584051591, "grad_norm": 0.7453623414039612, "learning_rate": 8.936707359551758e-05, "loss": 0.9737883567810058, "memory(GiB)": 91.52, "step": 38670, "token_acc": 0.7394838306297544, "train_speed(iter/s)": 0.161939 }, { "epoch": 0.5018344368068148, "grad_norm": 0.931511402130127, "learning_rate": 8.936376652365807e-05, "loss": 0.9643396377563477, "memory(GiB)": 91.52, "step": 38675, "token_acc": 0.7531137647663071, "train_speed(iter/s)": 0.161932 }, { "epoch": 0.5018993152084705, "grad_norm": 0.7795872092247009, "learning_rate": 8.936045899880286e-05, "loss": 0.9470457077026367, "memory(GiB)": 91.52, "step": 38680, "token_acc": 0.7681383142945354, "train_speed(iter/s)": 0.161925 }, { "epoch": 0.5019641936101262, "grad_norm": 0.7713845372200012, "learning_rate": 8.935715102099003e-05, "loss": 0.9262178421020508, "memory(GiB)": 91.52, "step": 38685, "token_acc": 0.739189513228359, "train_speed(iter/s)": 0.161917 }, { "epoch": 0.5020290720117819, "grad_norm": 0.8252926468849182, "learning_rate": 8.935384259025763e-05, "loss": 0.9037975311279297, "memory(GiB)": 91.52, "step": 38690, "token_acc": 0.7382605676590573, "train_speed(iter/s)": 0.161911 }, { "epoch": 0.5020939504134376, "grad_norm": 0.7894957661628723, "learning_rate": 8.935053370664373e-05, "loss": 0.9879534721374512, "memory(GiB)": 91.52, "step": 38695, "token_acc": 0.7476779233560927, "train_speed(iter/s)": 0.161904 }, { "epoch": 0.5021588288150933, "grad_norm": 0.7087438702583313, "learning_rate": 8.934722437018644e-05, "loss": 0.9117194175720215, "memory(GiB)": 91.52, "step": 38700, "token_acc": 0.7542010454015292, "train_speed(iter/s)": 0.161897 }, { "epoch": 0.502223707216749, "grad_norm": 0.6723414659500122, "learning_rate": 8.934391458092381e-05, "loss": 0.8608575820922851, "memory(GiB)": 91.52, "step": 38705, "token_acc": 0.7785511555615624, "train_speed(iter/s)": 0.161891 }, { "epoch": 0.5022885856184047, "grad_norm": 0.7765722870826721, "learning_rate": 8.934060433889396e-05, "loss": 0.9420351982116699, "memory(GiB)": 91.52, "step": 38710, "token_acc": 0.7524111127105225, "train_speed(iter/s)": 0.161885 }, { "epoch": 0.5023534640200604, "grad_norm": 0.6973863244056702, "learning_rate": 8.933729364413497e-05, "loss": 0.9083847999572754, "memory(GiB)": 91.52, "step": 38715, "token_acc": 0.7324660222399884, "train_speed(iter/s)": 0.161878 }, { "epoch": 0.5024183424217161, "grad_norm": 0.8606512546539307, "learning_rate": 8.933398249668491e-05, "loss": 0.9344868659973145, "memory(GiB)": 91.52, "step": 38720, "token_acc": 0.7587967544033247, "train_speed(iter/s)": 0.16187 }, { "epoch": 0.5024832208233718, "grad_norm": 0.8427032828330994, "learning_rate": 8.933067089658194e-05, "loss": 0.9127320289611817, "memory(GiB)": 91.52, "step": 38725, "token_acc": 0.7658435091070332, "train_speed(iter/s)": 0.161863 }, { "epoch": 0.5025480992250275, "grad_norm": 0.7909460067749023, "learning_rate": 8.932735884386412e-05, "loss": 0.9230718612670898, "memory(GiB)": 91.52, "step": 38730, "token_acc": 0.7299147217235189, "train_speed(iter/s)": 0.161856 }, { "epoch": 0.5026129776266832, "grad_norm": 0.7618365287780762, "learning_rate": 8.932404633856959e-05, "loss": 0.8857785224914551, "memory(GiB)": 91.52, "step": 38735, "token_acc": 0.7559976820552443, "train_speed(iter/s)": 0.161849 }, { "epoch": 0.5026778560283389, "grad_norm": 0.7996286153793335, "learning_rate": 8.932073338073648e-05, "loss": 0.9397514343261719, "memory(GiB)": 91.52, "step": 38740, "token_acc": 0.7497595930142383, "train_speed(iter/s)": 0.161842 }, { "epoch": 0.5027427344299946, "grad_norm": 0.7285304069519043, "learning_rate": 8.931741997040288e-05, "loss": 0.9376855850219726, "memory(GiB)": 91.52, "step": 38745, "token_acc": 0.7621407333994054, "train_speed(iter/s)": 0.161835 }, { "epoch": 0.5028076128316503, "grad_norm": 0.7094509601593018, "learning_rate": 8.931410610760696e-05, "loss": 0.8942115783691407, "memory(GiB)": 91.52, "step": 38750, "token_acc": 0.7453662182361734, "train_speed(iter/s)": 0.161829 }, { "epoch": 0.502872491233306, "grad_norm": 0.746912956237793, "learning_rate": 8.931079179238682e-05, "loss": 0.9566831588745117, "memory(GiB)": 91.52, "step": 38755, "token_acc": 0.7473696292612007, "train_speed(iter/s)": 0.161823 }, { "epoch": 0.5029373696349617, "grad_norm": 0.7761902809143066, "learning_rate": 8.930747702478063e-05, "loss": 0.89453125, "memory(GiB)": 91.52, "step": 38760, "token_acc": 0.759684091763821, "train_speed(iter/s)": 0.161816 }, { "epoch": 0.5030022480366174, "grad_norm": 0.7170781493186951, "learning_rate": 8.930416180482652e-05, "loss": 0.9489152908325196, "memory(GiB)": 91.52, "step": 38765, "token_acc": 0.7463484803776925, "train_speed(iter/s)": 0.16181 }, { "epoch": 0.5030671264382731, "grad_norm": 0.674512505531311, "learning_rate": 8.930084613256264e-05, "loss": 0.8988585472106934, "memory(GiB)": 91.52, "step": 38770, "token_acc": 0.7526150447060448, "train_speed(iter/s)": 0.161802 }, { "epoch": 0.5031320048399288, "grad_norm": 0.6802998185157776, "learning_rate": 8.929753000802717e-05, "loss": 0.9286611557006836, "memory(GiB)": 91.52, "step": 38775, "token_acc": 0.7463968554374727, "train_speed(iter/s)": 0.161795 }, { "epoch": 0.5031968832415845, "grad_norm": 0.7521098256111145, "learning_rate": 8.929421343125824e-05, "loss": 0.8755474090576172, "memory(GiB)": 91.52, "step": 38780, "token_acc": 0.7645507062477043, "train_speed(iter/s)": 0.161787 }, { "epoch": 0.5032617616432401, "grad_norm": 0.7955344915390015, "learning_rate": 8.929089640229402e-05, "loss": 0.9345286369323731, "memory(GiB)": 91.52, "step": 38785, "token_acc": 0.7498728166864508, "train_speed(iter/s)": 0.16178 }, { "epoch": 0.5033266400448958, "grad_norm": 0.8174014091491699, "learning_rate": 8.928757892117272e-05, "loss": 0.8859807968139648, "memory(GiB)": 91.52, "step": 38790, "token_acc": 0.7935271201678155, "train_speed(iter/s)": 0.161774 }, { "epoch": 0.5033915184465515, "grad_norm": 0.7255547046661377, "learning_rate": 8.928426098793247e-05, "loss": 0.9605914115905761, "memory(GiB)": 91.52, "step": 38795, "token_acc": 0.7530380090849089, "train_speed(iter/s)": 0.161768 }, { "epoch": 0.5034563968482072, "grad_norm": 0.749897301197052, "learning_rate": 8.928094260261148e-05, "loss": 0.9367647171020508, "memory(GiB)": 91.52, "step": 38800, "token_acc": 0.7473064432911469, "train_speed(iter/s)": 0.16176 }, { "epoch": 0.5035212752498629, "grad_norm": 0.7804886102676392, "learning_rate": 8.927762376524793e-05, "loss": 0.9802288055419922, "memory(GiB)": 91.52, "step": 38805, "token_acc": 0.7326364692218351, "train_speed(iter/s)": 0.161753 }, { "epoch": 0.5035861536515186, "grad_norm": 0.7226876616477966, "learning_rate": 8.927430447588001e-05, "loss": 0.9068288803100586, "memory(GiB)": 91.52, "step": 38810, "token_acc": 0.7734093067426401, "train_speed(iter/s)": 0.161747 }, { "epoch": 0.5036510320531743, "grad_norm": 0.7273823022842407, "learning_rate": 8.927098473454593e-05, "loss": 0.8719524383544922, "memory(GiB)": 91.52, "step": 38815, "token_acc": 0.7497866590812118, "train_speed(iter/s)": 0.16174 }, { "epoch": 0.50371591045483, "grad_norm": 0.7063352465629578, "learning_rate": 8.926766454128388e-05, "loss": 0.9149072647094727, "memory(GiB)": 91.52, "step": 38820, "token_acc": 0.764403996168058, "train_speed(iter/s)": 0.161732 }, { "epoch": 0.5037807888564857, "grad_norm": 0.8088054656982422, "learning_rate": 8.926434389613208e-05, "loss": 0.9368244171142578, "memory(GiB)": 91.52, "step": 38825, "token_acc": 0.7392184005963156, "train_speed(iter/s)": 0.161724 }, { "epoch": 0.5038456672581414, "grad_norm": 0.767637312412262, "learning_rate": 8.926102279912871e-05, "loss": 0.9719803810119629, "memory(GiB)": 91.52, "step": 38830, "token_acc": 0.7353853259435207, "train_speed(iter/s)": 0.161717 }, { "epoch": 0.5039105456597971, "grad_norm": 0.7491351366043091, "learning_rate": 8.925770125031205e-05, "loss": 0.9207752227783204, "memory(GiB)": 91.52, "step": 38835, "token_acc": 0.746087610196132, "train_speed(iter/s)": 0.16171 }, { "epoch": 0.5039754240614528, "grad_norm": 0.7976060509681702, "learning_rate": 8.925437924972027e-05, "loss": 0.9158709526062012, "memory(GiB)": 91.52, "step": 38840, "token_acc": 0.7610471786670375, "train_speed(iter/s)": 0.161703 }, { "epoch": 0.5040403024631085, "grad_norm": 0.7849993705749512, "learning_rate": 8.925105679739164e-05, "loss": 0.9547555923461915, "memory(GiB)": 91.52, "step": 38845, "token_acc": 0.7490800202034779, "train_speed(iter/s)": 0.161696 }, { "epoch": 0.5041051808647642, "grad_norm": 0.7439182996749878, "learning_rate": 8.924773389336435e-05, "loss": 0.936851692199707, "memory(GiB)": 91.52, "step": 38850, "token_acc": 0.7355109659356043, "train_speed(iter/s)": 0.16169 }, { "epoch": 0.5041700592664199, "grad_norm": 0.7141666412353516, "learning_rate": 8.924441053767669e-05, "loss": 0.9332422256469727, "memory(GiB)": 91.52, "step": 38855, "token_acc": 0.7599727608329451, "train_speed(iter/s)": 0.161683 }, { "epoch": 0.5042349376680756, "grad_norm": 0.696982204914093, "learning_rate": 8.924108673036686e-05, "loss": 0.9614458084106445, "memory(GiB)": 91.52, "step": 38860, "token_acc": 0.7523228498299499, "train_speed(iter/s)": 0.161677 }, { "epoch": 0.5042998160697313, "grad_norm": 0.7273934483528137, "learning_rate": 8.923776247147313e-05, "loss": 0.9395197868347168, "memory(GiB)": 91.52, "step": 38865, "token_acc": 0.7630711775043937, "train_speed(iter/s)": 0.16167 }, { "epoch": 0.504364694471387, "grad_norm": 0.7334424257278442, "learning_rate": 8.923443776103377e-05, "loss": 0.9053567886352539, "memory(GiB)": 91.52, "step": 38870, "token_acc": 0.7801703824542324, "train_speed(iter/s)": 0.161663 }, { "epoch": 0.5044295728730427, "grad_norm": 0.7436926364898682, "learning_rate": 8.923111259908701e-05, "loss": 0.959506607055664, "memory(GiB)": 91.52, "step": 38875, "token_acc": 0.7507876496534341, "train_speed(iter/s)": 0.161656 }, { "epoch": 0.5044944512746984, "grad_norm": 0.715770423412323, "learning_rate": 8.922778698567113e-05, "loss": 0.9023572921752929, "memory(GiB)": 91.52, "step": 38880, "token_acc": 0.7465298507462687, "train_speed(iter/s)": 0.16165 }, { "epoch": 0.5045593296763541, "grad_norm": 0.7377089858055115, "learning_rate": 8.922446092082441e-05, "loss": 0.9343706130981445, "memory(GiB)": 91.52, "step": 38885, "token_acc": 0.7417646654287876, "train_speed(iter/s)": 0.161642 }, { "epoch": 0.5046242080780098, "grad_norm": 0.7110248804092407, "learning_rate": 8.922113440458512e-05, "loss": 0.9065213203430176, "memory(GiB)": 91.52, "step": 38890, "token_acc": 0.7420800729317025, "train_speed(iter/s)": 0.161636 }, { "epoch": 0.5046890864796655, "grad_norm": 0.734672486782074, "learning_rate": 8.921780743699155e-05, "loss": 0.9420881271362305, "memory(GiB)": 91.52, "step": 38895, "token_acc": 0.7383058652447891, "train_speed(iter/s)": 0.16163 }, { "epoch": 0.5047539648813212, "grad_norm": 0.9219724535942078, "learning_rate": 8.921448001808194e-05, "loss": 0.953285026550293, "memory(GiB)": 91.52, "step": 38900, "token_acc": 0.7465470919892534, "train_speed(iter/s)": 0.161624 }, { "epoch": 0.5048188432829769, "grad_norm": 0.8101054430007935, "learning_rate": 8.921115214789466e-05, "loss": 0.9732610702514648, "memory(GiB)": 91.52, "step": 38905, "token_acc": 0.7564340673918811, "train_speed(iter/s)": 0.161618 }, { "epoch": 0.5048837216846326, "grad_norm": 0.7497149705886841, "learning_rate": 8.920782382646794e-05, "loss": 0.9691102027893066, "memory(GiB)": 91.52, "step": 38910, "token_acc": 0.7510533457116332, "train_speed(iter/s)": 0.161612 }, { "epoch": 0.5049486000862883, "grad_norm": 0.7110646963119507, "learning_rate": 8.920449505384012e-05, "loss": 0.9135557174682617, "memory(GiB)": 91.52, "step": 38915, "token_acc": 0.754713661554702, "train_speed(iter/s)": 0.161604 }, { "epoch": 0.505013478487944, "grad_norm": 0.8473029136657715, "learning_rate": 8.920116583004948e-05, "loss": 0.9816705703735351, "memory(GiB)": 91.52, "step": 38920, "token_acc": 0.7452281274391542, "train_speed(iter/s)": 0.161598 }, { "epoch": 0.5050783568895997, "grad_norm": 0.8059958815574646, "learning_rate": 8.919783615513436e-05, "loss": 0.8758844375610352, "memory(GiB)": 91.52, "step": 38925, "token_acc": 0.778777983702527, "train_speed(iter/s)": 0.16159 }, { "epoch": 0.5051432352912554, "grad_norm": 0.7234777212142944, "learning_rate": 8.919450602913304e-05, "loss": 0.8979853630065918, "memory(GiB)": 91.52, "step": 38930, "token_acc": 0.7581311650226606, "train_speed(iter/s)": 0.161583 }, { "epoch": 0.5052081136929111, "grad_norm": 0.7361820936203003, "learning_rate": 8.919117545208389e-05, "loss": 0.9042583465576172, "memory(GiB)": 91.52, "step": 38935, "token_acc": 0.7390207774530606, "train_speed(iter/s)": 0.161576 }, { "epoch": 0.5052729920945668, "grad_norm": 0.8055403232574463, "learning_rate": 8.918784442402521e-05, "loss": 0.8848402023315429, "memory(GiB)": 91.52, "step": 38940, "token_acc": 0.7817672875557246, "train_speed(iter/s)": 0.161568 }, { "epoch": 0.5053378704962225, "grad_norm": 0.7846799492835999, "learning_rate": 8.918451294499535e-05, "loss": 0.918428611755371, "memory(GiB)": 91.52, "step": 38945, "token_acc": 0.7615951302182409, "train_speed(iter/s)": 0.161562 }, { "epoch": 0.5054027488978782, "grad_norm": 0.7595363855361938, "learning_rate": 8.91811810150326e-05, "loss": 0.9232048034667969, "memory(GiB)": 91.52, "step": 38950, "token_acc": 0.744491245170049, "train_speed(iter/s)": 0.161555 }, { "epoch": 0.5054676272995339, "grad_norm": 0.8722442984580994, "learning_rate": 8.917784863417537e-05, "loss": 0.9531918525695801, "memory(GiB)": 91.52, "step": 38955, "token_acc": 0.7379281726271314, "train_speed(iter/s)": 0.16155 }, { "epoch": 0.5055325057011896, "grad_norm": 0.7548800706863403, "learning_rate": 8.917451580246199e-05, "loss": 0.9623327255249023, "memory(GiB)": 91.52, "step": 38960, "token_acc": 0.7244828936594396, "train_speed(iter/s)": 0.161543 }, { "epoch": 0.5055973841028453, "grad_norm": 0.6864344477653503, "learning_rate": 8.917118251993079e-05, "loss": 0.9249536514282226, "memory(GiB)": 91.52, "step": 38965, "token_acc": 0.7387756582287662, "train_speed(iter/s)": 0.161535 }, { "epoch": 0.505662262504501, "grad_norm": 0.7895022034645081, "learning_rate": 8.916784878662014e-05, "loss": 0.8823192596435547, "memory(GiB)": 91.52, "step": 38970, "token_acc": 0.7755936096193468, "train_speed(iter/s)": 0.161527 }, { "epoch": 0.5057271409061567, "grad_norm": 0.7460116147994995, "learning_rate": 8.91645146025684e-05, "loss": 0.9138014793395997, "memory(GiB)": 91.52, "step": 38975, "token_acc": 0.7588224121557455, "train_speed(iter/s)": 0.161521 }, { "epoch": 0.5057920193078124, "grad_norm": 0.7157934308052063, "learning_rate": 8.916117996781396e-05, "loss": 0.9389025688171386, "memory(GiB)": 91.52, "step": 38980, "token_acc": 0.7455962059620597, "train_speed(iter/s)": 0.161515 }, { "epoch": 0.5058568977094681, "grad_norm": 0.7361125349998474, "learning_rate": 8.915784488239518e-05, "loss": 0.941590690612793, "memory(GiB)": 91.52, "step": 38985, "token_acc": 0.7258215624099164, "train_speed(iter/s)": 0.16151 }, { "epoch": 0.5059217761111238, "grad_norm": 0.7028302550315857, "learning_rate": 8.915450934635044e-05, "loss": 0.893242073059082, "memory(GiB)": 91.52, "step": 38990, "token_acc": 0.7487203548882443, "train_speed(iter/s)": 0.161503 }, { "epoch": 0.5059866545127795, "grad_norm": 0.7622078061103821, "learning_rate": 8.915117335971812e-05, "loss": 0.9813505172729492, "memory(GiB)": 91.52, "step": 38995, "token_acc": 0.7440674857876398, "train_speed(iter/s)": 0.161496 }, { "epoch": 0.5060515329144352, "grad_norm": 0.7200149297714233, "learning_rate": 8.914783692253661e-05, "loss": 0.9002681732177734, "memory(GiB)": 91.52, "step": 39000, "token_acc": 0.7704593590500329, "train_speed(iter/s)": 0.161491 }, { "epoch": 0.5061164113160909, "grad_norm": 0.7622548341751099, "learning_rate": 8.914450003484433e-05, "loss": 0.9560449600219727, "memory(GiB)": 91.52, "step": 39005, "token_acc": 0.7349739881193964, "train_speed(iter/s)": 0.161484 }, { "epoch": 0.5061812897177466, "grad_norm": 0.7737246155738831, "learning_rate": 8.914116269667965e-05, "loss": 0.954319953918457, "memory(GiB)": 91.52, "step": 39010, "token_acc": 0.7431670281995661, "train_speed(iter/s)": 0.161478 }, { "epoch": 0.5062461681194023, "grad_norm": 0.8423319458961487, "learning_rate": 8.913782490808101e-05, "loss": 0.9362588882446289, "memory(GiB)": 91.52, "step": 39015, "token_acc": 0.7458509379753253, "train_speed(iter/s)": 0.161471 }, { "epoch": 0.506311046521058, "grad_norm": 0.7955574989318848, "learning_rate": 8.913448666908678e-05, "loss": 0.9373478889465332, "memory(GiB)": 91.52, "step": 39020, "token_acc": 0.7524817237399, "train_speed(iter/s)": 0.161464 }, { "epoch": 0.5063759249227135, "grad_norm": 0.7035645246505737, "learning_rate": 8.913114797973538e-05, "loss": 0.915975284576416, "memory(GiB)": 91.52, "step": 39025, "token_acc": 0.7463118120584853, "train_speed(iter/s)": 0.161457 }, { "epoch": 0.5064408033243692, "grad_norm": 0.7569683790206909, "learning_rate": 8.912780884006528e-05, "loss": 0.8697694778442383, "memory(GiB)": 91.52, "step": 39030, "token_acc": 0.7760879876674968, "train_speed(iter/s)": 0.161449 }, { "epoch": 0.506505681726025, "grad_norm": 0.7093518972396851, "learning_rate": 8.912446925011486e-05, "loss": 0.9519922256469726, "memory(GiB)": 91.52, "step": 39035, "token_acc": 0.7378172588832488, "train_speed(iter/s)": 0.161443 }, { "epoch": 0.5065705601276806, "grad_norm": 0.7632584571838379, "learning_rate": 8.912112920992258e-05, "loss": 0.9518278121948243, "memory(GiB)": 91.52, "step": 39040, "token_acc": 0.7530973150945317, "train_speed(iter/s)": 0.161436 }, { "epoch": 0.5066354385293363, "grad_norm": 0.7365924715995789, "learning_rate": 8.911778871952683e-05, "loss": 0.9090085029602051, "memory(GiB)": 91.52, "step": 39045, "token_acc": 0.74756666045124, "train_speed(iter/s)": 0.161428 }, { "epoch": 0.506700316930992, "grad_norm": 0.6491592526435852, "learning_rate": 8.911444777896612e-05, "loss": 0.9271957397460937, "memory(GiB)": 91.52, "step": 39050, "token_acc": 0.7487270698193486, "train_speed(iter/s)": 0.161421 }, { "epoch": 0.5067651953326477, "grad_norm": 0.7733837366104126, "learning_rate": 8.911110638827884e-05, "loss": 0.9300689697265625, "memory(GiB)": 91.52, "step": 39055, "token_acc": 0.7476112516743563, "train_speed(iter/s)": 0.161412 }, { "epoch": 0.5068300737343034, "grad_norm": 0.6272134184837341, "learning_rate": 8.910776454750347e-05, "loss": 0.8663690567016602, "memory(GiB)": 91.52, "step": 39060, "token_acc": 0.7783413644433173, "train_speed(iter/s)": 0.161406 }, { "epoch": 0.5068949521359591, "grad_norm": 0.7054623961448669, "learning_rate": 8.910442225667847e-05, "loss": 0.8588113784790039, "memory(GiB)": 91.52, "step": 39065, "token_acc": 0.7833516114196767, "train_speed(iter/s)": 0.161399 }, { "epoch": 0.5069598305376148, "grad_norm": 0.812985897064209, "learning_rate": 8.910107951584229e-05, "loss": 0.9101485252380371, "memory(GiB)": 91.52, "step": 39070, "token_acc": 0.7640084899939357, "train_speed(iter/s)": 0.161393 }, { "epoch": 0.5070247089392705, "grad_norm": 0.7853439450263977, "learning_rate": 8.90977363250334e-05, "loss": 0.8776954650878906, "memory(GiB)": 91.52, "step": 39075, "token_acc": 0.7544736936100297, "train_speed(iter/s)": 0.161386 }, { "epoch": 0.5070895873409262, "grad_norm": 0.8931267857551575, "learning_rate": 8.909439268429027e-05, "loss": 0.9293153762817383, "memory(GiB)": 91.52, "step": 39080, "token_acc": 0.7377785830766443, "train_speed(iter/s)": 0.16138 }, { "epoch": 0.5071544657425819, "grad_norm": 0.7367196679115295, "learning_rate": 8.90910485936514e-05, "loss": 0.9173934936523438, "memory(GiB)": 91.52, "step": 39085, "token_acc": 0.7510076713041217, "train_speed(iter/s)": 0.161373 }, { "epoch": 0.5072193441442376, "grad_norm": 0.8113522529602051, "learning_rate": 8.908770405315525e-05, "loss": 0.9423834800720214, "memory(GiB)": 91.52, "step": 39090, "token_acc": 0.735658833034742, "train_speed(iter/s)": 0.161367 }, { "epoch": 0.5072842225458933, "grad_norm": 0.7393420934677124, "learning_rate": 8.908435906284034e-05, "loss": 0.8986207962036132, "memory(GiB)": 91.52, "step": 39095, "token_acc": 0.761958901802257, "train_speed(iter/s)": 0.16136 }, { "epoch": 0.507349100947549, "grad_norm": 0.7414646744728088, "learning_rate": 8.908101362274513e-05, "loss": 0.9555338859558106, "memory(GiB)": 91.52, "step": 39100, "token_acc": 0.7354806797020484, "train_speed(iter/s)": 0.161354 }, { "epoch": 0.5074139793492047, "grad_norm": 0.7559008598327637, "learning_rate": 8.907766773290811e-05, "loss": 0.9537755012512207, "memory(GiB)": 91.52, "step": 39105, "token_acc": 0.762820279847356, "train_speed(iter/s)": 0.161347 }, { "epoch": 0.5074788577508604, "grad_norm": 0.7425857186317444, "learning_rate": 8.907432139336782e-05, "loss": 0.9055905342102051, "memory(GiB)": 91.52, "step": 39110, "token_acc": 0.7312754142800927, "train_speed(iter/s)": 0.16134 }, { "epoch": 0.5075437361525161, "grad_norm": 0.7480002045631409, "learning_rate": 8.907097460416275e-05, "loss": 0.9143001556396484, "memory(GiB)": 91.52, "step": 39115, "token_acc": 0.7668444910419686, "train_speed(iter/s)": 0.161332 }, { "epoch": 0.5076086145541718, "grad_norm": 0.7738467454910278, "learning_rate": 8.906762736533142e-05, "loss": 0.9599584579467774, "memory(GiB)": 91.52, "step": 39120, "token_acc": 0.7409092512256199, "train_speed(iter/s)": 0.161325 }, { "epoch": 0.5076734929558275, "grad_norm": 0.8611069917678833, "learning_rate": 8.906427967691235e-05, "loss": 0.901371955871582, "memory(GiB)": 91.52, "step": 39125, "token_acc": 0.749971566137165, "train_speed(iter/s)": 0.161319 }, { "epoch": 0.5077383713574832, "grad_norm": 0.8263249397277832, "learning_rate": 8.906093153894406e-05, "loss": 0.905706787109375, "memory(GiB)": 91.52, "step": 39130, "token_acc": 0.763707310565635, "train_speed(iter/s)": 0.161313 }, { "epoch": 0.5078032497591389, "grad_norm": 0.7529920935630798, "learning_rate": 8.905758295146509e-05, "loss": 0.9371309280395508, "memory(GiB)": 91.52, "step": 39135, "token_acc": 0.7640684110970997, "train_speed(iter/s)": 0.161305 }, { "epoch": 0.5078681281607946, "grad_norm": 0.807868242263794, "learning_rate": 8.905423391451397e-05, "loss": 0.9014005661010742, "memory(GiB)": 91.52, "step": 39140, "token_acc": 0.7671118530884808, "train_speed(iter/s)": 0.161298 }, { "epoch": 0.5079330065624503, "grad_norm": 0.7925792336463928, "learning_rate": 8.905088442812924e-05, "loss": 0.8874822616577148, "memory(GiB)": 91.52, "step": 39145, "token_acc": 0.7602029226925116, "train_speed(iter/s)": 0.161291 }, { "epoch": 0.507997884964106, "grad_norm": 0.6838642954826355, "learning_rate": 8.904753449234944e-05, "loss": 0.9086341857910156, "memory(GiB)": 91.52, "step": 39150, "token_acc": 0.7451964818240157, "train_speed(iter/s)": 0.161284 }, { "epoch": 0.5080627633657617, "grad_norm": 0.773820161819458, "learning_rate": 8.904418410721313e-05, "loss": 0.8907709121704102, "memory(GiB)": 91.52, "step": 39155, "token_acc": 0.7675770686857761, "train_speed(iter/s)": 0.161278 }, { "epoch": 0.5081276417674174, "grad_norm": 0.8016637563705444, "learning_rate": 8.904083327275885e-05, "loss": 0.9365707397460937, "memory(GiB)": 91.52, "step": 39160, "token_acc": 0.760142893595305, "train_speed(iter/s)": 0.161271 }, { "epoch": 0.5081925201690731, "grad_norm": 0.7452584505081177, "learning_rate": 8.90374819890252e-05, "loss": 0.8741379737854004, "memory(GiB)": 91.52, "step": 39165, "token_acc": 0.7617997874266819, "train_speed(iter/s)": 0.161263 }, { "epoch": 0.5082573985707288, "grad_norm": 0.7192701697349548, "learning_rate": 8.90341302560507e-05, "loss": 0.9191559791564942, "memory(GiB)": 91.52, "step": 39170, "token_acc": 0.7556625525566255, "train_speed(iter/s)": 0.161256 }, { "epoch": 0.5083222769723845, "grad_norm": 0.7409940361976624, "learning_rate": 8.903077807387393e-05, "loss": 0.9456069946289063, "memory(GiB)": 91.52, "step": 39175, "token_acc": 0.7389902212358693, "train_speed(iter/s)": 0.161251 }, { "epoch": 0.5083871553740402, "grad_norm": 0.7775251269340515, "learning_rate": 8.90274254425335e-05, "loss": 0.9405050277709961, "memory(GiB)": 91.52, "step": 39180, "token_acc": 0.7516974994917667, "train_speed(iter/s)": 0.161246 }, { "epoch": 0.5084520337756959, "grad_norm": 0.761651873588562, "learning_rate": 8.902407236206795e-05, "loss": 0.9405765533447266, "memory(GiB)": 91.52, "step": 39185, "token_acc": 0.7359068395780559, "train_speed(iter/s)": 0.161239 }, { "epoch": 0.5085169121773516, "grad_norm": 0.6786972880363464, "learning_rate": 8.90207188325159e-05, "loss": 0.9159461975097656, "memory(GiB)": 91.52, "step": 39190, "token_acc": 0.7461452928094885, "train_speed(iter/s)": 0.161233 }, { "epoch": 0.5085817905790073, "grad_norm": 0.7009294033050537, "learning_rate": 8.901736485391594e-05, "loss": 0.8712657928466797, "memory(GiB)": 91.52, "step": 39195, "token_acc": 0.759211622767291, "train_speed(iter/s)": 0.161227 }, { "epoch": 0.508646668980663, "grad_norm": 0.7869963049888611, "learning_rate": 8.901401042630664e-05, "loss": 0.9150430679321289, "memory(GiB)": 91.52, "step": 39200, "token_acc": 0.7588466107491271, "train_speed(iter/s)": 0.161221 }, { "epoch": 0.5087115473823187, "grad_norm": 0.7560594081878662, "learning_rate": 8.901065554972663e-05, "loss": 0.939445686340332, "memory(GiB)": 91.52, "step": 39205, "token_acc": 0.7527999225381661, "train_speed(iter/s)": 0.161214 }, { "epoch": 0.5087764257839744, "grad_norm": 0.7850143313407898, "learning_rate": 8.900730022421449e-05, "loss": 0.872468376159668, "memory(GiB)": 91.52, "step": 39210, "token_acc": 0.7441107998263821, "train_speed(iter/s)": 0.161208 }, { "epoch": 0.5088413041856301, "grad_norm": 0.8392114639282227, "learning_rate": 8.900394444980886e-05, "loss": 0.9644307136535645, "memory(GiB)": 91.52, "step": 39215, "token_acc": 0.7299088370841322, "train_speed(iter/s)": 0.161201 }, { "epoch": 0.5089061825872858, "grad_norm": 0.8986363410949707, "learning_rate": 8.900058822654836e-05, "loss": 0.9421353340148926, "memory(GiB)": 91.52, "step": 39220, "token_acc": 0.7457274483578541, "train_speed(iter/s)": 0.161196 }, { "epoch": 0.5089710609889415, "grad_norm": 0.8433253169059753, "learning_rate": 8.899723155447158e-05, "loss": 0.9219344139099122, "memory(GiB)": 91.52, "step": 39225, "token_acc": 0.7657645466847091, "train_speed(iter/s)": 0.161189 }, { "epoch": 0.5090359393905972, "grad_norm": 0.9506450295448303, "learning_rate": 8.899387443361717e-05, "loss": 0.9941875457763671, "memory(GiB)": 91.52, "step": 39230, "token_acc": 0.7329443147715905, "train_speed(iter/s)": 0.161182 }, { "epoch": 0.5091008177922529, "grad_norm": 0.7475565671920776, "learning_rate": 8.899051686402378e-05, "loss": 0.9041732788085938, "memory(GiB)": 91.52, "step": 39235, "token_acc": 0.764907793781128, "train_speed(iter/s)": 0.161176 }, { "epoch": 0.5091656961939086, "grad_norm": 0.8698789477348328, "learning_rate": 8.898715884573e-05, "loss": 0.9286567687988281, "memory(GiB)": 91.52, "step": 39240, "token_acc": 0.7362509963046157, "train_speed(iter/s)": 0.16117 }, { "epoch": 0.5092305745955643, "grad_norm": 0.7076787948608398, "learning_rate": 8.898380037877454e-05, "loss": 0.952235221862793, "memory(GiB)": 91.52, "step": 39245, "token_acc": 0.7482857690097683, "train_speed(iter/s)": 0.161164 }, { "epoch": 0.50929545299722, "grad_norm": 0.7824810743331909, "learning_rate": 8.8980441463196e-05, "loss": 0.9110293388366699, "memory(GiB)": 91.52, "step": 39250, "token_acc": 0.7512889328342397, "train_speed(iter/s)": 0.161157 }, { "epoch": 0.5093603313988757, "grad_norm": 0.8063435554504395, "learning_rate": 8.897708209903306e-05, "loss": 0.9172714233398438, "memory(GiB)": 91.52, "step": 39255, "token_acc": 0.7502540733698325, "train_speed(iter/s)": 0.16115 }, { "epoch": 0.5094252098005314, "grad_norm": 0.7925465106964111, "learning_rate": 8.897372228632434e-05, "loss": 0.9212385177612304, "memory(GiB)": 91.52, "step": 39260, "token_acc": 0.748424480818847, "train_speed(iter/s)": 0.161145 }, { "epoch": 0.509490088202187, "grad_norm": 0.778072714805603, "learning_rate": 8.897036202510855e-05, "loss": 0.9025651931762695, "memory(GiB)": 91.52, "step": 39265, "token_acc": 0.7501212219169225, "train_speed(iter/s)": 0.161139 }, { "epoch": 0.5095549666038427, "grad_norm": 0.7199855446815491, "learning_rate": 8.896700131542435e-05, "loss": 0.9192039489746093, "memory(GiB)": 91.52, "step": 39270, "token_acc": 0.7491010647582476, "train_speed(iter/s)": 0.161132 }, { "epoch": 0.5096198450054984, "grad_norm": 0.6619502902030945, "learning_rate": 8.896364015731039e-05, "loss": 0.8645929336547852, "memory(GiB)": 91.52, "step": 39275, "token_acc": 0.7976971647889067, "train_speed(iter/s)": 0.161125 }, { "epoch": 0.5096847234071541, "grad_norm": 0.7841289639472961, "learning_rate": 8.896027855080538e-05, "loss": 0.9406654357910156, "memory(GiB)": 91.52, "step": 39280, "token_acc": 0.7417829274060062, "train_speed(iter/s)": 0.161118 }, { "epoch": 0.5097496018088098, "grad_norm": 0.7712162137031555, "learning_rate": 8.895691649594797e-05, "loss": 0.9000533103942872, "memory(GiB)": 91.52, "step": 39285, "token_acc": 0.7676798768757215, "train_speed(iter/s)": 0.16111 }, { "epoch": 0.5098144802104655, "grad_norm": 0.805444598197937, "learning_rate": 8.895355399277689e-05, "loss": 0.8939458847045898, "memory(GiB)": 91.52, "step": 39290, "token_acc": 0.7464712041884817, "train_speed(iter/s)": 0.161103 }, { "epoch": 0.5098793586121212, "grad_norm": 0.7716233134269714, "learning_rate": 8.895019104133082e-05, "loss": 0.8802297592163086, "memory(GiB)": 91.52, "step": 39295, "token_acc": 0.7721327795132333, "train_speed(iter/s)": 0.161096 }, { "epoch": 0.5099442370137769, "grad_norm": 0.7070086598396301, "learning_rate": 8.894682764164845e-05, "loss": 0.944122314453125, "memory(GiB)": 91.52, "step": 39300, "token_acc": 0.7429377496061261, "train_speed(iter/s)": 0.161089 }, { "epoch": 0.5100091154154326, "grad_norm": 0.7762607932090759, "learning_rate": 8.89434637937685e-05, "loss": 0.928154468536377, "memory(GiB)": 91.52, "step": 39305, "token_acc": 0.7556995679627784, "train_speed(iter/s)": 0.161083 }, { "epoch": 0.5100739938170883, "grad_norm": 0.8716368675231934, "learning_rate": 8.894009949772967e-05, "loss": 0.886760139465332, "memory(GiB)": 91.52, "step": 39310, "token_acc": 0.7536209553158706, "train_speed(iter/s)": 0.161078 }, { "epoch": 0.510138872218744, "grad_norm": 0.7242597937583923, "learning_rate": 8.893673475357069e-05, "loss": 0.899809455871582, "memory(GiB)": 91.52, "step": 39315, "token_acc": 0.7661364802462801, "train_speed(iter/s)": 0.161072 }, { "epoch": 0.5102037506203997, "grad_norm": 0.7730773687362671, "learning_rate": 8.893336956133027e-05, "loss": 0.9695762634277344, "memory(GiB)": 91.52, "step": 39320, "token_acc": 0.7469452437549523, "train_speed(iter/s)": 0.161066 }, { "epoch": 0.5102686290220554, "grad_norm": 0.7337841987609863, "learning_rate": 8.893000392104714e-05, "loss": 0.9142335891723633, "memory(GiB)": 91.52, "step": 39325, "token_acc": 0.7513905760501247, "train_speed(iter/s)": 0.161059 }, { "epoch": 0.5103335074237111, "grad_norm": 0.7495999336242676, "learning_rate": 8.892663783276003e-05, "loss": 0.9561736106872558, "memory(GiB)": 91.52, "step": 39330, "token_acc": 0.7411365837907653, "train_speed(iter/s)": 0.161052 }, { "epoch": 0.5103983858253668, "grad_norm": 0.7361171245574951, "learning_rate": 8.892327129650765e-05, "loss": 0.9103775978088379, "memory(GiB)": 91.52, "step": 39335, "token_acc": 0.7489572628906486, "train_speed(iter/s)": 0.161046 }, { "epoch": 0.5104632642270225, "grad_norm": 0.7721167802810669, "learning_rate": 8.89199043123288e-05, "loss": 0.9812389373779297, "memory(GiB)": 91.52, "step": 39340, "token_acc": 0.7525180902512636, "train_speed(iter/s)": 0.16104 }, { "epoch": 0.5105281426286782, "grad_norm": 0.7563934922218323, "learning_rate": 8.891653688026219e-05, "loss": 0.9802900314331054, "memory(GiB)": 91.52, "step": 39345, "token_acc": 0.7290135975265135, "train_speed(iter/s)": 0.161035 }, { "epoch": 0.5105930210303339, "grad_norm": 0.7509890794754028, "learning_rate": 8.891316900034659e-05, "loss": 0.9169995307922363, "memory(GiB)": 91.52, "step": 39350, "token_acc": 0.7455679086538461, "train_speed(iter/s)": 0.161027 }, { "epoch": 0.5106578994319896, "grad_norm": 0.7856737971305847, "learning_rate": 8.890980067262072e-05, "loss": 0.9119293212890625, "memory(GiB)": 91.52, "step": 39355, "token_acc": 0.7727758931572996, "train_speed(iter/s)": 0.161021 }, { "epoch": 0.5107227778336453, "grad_norm": 0.7088439464569092, "learning_rate": 8.890643189712338e-05, "loss": 0.8887483596801757, "memory(GiB)": 91.52, "step": 39360, "token_acc": 0.7495794953944733, "train_speed(iter/s)": 0.161014 }, { "epoch": 0.510787656235301, "grad_norm": 0.7353898286819458, "learning_rate": 8.890306267389332e-05, "loss": 0.8887514114379883, "memory(GiB)": 91.52, "step": 39365, "token_acc": 0.7664209115281502, "train_speed(iter/s)": 0.161008 }, { "epoch": 0.5108525346369567, "grad_norm": 0.7772915959358215, "learning_rate": 8.889969300296933e-05, "loss": 0.9528061866760253, "memory(GiB)": 91.52, "step": 39370, "token_acc": 0.730753171671752, "train_speed(iter/s)": 0.161002 }, { "epoch": 0.5109174130386124, "grad_norm": 0.745193600654602, "learning_rate": 8.889632288439017e-05, "loss": 0.9445791244506836, "memory(GiB)": 91.52, "step": 39375, "token_acc": 0.7215993310782562, "train_speed(iter/s)": 0.160995 }, { "epoch": 0.5109822914402681, "grad_norm": 0.740790605545044, "learning_rate": 8.889295231819463e-05, "loss": 0.9318880081176758, "memory(GiB)": 91.52, "step": 39380, "token_acc": 0.7484446848796321, "train_speed(iter/s)": 0.160989 }, { "epoch": 0.5110471698419238, "grad_norm": 0.8029330968856812, "learning_rate": 8.88895813044215e-05, "loss": 0.9385171890258789, "memory(GiB)": 91.52, "step": 39385, "token_acc": 0.7517743570836022, "train_speed(iter/s)": 0.160982 }, { "epoch": 0.5111120482435795, "grad_norm": 0.6945297718048096, "learning_rate": 8.888620984310957e-05, "loss": 0.9004888534545898, "memory(GiB)": 91.52, "step": 39390, "token_acc": 0.7594697537714062, "train_speed(iter/s)": 0.160974 }, { "epoch": 0.5111769266452352, "grad_norm": 0.9072394967079163, "learning_rate": 8.888283793429764e-05, "loss": 0.918953800201416, "memory(GiB)": 91.52, "step": 39395, "token_acc": 0.7513055086575837, "train_speed(iter/s)": 0.160967 }, { "epoch": 0.5112418050468909, "grad_norm": 0.8458972573280334, "learning_rate": 8.887946557802451e-05, "loss": 0.8748984336853027, "memory(GiB)": 91.52, "step": 39400, "token_acc": 0.7738152670056035, "train_speed(iter/s)": 0.16096 }, { "epoch": 0.5113066834485466, "grad_norm": 0.7670189738273621, "learning_rate": 8.8876092774329e-05, "loss": 0.9254874229431153, "memory(GiB)": 91.52, "step": 39405, "token_acc": 0.7382878717830175, "train_speed(iter/s)": 0.160954 }, { "epoch": 0.5113715618502023, "grad_norm": 0.8436035513877869, "learning_rate": 8.887271952324991e-05, "loss": 0.9364004135131836, "memory(GiB)": 91.52, "step": 39410, "token_acc": 0.7511750241230118, "train_speed(iter/s)": 0.160946 }, { "epoch": 0.511436440251858, "grad_norm": 0.793913722038269, "learning_rate": 8.886934582482607e-05, "loss": 0.913909912109375, "memory(GiB)": 91.52, "step": 39415, "token_acc": 0.7650872473963721, "train_speed(iter/s)": 0.16094 }, { "epoch": 0.5115013186535137, "grad_norm": 0.7825809121131897, "learning_rate": 8.88659716790963e-05, "loss": 0.9505355834960938, "memory(GiB)": 91.52, "step": 39420, "token_acc": 0.7494157058273605, "train_speed(iter/s)": 0.160934 }, { "epoch": 0.5115661970551694, "grad_norm": 0.8199952840805054, "learning_rate": 8.886259708609943e-05, "loss": 0.9113832473754883, "memory(GiB)": 91.52, "step": 39425, "token_acc": 0.7504444444444445, "train_speed(iter/s)": 0.160927 }, { "epoch": 0.5116310754568251, "grad_norm": 0.7798362970352173, "learning_rate": 8.885922204587428e-05, "loss": 0.9011890411376953, "memory(GiB)": 91.52, "step": 39430, "token_acc": 0.7543644394587394, "train_speed(iter/s)": 0.16092 }, { "epoch": 0.5116959538584808, "grad_norm": 0.7804653644561768, "learning_rate": 8.885584655845971e-05, "loss": 0.9070765495300293, "memory(GiB)": 91.52, "step": 39435, "token_acc": 0.7587125332933188, "train_speed(iter/s)": 0.160914 }, { "epoch": 0.5117608322601365, "grad_norm": 0.768474280834198, "learning_rate": 8.885247062389457e-05, "loss": 0.928109073638916, "memory(GiB)": 91.52, "step": 39440, "token_acc": 0.7561418500320445, "train_speed(iter/s)": 0.160908 }, { "epoch": 0.5118257106617922, "grad_norm": 0.7798163890838623, "learning_rate": 8.884909424221768e-05, "loss": 0.9162284851074218, "memory(GiB)": 91.52, "step": 39445, "token_acc": 0.7655960074221, "train_speed(iter/s)": 0.160901 }, { "epoch": 0.5118905890634479, "grad_norm": 0.7491135597229004, "learning_rate": 8.884571741346793e-05, "loss": 0.9286259651184082, "memory(GiB)": 91.52, "step": 39450, "token_acc": 0.7641509433962265, "train_speed(iter/s)": 0.160896 }, { "epoch": 0.5119554674651036, "grad_norm": 0.7045735716819763, "learning_rate": 8.884234013768416e-05, "loss": 0.8959465980529785, "memory(GiB)": 91.52, "step": 39455, "token_acc": 0.7555761160493011, "train_speed(iter/s)": 0.160889 }, { "epoch": 0.5120203458667593, "grad_norm": 0.6832711696624756, "learning_rate": 8.883896241490524e-05, "loss": 0.9571964263916015, "memory(GiB)": 91.52, "step": 39460, "token_acc": 0.7273378828955159, "train_speed(iter/s)": 0.160882 }, { "epoch": 0.512085224268415, "grad_norm": 0.6937328577041626, "learning_rate": 8.883558424517005e-05, "loss": 0.935344123840332, "memory(GiB)": 91.52, "step": 39465, "token_acc": 0.7367455642174328, "train_speed(iter/s)": 0.160877 }, { "epoch": 0.5121501026700707, "grad_norm": 0.7691918611526489, "learning_rate": 8.883220562851743e-05, "loss": 0.880677604675293, "memory(GiB)": 91.52, "step": 39470, "token_acc": 0.7638024076380241, "train_speed(iter/s)": 0.160869 }, { "epoch": 0.5122149810717264, "grad_norm": 0.825445830821991, "learning_rate": 8.882882656498631e-05, "loss": 0.9257693290710449, "memory(GiB)": 91.52, "step": 39475, "token_acc": 0.7622625677015791, "train_speed(iter/s)": 0.160863 }, { "epoch": 0.5122798594733821, "grad_norm": 0.7565649151802063, "learning_rate": 8.882544705461553e-05, "loss": 0.9396695137023926, "memory(GiB)": 91.52, "step": 39480, "token_acc": 0.7332226031028024, "train_speed(iter/s)": 0.160857 }, { "epoch": 0.5123447378750378, "grad_norm": 0.818543016910553, "learning_rate": 8.882206709744403e-05, "loss": 0.8993643760681153, "memory(GiB)": 91.52, "step": 39485, "token_acc": 0.7698877850046458, "train_speed(iter/s)": 0.160851 }, { "epoch": 0.5124096162766935, "grad_norm": 0.7801651954650879, "learning_rate": 8.881868669351065e-05, "loss": 0.9680779457092286, "memory(GiB)": 91.52, "step": 39490, "token_acc": 0.7430719135374809, "train_speed(iter/s)": 0.160845 }, { "epoch": 0.5124744946783492, "grad_norm": 0.6533637046813965, "learning_rate": 8.881530584285436e-05, "loss": 0.9387955665588379, "memory(GiB)": 91.52, "step": 39495, "token_acc": 0.7414180206794683, "train_speed(iter/s)": 0.160838 }, { "epoch": 0.5125393730800049, "grad_norm": 0.7383666038513184, "learning_rate": 8.8811924545514e-05, "loss": 0.887545108795166, "memory(GiB)": 91.52, "step": 39500, "token_acc": 0.7508553526526238, "train_speed(iter/s)": 0.160831 }, { "epoch": 0.5126042514816604, "grad_norm": 0.8291112184524536, "learning_rate": 8.880854280152853e-05, "loss": 0.9276784896850586, "memory(GiB)": 91.52, "step": 39505, "token_acc": 0.7650853242320819, "train_speed(iter/s)": 0.160824 }, { "epoch": 0.5126691298833161, "grad_norm": 0.7682941555976868, "learning_rate": 8.880516061093682e-05, "loss": 0.886020565032959, "memory(GiB)": 91.52, "step": 39510, "token_acc": 0.7582955979511543, "train_speed(iter/s)": 0.160819 }, { "epoch": 0.5127340082849718, "grad_norm": 0.6709444522857666, "learning_rate": 8.880177797377784e-05, "loss": 0.9014348983764648, "memory(GiB)": 91.52, "step": 39515, "token_acc": 0.7619857413826935, "train_speed(iter/s)": 0.16081 }, { "epoch": 0.5127988866866275, "grad_norm": 0.7216230034828186, "learning_rate": 8.87983948900905e-05, "loss": 0.9220620155334472, "memory(GiB)": 91.52, "step": 39520, "token_acc": 0.7423084590338228, "train_speed(iter/s)": 0.160804 }, { "epoch": 0.5128637650882832, "grad_norm": 0.7353547811508179, "learning_rate": 8.879501135991372e-05, "loss": 0.9187155723571777, "memory(GiB)": 91.52, "step": 39525, "token_acc": 0.7696819892274773, "train_speed(iter/s)": 0.160797 }, { "epoch": 0.5129286434899389, "grad_norm": 0.6923987865447998, "learning_rate": 8.879162738328645e-05, "loss": 0.8434033393859863, "memory(GiB)": 91.52, "step": 39530, "token_acc": 0.7680754453370591, "train_speed(iter/s)": 0.16079 }, { "epoch": 0.5129935218915946, "grad_norm": 0.7973989248275757, "learning_rate": 8.878824296024761e-05, "loss": 0.949582290649414, "memory(GiB)": 91.52, "step": 39535, "token_acc": 0.7321754452205651, "train_speed(iter/s)": 0.160783 }, { "epoch": 0.5130584002932503, "grad_norm": 0.7813458442687988, "learning_rate": 8.878485809083619e-05, "loss": 0.962092399597168, "memory(GiB)": 91.52, "step": 39540, "token_acc": 0.7463508322663253, "train_speed(iter/s)": 0.160776 }, { "epoch": 0.513123278694906, "grad_norm": 0.6404515504837036, "learning_rate": 8.878147277509112e-05, "loss": 0.9226119995117188, "memory(GiB)": 91.52, "step": 39545, "token_acc": 0.7516880486158002, "train_speed(iter/s)": 0.160769 }, { "epoch": 0.5131881570965617, "grad_norm": 0.7878654599189758, "learning_rate": 8.877808701305134e-05, "loss": 0.9229970932006836, "memory(GiB)": 91.52, "step": 39550, "token_acc": 0.7684668561602975, "train_speed(iter/s)": 0.160763 }, { "epoch": 0.5132530354982174, "grad_norm": 0.7370734810829163, "learning_rate": 8.877470080475583e-05, "loss": 0.9450469970703125, "memory(GiB)": 91.52, "step": 39555, "token_acc": 0.7381431899853887, "train_speed(iter/s)": 0.160756 }, { "epoch": 0.5133179138998731, "grad_norm": 0.826820433139801, "learning_rate": 8.877131415024357e-05, "loss": 0.9050561904907226, "memory(GiB)": 91.52, "step": 39560, "token_acc": 0.7507641196013289, "train_speed(iter/s)": 0.160749 }, { "epoch": 0.5133827923015288, "grad_norm": 0.8193526864051819, "learning_rate": 8.876792704955352e-05, "loss": 0.9058294296264648, "memory(GiB)": 91.52, "step": 39565, "token_acc": 0.7383972174401855, "train_speed(iter/s)": 0.160742 }, { "epoch": 0.5134476707031845, "grad_norm": 0.8904451727867126, "learning_rate": 8.876453950272466e-05, "loss": 0.9213336944580078, "memory(GiB)": 91.52, "step": 39570, "token_acc": 0.7330811026085967, "train_speed(iter/s)": 0.160736 }, { "epoch": 0.5135125491048402, "grad_norm": 0.7862693667411804, "learning_rate": 8.876115150979599e-05, "loss": 0.9402843475341797, "memory(GiB)": 91.52, "step": 39575, "token_acc": 0.7484018596542206, "train_speed(iter/s)": 0.16073 }, { "epoch": 0.5135774275064959, "grad_norm": 0.6616705656051636, "learning_rate": 8.875776307080646e-05, "loss": 0.9188697814941407, "memory(GiB)": 91.52, "step": 39580, "token_acc": 0.7642074113928308, "train_speed(iter/s)": 0.160724 }, { "epoch": 0.5136423059081516, "grad_norm": 0.7748188376426697, "learning_rate": 8.875437418579509e-05, "loss": 0.9572066307067871, "memory(GiB)": 91.52, "step": 39585, "token_acc": 0.7512303540244484, "train_speed(iter/s)": 0.160717 }, { "epoch": 0.5137071843098073, "grad_norm": 0.8253615498542786, "learning_rate": 8.875098485480089e-05, "loss": 0.9144445419311523, "memory(GiB)": 91.52, "step": 39590, "token_acc": 0.7686686030169587, "train_speed(iter/s)": 0.16071 }, { "epoch": 0.513772062711463, "grad_norm": 0.779213547706604, "learning_rate": 8.874759507786284e-05, "loss": 0.9128459930419922, "memory(GiB)": 91.52, "step": 39595, "token_acc": 0.7608374626718735, "train_speed(iter/s)": 0.160704 }, { "epoch": 0.5138369411131187, "grad_norm": 0.7386518120765686, "learning_rate": 8.874420485501995e-05, "loss": 0.9702797889709472, "memory(GiB)": 91.52, "step": 39600, "token_acc": 0.7384752525681297, "train_speed(iter/s)": 0.160698 }, { "epoch": 0.5139018195147744, "grad_norm": 0.7378857135772705, "learning_rate": 8.874081418631126e-05, "loss": 0.9327211380004883, "memory(GiB)": 91.52, "step": 39605, "token_acc": 0.763864741156016, "train_speed(iter/s)": 0.160692 }, { "epoch": 0.5139666979164301, "grad_norm": 0.7457571625709534, "learning_rate": 8.873742307177577e-05, "loss": 0.9303882598876954, "memory(GiB)": 91.52, "step": 39610, "token_acc": 0.7529121788162204, "train_speed(iter/s)": 0.160686 }, { "epoch": 0.5140315763180858, "grad_norm": 0.7037056088447571, "learning_rate": 8.87340315114525e-05, "loss": 0.8829092025756836, "memory(GiB)": 91.52, "step": 39615, "token_acc": 0.7458879026751909, "train_speed(iter/s)": 0.160679 }, { "epoch": 0.5140964547197415, "grad_norm": 0.822310745716095, "learning_rate": 8.87306395053805e-05, "loss": 0.9270441055297851, "memory(GiB)": 91.52, "step": 39620, "token_acc": 0.7634466561429176, "train_speed(iter/s)": 0.160673 }, { "epoch": 0.5141613331213972, "grad_norm": 0.8207115530967712, "learning_rate": 8.872724705359879e-05, "loss": 0.939876937866211, "memory(GiB)": 91.52, "step": 39625, "token_acc": 0.7430735370274469, "train_speed(iter/s)": 0.160666 }, { "epoch": 0.5142262115230529, "grad_norm": 0.7348400950431824, "learning_rate": 8.872385415614642e-05, "loss": 0.9236783981323242, "memory(GiB)": 91.52, "step": 39630, "token_acc": 0.7505570495582021, "train_speed(iter/s)": 0.16066 }, { "epoch": 0.5142910899247086, "grad_norm": 0.7416028380393982, "learning_rate": 8.87204608130624e-05, "loss": 0.9158637046813964, "memory(GiB)": 91.52, "step": 39635, "token_acc": 0.743952608194833, "train_speed(iter/s)": 0.160654 }, { "epoch": 0.5143559683263643, "grad_norm": 0.7222446203231812, "learning_rate": 8.871706702438585e-05, "loss": 0.9309268951416015, "memory(GiB)": 91.52, "step": 39640, "token_acc": 0.7437996545768566, "train_speed(iter/s)": 0.160648 }, { "epoch": 0.51442084672802, "grad_norm": 0.8258227109909058, "learning_rate": 8.871367279015576e-05, "loss": 0.9483868598937988, "memory(GiB)": 91.52, "step": 39645, "token_acc": 0.7536250317985246, "train_speed(iter/s)": 0.160642 }, { "epoch": 0.5144857251296757, "grad_norm": 0.7312723994255066, "learning_rate": 8.871027811041122e-05, "loss": 0.9609488487243653, "memory(GiB)": 91.52, "step": 39650, "token_acc": 0.7414146942126146, "train_speed(iter/s)": 0.160636 }, { "epoch": 0.5145506035313314, "grad_norm": 0.770556628704071, "learning_rate": 8.870688298519127e-05, "loss": 0.9189789772033692, "memory(GiB)": 91.52, "step": 39655, "token_acc": 0.7798942540004902, "train_speed(iter/s)": 0.160629 }, { "epoch": 0.5146154819329871, "grad_norm": 0.6705120205879211, "learning_rate": 8.870348741453502e-05, "loss": 1.007650661468506, "memory(GiB)": 91.52, "step": 39660, "token_acc": 0.7460718376457458, "train_speed(iter/s)": 0.160623 }, { "epoch": 0.5146803603346428, "grad_norm": 0.770812451839447, "learning_rate": 8.870009139848154e-05, "loss": 0.9296764373779297, "memory(GiB)": 91.52, "step": 39665, "token_acc": 0.7505641440167053, "train_speed(iter/s)": 0.160617 }, { "epoch": 0.5147452387362985, "grad_norm": 0.7628102898597717, "learning_rate": 8.869669493706989e-05, "loss": 0.9402880668640137, "memory(GiB)": 91.52, "step": 39670, "token_acc": 0.7510573351476592, "train_speed(iter/s)": 0.160611 }, { "epoch": 0.5148101171379542, "grad_norm": 0.8412941694259644, "learning_rate": 8.869329803033916e-05, "loss": 0.9986516952514648, "memory(GiB)": 91.52, "step": 39675, "token_acc": 0.7305145537315492, "train_speed(iter/s)": 0.160604 }, { "epoch": 0.5148749955396099, "grad_norm": 0.6732886433601379, "learning_rate": 8.868990067832845e-05, "loss": 0.9077571868896485, "memory(GiB)": 91.52, "step": 39680, "token_acc": 0.7557533892357134, "train_speed(iter/s)": 0.160599 }, { "epoch": 0.5149398739412656, "grad_norm": 0.7719795107841492, "learning_rate": 8.868650288107685e-05, "loss": 0.9284738540649414, "memory(GiB)": 91.52, "step": 39685, "token_acc": 0.7449936154972127, "train_speed(iter/s)": 0.160594 }, { "epoch": 0.5150047523429213, "grad_norm": 0.7461731433868408, "learning_rate": 8.868310463862347e-05, "loss": 0.8733972549438477, "memory(GiB)": 91.52, "step": 39690, "token_acc": 0.7560806534253595, "train_speed(iter/s)": 0.160586 }, { "epoch": 0.515069630744577, "grad_norm": 0.682548999786377, "learning_rate": 8.86797059510074e-05, "loss": 0.8860931396484375, "memory(GiB)": 91.52, "step": 39695, "token_acc": 0.7541251672365096, "train_speed(iter/s)": 0.160579 }, { "epoch": 0.5151345091462327, "grad_norm": 0.6660832762718201, "learning_rate": 8.867630681826779e-05, "loss": 0.9575565338134766, "memory(GiB)": 91.52, "step": 39700, "token_acc": 0.7456469327618538, "train_speed(iter/s)": 0.160571 }, { "epoch": 0.5151993875478884, "grad_norm": 0.8389596343040466, "learning_rate": 8.867290724044371e-05, "loss": 0.9422883987426758, "memory(GiB)": 91.52, "step": 39705, "token_acc": 0.7436429512516469, "train_speed(iter/s)": 0.160564 }, { "epoch": 0.5152642659495441, "grad_norm": 0.702534556388855, "learning_rate": 8.86695072175743e-05, "loss": 0.8924281120300293, "memory(GiB)": 91.52, "step": 39710, "token_acc": 0.7743582419936449, "train_speed(iter/s)": 0.160556 }, { "epoch": 0.5153291443511998, "grad_norm": 0.8253819346427917, "learning_rate": 8.86661067496987e-05, "loss": 0.9250526428222656, "memory(GiB)": 91.52, "step": 39715, "token_acc": 0.7582981771869777, "train_speed(iter/s)": 0.160551 }, { "epoch": 0.5153940227528555, "grad_norm": 0.8159412145614624, "learning_rate": 8.866270583685603e-05, "loss": 0.9082444190979004, "memory(GiB)": 91.52, "step": 39720, "token_acc": 0.7586014090282178, "train_speed(iter/s)": 0.160546 }, { "epoch": 0.5154589011545112, "grad_norm": 0.8189860582351685, "learning_rate": 8.865930447908541e-05, "loss": 0.9151457786560059, "memory(GiB)": 91.52, "step": 39725, "token_acc": 0.7633364456893869, "train_speed(iter/s)": 0.160539 }, { "epoch": 0.5155237795561669, "grad_norm": 0.805189311504364, "learning_rate": 8.865590267642602e-05, "loss": 0.9786593437194824, "memory(GiB)": 91.52, "step": 39730, "token_acc": 0.731763396996106, "train_speed(iter/s)": 0.160532 }, { "epoch": 0.5155886579578226, "grad_norm": 0.7155797481536865, "learning_rate": 8.865250042891699e-05, "loss": 0.93226318359375, "memory(GiB)": 91.52, "step": 39735, "token_acc": 0.7556007467662355, "train_speed(iter/s)": 0.160526 }, { "epoch": 0.5156535363594782, "grad_norm": 0.8455410599708557, "learning_rate": 8.864909773659749e-05, "loss": 0.934140968322754, "memory(GiB)": 91.52, "step": 39740, "token_acc": 0.7718033918784792, "train_speed(iter/s)": 0.16052 }, { "epoch": 0.5157184147611339, "grad_norm": 0.7033368945121765, "learning_rate": 8.864569459950662e-05, "loss": 0.8929407119750976, "memory(GiB)": 91.52, "step": 39745, "token_acc": 0.7694971199275128, "train_speed(iter/s)": 0.160512 }, { "epoch": 0.5157832931627896, "grad_norm": 0.7668476700782776, "learning_rate": 8.864229101768362e-05, "loss": 0.887509536743164, "memory(GiB)": 91.52, "step": 39750, "token_acc": 0.7703733440385387, "train_speed(iter/s)": 0.160505 }, { "epoch": 0.5158481715644453, "grad_norm": 0.7238761782646179, "learning_rate": 8.863888699116761e-05, "loss": 0.915263557434082, "memory(GiB)": 91.52, "step": 39755, "token_acc": 0.7279861583142804, "train_speed(iter/s)": 0.160498 }, { "epoch": 0.515913049966101, "grad_norm": 0.7185658812522888, "learning_rate": 8.863548251999776e-05, "loss": 0.9251737594604492, "memory(GiB)": 91.52, "step": 39760, "token_acc": 0.7492472713586752, "train_speed(iter/s)": 0.160491 }, { "epoch": 0.5159779283677567, "grad_norm": 0.7881054878234863, "learning_rate": 8.863207760421329e-05, "loss": 0.9423316955566406, "memory(GiB)": 91.52, "step": 39765, "token_acc": 0.7572427938321796, "train_speed(iter/s)": 0.160483 }, { "epoch": 0.5160428067694124, "grad_norm": 0.805627167224884, "learning_rate": 8.862867224385334e-05, "loss": 0.9175674438476562, "memory(GiB)": 91.52, "step": 39770, "token_acc": 0.7457200055225736, "train_speed(iter/s)": 0.160476 }, { "epoch": 0.5161076851710681, "grad_norm": 0.7969523072242737, "learning_rate": 8.862526643895712e-05, "loss": 0.8975191116333008, "memory(GiB)": 91.52, "step": 39775, "token_acc": 0.7530353104887807, "train_speed(iter/s)": 0.16047 }, { "epoch": 0.5161725635727238, "grad_norm": 0.7280052304267883, "learning_rate": 8.862186018956382e-05, "loss": 0.8565019607543946, "memory(GiB)": 91.52, "step": 39780, "token_acc": 0.7716515449527315, "train_speed(iter/s)": 0.160463 }, { "epoch": 0.5162374419743795, "grad_norm": 0.6471118927001953, "learning_rate": 8.861845349571263e-05, "loss": 0.8272016525268555, "memory(GiB)": 91.52, "step": 39785, "token_acc": 0.7641311685081892, "train_speed(iter/s)": 0.160456 }, { "epoch": 0.5163023203760352, "grad_norm": 0.7635462284088135, "learning_rate": 8.861504635744278e-05, "loss": 0.9285269737243652, "memory(GiB)": 91.52, "step": 39790, "token_acc": 0.7567199971366191, "train_speed(iter/s)": 0.16045 }, { "epoch": 0.5163671987776909, "grad_norm": 0.7369228601455688, "learning_rate": 8.861163877479343e-05, "loss": 0.9296289443969726, "memory(GiB)": 91.52, "step": 39795, "token_acc": 0.7560975609756098, "train_speed(iter/s)": 0.160444 }, { "epoch": 0.5164320771793466, "grad_norm": 0.7981152534484863, "learning_rate": 8.860823074780387e-05, "loss": 0.9246253967285156, "memory(GiB)": 91.52, "step": 39800, "token_acc": 0.7225727434679335, "train_speed(iter/s)": 0.160439 }, { "epoch": 0.5164969555810023, "grad_norm": 0.7216131091117859, "learning_rate": 8.860482227651326e-05, "loss": 0.8972311019897461, "memory(GiB)": 91.52, "step": 39805, "token_acc": 0.7703003094625812, "train_speed(iter/s)": 0.160433 }, { "epoch": 0.516561833982658, "grad_norm": 0.8326748609542847, "learning_rate": 8.860141336096082e-05, "loss": 0.9049539566040039, "memory(GiB)": 91.52, "step": 39810, "token_acc": 0.7566221417251529, "train_speed(iter/s)": 0.160426 }, { "epoch": 0.5166267123843137, "grad_norm": 0.8536413311958313, "learning_rate": 8.859800400118581e-05, "loss": 0.9131992340087891, "memory(GiB)": 91.52, "step": 39815, "token_acc": 0.7291643107542689, "train_speed(iter/s)": 0.16042 }, { "epoch": 0.5166915907859694, "grad_norm": 0.8385304808616638, "learning_rate": 8.859459419722746e-05, "loss": 0.9456830978393554, "memory(GiB)": 91.52, "step": 39820, "token_acc": 0.7461748835834134, "train_speed(iter/s)": 0.160413 }, { "epoch": 0.5167564691876251, "grad_norm": 0.8525048494338989, "learning_rate": 8.8591183949125e-05, "loss": 0.9270284652709961, "memory(GiB)": 91.52, "step": 39825, "token_acc": 0.7429451186379279, "train_speed(iter/s)": 0.160407 }, { "epoch": 0.5168213475892808, "grad_norm": 0.8692244291305542, "learning_rate": 8.858777325691768e-05, "loss": 0.9369470596313476, "memory(GiB)": 91.52, "step": 39830, "token_acc": 0.7468137093046292, "train_speed(iter/s)": 0.1604 }, { "epoch": 0.5168862259909365, "grad_norm": 0.7278541922569275, "learning_rate": 8.858436212064475e-05, "loss": 0.8634902000427246, "memory(GiB)": 91.52, "step": 39835, "token_acc": 0.7623091699524821, "train_speed(iter/s)": 0.160395 }, { "epoch": 0.5169511043925922, "grad_norm": 0.8691186904907227, "learning_rate": 8.858095054034546e-05, "loss": 0.9153131484985352, "memory(GiB)": 91.52, "step": 39840, "token_acc": 0.7537429167909335, "train_speed(iter/s)": 0.160388 }, { "epoch": 0.5170159827942479, "grad_norm": 0.8479330539703369, "learning_rate": 8.857753851605907e-05, "loss": 0.9564762115478516, "memory(GiB)": 91.52, "step": 39845, "token_acc": 0.7492073433279837, "train_speed(iter/s)": 0.160383 }, { "epoch": 0.5170808611959036, "grad_norm": 0.7217499017715454, "learning_rate": 8.857412604782487e-05, "loss": 0.9399153709411621, "memory(GiB)": 91.52, "step": 39850, "token_acc": 0.7255186889480908, "train_speed(iter/s)": 0.160376 }, { "epoch": 0.5171457395975593, "grad_norm": 0.7463667392730713, "learning_rate": 8.85707131356821e-05, "loss": 0.9453207015991211, "memory(GiB)": 91.52, "step": 39855, "token_acc": 0.7438617443535073, "train_speed(iter/s)": 0.16037 }, { "epoch": 0.517210617999215, "grad_norm": 0.7388525009155273, "learning_rate": 8.856729977967005e-05, "loss": 0.9308570861816406, "memory(GiB)": 91.52, "step": 39860, "token_acc": 0.7523969319271333, "train_speed(iter/s)": 0.160364 }, { "epoch": 0.5172754964008707, "grad_norm": 0.835677444934845, "learning_rate": 8.856388597982798e-05, "loss": 0.9282342910766601, "memory(GiB)": 91.52, "step": 39865, "token_acc": 0.7422254247572816, "train_speed(iter/s)": 0.160358 }, { "epoch": 0.5173403748025264, "grad_norm": 0.84671950340271, "learning_rate": 8.85604717361952e-05, "loss": 0.9865716934204102, "memory(GiB)": 91.52, "step": 39870, "token_acc": 0.7412925239012196, "train_speed(iter/s)": 0.160352 }, { "epoch": 0.5174052532041821, "grad_norm": 0.8085798621177673, "learning_rate": 8.855705704881099e-05, "loss": 0.9372123718261719, "memory(GiB)": 91.52, "step": 39875, "token_acc": 0.7583170890188434, "train_speed(iter/s)": 0.160345 }, { "epoch": 0.5174701316058378, "grad_norm": 0.8031641244888306, "learning_rate": 8.855364191771465e-05, "loss": 0.9396881103515625, "memory(GiB)": 91.52, "step": 39880, "token_acc": 0.7510362523850254, "train_speed(iter/s)": 0.160339 }, { "epoch": 0.5175350100074935, "grad_norm": 0.8079886436462402, "learning_rate": 8.855022634294548e-05, "loss": 0.9407782554626465, "memory(GiB)": 91.52, "step": 39885, "token_acc": 0.7290569961655944, "train_speed(iter/s)": 0.160331 }, { "epoch": 0.5175998884091492, "grad_norm": 0.7928133606910706, "learning_rate": 8.854681032454279e-05, "loss": 0.9397115707397461, "memory(GiB)": 91.52, "step": 39890, "token_acc": 0.7488146665261827, "train_speed(iter/s)": 0.160326 }, { "epoch": 0.5176647668108049, "grad_norm": 0.7603999376296997, "learning_rate": 8.854339386254589e-05, "loss": 0.9542902946472168, "memory(GiB)": 91.52, "step": 39895, "token_acc": 0.7521907479111474, "train_speed(iter/s)": 0.16032 }, { "epoch": 0.5177296452124606, "grad_norm": 0.8347707986831665, "learning_rate": 8.853997695699409e-05, "loss": 0.9605276107788085, "memory(GiB)": 91.52, "step": 39900, "token_acc": 0.7581104368022776, "train_speed(iter/s)": 0.160315 }, { "epoch": 0.5177945236141163, "grad_norm": 0.6772611737251282, "learning_rate": 8.853655960792672e-05, "loss": 0.9017801284790039, "memory(GiB)": 91.52, "step": 39905, "token_acc": 0.7703156241073978, "train_speed(iter/s)": 0.160308 }, { "epoch": 0.517859402015772, "grad_norm": 0.8605998158454895, "learning_rate": 8.85331418153831e-05, "loss": 0.9306888580322266, "memory(GiB)": 91.52, "step": 39910, "token_acc": 0.7406660309543021, "train_speed(iter/s)": 0.160302 }, { "epoch": 0.5179242804174277, "grad_norm": 0.6820176243782043, "learning_rate": 8.852972357940255e-05, "loss": 0.9360714912414551, "memory(GiB)": 91.52, "step": 39915, "token_acc": 0.744852363987682, "train_speed(iter/s)": 0.160295 }, { "epoch": 0.5179891588190834, "grad_norm": 0.7292795777320862, "learning_rate": 8.852630490002444e-05, "loss": 0.9095382690429688, "memory(GiB)": 91.52, "step": 39920, "token_acc": 0.7413157652262047, "train_speed(iter/s)": 0.160289 }, { "epoch": 0.5180540372207391, "grad_norm": 0.7543690800666809, "learning_rate": 8.852288577728808e-05, "loss": 0.9352819442749023, "memory(GiB)": 91.52, "step": 39925, "token_acc": 0.7575699255837824, "train_speed(iter/s)": 0.160284 }, { "epoch": 0.5181189156223948, "grad_norm": 0.7882603406906128, "learning_rate": 8.851946621123284e-05, "loss": 0.9224735260009765, "memory(GiB)": 91.52, "step": 39930, "token_acc": 0.7401253172588832, "train_speed(iter/s)": 0.160278 }, { "epoch": 0.5181837940240505, "grad_norm": 0.7509085536003113, "learning_rate": 8.851604620189807e-05, "loss": 0.9534910202026368, "memory(GiB)": 91.52, "step": 39935, "token_acc": 0.7428517858816912, "train_speed(iter/s)": 0.16027 }, { "epoch": 0.5182486724257062, "grad_norm": 0.7511221170425415, "learning_rate": 8.85126257493231e-05, "loss": 0.9664731025695801, "memory(GiB)": 91.52, "step": 39940, "token_acc": 0.7281091666407634, "train_speed(iter/s)": 0.160262 }, { "epoch": 0.5183135508273619, "grad_norm": 0.7362819314002991, "learning_rate": 8.850920485354732e-05, "loss": 0.9493961334228516, "memory(GiB)": 91.52, "step": 39945, "token_acc": 0.7417591125198099, "train_speed(iter/s)": 0.160256 }, { "epoch": 0.5183784292290176, "grad_norm": 0.7078364491462708, "learning_rate": 8.850578351461009e-05, "loss": 0.931761360168457, "memory(GiB)": 91.52, "step": 39950, "token_acc": 0.7643521049753964, "train_speed(iter/s)": 0.160249 }, { "epoch": 0.5184433076306733, "grad_norm": 0.7641295194625854, "learning_rate": 8.850236173255077e-05, "loss": 0.8940531730651855, "memory(GiB)": 91.52, "step": 39955, "token_acc": 0.7510548523206751, "train_speed(iter/s)": 0.160243 }, { "epoch": 0.518508186032329, "grad_norm": 0.7555259466171265, "learning_rate": 8.849893950740877e-05, "loss": 0.9240854263305665, "memory(GiB)": 91.52, "step": 39960, "token_acc": 0.7627308946034046, "train_speed(iter/s)": 0.160237 }, { "epoch": 0.5185730644339847, "grad_norm": 0.8107632994651794, "learning_rate": 8.849551683922344e-05, "loss": 0.9415878295898438, "memory(GiB)": 91.52, "step": 39965, "token_acc": 0.7388231968719084, "train_speed(iter/s)": 0.160231 }, { "epoch": 0.5186379428356404, "grad_norm": 0.749622106552124, "learning_rate": 8.849209372803417e-05, "loss": 0.9399875640869141, "memory(GiB)": 91.52, "step": 39970, "token_acc": 0.7506491551574906, "train_speed(iter/s)": 0.160224 }, { "epoch": 0.518702821237296, "grad_norm": 0.7162501811981201, "learning_rate": 8.848867017388038e-05, "loss": 0.909238052368164, "memory(GiB)": 91.52, "step": 39975, "token_acc": 0.7568674608244711, "train_speed(iter/s)": 0.160217 }, { "epoch": 0.5187676996389516, "grad_norm": 0.7722998261451721, "learning_rate": 8.848524617680144e-05, "loss": 0.9159708023071289, "memory(GiB)": 91.52, "step": 39980, "token_acc": 0.7641163344805416, "train_speed(iter/s)": 0.160211 }, { "epoch": 0.5188325780406073, "grad_norm": 0.8272303342819214, "learning_rate": 8.848182173683677e-05, "loss": 0.9441682815551757, "memory(GiB)": 91.52, "step": 39985, "token_acc": 0.7640821925638022, "train_speed(iter/s)": 0.160203 }, { "epoch": 0.518897456442263, "grad_norm": 0.7982874512672424, "learning_rate": 8.847839685402577e-05, "loss": 0.9062732696533203, "memory(GiB)": 91.52, "step": 39990, "token_acc": 0.7593329432416618, "train_speed(iter/s)": 0.160196 }, { "epoch": 0.5189623348439187, "grad_norm": 0.7129908204078674, "learning_rate": 8.847497152840785e-05, "loss": 0.9060268402099609, "memory(GiB)": 91.52, "step": 39995, "token_acc": 0.7704183587958486, "train_speed(iter/s)": 0.16019 }, { "epoch": 0.5190272132455744, "grad_norm": 0.7187349796295166, "learning_rate": 8.847154576002243e-05, "loss": 0.9248245239257813, "memory(GiB)": 91.52, "step": 40000, "token_acc": 0.7309541984732825, "train_speed(iter/s)": 0.160181 }, { "epoch": 0.5190272132455744, "eval_loss": 0.9174764752388, "eval_runtime": 2815.46, "eval_samples_per_second": 17.696, "eval_steps_per_second": 1.106, "eval_token_acc": 0.7534171505148569, "step": 40000 }, { "epoch": 0.5190920916472301, "grad_norm": 0.7578424215316772, "learning_rate": 8.846811954890896e-05, "loss": 0.9291080474853516, "memory(GiB)": 91.52, "step": 40005, "token_acc": 0.755041265201775, "train_speed(iter/s)": 0.158315 }, { "epoch": 0.5191569700488858, "grad_norm": 0.8323163390159607, "learning_rate": 8.846469289510682e-05, "loss": 0.9430960655212403, "memory(GiB)": 91.52, "step": 40010, "token_acc": 0.7267539982319376, "train_speed(iter/s)": 0.15831 }, { "epoch": 0.5192218484505415, "grad_norm": 0.7131609320640564, "learning_rate": 8.846126579865548e-05, "loss": 0.8708202362060546, "memory(GiB)": 91.52, "step": 40015, "token_acc": 0.7763933262335818, "train_speed(iter/s)": 0.158302 }, { "epoch": 0.5192867268521972, "grad_norm": 0.7175420522689819, "learning_rate": 8.845783825959438e-05, "loss": 0.9186573028564453, "memory(GiB)": 91.52, "step": 40020, "token_acc": 0.7622722949740288, "train_speed(iter/s)": 0.158296 }, { "epoch": 0.5193516052538529, "grad_norm": 0.7523294687271118, "learning_rate": 8.845441027796295e-05, "loss": 0.9669205665588378, "memory(GiB)": 91.52, "step": 40025, "token_acc": 0.7242261353104726, "train_speed(iter/s)": 0.158291 }, { "epoch": 0.5194164836555086, "grad_norm": 0.7375420928001404, "learning_rate": 8.845098185380062e-05, "loss": 0.9392776489257812, "memory(GiB)": 91.52, "step": 40030, "token_acc": 0.7298074000386858, "train_speed(iter/s)": 0.158284 }, { "epoch": 0.5194813620571643, "grad_norm": 0.717841386795044, "learning_rate": 8.844755298714689e-05, "loss": 0.8979387283325195, "memory(GiB)": 91.52, "step": 40035, "token_acc": 0.7469480292989187, "train_speed(iter/s)": 0.158279 }, { "epoch": 0.51954624045882, "grad_norm": 0.7827938199043274, "learning_rate": 8.844412367804118e-05, "loss": 0.9371488571166993, "memory(GiB)": 91.52, "step": 40040, "token_acc": 0.7383263657620633, "train_speed(iter/s)": 0.158273 }, { "epoch": 0.5196111188604757, "grad_norm": 0.8031564354896545, "learning_rate": 8.844069392652296e-05, "loss": 0.9137388229370117, "memory(GiB)": 91.52, "step": 40045, "token_acc": 0.7498860909449895, "train_speed(iter/s)": 0.158266 }, { "epoch": 0.5196759972621314, "grad_norm": 0.7657561898231506, "learning_rate": 8.843726373263174e-05, "loss": 0.9642008781433106, "memory(GiB)": 91.52, "step": 40050, "token_acc": 0.7629030073150908, "train_speed(iter/s)": 0.15826 }, { "epoch": 0.5197408756637871, "grad_norm": 0.8373719453811646, "learning_rate": 8.843383309640694e-05, "loss": 0.8877071380615235, "memory(GiB)": 91.52, "step": 40055, "token_acc": 0.7510556406711258, "train_speed(iter/s)": 0.158254 }, { "epoch": 0.5198057540654428, "grad_norm": 0.764820396900177, "learning_rate": 8.843040201788807e-05, "loss": 0.9332043647766113, "memory(GiB)": 91.52, "step": 40060, "token_acc": 0.7603831581957252, "train_speed(iter/s)": 0.158248 }, { "epoch": 0.5198706324670985, "grad_norm": 0.7546483278274536, "learning_rate": 8.842697049711461e-05, "loss": 0.8772529602050781, "memory(GiB)": 91.52, "step": 40065, "token_acc": 0.7746607559811851, "train_speed(iter/s)": 0.158242 }, { "epoch": 0.5199355108687542, "grad_norm": 0.7572035789489746, "learning_rate": 8.842353853412604e-05, "loss": 0.9460844039916992, "memory(GiB)": 91.52, "step": 40070, "token_acc": 0.7502690471591379, "train_speed(iter/s)": 0.158236 }, { "epoch": 0.5200003892704099, "grad_norm": 0.8319973945617676, "learning_rate": 8.842010612896186e-05, "loss": 0.888643741607666, "memory(GiB)": 91.52, "step": 40075, "token_acc": 0.7573565989362109, "train_speed(iter/s)": 0.15823 }, { "epoch": 0.5200652676720656, "grad_norm": 0.8507674336433411, "learning_rate": 8.841667328166157e-05, "loss": 0.9006302833557129, "memory(GiB)": 91.52, "step": 40080, "token_acc": 0.7674919722832516, "train_speed(iter/s)": 0.158225 }, { "epoch": 0.5201301460737213, "grad_norm": 0.8386595845222473, "learning_rate": 8.841323999226468e-05, "loss": 0.921732234954834, "memory(GiB)": 91.52, "step": 40085, "token_acc": 0.7465347255066032, "train_speed(iter/s)": 0.158219 }, { "epoch": 0.520195024475377, "grad_norm": 0.7856482267379761, "learning_rate": 8.840980626081069e-05, "loss": 0.9229936599731445, "memory(GiB)": 91.52, "step": 40090, "token_acc": 0.7421568627450981, "train_speed(iter/s)": 0.158215 }, { "epoch": 0.5202599028770327, "grad_norm": 0.8635492324829102, "learning_rate": 8.840637208733912e-05, "loss": 0.9504623413085938, "memory(GiB)": 91.52, "step": 40095, "token_acc": 0.7445212313832736, "train_speed(iter/s)": 0.158209 }, { "epoch": 0.5203247812786884, "grad_norm": 0.8630961775779724, "learning_rate": 8.84029374718895e-05, "loss": 0.8957660675048829, "memory(GiB)": 91.52, "step": 40100, "token_acc": 0.7646700198140527, "train_speed(iter/s)": 0.158204 }, { "epoch": 0.5203896596803441, "grad_norm": 0.693943977355957, "learning_rate": 8.839950241450135e-05, "loss": 0.9281504631042481, "memory(GiB)": 91.52, "step": 40105, "token_acc": 0.7569373727087576, "train_speed(iter/s)": 0.158198 }, { "epoch": 0.5204545380819998, "grad_norm": 0.7092140316963196, "learning_rate": 8.83960669152142e-05, "loss": 0.8719119071960449, "memory(GiB)": 91.52, "step": 40110, "token_acc": 0.7816886839849868, "train_speed(iter/s)": 0.158191 }, { "epoch": 0.5205194164836555, "grad_norm": 0.7086737155914307, "learning_rate": 8.839263097406758e-05, "loss": 0.9212300300598144, "memory(GiB)": 91.52, "step": 40115, "token_acc": 0.7335608395714076, "train_speed(iter/s)": 0.158186 }, { "epoch": 0.5205842948853112, "grad_norm": 0.701915979385376, "learning_rate": 8.838919459110101e-05, "loss": 0.8962020874023438, "memory(GiB)": 91.52, "step": 40120, "token_acc": 0.7575757575757576, "train_speed(iter/s)": 0.15818 }, { "epoch": 0.5206491732869669, "grad_norm": 0.7377774715423584, "learning_rate": 8.838575776635409e-05, "loss": 0.9212852478027344, "memory(GiB)": 91.52, "step": 40125, "token_acc": 0.7450834519342735, "train_speed(iter/s)": 0.158175 }, { "epoch": 0.5207140516886226, "grad_norm": 0.6710132956504822, "learning_rate": 8.838232049986635e-05, "loss": 0.9281479835510253, "memory(GiB)": 91.52, "step": 40130, "token_acc": 0.7711909711909712, "train_speed(iter/s)": 0.158167 }, { "epoch": 0.5207789300902783, "grad_norm": 0.7801940441131592, "learning_rate": 8.837888279167731e-05, "loss": 0.921206283569336, "memory(GiB)": 91.52, "step": 40135, "token_acc": 0.7518901251279342, "train_speed(iter/s)": 0.158162 }, { "epoch": 0.520843808491934, "grad_norm": 0.7781084775924683, "learning_rate": 8.837544464182657e-05, "loss": 0.9585433959960937, "memory(GiB)": 91.52, "step": 40140, "token_acc": 0.7531808961829246, "train_speed(iter/s)": 0.158156 }, { "epoch": 0.5209086868935897, "grad_norm": 0.9148202538490295, "learning_rate": 8.837200605035367e-05, "loss": 0.8903446197509766, "memory(GiB)": 91.52, "step": 40145, "token_acc": 0.7603344315285634, "train_speed(iter/s)": 0.15815 }, { "epoch": 0.5209735652952454, "grad_norm": 0.6983218789100647, "learning_rate": 8.83685670172982e-05, "loss": 0.928378963470459, "memory(GiB)": 91.52, "step": 40150, "token_acc": 0.7721114996774522, "train_speed(iter/s)": 0.158145 }, { "epoch": 0.5210384436969011, "grad_norm": 0.7610281109809875, "learning_rate": 8.836512754269972e-05, "loss": 0.9783044815063476, "memory(GiB)": 91.52, "step": 40155, "token_acc": 0.7201578009671672, "train_speed(iter/s)": 0.158139 }, { "epoch": 0.5211033220985568, "grad_norm": 0.7725858688354492, "learning_rate": 8.836168762659785e-05, "loss": 0.8748973846435547, "memory(GiB)": 91.52, "step": 40160, "token_acc": 0.761446556009943, "train_speed(iter/s)": 0.158134 }, { "epoch": 0.5211682005002125, "grad_norm": 0.7812219858169556, "learning_rate": 8.835824726903212e-05, "loss": 0.8910701751708985, "memory(GiB)": 91.52, "step": 40165, "token_acc": 0.7590905581718521, "train_speed(iter/s)": 0.158127 }, { "epoch": 0.5212330789018682, "grad_norm": 0.8434615135192871, "learning_rate": 8.835480647004216e-05, "loss": 0.9387059211730957, "memory(GiB)": 91.52, "step": 40170, "token_acc": 0.7487986628595306, "train_speed(iter/s)": 0.158123 }, { "epoch": 0.5212979573035239, "grad_norm": 0.8333449363708496, "learning_rate": 8.835136522966755e-05, "loss": 0.9648982048034668, "memory(GiB)": 91.52, "step": 40175, "token_acc": 0.7368403595475803, "train_speed(iter/s)": 0.158117 }, { "epoch": 0.5213628357051796, "grad_norm": 0.8024142384529114, "learning_rate": 8.83479235479479e-05, "loss": 0.9037823677062988, "memory(GiB)": 91.52, "step": 40180, "token_acc": 0.7586242229660496, "train_speed(iter/s)": 0.158111 }, { "epoch": 0.5214277141068353, "grad_norm": 0.7164941430091858, "learning_rate": 8.834448142492281e-05, "loss": 0.8964984893798829, "memory(GiB)": 91.52, "step": 40185, "token_acc": 0.7608233627204031, "train_speed(iter/s)": 0.158105 }, { "epoch": 0.521492592508491, "grad_norm": 0.7802535891532898, "learning_rate": 8.83410388606319e-05, "loss": 0.9708864212036132, "memory(GiB)": 91.52, "step": 40190, "token_acc": 0.7564583191329834, "train_speed(iter/s)": 0.158099 }, { "epoch": 0.5215574709101467, "grad_norm": 0.8031997680664062, "learning_rate": 8.833759585511478e-05, "loss": 0.92886381149292, "memory(GiB)": 91.52, "step": 40195, "token_acc": 0.7338920239198181, "train_speed(iter/s)": 0.158093 }, { "epoch": 0.5216223493118024, "grad_norm": 0.7368095517158508, "learning_rate": 8.83341524084111e-05, "loss": 0.924908447265625, "memory(GiB)": 91.52, "step": 40200, "token_acc": 0.756098505382173, "train_speed(iter/s)": 0.158088 }, { "epoch": 0.5216872277134581, "grad_norm": 0.775227963924408, "learning_rate": 8.833070852056042e-05, "loss": 0.8994722366333008, "memory(GiB)": 91.52, "step": 40205, "token_acc": 0.7601988925302294, "train_speed(iter/s)": 0.158082 }, { "epoch": 0.5217521061151138, "grad_norm": 0.7695657014846802, "learning_rate": 8.832726419160244e-05, "loss": 0.9550894737243653, "memory(GiB)": 91.52, "step": 40210, "token_acc": 0.7450010236811574, "train_speed(iter/s)": 0.158078 }, { "epoch": 0.5218169845167695, "grad_norm": 0.7118097543716431, "learning_rate": 8.832381942157678e-05, "loss": 0.8863751411437988, "memory(GiB)": 91.52, "step": 40215, "token_acc": 0.7434749189586606, "train_speed(iter/s)": 0.158072 }, { "epoch": 0.5218818629184251, "grad_norm": 0.7068065404891968, "learning_rate": 8.832037421052304e-05, "loss": 0.9034189224243164, "memory(GiB)": 91.52, "step": 40220, "token_acc": 0.7500187195806814, "train_speed(iter/s)": 0.158066 }, { "epoch": 0.5219467413200808, "grad_norm": 0.764308512210846, "learning_rate": 8.83169285584809e-05, "loss": 0.9280649185180664, "memory(GiB)": 91.52, "step": 40225, "token_acc": 0.7399684810280034, "train_speed(iter/s)": 0.15806 }, { "epoch": 0.5220116197217365, "grad_norm": 0.7612823247909546, "learning_rate": 8.831348246549004e-05, "loss": 0.9003124237060547, "memory(GiB)": 91.52, "step": 40230, "token_acc": 0.764328554475769, "train_speed(iter/s)": 0.158054 }, { "epoch": 0.5220764981233922, "grad_norm": 0.74409419298172, "learning_rate": 8.831003593159009e-05, "loss": 0.9134955406188965, "memory(GiB)": 91.52, "step": 40235, "token_acc": 0.7499217233389692, "train_speed(iter/s)": 0.158047 }, { "epoch": 0.5221413765250479, "grad_norm": 0.7561519145965576, "learning_rate": 8.83065889568207e-05, "loss": 0.9249560356140136, "memory(GiB)": 91.52, "step": 40240, "token_acc": 0.7534716024702817, "train_speed(iter/s)": 0.158041 }, { "epoch": 0.5222062549267036, "grad_norm": 0.7927361726760864, "learning_rate": 8.830314154122153e-05, "loss": 0.9434298515319824, "memory(GiB)": 91.52, "step": 40245, "token_acc": 0.7416498542927594, "train_speed(iter/s)": 0.158036 }, { "epoch": 0.5222711333283593, "grad_norm": 0.7789425253868103, "learning_rate": 8.82996936848323e-05, "loss": 0.9532009124755859, "memory(GiB)": 91.52, "step": 40250, "token_acc": 0.7644192362997284, "train_speed(iter/s)": 0.15803 }, { "epoch": 0.522336011730015, "grad_norm": 0.7305173277854919, "learning_rate": 8.829624538769264e-05, "loss": 0.9278102874755859, "memory(GiB)": 91.52, "step": 40255, "token_acc": 0.7646468448124452, "train_speed(iter/s)": 0.158023 }, { "epoch": 0.5224008901316707, "grad_norm": 0.8472911715507507, "learning_rate": 8.829279664984227e-05, "loss": 0.9002625465393066, "memory(GiB)": 91.52, "step": 40260, "token_acc": 0.7709256061857218, "train_speed(iter/s)": 0.158018 }, { "epoch": 0.5224657685333264, "grad_norm": 0.7450332045555115, "learning_rate": 8.828934747132085e-05, "loss": 0.9284103393554688, "memory(GiB)": 91.52, "step": 40265, "token_acc": 0.7523090708574579, "train_speed(iter/s)": 0.158012 }, { "epoch": 0.5225306469349821, "grad_norm": 0.6282486915588379, "learning_rate": 8.82858978521681e-05, "loss": 0.9426576614379882, "memory(GiB)": 91.52, "step": 40270, "token_acc": 0.7423332481472016, "train_speed(iter/s)": 0.158005 }, { "epoch": 0.5225955253366378, "grad_norm": 0.7148241400718689, "learning_rate": 8.828244779242369e-05, "loss": 0.9336827278137207, "memory(GiB)": 91.52, "step": 40275, "token_acc": 0.7297947996147863, "train_speed(iter/s)": 0.157999 }, { "epoch": 0.5226604037382935, "grad_norm": 0.7652750015258789, "learning_rate": 8.827899729212732e-05, "loss": 0.925838851928711, "memory(GiB)": 91.52, "step": 40280, "token_acc": 0.7735345076523246, "train_speed(iter/s)": 0.157993 }, { "epoch": 0.5227252821399492, "grad_norm": 0.8126930594444275, "learning_rate": 8.827554635131874e-05, "loss": 0.9281128883361817, "memory(GiB)": 91.52, "step": 40285, "token_acc": 0.7597785854813431, "train_speed(iter/s)": 0.157988 }, { "epoch": 0.5227901605416049, "grad_norm": 0.773480236530304, "learning_rate": 8.827209497003761e-05, "loss": 0.8987153053283692, "memory(GiB)": 91.52, "step": 40290, "token_acc": 0.7560625921677864, "train_speed(iter/s)": 0.157983 }, { "epoch": 0.5228550389432606, "grad_norm": 0.694680392742157, "learning_rate": 8.826864314832369e-05, "loss": 0.8666776657104492, "memory(GiB)": 91.52, "step": 40295, "token_acc": 0.7659495298017726, "train_speed(iter/s)": 0.157976 }, { "epoch": 0.5229199173449163, "grad_norm": 0.7993465065956116, "learning_rate": 8.826519088621669e-05, "loss": 0.8733068466186523, "memory(GiB)": 91.52, "step": 40300, "token_acc": 0.7722258496855562, "train_speed(iter/s)": 0.15797 }, { "epoch": 0.522984795746572, "grad_norm": 0.7436180114746094, "learning_rate": 8.826173818375632e-05, "loss": 0.9209781646728515, "memory(GiB)": 91.52, "step": 40305, "token_acc": 0.7663764577376554, "train_speed(iter/s)": 0.157965 }, { "epoch": 0.5230496741482277, "grad_norm": 0.7427520751953125, "learning_rate": 8.825828504098235e-05, "loss": 0.8988395690917969, "memory(GiB)": 91.52, "step": 40310, "token_acc": 0.7606863953966296, "train_speed(iter/s)": 0.157958 }, { "epoch": 0.5231145525498834, "grad_norm": 0.7327860593795776, "learning_rate": 8.82548314579345e-05, "loss": 0.8810090065002442, "memory(GiB)": 91.52, "step": 40315, "token_acc": 0.7635500189270106, "train_speed(iter/s)": 0.157952 }, { "epoch": 0.5231794309515391, "grad_norm": 0.8976019620895386, "learning_rate": 8.825137743465248e-05, "loss": 0.9928110122680665, "memory(GiB)": 91.52, "step": 40320, "token_acc": 0.7424529871858878, "train_speed(iter/s)": 0.157946 }, { "epoch": 0.5232443093531948, "grad_norm": 0.7197787761688232, "learning_rate": 8.82479229711761e-05, "loss": 0.9315336227416993, "memory(GiB)": 91.52, "step": 40325, "token_acc": 0.7612018243219719, "train_speed(iter/s)": 0.15794 }, { "epoch": 0.5233091877548505, "grad_norm": 0.7541587948799133, "learning_rate": 8.824446806754507e-05, "loss": 0.9182474136352539, "memory(GiB)": 91.52, "step": 40330, "token_acc": 0.7467079293902169, "train_speed(iter/s)": 0.157934 }, { "epoch": 0.5233740661565062, "grad_norm": 0.791912317276001, "learning_rate": 8.824101272379917e-05, "loss": 0.9193012237548828, "memory(GiB)": 91.52, "step": 40335, "token_acc": 0.751002635499026, "train_speed(iter/s)": 0.157929 }, { "epoch": 0.5234389445581619, "grad_norm": 0.7664456367492676, "learning_rate": 8.823755693997815e-05, "loss": 0.9110596656799317, "memory(GiB)": 91.52, "step": 40340, "token_acc": 0.7595330478993846, "train_speed(iter/s)": 0.157923 }, { "epoch": 0.5235038229598176, "grad_norm": 0.8090678453445435, "learning_rate": 8.82341007161218e-05, "loss": 0.9350669860839844, "memory(GiB)": 91.52, "step": 40345, "token_acc": 0.7508454639023673, "train_speed(iter/s)": 0.157916 }, { "epoch": 0.5235687013614733, "grad_norm": 0.7934585213661194, "learning_rate": 8.823064405226986e-05, "loss": 0.976287841796875, "memory(GiB)": 91.52, "step": 40350, "token_acc": 0.737495881383855, "train_speed(iter/s)": 0.157911 }, { "epoch": 0.523633579763129, "grad_norm": 0.7256094217300415, "learning_rate": 8.822718694846214e-05, "loss": 0.9077073097229004, "memory(GiB)": 91.52, "step": 40355, "token_acc": 0.7386065604926071, "train_speed(iter/s)": 0.157905 }, { "epoch": 0.5236984581647847, "grad_norm": 0.8631538152694702, "learning_rate": 8.82237294047384e-05, "loss": 0.9531086921691895, "memory(GiB)": 91.52, "step": 40360, "token_acc": 0.755862956409844, "train_speed(iter/s)": 0.1579 }, { "epoch": 0.5237633365664404, "grad_norm": 0.7904276847839355, "learning_rate": 8.822027142113847e-05, "loss": 0.9079973220825195, "memory(GiB)": 91.52, "step": 40365, "token_acc": 0.7454739084132055, "train_speed(iter/s)": 0.157894 }, { "epoch": 0.5238282149680961, "grad_norm": 0.800445020198822, "learning_rate": 8.821681299770209e-05, "loss": 0.9419103622436523, "memory(GiB)": 91.52, "step": 40370, "token_acc": 0.7556290090325959, "train_speed(iter/s)": 0.157889 }, { "epoch": 0.5238930933697518, "grad_norm": 0.7004664540290833, "learning_rate": 8.82133541344691e-05, "loss": 0.8879864692687989, "memory(GiB)": 91.52, "step": 40375, "token_acc": 0.7630873149895698, "train_speed(iter/s)": 0.157885 }, { "epoch": 0.5239579717714075, "grad_norm": 0.7886736989021301, "learning_rate": 8.820989483147928e-05, "loss": 0.9140486717224121, "memory(GiB)": 91.52, "step": 40380, "token_acc": 0.7637834944425202, "train_speed(iter/s)": 0.157878 }, { "epoch": 0.5240228501730632, "grad_norm": 0.7593263983726501, "learning_rate": 8.820643508877247e-05, "loss": 0.9750194549560547, "memory(GiB)": 91.52, "step": 40385, "token_acc": 0.7229714372300932, "train_speed(iter/s)": 0.157872 }, { "epoch": 0.5240877285747189, "grad_norm": 0.7014757394790649, "learning_rate": 8.820297490638844e-05, "loss": 0.9406533241271973, "memory(GiB)": 91.52, "step": 40390, "token_acc": 0.7587853974752644, "train_speed(iter/s)": 0.157865 }, { "epoch": 0.5241526069763746, "grad_norm": 0.8141337037086487, "learning_rate": 8.819951428436703e-05, "loss": 0.9307414054870605, "memory(GiB)": 91.52, "step": 40395, "token_acc": 0.7476692266476156, "train_speed(iter/s)": 0.15786 }, { "epoch": 0.5242174853780303, "grad_norm": 0.7663941979408264, "learning_rate": 8.819605322274809e-05, "loss": 0.8977741241455078, "memory(GiB)": 91.52, "step": 40400, "token_acc": 0.7566836575470717, "train_speed(iter/s)": 0.157854 }, { "epoch": 0.524282363779686, "grad_norm": 0.7536264061927795, "learning_rate": 8.819259172157141e-05, "loss": 0.9379358291625977, "memory(GiB)": 91.52, "step": 40405, "token_acc": 0.740669387755102, "train_speed(iter/s)": 0.157848 }, { "epoch": 0.5243472421813417, "grad_norm": 0.762088418006897, "learning_rate": 8.818912978087686e-05, "loss": 0.925634765625, "memory(GiB)": 91.52, "step": 40410, "token_acc": 0.7458040996010455, "train_speed(iter/s)": 0.157842 }, { "epoch": 0.5244121205829974, "grad_norm": 0.7698830366134644, "learning_rate": 8.818566740070425e-05, "loss": 0.9743701934814453, "memory(GiB)": 91.52, "step": 40415, "token_acc": 0.7575415446494725, "train_speed(iter/s)": 0.157837 }, { "epoch": 0.524476998984653, "grad_norm": 0.7134950160980225, "learning_rate": 8.818220458109344e-05, "loss": 0.9271694183349609, "memory(GiB)": 91.52, "step": 40420, "token_acc": 0.7335006273525722, "train_speed(iter/s)": 0.157832 }, { "epoch": 0.5245418773863088, "grad_norm": 0.7325361967086792, "learning_rate": 8.817874132208428e-05, "loss": 0.913667869567871, "memory(GiB)": 91.52, "step": 40425, "token_acc": 0.7467448563761058, "train_speed(iter/s)": 0.157826 }, { "epoch": 0.5246067557879645, "grad_norm": 0.7068336009979248, "learning_rate": 8.817527762371662e-05, "loss": 0.9146163940429688, "memory(GiB)": 91.52, "step": 40430, "token_acc": 0.754981388219838, "train_speed(iter/s)": 0.15782 }, { "epoch": 0.5246716341896202, "grad_norm": 0.7882722616195679, "learning_rate": 8.817181348603032e-05, "loss": 0.889897346496582, "memory(GiB)": 91.52, "step": 40435, "token_acc": 0.7624772943516996, "train_speed(iter/s)": 0.157814 }, { "epoch": 0.5247365125912758, "grad_norm": 0.7699865102767944, "learning_rate": 8.816834890906524e-05, "loss": 0.869017219543457, "memory(GiB)": 91.52, "step": 40440, "token_acc": 0.7691767484917164, "train_speed(iter/s)": 0.157807 }, { "epoch": 0.5248013909929315, "grad_norm": 0.8399893641471863, "learning_rate": 8.816488389286127e-05, "loss": 0.9414752006530762, "memory(GiB)": 91.52, "step": 40445, "token_acc": 0.7264981142617684, "train_speed(iter/s)": 0.157801 }, { "epoch": 0.5248662693945872, "grad_norm": 0.685387134552002, "learning_rate": 8.816141843745827e-05, "loss": 0.9144228935241699, "memory(GiB)": 91.52, "step": 40450, "token_acc": 0.7640900665888652, "train_speed(iter/s)": 0.157796 }, { "epoch": 0.5249311477962428, "grad_norm": 0.8284204602241516, "learning_rate": 8.815795254289611e-05, "loss": 0.9293749809265137, "memory(GiB)": 91.52, "step": 40455, "token_acc": 0.7677316009611106, "train_speed(iter/s)": 0.157791 }, { "epoch": 0.5249960261978985, "grad_norm": 0.7008129954338074, "learning_rate": 8.815448620921472e-05, "loss": 0.925262451171875, "memory(GiB)": 91.52, "step": 40460, "token_acc": 0.7779035624579899, "train_speed(iter/s)": 0.157785 }, { "epoch": 0.5250609045995542, "grad_norm": 0.7742753624916077, "learning_rate": 8.815101943645391e-05, "loss": 0.9203228950500488, "memory(GiB)": 91.52, "step": 40465, "token_acc": 0.7548354430379747, "train_speed(iter/s)": 0.157778 }, { "epoch": 0.5251257830012099, "grad_norm": 0.7737635970115662, "learning_rate": 8.814755222465367e-05, "loss": 0.9264424324035645, "memory(GiB)": 91.52, "step": 40470, "token_acc": 0.7392479994153542, "train_speed(iter/s)": 0.157773 }, { "epoch": 0.5251906614028656, "grad_norm": 0.7291224598884583, "learning_rate": 8.814408457385384e-05, "loss": 0.9207305908203125, "memory(GiB)": 91.52, "step": 40475, "token_acc": 0.7423672103722292, "train_speed(iter/s)": 0.157767 }, { "epoch": 0.5252555398045213, "grad_norm": 0.7762709259986877, "learning_rate": 8.814061648409432e-05, "loss": 0.9468317985534668, "memory(GiB)": 91.52, "step": 40480, "token_acc": 0.7466687630203215, "train_speed(iter/s)": 0.157761 }, { "epoch": 0.525320418206177, "grad_norm": 0.7271079421043396, "learning_rate": 8.813714795541505e-05, "loss": 0.9128026962280273, "memory(GiB)": 91.52, "step": 40485, "token_acc": 0.7553523432914208, "train_speed(iter/s)": 0.157754 }, { "epoch": 0.5253852966078327, "grad_norm": 0.8875125646591187, "learning_rate": 8.813367898785595e-05, "loss": 0.9467693328857422, "memory(GiB)": 91.52, "step": 40490, "token_acc": 0.764312422102202, "train_speed(iter/s)": 0.15775 }, { "epoch": 0.5254501750094884, "grad_norm": 0.7044214606285095, "learning_rate": 8.813020958145691e-05, "loss": 0.9435364723205566, "memory(GiB)": 91.52, "step": 40495, "token_acc": 0.7591673032849503, "train_speed(iter/s)": 0.157743 }, { "epoch": 0.5255150534111441, "grad_norm": 0.7129897475242615, "learning_rate": 8.812673973625789e-05, "loss": 0.9276792526245117, "memory(GiB)": 91.52, "step": 40500, "token_acc": 0.7496796788246348, "train_speed(iter/s)": 0.157738 }, { "epoch": 0.5255799318127998, "grad_norm": 0.7624219059944153, "learning_rate": 8.812326945229878e-05, "loss": 0.978400993347168, "memory(GiB)": 91.52, "step": 40505, "token_acc": 0.731687112934401, "train_speed(iter/s)": 0.157732 }, { "epoch": 0.5256448102144555, "grad_norm": 0.7574115991592407, "learning_rate": 8.811979872961955e-05, "loss": 0.8711605072021484, "memory(GiB)": 91.52, "step": 40510, "token_acc": 0.7523769360527527, "train_speed(iter/s)": 0.157727 }, { "epoch": 0.5257096886161112, "grad_norm": 0.7747073769569397, "learning_rate": 8.811632756826014e-05, "loss": 0.9626970291137695, "memory(GiB)": 91.52, "step": 40515, "token_acc": 0.7420456525527095, "train_speed(iter/s)": 0.157722 }, { "epoch": 0.5257745670177669, "grad_norm": 0.7460055351257324, "learning_rate": 8.811285596826046e-05, "loss": 0.8603067398071289, "memory(GiB)": 91.52, "step": 40520, "token_acc": 0.7787018686296716, "train_speed(iter/s)": 0.157715 }, { "epoch": 0.5258394454194226, "grad_norm": 0.7122834920883179, "learning_rate": 8.810938392966052e-05, "loss": 0.9124021530151367, "memory(GiB)": 91.52, "step": 40525, "token_acc": 0.7558031281032771, "train_speed(iter/s)": 0.157709 }, { "epoch": 0.5259043238210783, "grad_norm": 0.8072582483291626, "learning_rate": 8.810591145250022e-05, "loss": 0.9257807731628418, "memory(GiB)": 91.52, "step": 40530, "token_acc": 0.745507713171724, "train_speed(iter/s)": 0.157704 }, { "epoch": 0.525969202222734, "grad_norm": 0.7735766172409058, "learning_rate": 8.810243853681955e-05, "loss": 0.9611080169677735, "memory(GiB)": 91.52, "step": 40535, "token_acc": 0.7412547655675206, "train_speed(iter/s)": 0.157697 }, { "epoch": 0.5260340806243897, "grad_norm": 0.8529530763626099, "learning_rate": 8.809896518265847e-05, "loss": 0.9725477218627929, "memory(GiB)": 91.52, "step": 40540, "token_acc": 0.7410570043849527, "train_speed(iter/s)": 0.157691 }, { "epoch": 0.5260989590260454, "grad_norm": 0.7634020447731018, "learning_rate": 8.809549139005694e-05, "loss": 0.9052064895629883, "memory(GiB)": 91.52, "step": 40545, "token_acc": 0.7512583892617449, "train_speed(iter/s)": 0.157686 }, { "epoch": 0.5261638374277011, "grad_norm": 0.7648077607154846, "learning_rate": 8.809201715905495e-05, "loss": 0.8790936470031738, "memory(GiB)": 91.52, "step": 40550, "token_acc": 0.7581542489364023, "train_speed(iter/s)": 0.157679 }, { "epoch": 0.5262287158293568, "grad_norm": 0.7435277104377747, "learning_rate": 8.808854248969249e-05, "loss": 0.8780025482177735, "memory(GiB)": 91.52, "step": 40555, "token_acc": 0.7493744238114053, "train_speed(iter/s)": 0.157674 }, { "epoch": 0.5262935942310125, "grad_norm": 0.6648904085159302, "learning_rate": 8.808506738200953e-05, "loss": 0.8444477081298828, "memory(GiB)": 91.52, "step": 40560, "token_acc": 0.7479872554729522, "train_speed(iter/s)": 0.157667 }, { "epoch": 0.5263584726326682, "grad_norm": 0.9505670070648193, "learning_rate": 8.808159183604606e-05, "loss": 0.9143074989318848, "memory(GiB)": 91.52, "step": 40565, "token_acc": 0.7623279316954173, "train_speed(iter/s)": 0.157662 }, { "epoch": 0.5264233510343239, "grad_norm": 0.7186955809593201, "learning_rate": 8.80781158518421e-05, "loss": 0.8955949783325196, "memory(GiB)": 91.52, "step": 40570, "token_acc": 0.7588521334155195, "train_speed(iter/s)": 0.157655 }, { "epoch": 0.5264882294359796, "grad_norm": 0.848686933517456, "learning_rate": 8.80746394294376e-05, "loss": 0.9678859710693359, "memory(GiB)": 91.52, "step": 40575, "token_acc": 0.7183314881611371, "train_speed(iter/s)": 0.157651 }, { "epoch": 0.5265531078376353, "grad_norm": 0.8422302007675171, "learning_rate": 8.807116256887262e-05, "loss": 0.9417578697204589, "memory(GiB)": 91.52, "step": 40580, "token_acc": 0.7461289536866279, "train_speed(iter/s)": 0.157646 }, { "epoch": 0.526617986239291, "grad_norm": 0.7520542740821838, "learning_rate": 8.806768527018716e-05, "loss": 0.9053444862365723, "memory(GiB)": 91.52, "step": 40585, "token_acc": 0.754418058815408, "train_speed(iter/s)": 0.157639 }, { "epoch": 0.5266828646409467, "grad_norm": 0.7961593866348267, "learning_rate": 8.806420753342122e-05, "loss": 0.9099843978881836, "memory(GiB)": 91.52, "step": 40590, "token_acc": 0.7519764216366158, "train_speed(iter/s)": 0.157634 }, { "epoch": 0.5267477430426024, "grad_norm": 0.7869747877120972, "learning_rate": 8.806072935861483e-05, "loss": 0.9785037040710449, "memory(GiB)": 91.52, "step": 40595, "token_acc": 0.7321285676180315, "train_speed(iter/s)": 0.157629 }, { "epoch": 0.5268126214442581, "grad_norm": 0.7642644047737122, "learning_rate": 8.8057250745808e-05, "loss": 0.9359683990478516, "memory(GiB)": 91.52, "step": 40600, "token_acc": 0.7535952800426107, "train_speed(iter/s)": 0.157623 }, { "epoch": 0.5268774998459138, "grad_norm": 0.7906391620635986, "learning_rate": 8.80537716950408e-05, "loss": 0.8824751853942872, "memory(GiB)": 91.52, "step": 40605, "token_acc": 0.7730470645317807, "train_speed(iter/s)": 0.157618 }, { "epoch": 0.5269423782475695, "grad_norm": 0.8328759074211121, "learning_rate": 8.805029220635323e-05, "loss": 0.9333118438720703, "memory(GiB)": 91.52, "step": 40610, "token_acc": 0.7395446966345663, "train_speed(iter/s)": 0.157612 }, { "epoch": 0.5270072566492252, "grad_norm": 0.8332505822181702, "learning_rate": 8.804681227978535e-05, "loss": 0.9241398811340332, "memory(GiB)": 91.52, "step": 40615, "token_acc": 0.7591624885248217, "train_speed(iter/s)": 0.157605 }, { "epoch": 0.5270721350508809, "grad_norm": 0.7559762597084045, "learning_rate": 8.804333191537722e-05, "loss": 0.9405086517333985, "memory(GiB)": 91.52, "step": 40620, "token_acc": 0.7652970997406272, "train_speed(iter/s)": 0.157599 }, { "epoch": 0.5271370134525366, "grad_norm": 0.830764651298523, "learning_rate": 8.803985111316886e-05, "loss": 0.9578826904296875, "memory(GiB)": 91.52, "step": 40625, "token_acc": 0.7737345059206426, "train_speed(iter/s)": 0.157594 }, { "epoch": 0.5272018918541923, "grad_norm": 0.8343607187271118, "learning_rate": 8.803636987320034e-05, "loss": 0.9017136573791504, "memory(GiB)": 91.52, "step": 40630, "token_acc": 0.7547019263649835, "train_speed(iter/s)": 0.157588 }, { "epoch": 0.527266770255848, "grad_norm": 0.7411007285118103, "learning_rate": 8.803288819551171e-05, "loss": 0.8900579452514649, "memory(GiB)": 91.52, "step": 40635, "token_acc": 0.7646507266760432, "train_speed(iter/s)": 0.157582 }, { "epoch": 0.5273316486575037, "grad_norm": 0.797985851764679, "learning_rate": 8.802940608014307e-05, "loss": 0.9583847999572754, "memory(GiB)": 91.52, "step": 40640, "token_acc": 0.7332474714951324, "train_speed(iter/s)": 0.157576 }, { "epoch": 0.5273965270591594, "grad_norm": 0.6695939898490906, "learning_rate": 8.802592352713448e-05, "loss": 0.8995628356933594, "memory(GiB)": 91.52, "step": 40645, "token_acc": 0.7586012606965524, "train_speed(iter/s)": 0.15757 }, { "epoch": 0.5274614054608151, "grad_norm": 0.7329315543174744, "learning_rate": 8.802244053652598e-05, "loss": 0.8711420059204101, "memory(GiB)": 91.52, "step": 40650, "token_acc": 0.7791466772645982, "train_speed(iter/s)": 0.157563 }, { "epoch": 0.5275262838624708, "grad_norm": 0.7841063141822815, "learning_rate": 8.801895710835771e-05, "loss": 0.866611099243164, "memory(GiB)": 91.52, "step": 40655, "token_acc": 0.7762564610139668, "train_speed(iter/s)": 0.157555 }, { "epoch": 0.5275911622641265, "grad_norm": 0.7957519292831421, "learning_rate": 8.801547324266972e-05, "loss": 0.9018341064453125, "memory(GiB)": 91.52, "step": 40660, "token_acc": 0.7547383710631664, "train_speed(iter/s)": 0.15755 }, { "epoch": 0.5276560406657822, "grad_norm": 0.9849435091018677, "learning_rate": 8.801198893950212e-05, "loss": 0.9182909965515137, "memory(GiB)": 91.52, "step": 40665, "token_acc": 0.7255485432131061, "train_speed(iter/s)": 0.157544 }, { "epoch": 0.5277209190674379, "grad_norm": 0.7632284164428711, "learning_rate": 8.800850419889497e-05, "loss": 0.9249715805053711, "memory(GiB)": 91.52, "step": 40670, "token_acc": 0.7443113554447155, "train_speed(iter/s)": 0.157539 }, { "epoch": 0.5277857974690936, "grad_norm": 0.7423215508460999, "learning_rate": 8.800501902088842e-05, "loss": 0.924460506439209, "memory(GiB)": 91.52, "step": 40675, "token_acc": 0.753751636620002, "train_speed(iter/s)": 0.157534 }, { "epoch": 0.5278506758707493, "grad_norm": 0.7418259382247925, "learning_rate": 8.800153340552255e-05, "loss": 0.9185827255249024, "memory(GiB)": 91.52, "step": 40680, "token_acc": 0.7617151495041109, "train_speed(iter/s)": 0.157528 }, { "epoch": 0.527915554272405, "grad_norm": 0.8883485198020935, "learning_rate": 8.799804735283748e-05, "loss": 0.9198625564575196, "memory(GiB)": 91.52, "step": 40685, "token_acc": 0.7646342812880381, "train_speed(iter/s)": 0.157523 }, { "epoch": 0.5279804326740607, "grad_norm": 0.8315388560295105, "learning_rate": 8.799456086287333e-05, "loss": 0.9394715309143067, "memory(GiB)": 91.52, "step": 40690, "token_acc": 0.7416683498283175, "train_speed(iter/s)": 0.157518 }, { "epoch": 0.5280453110757163, "grad_norm": 0.8872925043106079, "learning_rate": 8.799107393567021e-05, "loss": 0.9471025466918945, "memory(GiB)": 91.52, "step": 40695, "token_acc": 0.7244762221483206, "train_speed(iter/s)": 0.157513 }, { "epoch": 0.528110189477372, "grad_norm": 0.7351139783859253, "learning_rate": 8.798758657126827e-05, "loss": 0.9190670013427734, "memory(GiB)": 91.52, "step": 40700, "token_acc": 0.7371893629847946, "train_speed(iter/s)": 0.157507 }, { "epoch": 0.5281750678790277, "grad_norm": 0.732839822769165, "learning_rate": 8.798409876970761e-05, "loss": 0.8913147926330567, "memory(GiB)": 91.52, "step": 40705, "token_acc": 0.7613595056343148, "train_speed(iter/s)": 0.1575 }, { "epoch": 0.5282399462806834, "grad_norm": 0.7068930268287659, "learning_rate": 8.798061053102841e-05, "loss": 0.8715251922607422, "memory(GiB)": 91.52, "step": 40710, "token_acc": 0.7619651558640945, "train_speed(iter/s)": 0.157494 }, { "epoch": 0.5283048246823391, "grad_norm": 0.7427049279212952, "learning_rate": 8.797712185527078e-05, "loss": 0.9424848556518555, "memory(GiB)": 91.52, "step": 40715, "token_acc": 0.7331870616003457, "train_speed(iter/s)": 0.157489 }, { "epoch": 0.5283697030839948, "grad_norm": 0.8202970027923584, "learning_rate": 8.797363274247486e-05, "loss": 0.9306697845458984, "memory(GiB)": 91.52, "step": 40720, "token_acc": 0.7265139329652655, "train_speed(iter/s)": 0.157484 }, { "epoch": 0.5284345814856505, "grad_norm": 0.8509706854820251, "learning_rate": 8.797014319268084e-05, "loss": 0.9410987854003906, "memory(GiB)": 91.52, "step": 40725, "token_acc": 0.729228672858321, "train_speed(iter/s)": 0.157479 }, { "epoch": 0.5284994598873062, "grad_norm": 0.7292749881744385, "learning_rate": 8.796665320592884e-05, "loss": 0.9585153579711914, "memory(GiB)": 91.52, "step": 40730, "token_acc": 0.745125854794122, "train_speed(iter/s)": 0.157472 }, { "epoch": 0.5285643382889619, "grad_norm": 0.7274779081344604, "learning_rate": 8.796316278225905e-05, "loss": 0.9659220695495605, "memory(GiB)": 91.52, "step": 40735, "token_acc": 0.767775208963187, "train_speed(iter/s)": 0.157466 }, { "epoch": 0.5286292166906176, "grad_norm": 0.685886800289154, "learning_rate": 8.795967192171162e-05, "loss": 0.9026547431945801, "memory(GiB)": 91.52, "step": 40740, "token_acc": 0.7683121548799149, "train_speed(iter/s)": 0.15746 }, { "epoch": 0.5286940950922733, "grad_norm": 0.7317732572555542, "learning_rate": 8.795618062432673e-05, "loss": 0.8950634002685547, "memory(GiB)": 91.52, "step": 40745, "token_acc": 0.7482886216466235, "train_speed(iter/s)": 0.157454 }, { "epoch": 0.528758973493929, "grad_norm": 0.7244317531585693, "learning_rate": 8.795268889014456e-05, "loss": 0.8911548614501953, "memory(GiB)": 91.52, "step": 40750, "token_acc": 0.7429428561101673, "train_speed(iter/s)": 0.15745 }, { "epoch": 0.5288238518955847, "grad_norm": 0.7752748727798462, "learning_rate": 8.794919671920528e-05, "loss": 0.9062799453735352, "memory(GiB)": 91.52, "step": 40755, "token_acc": 0.7466016751338734, "train_speed(iter/s)": 0.157445 }, { "epoch": 0.5288887302972404, "grad_norm": 0.8787034153938293, "learning_rate": 8.79457041115491e-05, "loss": 0.9003854751586914, "memory(GiB)": 91.52, "step": 40760, "token_acc": 0.7597753200383175, "train_speed(iter/s)": 0.157439 }, { "epoch": 0.5289536086988961, "grad_norm": 0.9405005574226379, "learning_rate": 8.794221106721619e-05, "loss": 0.9095590591430665, "memory(GiB)": 91.52, "step": 40765, "token_acc": 0.742918307274743, "train_speed(iter/s)": 0.157433 }, { "epoch": 0.5290184871005518, "grad_norm": 0.8691454529762268, "learning_rate": 8.793871758624674e-05, "loss": 0.909344482421875, "memory(GiB)": 91.52, "step": 40770, "token_acc": 0.7462884021755108, "train_speed(iter/s)": 0.157427 }, { "epoch": 0.5290833655022075, "grad_norm": 0.6981282234191895, "learning_rate": 8.793522366868099e-05, "loss": 0.9184775352478027, "memory(GiB)": 91.52, "step": 40775, "token_acc": 0.7559560379548093, "train_speed(iter/s)": 0.157422 }, { "epoch": 0.5291482439038632, "grad_norm": 0.8450264930725098, "learning_rate": 8.793172931455913e-05, "loss": 0.9145654678344727, "memory(GiB)": 91.52, "step": 40780, "token_acc": 0.7619293458219633, "train_speed(iter/s)": 0.157416 }, { "epoch": 0.5292131223055189, "grad_norm": 0.800225555896759, "learning_rate": 8.792823452392136e-05, "loss": 0.9614890098571778, "memory(GiB)": 91.52, "step": 40785, "token_acc": 0.7401352504760029, "train_speed(iter/s)": 0.157411 }, { "epoch": 0.5292780007071746, "grad_norm": 0.802615761756897, "learning_rate": 8.792473929680792e-05, "loss": 0.926169490814209, "memory(GiB)": 91.52, "step": 40790, "token_acc": 0.7760897231944003, "train_speed(iter/s)": 0.157406 }, { "epoch": 0.5293428791088303, "grad_norm": 0.7903833389282227, "learning_rate": 8.792124363325901e-05, "loss": 0.9035185813903809, "memory(GiB)": 91.52, "step": 40795, "token_acc": 0.7622013477639371, "train_speed(iter/s)": 0.157401 }, { "epoch": 0.529407757510486, "grad_norm": 0.7474675178527832, "learning_rate": 8.791774753331488e-05, "loss": 0.8950830459594726, "memory(GiB)": 91.52, "step": 40800, "token_acc": 0.7463244113917543, "train_speed(iter/s)": 0.157395 }, { "epoch": 0.5294726359121417, "grad_norm": 0.7873432040214539, "learning_rate": 8.791425099701573e-05, "loss": 0.9170854568481446, "memory(GiB)": 91.52, "step": 40805, "token_acc": 0.7141624730409777, "train_speed(iter/s)": 0.157388 }, { "epoch": 0.5295375143137974, "grad_norm": 0.8656072020530701, "learning_rate": 8.791075402440185e-05, "loss": 0.9015192031860352, "memory(GiB)": 91.52, "step": 40810, "token_acc": 0.7444883685570929, "train_speed(iter/s)": 0.157383 }, { "epoch": 0.5296023927154531, "grad_norm": 0.884331226348877, "learning_rate": 8.790725661551344e-05, "loss": 0.9435998916625976, "memory(GiB)": 91.52, "step": 40815, "token_acc": 0.7580813347236705, "train_speed(iter/s)": 0.157376 }, { "epoch": 0.5296672711171088, "grad_norm": 0.7843711376190186, "learning_rate": 8.790375877039075e-05, "loss": 0.9122481346130371, "memory(GiB)": 91.52, "step": 40820, "token_acc": 0.7499208359721342, "train_speed(iter/s)": 0.157371 }, { "epoch": 0.5297321495187645, "grad_norm": 0.7539430856704712, "learning_rate": 8.790026048907405e-05, "loss": 0.9201077461242676, "memory(GiB)": 91.52, "step": 40825, "token_acc": 0.7393073997464469, "train_speed(iter/s)": 0.157365 }, { "epoch": 0.5297970279204202, "grad_norm": 0.6971803307533264, "learning_rate": 8.78967617716036e-05, "loss": 0.9531981468200683, "memory(GiB)": 91.52, "step": 40830, "token_acc": 0.7505864472858228, "train_speed(iter/s)": 0.157361 }, { "epoch": 0.5298619063220759, "grad_norm": 0.7708835005760193, "learning_rate": 8.789326261801965e-05, "loss": 0.915545654296875, "memory(GiB)": 91.52, "step": 40835, "token_acc": 0.7629485179407176, "train_speed(iter/s)": 0.157355 }, { "epoch": 0.5299267847237316, "grad_norm": 0.793860673904419, "learning_rate": 8.788976302836247e-05, "loss": 0.9211606025695801, "memory(GiB)": 91.52, "step": 40840, "token_acc": 0.7324061662198391, "train_speed(iter/s)": 0.157349 }, { "epoch": 0.5299916631253873, "grad_norm": 0.7997251749038696, "learning_rate": 8.788626300267233e-05, "loss": 0.9224899291992188, "memory(GiB)": 91.52, "step": 40845, "token_acc": 0.7407116610659213, "train_speed(iter/s)": 0.157345 }, { "epoch": 0.530056541527043, "grad_norm": 0.7439635396003723, "learning_rate": 8.788276254098952e-05, "loss": 0.8870025634765625, "memory(GiB)": 91.52, "step": 40850, "token_acc": 0.765366550949561, "train_speed(iter/s)": 0.157339 }, { "epoch": 0.5301214199286987, "grad_norm": 0.8466097712516785, "learning_rate": 8.787926164335431e-05, "loss": 0.9170675277709961, "memory(GiB)": 91.52, "step": 40855, "token_acc": 0.7557058036173937, "train_speed(iter/s)": 0.157333 }, { "epoch": 0.5301862983303544, "grad_norm": 0.7902077436447144, "learning_rate": 8.7875760309807e-05, "loss": 0.9798743247985839, "memory(GiB)": 91.52, "step": 40860, "token_acc": 0.7436615431441181, "train_speed(iter/s)": 0.157327 }, { "epoch": 0.53025117673201, "grad_norm": 0.8101859092712402, "learning_rate": 8.787225854038787e-05, "loss": 0.9774515151977539, "memory(GiB)": 91.52, "step": 40865, "token_acc": 0.7488992379339543, "train_speed(iter/s)": 0.157322 }, { "epoch": 0.5303160551336658, "grad_norm": 0.7986038327217102, "learning_rate": 8.786875633513725e-05, "loss": 0.8916953086853028, "memory(GiB)": 91.52, "step": 40870, "token_acc": 0.7555043833070739, "train_speed(iter/s)": 0.157318 }, { "epoch": 0.5303809335353215, "grad_norm": 0.7202747464179993, "learning_rate": 8.78652536940954e-05, "loss": 0.9365358352661133, "memory(GiB)": 91.52, "step": 40875, "token_acc": 0.733406121757464, "train_speed(iter/s)": 0.157312 }, { "epoch": 0.5304458119369772, "grad_norm": 0.8352583050727844, "learning_rate": 8.786175061730267e-05, "loss": 0.955109977722168, "memory(GiB)": 91.52, "step": 40880, "token_acc": 0.7610430204820268, "train_speed(iter/s)": 0.157308 }, { "epoch": 0.5305106903386329, "grad_norm": 0.6954050660133362, "learning_rate": 8.785824710479932e-05, "loss": 0.9091192245483398, "memory(GiB)": 91.52, "step": 40885, "token_acc": 0.7531485338887999, "train_speed(iter/s)": 0.157302 }, { "epoch": 0.5305755687402886, "grad_norm": 0.7820427417755127, "learning_rate": 8.78547431566257e-05, "loss": 0.9312276840209961, "memory(GiB)": 91.52, "step": 40890, "token_acc": 0.7518478298945599, "train_speed(iter/s)": 0.157297 }, { "epoch": 0.5306404471419442, "grad_norm": 0.7562230825424194, "learning_rate": 8.785123877282215e-05, "loss": 0.959017562866211, "memory(GiB)": 91.52, "step": 40895, "token_acc": 0.7385321100917431, "train_speed(iter/s)": 0.157292 }, { "epoch": 0.5307053255436, "grad_norm": 0.8301969170570374, "learning_rate": 8.784773395342899e-05, "loss": 0.9250818252563476, "memory(GiB)": 91.52, "step": 40900, "token_acc": 0.7400232210980262, "train_speed(iter/s)": 0.157287 }, { "epoch": 0.5307702039452556, "grad_norm": 0.7724499702453613, "learning_rate": 8.784422869848651e-05, "loss": 0.9464103698730468, "memory(GiB)": 91.52, "step": 40905, "token_acc": 0.7590950779578111, "train_speed(iter/s)": 0.157281 }, { "epoch": 0.5308350823469113, "grad_norm": 0.9029170870780945, "learning_rate": 8.78407230080351e-05, "loss": 0.9016447067260742, "memory(GiB)": 91.52, "step": 40910, "token_acc": 0.746652674241876, "train_speed(iter/s)": 0.157275 }, { "epoch": 0.530899960748567, "grad_norm": 0.7738682627677917, "learning_rate": 8.783721688211508e-05, "loss": 0.908267879486084, "memory(GiB)": 91.52, "step": 40915, "token_acc": 0.7570221584568787, "train_speed(iter/s)": 0.157269 }, { "epoch": 0.5309648391502227, "grad_norm": 0.7459231019020081, "learning_rate": 8.783371032076682e-05, "loss": 0.8863401412963867, "memory(GiB)": 91.52, "step": 40920, "token_acc": 0.7671131701749085, "train_speed(iter/s)": 0.157263 }, { "epoch": 0.5310297175518784, "grad_norm": 0.8228430151939392, "learning_rate": 8.783020332403063e-05, "loss": 0.9217498779296875, "memory(GiB)": 91.52, "step": 40925, "token_acc": 0.7617590206185567, "train_speed(iter/s)": 0.157258 }, { "epoch": 0.5310945959535341, "grad_norm": 0.709656298160553, "learning_rate": 8.782669589194691e-05, "loss": 0.9112242698669434, "memory(GiB)": 91.52, "step": 40930, "token_acc": 0.7386453137306379, "train_speed(iter/s)": 0.157252 }, { "epoch": 0.5311594743551897, "grad_norm": 0.7347441911697388, "learning_rate": 8.782318802455601e-05, "loss": 0.9222244262695313, "memory(GiB)": 91.52, "step": 40935, "token_acc": 0.7449873467004088, "train_speed(iter/s)": 0.157247 }, { "epoch": 0.5312243527568454, "grad_norm": 0.8852797746658325, "learning_rate": 8.78196797218983e-05, "loss": 0.9353471755981445, "memory(GiB)": 91.52, "step": 40940, "token_acc": 0.7475223989760468, "train_speed(iter/s)": 0.157241 }, { "epoch": 0.5312892311585011, "grad_norm": 0.7222773432731628, "learning_rate": 8.781617098401414e-05, "loss": 0.9159919738769531, "memory(GiB)": 91.52, "step": 40945, "token_acc": 0.7577806937855388, "train_speed(iter/s)": 0.157236 }, { "epoch": 0.5313541095601568, "grad_norm": 0.7410185933113098, "learning_rate": 8.781266181094392e-05, "loss": 0.8929437637329102, "memory(GiB)": 91.52, "step": 40950, "token_acc": 0.7535543330713724, "train_speed(iter/s)": 0.157232 }, { "epoch": 0.5314189879618125, "grad_norm": 0.8755536675453186, "learning_rate": 8.780915220272803e-05, "loss": 0.9034856796264649, "memory(GiB)": 91.52, "step": 40955, "token_acc": 0.7364015233168067, "train_speed(iter/s)": 0.157227 }, { "epoch": 0.5314838663634682, "grad_norm": 0.7922112941741943, "learning_rate": 8.780564215940683e-05, "loss": 0.9296730995178223, "memory(GiB)": 91.52, "step": 40960, "token_acc": 0.7408566399374145, "train_speed(iter/s)": 0.157222 }, { "epoch": 0.5315487447651239, "grad_norm": 0.8065581321716309, "learning_rate": 8.780213168102075e-05, "loss": 0.9413663864135742, "memory(GiB)": 91.52, "step": 40965, "token_acc": 0.7223710177614886, "train_speed(iter/s)": 0.157217 }, { "epoch": 0.5316136231667796, "grad_norm": 0.7692359089851379, "learning_rate": 8.779862076761017e-05, "loss": 0.9417181015014648, "memory(GiB)": 91.52, "step": 40970, "token_acc": 0.7639924009937162, "train_speed(iter/s)": 0.157213 }, { "epoch": 0.5316785015684353, "grad_norm": 0.8046033382415771, "learning_rate": 8.779510941921549e-05, "loss": 0.932872200012207, "memory(GiB)": 91.52, "step": 40975, "token_acc": 0.7665898134562985, "train_speed(iter/s)": 0.157208 }, { "epoch": 0.531743379970091, "grad_norm": 0.8067283630371094, "learning_rate": 8.779159763587712e-05, "loss": 0.954318904876709, "memory(GiB)": 91.52, "step": 40980, "token_acc": 0.7693024589576314, "train_speed(iter/s)": 0.157203 }, { "epoch": 0.5318082583717467, "grad_norm": 0.7205145955085754, "learning_rate": 8.77880854176355e-05, "loss": 0.8692741394042969, "memory(GiB)": 91.52, "step": 40985, "token_acc": 0.7811445626784324, "train_speed(iter/s)": 0.157197 }, { "epoch": 0.5318731367734024, "grad_norm": 0.8082208037376404, "learning_rate": 8.7784572764531e-05, "loss": 0.90567626953125, "memory(GiB)": 91.52, "step": 40990, "token_acc": 0.7491567548375644, "train_speed(iter/s)": 0.157193 }, { "epoch": 0.5319380151750581, "grad_norm": 0.8209081888198853, "learning_rate": 8.778105967660408e-05, "loss": 0.8988468170166015, "memory(GiB)": 91.52, "step": 40995, "token_acc": 0.754946268956513, "train_speed(iter/s)": 0.157187 }, { "epoch": 0.5320028935767138, "grad_norm": 0.7759151458740234, "learning_rate": 8.777754615389515e-05, "loss": 0.8670568466186523, "memory(GiB)": 91.52, "step": 41000, "token_acc": 0.7631838830732767, "train_speed(iter/s)": 0.157182 }, { "epoch": 0.5320677719783695, "grad_norm": 0.683170735836029, "learning_rate": 8.777403219644464e-05, "loss": 0.9205193519592285, "memory(GiB)": 91.52, "step": 41005, "token_acc": 0.7620831772199326, "train_speed(iter/s)": 0.157177 }, { "epoch": 0.5321326503800252, "grad_norm": 0.8167744874954224, "learning_rate": 8.777051780429302e-05, "loss": 0.9712947845458985, "memory(GiB)": 91.52, "step": 41010, "token_acc": 0.7381052927927928, "train_speed(iter/s)": 0.157172 }, { "epoch": 0.5321975287816809, "grad_norm": 0.838287353515625, "learning_rate": 8.77670029774807e-05, "loss": 0.9515910148620605, "memory(GiB)": 91.52, "step": 41015, "token_acc": 0.7496649057793897, "train_speed(iter/s)": 0.157167 }, { "epoch": 0.5322624071833366, "grad_norm": 0.8220329880714417, "learning_rate": 8.776348771604814e-05, "loss": 0.9155139923095703, "memory(GiB)": 91.52, "step": 41020, "token_acc": 0.7651379141504815, "train_speed(iter/s)": 0.15716 }, { "epoch": 0.5323272855849923, "grad_norm": 0.8006086945533752, "learning_rate": 8.77599720200358e-05, "loss": 0.9431469917297364, "memory(GiB)": 91.52, "step": 41025, "token_acc": 0.7305934492071187, "train_speed(iter/s)": 0.157155 }, { "epoch": 0.532392163986648, "grad_norm": 0.8480885028839111, "learning_rate": 8.775645588948414e-05, "loss": 0.9129174232482911, "memory(GiB)": 91.52, "step": 41030, "token_acc": 0.7576877703499804, "train_speed(iter/s)": 0.157151 }, { "epoch": 0.5324570423883037, "grad_norm": 0.708975613117218, "learning_rate": 8.775293932443361e-05, "loss": 0.9289512634277344, "memory(GiB)": 91.52, "step": 41035, "token_acc": 0.7494763614602035, "train_speed(iter/s)": 0.157146 }, { "epoch": 0.5325219207899594, "grad_norm": 0.7415723204612732, "learning_rate": 8.774942232492467e-05, "loss": 0.870782470703125, "memory(GiB)": 91.52, "step": 41040, "token_acc": 0.7642108075612851, "train_speed(iter/s)": 0.157141 }, { "epoch": 0.5325867991916151, "grad_norm": 0.7527822256088257, "learning_rate": 8.77459048909978e-05, "loss": 0.917110824584961, "memory(GiB)": 91.52, "step": 41045, "token_acc": 0.7392602357320099, "train_speed(iter/s)": 0.157137 }, { "epoch": 0.5326516775932708, "grad_norm": 0.6508464217185974, "learning_rate": 8.774238702269352e-05, "loss": 0.9122884750366211, "memory(GiB)": 91.52, "step": 41050, "token_acc": 0.7741378186322446, "train_speed(iter/s)": 0.157129 }, { "epoch": 0.5327165559949265, "grad_norm": 0.7605395317077637, "learning_rate": 8.773886872005225e-05, "loss": 0.9174051284790039, "memory(GiB)": 91.52, "step": 41055, "token_acc": 0.7530415834392591, "train_speed(iter/s)": 0.157124 }, { "epoch": 0.5327814343965822, "grad_norm": 0.7491083741188049, "learning_rate": 8.773534998311454e-05, "loss": 0.8994550704956055, "memory(GiB)": 91.52, "step": 41060, "token_acc": 0.747812734349891, "train_speed(iter/s)": 0.157119 }, { "epoch": 0.5328463127982379, "grad_norm": 0.8040991425514221, "learning_rate": 8.773183081192082e-05, "loss": 0.9259689331054688, "memory(GiB)": 91.52, "step": 41065, "token_acc": 0.7445709123757904, "train_speed(iter/s)": 0.157114 }, { "epoch": 0.5329111911998936, "grad_norm": 0.8213948011398315, "learning_rate": 8.772831120651164e-05, "loss": 0.8961538314819336, "memory(GiB)": 91.52, "step": 41070, "token_acc": 0.74925320056899, "train_speed(iter/s)": 0.157108 }, { "epoch": 0.5329760696015493, "grad_norm": 0.7834016680717468, "learning_rate": 8.772479116692747e-05, "loss": 0.932186508178711, "memory(GiB)": 91.52, "step": 41075, "token_acc": 0.7470821487355978, "train_speed(iter/s)": 0.157102 }, { "epoch": 0.533040948003205, "grad_norm": 0.7629474401473999, "learning_rate": 8.772127069320884e-05, "loss": 0.9284245491027832, "memory(GiB)": 91.52, "step": 41080, "token_acc": 0.7656068693432351, "train_speed(iter/s)": 0.157098 }, { "epoch": 0.5331058264048607, "grad_norm": 0.7952733039855957, "learning_rate": 8.771774978539625e-05, "loss": 0.8761402130126953, "memory(GiB)": 91.52, "step": 41085, "token_acc": 0.7593927003385059, "train_speed(iter/s)": 0.157092 }, { "epoch": 0.5331707048065164, "grad_norm": 0.7189408540725708, "learning_rate": 8.771422844353021e-05, "loss": 0.899380874633789, "memory(GiB)": 91.52, "step": 41090, "token_acc": 0.7597358314489788, "train_speed(iter/s)": 0.157088 }, { "epoch": 0.5332355832081721, "grad_norm": 0.6855992078781128, "learning_rate": 8.771070666765126e-05, "loss": 0.8805028915405273, "memory(GiB)": 91.52, "step": 41095, "token_acc": 0.7608543212762228, "train_speed(iter/s)": 0.157082 }, { "epoch": 0.5333004616098278, "grad_norm": 0.795844316482544, "learning_rate": 8.770718445779994e-05, "loss": 0.9302377700805664, "memory(GiB)": 91.52, "step": 41100, "token_acc": 0.7530493013085517, "train_speed(iter/s)": 0.157076 }, { "epoch": 0.5333653400114835, "grad_norm": 0.7840158343315125, "learning_rate": 8.770366181401677e-05, "loss": 0.9479204177856445, "memory(GiB)": 91.52, "step": 41105, "token_acc": 0.7466732035299062, "train_speed(iter/s)": 0.15707 }, { "epoch": 0.5334302184131392, "grad_norm": 0.7671154737472534, "learning_rate": 8.770013873634227e-05, "loss": 0.8981259346008301, "memory(GiB)": 91.52, "step": 41110, "token_acc": 0.7575007951372937, "train_speed(iter/s)": 0.157066 }, { "epoch": 0.5334950968147949, "grad_norm": 0.8846737146377563, "learning_rate": 8.769661522481701e-05, "loss": 0.9683540344238282, "memory(GiB)": 91.52, "step": 41115, "token_acc": 0.7555818618263128, "train_speed(iter/s)": 0.157061 }, { "epoch": 0.5335599752164506, "grad_norm": 0.7379413843154907, "learning_rate": 8.769309127948151e-05, "loss": 0.8760688781738282, "memory(GiB)": 91.52, "step": 41120, "token_acc": 0.7754512635379062, "train_speed(iter/s)": 0.157055 }, { "epoch": 0.5336248536181063, "grad_norm": 0.7213836312294006, "learning_rate": 8.768956690037637e-05, "loss": 0.9037458419799804, "memory(GiB)": 91.52, "step": 41125, "token_acc": 0.7559510730242438, "train_speed(iter/s)": 0.15705 }, { "epoch": 0.533689732019762, "grad_norm": 0.8106138110160828, "learning_rate": 8.76860420875421e-05, "loss": 0.9073102951049805, "memory(GiB)": 91.52, "step": 41130, "token_acc": 0.7483695484190028, "train_speed(iter/s)": 0.157045 }, { "epoch": 0.5337546104214177, "grad_norm": 0.8837756514549255, "learning_rate": 8.768251684101929e-05, "loss": 0.9536537170410156, "memory(GiB)": 91.52, "step": 41135, "token_acc": 0.7518434106448586, "train_speed(iter/s)": 0.157041 }, { "epoch": 0.5338194888230734, "grad_norm": 0.7642806172370911, "learning_rate": 8.76789911608485e-05, "loss": 0.9308725357055664, "memory(GiB)": 91.52, "step": 41140, "token_acc": 0.7258616839127522, "train_speed(iter/s)": 0.157034 }, { "epoch": 0.5338843672247291, "grad_norm": 0.7498519420623779, "learning_rate": 8.76754650470703e-05, "loss": 0.8686923027038574, "memory(GiB)": 91.52, "step": 41145, "token_acc": 0.7623325043613823, "train_speed(iter/s)": 0.157028 }, { "epoch": 0.5339492456263848, "grad_norm": 0.892728328704834, "learning_rate": 8.767193849972528e-05, "loss": 0.9474493026733398, "memory(GiB)": 91.52, "step": 41150, "token_acc": 0.7568662764520486, "train_speed(iter/s)": 0.157023 }, { "epoch": 0.5340141240280405, "grad_norm": 0.748672366142273, "learning_rate": 8.766841151885402e-05, "loss": 0.9336414337158203, "memory(GiB)": 91.52, "step": 41155, "token_acc": 0.7458414966770389, "train_speed(iter/s)": 0.157018 }, { "epoch": 0.5340790024296962, "grad_norm": 0.7878945469856262, "learning_rate": 8.76648841044971e-05, "loss": 0.9582138061523438, "memory(GiB)": 91.52, "step": 41160, "token_acc": 0.7539152338188588, "train_speed(iter/s)": 0.157012 }, { "epoch": 0.5341438808313519, "grad_norm": 0.7716007828712463, "learning_rate": 8.766135625669511e-05, "loss": 0.9450934410095215, "memory(GiB)": 91.52, "step": 41165, "token_acc": 0.7398614188737646, "train_speed(iter/s)": 0.157006 }, { "epoch": 0.5342087592330076, "grad_norm": 0.7667925953865051, "learning_rate": 8.765782797548867e-05, "loss": 0.9452011108398437, "memory(GiB)": 91.52, "step": 41170, "token_acc": 0.7407788390889052, "train_speed(iter/s)": 0.157001 }, { "epoch": 0.5342736376346632, "grad_norm": 0.7428690791130066, "learning_rate": 8.765429926091836e-05, "loss": 0.882689094543457, "memory(GiB)": 91.52, "step": 41175, "token_acc": 0.7691279107690301, "train_speed(iter/s)": 0.156996 }, { "epoch": 0.5343385160363189, "grad_norm": 0.911457359790802, "learning_rate": 8.76507701130248e-05, "loss": 0.9494553565979004, "memory(GiB)": 91.52, "step": 41180, "token_acc": 0.7617042775345234, "train_speed(iter/s)": 0.156991 }, { "epoch": 0.5344033944379746, "grad_norm": 0.8142462968826294, "learning_rate": 8.764724053184861e-05, "loss": 0.8713832855224609, "memory(GiB)": 91.52, "step": 41185, "token_acc": 0.7415170661585264, "train_speed(iter/s)": 0.156984 }, { "epoch": 0.5344682728396303, "grad_norm": 0.6651624441146851, "learning_rate": 8.764371051743039e-05, "loss": 0.8827911376953125, "memory(GiB)": 91.52, "step": 41190, "token_acc": 0.7472333907715872, "train_speed(iter/s)": 0.156978 }, { "epoch": 0.534533151241286, "grad_norm": 0.7221699357032776, "learning_rate": 8.764018006981077e-05, "loss": 0.9274069786071777, "memory(GiB)": 91.52, "step": 41195, "token_acc": 0.7519815807352608, "train_speed(iter/s)": 0.156974 }, { "epoch": 0.5345980296429417, "grad_norm": 0.7573794722557068, "learning_rate": 8.763664918903039e-05, "loss": 0.8980860710144043, "memory(GiB)": 91.52, "step": 41200, "token_acc": 0.7645764927157385, "train_speed(iter/s)": 0.156968 }, { "epoch": 0.5346629080445974, "grad_norm": 0.6270119547843933, "learning_rate": 8.763311787512986e-05, "loss": 0.9514017105102539, "memory(GiB)": 91.52, "step": 41205, "token_acc": 0.7529842012873025, "train_speed(iter/s)": 0.156962 }, { "epoch": 0.5347277864462531, "grad_norm": 0.8315172791481018, "learning_rate": 8.762958612814984e-05, "loss": 0.9068738937377929, "memory(GiB)": 91.52, "step": 41210, "token_acc": 0.7400667377618769, "train_speed(iter/s)": 0.156957 }, { "epoch": 0.5347926648479088, "grad_norm": 0.8757125735282898, "learning_rate": 8.762605394813095e-05, "loss": 0.91903715133667, "memory(GiB)": 91.52, "step": 41215, "token_acc": 0.7548504834475251, "train_speed(iter/s)": 0.156952 }, { "epoch": 0.5348575432495645, "grad_norm": 0.7385910153388977, "learning_rate": 8.762252133511385e-05, "loss": 0.9054059028625489, "memory(GiB)": 91.52, "step": 41220, "token_acc": 0.752540235852555, "train_speed(iter/s)": 0.156947 }, { "epoch": 0.5349224216512202, "grad_norm": 0.774050772190094, "learning_rate": 8.761898828913922e-05, "loss": 0.8970928192138672, "memory(GiB)": 91.52, "step": 41225, "token_acc": 0.7546800484020215, "train_speed(iter/s)": 0.156942 }, { "epoch": 0.5349873000528759, "grad_norm": 0.768514096736908, "learning_rate": 8.761545481024768e-05, "loss": 0.9008060455322265, "memory(GiB)": 91.52, "step": 41230, "token_acc": 0.7450126440011239, "train_speed(iter/s)": 0.156936 }, { "epoch": 0.5350521784545316, "grad_norm": 0.8467762470245361, "learning_rate": 8.761192089847992e-05, "loss": 0.8910959243774415, "memory(GiB)": 91.52, "step": 41235, "token_acc": 0.7706712297807768, "train_speed(iter/s)": 0.156931 }, { "epoch": 0.5351170568561873, "grad_norm": 0.7788923978805542, "learning_rate": 8.76083865538766e-05, "loss": 0.9628202438354492, "memory(GiB)": 91.52, "step": 41240, "token_acc": 0.7400053843047516, "train_speed(iter/s)": 0.156925 }, { "epoch": 0.535181935257843, "grad_norm": 0.8219671845436096, "learning_rate": 8.760485177647837e-05, "loss": 0.924285888671875, "memory(GiB)": 91.52, "step": 41245, "token_acc": 0.7461304133047917, "train_speed(iter/s)": 0.156921 }, { "epoch": 0.5352468136594987, "grad_norm": 0.7456937432289124, "learning_rate": 8.760131656632593e-05, "loss": 0.8951816558837891, "memory(GiB)": 91.52, "step": 41250, "token_acc": 0.7636478606056348, "train_speed(iter/s)": 0.156915 }, { "epoch": 0.5353116920611544, "grad_norm": 0.7342585921287537, "learning_rate": 8.759778092345995e-05, "loss": 0.9188187599182129, "memory(GiB)": 91.52, "step": 41255, "token_acc": 0.7564806678383128, "train_speed(iter/s)": 0.15691 }, { "epoch": 0.5353765704628101, "grad_norm": 0.696077287197113, "learning_rate": 8.759424484792114e-05, "loss": 0.9101436614990235, "memory(GiB)": 91.52, "step": 41260, "token_acc": 0.7481963613550816, "train_speed(iter/s)": 0.156905 }, { "epoch": 0.5354414488644658, "grad_norm": 0.8301834464073181, "learning_rate": 8.759070833975017e-05, "loss": 0.9308535575866699, "memory(GiB)": 91.52, "step": 41265, "token_acc": 0.757444917137596, "train_speed(iter/s)": 0.1569 }, { "epoch": 0.5355063272661215, "grad_norm": 0.7660889029502869, "learning_rate": 8.758717139898775e-05, "loss": 0.9360870361328125, "memory(GiB)": 91.52, "step": 41270, "token_acc": 0.7589685218619986, "train_speed(iter/s)": 0.156894 }, { "epoch": 0.5355712056677772, "grad_norm": 0.7713258266448975, "learning_rate": 8.75836340256746e-05, "loss": 0.8947535514831543, "memory(GiB)": 91.52, "step": 41275, "token_acc": 0.7594812017731079, "train_speed(iter/s)": 0.156887 }, { "epoch": 0.5356360840694329, "grad_norm": 0.8316445350646973, "learning_rate": 8.75800962198514e-05, "loss": 0.9228060722351075, "memory(GiB)": 91.52, "step": 41280, "token_acc": 0.7362453057206403, "train_speed(iter/s)": 0.156882 }, { "epoch": 0.5357009624710886, "grad_norm": 0.9307022094726562, "learning_rate": 8.757655798155886e-05, "loss": 0.9109161376953125, "memory(GiB)": 91.52, "step": 41285, "token_acc": 0.7627921557699676, "train_speed(iter/s)": 0.156876 }, { "epoch": 0.5357658408727443, "grad_norm": 0.6575342416763306, "learning_rate": 8.75730193108377e-05, "loss": 0.8696216583251953, "memory(GiB)": 91.52, "step": 41290, "token_acc": 0.7750122488975992, "train_speed(iter/s)": 0.156869 }, { "epoch": 0.5358307192744, "grad_norm": 0.7742988467216492, "learning_rate": 8.756948020772866e-05, "loss": 0.8954594612121582, "memory(GiB)": 91.52, "step": 41295, "token_acc": 0.7673503499669185, "train_speed(iter/s)": 0.156864 }, { "epoch": 0.5358955976760557, "grad_norm": 0.6982574462890625, "learning_rate": 8.756594067227246e-05, "loss": 0.9511928558349609, "memory(GiB)": 91.52, "step": 41300, "token_acc": 0.7409722222222223, "train_speed(iter/s)": 0.15686 }, { "epoch": 0.5359604760777114, "grad_norm": 0.8970402479171753, "learning_rate": 8.756240070450986e-05, "loss": 0.9148263931274414, "memory(GiB)": 91.52, "step": 41305, "token_acc": 0.7565656878856469, "train_speed(iter/s)": 0.156855 }, { "epoch": 0.536025354479367, "grad_norm": 0.8999758362770081, "learning_rate": 8.755886030448153e-05, "loss": 0.9148548126220704, "memory(GiB)": 91.52, "step": 41310, "token_acc": 0.7482405140758874, "train_speed(iter/s)": 0.15685 }, { "epoch": 0.5360902328810228, "grad_norm": 0.8803977966308594, "learning_rate": 8.755531947222828e-05, "loss": 0.8982089996337891, "memory(GiB)": 91.52, "step": 41315, "token_acc": 0.7590608616366538, "train_speed(iter/s)": 0.156844 }, { "epoch": 0.5361551112826785, "grad_norm": 0.7964062690734863, "learning_rate": 8.755177820779081e-05, "loss": 0.9177257537841796, "memory(GiB)": 91.52, "step": 41320, "token_acc": 0.7469478357380688, "train_speed(iter/s)": 0.156839 }, { "epoch": 0.5362199896843342, "grad_norm": 0.7623639106750488, "learning_rate": 8.754823651120992e-05, "loss": 0.8904641151428223, "memory(GiB)": 91.52, "step": 41325, "token_acc": 0.7588404391646174, "train_speed(iter/s)": 0.156832 }, { "epoch": 0.5362848680859899, "grad_norm": 0.7989148497581482, "learning_rate": 8.754469438252632e-05, "loss": 0.8637216567993165, "memory(GiB)": 91.52, "step": 41330, "token_acc": 0.7790011662207588, "train_speed(iter/s)": 0.156827 }, { "epoch": 0.5363497464876456, "grad_norm": 0.8855605721473694, "learning_rate": 8.75411518217808e-05, "loss": 0.9499561309814453, "memory(GiB)": 91.52, "step": 41335, "token_acc": 0.7437419110451718, "train_speed(iter/s)": 0.156822 }, { "epoch": 0.5364146248893013, "grad_norm": 0.6986954808235168, "learning_rate": 8.753760882901411e-05, "loss": 0.8553987503051758, "memory(GiB)": 91.52, "step": 41340, "token_acc": 0.7659489897316992, "train_speed(iter/s)": 0.156817 }, { "epoch": 0.536479503290957, "grad_norm": 0.8366587162017822, "learning_rate": 8.753406540426705e-05, "loss": 0.9082152366638183, "memory(GiB)": 91.52, "step": 41345, "token_acc": 0.7344238068048231, "train_speed(iter/s)": 0.156811 }, { "epoch": 0.5365443816926126, "grad_norm": 0.6942189931869507, "learning_rate": 8.753052154758036e-05, "loss": 0.8884735107421875, "memory(GiB)": 91.52, "step": 41350, "token_acc": 0.7620196377179618, "train_speed(iter/s)": 0.156807 }, { "epoch": 0.5366092600942683, "grad_norm": 0.8349241018295288, "learning_rate": 8.752697725899487e-05, "loss": 0.9180144309997559, "memory(GiB)": 91.52, "step": 41355, "token_acc": 0.7678222800686269, "train_speed(iter/s)": 0.156801 }, { "epoch": 0.536674138495924, "grad_norm": 0.910422146320343, "learning_rate": 8.752343253855133e-05, "loss": 0.9316112518310546, "memory(GiB)": 91.52, "step": 41360, "token_acc": 0.7446384910932589, "train_speed(iter/s)": 0.156796 }, { "epoch": 0.5367390168975797, "grad_norm": 0.8253819346427917, "learning_rate": 8.751988738629053e-05, "loss": 0.9664968490600586, "memory(GiB)": 91.52, "step": 41365, "token_acc": 0.744362917546152, "train_speed(iter/s)": 0.156791 }, { "epoch": 0.5368038952992354, "grad_norm": 0.7780872583389282, "learning_rate": 8.75163418022533e-05, "loss": 0.9221439361572266, "memory(GiB)": 91.52, "step": 41370, "token_acc": 0.768338070104209, "train_speed(iter/s)": 0.156786 }, { "epoch": 0.5368687737008911, "grad_norm": 0.6444248557090759, "learning_rate": 8.751279578648041e-05, "loss": 0.9137701034545899, "memory(GiB)": 91.52, "step": 41375, "token_acc": 0.7634773193526327, "train_speed(iter/s)": 0.15678 }, { "epoch": 0.5369336521025468, "grad_norm": 0.7215065360069275, "learning_rate": 8.75092493390127e-05, "loss": 0.9477964401245117, "memory(GiB)": 91.52, "step": 41380, "token_acc": 0.7424895572263993, "train_speed(iter/s)": 0.156774 }, { "epoch": 0.5369985305042025, "grad_norm": 0.7942039370536804, "learning_rate": 8.750570245989094e-05, "loss": 0.9084707260131836, "memory(GiB)": 91.52, "step": 41385, "token_acc": 0.7613495848789966, "train_speed(iter/s)": 0.156768 }, { "epoch": 0.5370634089058582, "grad_norm": 0.7915685176849365, "learning_rate": 8.750215514915599e-05, "loss": 0.9686349868774414, "memory(GiB)": 91.52, "step": 41390, "token_acc": 0.7464223443102938, "train_speed(iter/s)": 0.156763 }, { "epoch": 0.5371282873075139, "grad_norm": 0.7928205728530884, "learning_rate": 8.749860740684864e-05, "loss": 0.9035139083862305, "memory(GiB)": 91.52, "step": 41395, "token_acc": 0.750790416637392, "train_speed(iter/s)": 0.156758 }, { "epoch": 0.5371931657091696, "grad_norm": 0.7557339668273926, "learning_rate": 8.749505923300971e-05, "loss": 0.9253811836242676, "memory(GiB)": 91.52, "step": 41400, "token_acc": 0.7703859032028951, "train_speed(iter/s)": 0.156752 }, { "epoch": 0.5372580441108253, "grad_norm": 0.7861363291740417, "learning_rate": 8.749151062768008e-05, "loss": 0.9377026557922363, "memory(GiB)": 91.52, "step": 41405, "token_acc": 0.7377412839784128, "train_speed(iter/s)": 0.156748 }, { "epoch": 0.5373229225124809, "grad_norm": 0.7387170791625977, "learning_rate": 8.748796159090056e-05, "loss": 0.981209659576416, "memory(GiB)": 91.52, "step": 41410, "token_acc": 0.7500853922349995, "train_speed(iter/s)": 0.156742 }, { "epoch": 0.5373878009141366, "grad_norm": 0.7012704610824585, "learning_rate": 8.748441212271197e-05, "loss": 0.901462173461914, "memory(GiB)": 91.52, "step": 41415, "token_acc": 0.7479377087441149, "train_speed(iter/s)": 0.156736 }, { "epoch": 0.5374526793157923, "grad_norm": 0.75471431016922, "learning_rate": 8.748086222315519e-05, "loss": 0.9044347763061523, "memory(GiB)": 91.52, "step": 41420, "token_acc": 0.7369985686497597, "train_speed(iter/s)": 0.15673 }, { "epoch": 0.537517557717448, "grad_norm": 0.8009586930274963, "learning_rate": 8.747731189227106e-05, "loss": 0.9018221855163574, "memory(GiB)": 91.52, "step": 41425, "token_acc": 0.7605012933075886, "train_speed(iter/s)": 0.156724 }, { "epoch": 0.5375824361191037, "grad_norm": 0.7371915578842163, "learning_rate": 8.747376113010044e-05, "loss": 0.9728683471679688, "memory(GiB)": 91.52, "step": 41430, "token_acc": 0.746731529340225, "train_speed(iter/s)": 0.15672 }, { "epoch": 0.5376473145207594, "grad_norm": 0.7807534337043762, "learning_rate": 8.747020993668418e-05, "loss": 0.933465576171875, "memory(GiB)": 91.52, "step": 41435, "token_acc": 0.7635817438198994, "train_speed(iter/s)": 0.156715 }, { "epoch": 0.5377121929224151, "grad_norm": 0.7432830333709717, "learning_rate": 8.746665831206315e-05, "loss": 0.9077709197998047, "memory(GiB)": 91.52, "step": 41440, "token_acc": 0.7559640706995912, "train_speed(iter/s)": 0.156709 }, { "epoch": 0.5377770713240708, "grad_norm": 0.7538564205169678, "learning_rate": 8.746310625627824e-05, "loss": 0.8842418670654297, "memory(GiB)": 91.52, "step": 41445, "token_acc": 0.7611563395145484, "train_speed(iter/s)": 0.156703 }, { "epoch": 0.5378419497257265, "grad_norm": 0.6690747737884521, "learning_rate": 8.745955376937032e-05, "loss": 0.8842583656311035, "memory(GiB)": 91.52, "step": 41450, "token_acc": 0.7597985256346109, "train_speed(iter/s)": 0.156698 }, { "epoch": 0.5379068281273822, "grad_norm": 0.7591288685798645, "learning_rate": 8.745600085138026e-05, "loss": 0.9048006057739257, "memory(GiB)": 91.52, "step": 41455, "token_acc": 0.7649493519224485, "train_speed(iter/s)": 0.156692 }, { "epoch": 0.5379717065290379, "grad_norm": 0.7735596895217896, "learning_rate": 8.745244750234894e-05, "loss": 0.9186506271362305, "memory(GiB)": 91.52, "step": 41460, "token_acc": 0.7481719892506719, "train_speed(iter/s)": 0.156686 }, { "epoch": 0.5380365849306936, "grad_norm": 0.7711177468299866, "learning_rate": 8.744889372231728e-05, "loss": 0.9116428375244141, "memory(GiB)": 91.52, "step": 41465, "token_acc": 0.7591111716064927, "train_speed(iter/s)": 0.15668 }, { "epoch": 0.5381014633323493, "grad_norm": 0.7613226771354675, "learning_rate": 8.744533951132616e-05, "loss": 0.9054398536682129, "memory(GiB)": 91.52, "step": 41470, "token_acc": 0.7397593149733858, "train_speed(iter/s)": 0.156675 }, { "epoch": 0.538166341734005, "grad_norm": 0.7720469832420349, "learning_rate": 8.744178486941648e-05, "loss": 0.9550876617431641, "memory(GiB)": 91.52, "step": 41475, "token_acc": 0.7373444873444873, "train_speed(iter/s)": 0.15667 }, { "epoch": 0.5382312201356607, "grad_norm": 0.7251049876213074, "learning_rate": 8.743822979662914e-05, "loss": 0.9266311645507812, "memory(GiB)": 91.52, "step": 41480, "token_acc": 0.7686860999900738, "train_speed(iter/s)": 0.156666 }, { "epoch": 0.5382960985373164, "grad_norm": 0.6750892996788025, "learning_rate": 8.743467429300508e-05, "loss": 0.889272403717041, "memory(GiB)": 91.52, "step": 41485, "token_acc": 0.7585869210402234, "train_speed(iter/s)": 0.156661 }, { "epoch": 0.5383609769389721, "grad_norm": 0.775240957736969, "learning_rate": 8.743111835858518e-05, "loss": 0.8936382293701172, "memory(GiB)": 91.52, "step": 41490, "token_acc": 0.7643494897959183, "train_speed(iter/s)": 0.156654 }, { "epoch": 0.5384258553406278, "grad_norm": 0.7298294901847839, "learning_rate": 8.742756199341037e-05, "loss": 0.8782676696777344, "memory(GiB)": 91.52, "step": 41495, "token_acc": 0.7690181776739415, "train_speed(iter/s)": 0.156648 }, { "epoch": 0.5384907337422835, "grad_norm": 0.7768063545227051, "learning_rate": 8.742400519752161e-05, "loss": 0.9306472778320313, "memory(GiB)": 91.52, "step": 41500, "token_acc": 0.7474654377880184, "train_speed(iter/s)": 0.156642 }, { "epoch": 0.5385556121439392, "grad_norm": 0.6899733543395996, "learning_rate": 8.74204479709598e-05, "loss": 0.8702302932739258, "memory(GiB)": 91.52, "step": 41505, "token_acc": 0.7558858547451703, "train_speed(iter/s)": 0.156637 }, { "epoch": 0.5386204905455949, "grad_norm": 0.7872180938720703, "learning_rate": 8.741689031376587e-05, "loss": 0.8944025039672852, "memory(GiB)": 91.52, "step": 41510, "token_acc": 0.7516668376243717, "train_speed(iter/s)": 0.156631 }, { "epoch": 0.5386853689472506, "grad_norm": 0.8389766812324524, "learning_rate": 8.74133322259808e-05, "loss": 0.9062763214111328, "memory(GiB)": 91.52, "step": 41515, "token_acc": 0.7536908805853149, "train_speed(iter/s)": 0.156626 }, { "epoch": 0.5387502473489063, "grad_norm": 0.8260330557823181, "learning_rate": 8.74097737076455e-05, "loss": 0.9232124328613281, "memory(GiB)": 91.52, "step": 41520, "token_acc": 0.7560766465485966, "train_speed(iter/s)": 0.156621 }, { "epoch": 0.538815125750562, "grad_norm": 0.7618933916091919, "learning_rate": 8.740621475880094e-05, "loss": 0.9567464828491211, "memory(GiB)": 91.52, "step": 41525, "token_acc": 0.7272475576474311, "train_speed(iter/s)": 0.156616 }, { "epoch": 0.5388800041522177, "grad_norm": 0.7034168839454651, "learning_rate": 8.740265537948804e-05, "loss": 0.9002531051635743, "memory(GiB)": 91.52, "step": 41530, "token_acc": 0.745042165659055, "train_speed(iter/s)": 0.156611 }, { "epoch": 0.5389448825538734, "grad_norm": 0.7584607005119324, "learning_rate": 8.73990955697478e-05, "loss": 0.9420913696289063, "memory(GiB)": 91.52, "step": 41535, "token_acc": 0.7280399339222868, "train_speed(iter/s)": 0.156605 }, { "epoch": 0.5390097609555291, "grad_norm": 0.7217975854873657, "learning_rate": 8.73955353296212e-05, "loss": 0.9444068908691406, "memory(GiB)": 91.52, "step": 41540, "token_acc": 0.7454537800976595, "train_speed(iter/s)": 0.156599 }, { "epoch": 0.5390746393571848, "grad_norm": 0.6798338890075684, "learning_rate": 8.739197465914914e-05, "loss": 0.8953253746032714, "memory(GiB)": 91.52, "step": 41545, "token_acc": 0.756784107946027, "train_speed(iter/s)": 0.156593 }, { "epoch": 0.5391395177588405, "grad_norm": 0.7576937675476074, "learning_rate": 8.738841355837268e-05, "loss": 0.9413884162902832, "memory(GiB)": 91.52, "step": 41550, "token_acc": 0.7586626962642122, "train_speed(iter/s)": 0.156588 }, { "epoch": 0.5392043961604962, "grad_norm": 0.8070975542068481, "learning_rate": 8.738485202733275e-05, "loss": 0.9464822769165039, "memory(GiB)": 91.52, "step": 41555, "token_acc": 0.742168754998667, "train_speed(iter/s)": 0.156582 }, { "epoch": 0.5392692745621519, "grad_norm": 0.7582336664199829, "learning_rate": 8.738129006607035e-05, "loss": 0.8876945495605468, "memory(GiB)": 91.52, "step": 41560, "token_acc": 0.7570386135603527, "train_speed(iter/s)": 0.156575 }, { "epoch": 0.5393341529638076, "grad_norm": 0.7295041084289551, "learning_rate": 8.737772767462647e-05, "loss": 0.9090933799743652, "memory(GiB)": 91.52, "step": 41565, "token_acc": 0.7524062396282775, "train_speed(iter/s)": 0.15657 }, { "epoch": 0.5393990313654633, "grad_norm": 0.7790703177452087, "learning_rate": 8.73741648530421e-05, "loss": 0.9433717727661133, "memory(GiB)": 91.52, "step": 41570, "token_acc": 0.7832368161655988, "train_speed(iter/s)": 0.156564 }, { "epoch": 0.539463909767119, "grad_norm": 0.8296291828155518, "learning_rate": 8.737060160135826e-05, "loss": 0.939216136932373, "memory(GiB)": 91.52, "step": 41575, "token_acc": 0.7462608876704723, "train_speed(iter/s)": 0.156559 }, { "epoch": 0.5395287881687747, "grad_norm": 0.8743883371353149, "learning_rate": 8.736703791961594e-05, "loss": 0.9123661994934082, "memory(GiB)": 91.52, "step": 41580, "token_acc": 0.7368707141662485, "train_speed(iter/s)": 0.156553 }, { "epoch": 0.5395936665704304, "grad_norm": 0.7451575994491577, "learning_rate": 8.736347380785615e-05, "loss": 0.9588373184204102, "memory(GiB)": 91.52, "step": 41585, "token_acc": 0.7331526777437067, "train_speed(iter/s)": 0.156548 }, { "epoch": 0.5396585449720861, "grad_norm": 0.9876304268836975, "learning_rate": 8.73599092661199e-05, "loss": 0.8853237152099609, "memory(GiB)": 91.52, "step": 41590, "token_acc": 0.7671771445688397, "train_speed(iter/s)": 0.156544 }, { "epoch": 0.5397234233737418, "grad_norm": 0.8213366866111755, "learning_rate": 8.735634429444823e-05, "loss": 0.9194710731506348, "memory(GiB)": 91.52, "step": 41595, "token_acc": 0.7639876409931876, "train_speed(iter/s)": 0.156539 }, { "epoch": 0.5397883017753975, "grad_norm": 0.7883407473564148, "learning_rate": 8.735277889288213e-05, "loss": 0.9207687377929688, "memory(GiB)": 91.52, "step": 41600, "token_acc": 0.7592982456140351, "train_speed(iter/s)": 0.156534 }, { "epoch": 0.5398531801770532, "grad_norm": 0.8073774576187134, "learning_rate": 8.734921306146267e-05, "loss": 0.8995401382446289, "memory(GiB)": 91.52, "step": 41605, "token_acc": 0.7535155029339259, "train_speed(iter/s)": 0.156529 }, { "epoch": 0.5399180585787089, "grad_norm": 0.7545701265335083, "learning_rate": 8.734564680023087e-05, "loss": 0.9425673484802246, "memory(GiB)": 91.52, "step": 41610, "token_acc": 0.7531391444006424, "train_speed(iter/s)": 0.156525 }, { "epoch": 0.5399829369803646, "grad_norm": 0.735122799873352, "learning_rate": 8.734208010922777e-05, "loss": 0.9192132949829102, "memory(GiB)": 91.52, "step": 41615, "token_acc": 0.7478750708309723, "train_speed(iter/s)": 0.156519 }, { "epoch": 0.5400478153820203, "grad_norm": 0.6891289949417114, "learning_rate": 8.733851298849443e-05, "loss": 0.9174777030944824, "memory(GiB)": 91.52, "step": 41620, "token_acc": 0.7645687645687645, "train_speed(iter/s)": 0.156512 }, { "epoch": 0.540112693783676, "grad_norm": 0.8870183229446411, "learning_rate": 8.733494543807188e-05, "loss": 0.9507089614868164, "memory(GiB)": 91.52, "step": 41625, "token_acc": 0.7453671205952548, "train_speed(iter/s)": 0.156506 }, { "epoch": 0.5401775721853317, "grad_norm": 0.8373681306838989, "learning_rate": 8.733137745800117e-05, "loss": 0.9366743087768554, "memory(GiB)": 91.52, "step": 41630, "token_acc": 0.7586407302314451, "train_speed(iter/s)": 0.156499 }, { "epoch": 0.5402424505869874, "grad_norm": 0.7501279711723328, "learning_rate": 8.732780904832337e-05, "loss": 0.8997788429260254, "memory(GiB)": 91.52, "step": 41635, "token_acc": 0.7701934763328188, "train_speed(iter/s)": 0.156495 }, { "epoch": 0.5403073289886431, "grad_norm": 0.7836220264434814, "learning_rate": 8.732424020907955e-05, "loss": 0.9227792739868164, "memory(GiB)": 91.52, "step": 41640, "token_acc": 0.763220001475035, "train_speed(iter/s)": 0.15649 }, { "epoch": 0.5403722073902988, "grad_norm": 0.8567848205566406, "learning_rate": 8.732067094031078e-05, "loss": 0.8535757064819336, "memory(GiB)": 91.52, "step": 41645, "token_acc": 0.7528816043707109, "train_speed(iter/s)": 0.156484 }, { "epoch": 0.5404370857919544, "grad_norm": 0.8358217477798462, "learning_rate": 8.731710124205812e-05, "loss": 0.9139888763427735, "memory(GiB)": 91.52, "step": 41650, "token_acc": 0.7608356305423627, "train_speed(iter/s)": 0.156479 }, { "epoch": 0.5405019641936101, "grad_norm": 0.8070389032363892, "learning_rate": 8.731353111436266e-05, "loss": 0.8942958831787109, "memory(GiB)": 91.52, "step": 41655, "token_acc": 0.7782363162467419, "train_speed(iter/s)": 0.156473 }, { "epoch": 0.5405668425952658, "grad_norm": 0.9040306806564331, "learning_rate": 8.73099605572655e-05, "loss": 0.9167821884155274, "memory(GiB)": 91.52, "step": 41660, "token_acc": 0.7634374251257887, "train_speed(iter/s)": 0.156468 }, { "epoch": 0.5406317209969215, "grad_norm": 0.7912569642066956, "learning_rate": 8.730638957080772e-05, "loss": 0.8896015167236329, "memory(GiB)": 91.52, "step": 41665, "token_acc": 0.7411441333104277, "train_speed(iter/s)": 0.156462 }, { "epoch": 0.5406965993985772, "grad_norm": 0.7935519218444824, "learning_rate": 8.730281815503039e-05, "loss": 0.959344482421875, "memory(GiB)": 91.52, "step": 41670, "token_acc": 0.7686798679867987, "train_speed(iter/s)": 0.156458 }, { "epoch": 0.5407614778002329, "grad_norm": 0.8220927715301514, "learning_rate": 8.729924630997463e-05, "loss": 0.9153818130493164, "memory(GiB)": 91.52, "step": 41675, "token_acc": 0.7773409578270193, "train_speed(iter/s)": 0.156453 }, { "epoch": 0.5408263562018886, "grad_norm": 0.7685036659240723, "learning_rate": 8.729567403568157e-05, "loss": 0.9398603439331055, "memory(GiB)": 91.52, "step": 41680, "token_acc": 0.7453864102382805, "train_speed(iter/s)": 0.156448 }, { "epoch": 0.5408912346035443, "grad_norm": 0.7700097560882568, "learning_rate": 8.729210133219226e-05, "loss": 0.9228055000305175, "memory(GiB)": 91.52, "step": 41685, "token_acc": 0.7503435352904435, "train_speed(iter/s)": 0.156442 }, { "epoch": 0.5409561130052, "grad_norm": 0.7742102146148682, "learning_rate": 8.728852819954787e-05, "loss": 0.9065279006958008, "memory(GiB)": 91.52, "step": 41690, "token_acc": 0.7521019986216403, "train_speed(iter/s)": 0.156436 }, { "epoch": 0.5410209914068557, "grad_norm": 0.7046719193458557, "learning_rate": 8.72849546377895e-05, "loss": 0.8983510971069336, "memory(GiB)": 91.52, "step": 41695, "token_acc": 0.7605257074926527, "train_speed(iter/s)": 0.15643 }, { "epoch": 0.5410858698085114, "grad_norm": 0.798229455947876, "learning_rate": 8.728138064695826e-05, "loss": 0.9478431701660156, "memory(GiB)": 91.52, "step": 41700, "token_acc": 0.7712713562718657, "train_speed(iter/s)": 0.156425 }, { "epoch": 0.5411507482101671, "grad_norm": 0.8655489087104797, "learning_rate": 8.72778062270953e-05, "loss": 0.938901424407959, "memory(GiB)": 91.52, "step": 41705, "token_acc": 0.7890425712828071, "train_speed(iter/s)": 0.156419 }, { "epoch": 0.5412156266118228, "grad_norm": 0.8587719798088074, "learning_rate": 8.727423137824174e-05, "loss": 0.8970146179199219, "memory(GiB)": 91.52, "step": 41710, "token_acc": 0.7305808511410223, "train_speed(iter/s)": 0.156414 }, { "epoch": 0.5412805050134785, "grad_norm": 0.8519959449768066, "learning_rate": 8.727065610043869e-05, "loss": 0.9537115097045898, "memory(GiB)": 91.52, "step": 41715, "token_acc": 0.7402879749091168, "train_speed(iter/s)": 0.156409 }, { "epoch": 0.5413453834151342, "grad_norm": 0.6630926728248596, "learning_rate": 8.726708039372738e-05, "loss": 0.8801794052124023, "memory(GiB)": 91.52, "step": 41720, "token_acc": 0.7555980082051924, "train_speed(iter/s)": 0.156404 }, { "epoch": 0.5414102618167899, "grad_norm": 0.7526582479476929, "learning_rate": 8.726350425814889e-05, "loss": 0.9196223258972168, "memory(GiB)": 91.52, "step": 41725, "token_acc": 0.7517985611510791, "train_speed(iter/s)": 0.156399 }, { "epoch": 0.5414751402184456, "grad_norm": 0.8422051072120667, "learning_rate": 8.725992769374439e-05, "loss": 0.9095510482788086, "memory(GiB)": 91.52, "step": 41730, "token_acc": 0.7559291316439084, "train_speed(iter/s)": 0.156392 }, { "epoch": 0.5415400186201013, "grad_norm": 0.7422538995742798, "learning_rate": 8.725635070055503e-05, "loss": 0.894891357421875, "memory(GiB)": 91.52, "step": 41735, "token_acc": 0.7578235412881082, "train_speed(iter/s)": 0.156387 }, { "epoch": 0.541604897021757, "grad_norm": 0.722316324710846, "learning_rate": 8.7252773278622e-05, "loss": 0.909581184387207, "memory(GiB)": 91.52, "step": 41740, "token_acc": 0.7630840983558166, "train_speed(iter/s)": 0.156381 }, { "epoch": 0.5416697754234127, "grad_norm": 0.6832873225212097, "learning_rate": 8.724919542798645e-05, "loss": 0.879453468322754, "memory(GiB)": 91.52, "step": 41745, "token_acc": 0.7605284888521883, "train_speed(iter/s)": 0.156375 }, { "epoch": 0.5417346538250684, "grad_norm": 0.7178525924682617, "learning_rate": 8.724561714868956e-05, "loss": 0.9503183364868164, "memory(GiB)": 91.52, "step": 41750, "token_acc": 0.7669165068851316, "train_speed(iter/s)": 0.15637 }, { "epoch": 0.5417995322267241, "grad_norm": 0.8805281519889832, "learning_rate": 8.724203844077247e-05, "loss": 0.9367210388183593, "memory(GiB)": 91.52, "step": 41755, "token_acc": 0.7355495978552279, "train_speed(iter/s)": 0.156365 }, { "epoch": 0.5418644106283798, "grad_norm": 0.7159448862075806, "learning_rate": 8.723845930427644e-05, "loss": 0.8831820487976074, "memory(GiB)": 91.52, "step": 41760, "token_acc": 0.7519387146400427, "train_speed(iter/s)": 0.156359 }, { "epoch": 0.5419292890300355, "grad_norm": 0.7485131621360779, "learning_rate": 8.723487973924261e-05, "loss": 0.9597841262817383, "memory(GiB)": 91.52, "step": 41765, "token_acc": 0.7495382210855357, "train_speed(iter/s)": 0.156354 }, { "epoch": 0.5419941674316912, "grad_norm": 0.6886774301528931, "learning_rate": 8.723129974571216e-05, "loss": 0.8955802917480469, "memory(GiB)": 91.52, "step": 41770, "token_acc": 0.761174584084987, "train_speed(iter/s)": 0.156348 }, { "epoch": 0.5420590458333469, "grad_norm": 0.8170024752616882, "learning_rate": 8.722771932372633e-05, "loss": 0.9636299133300781, "memory(GiB)": 91.52, "step": 41775, "token_acc": 0.7491491239127694, "train_speed(iter/s)": 0.156343 }, { "epoch": 0.5421239242350026, "grad_norm": 0.6989811062812805, "learning_rate": 8.722413847332629e-05, "loss": 0.9047798156738281, "memory(GiB)": 91.52, "step": 41780, "token_acc": 0.7558507273877293, "train_speed(iter/s)": 0.156338 }, { "epoch": 0.5421888026366583, "grad_norm": 0.7449722290039062, "learning_rate": 8.722055719455327e-05, "loss": 0.9503561019897461, "memory(GiB)": 91.52, "step": 41785, "token_acc": 0.7441717224600413, "train_speed(iter/s)": 0.156331 }, { "epoch": 0.542253681038314, "grad_norm": 0.6677471995353699, "learning_rate": 8.721697548744847e-05, "loss": 0.8907918930053711, "memory(GiB)": 91.52, "step": 41790, "token_acc": 0.7560944601595525, "train_speed(iter/s)": 0.156326 }, { "epoch": 0.5423185594399696, "grad_norm": 0.7790036797523499, "learning_rate": 8.72133933520531e-05, "loss": 0.9017350196838378, "memory(GiB)": 91.52, "step": 41795, "token_acc": 0.7482855990318676, "train_speed(iter/s)": 0.156321 }, { "epoch": 0.5423834378416253, "grad_norm": 0.7643346786499023, "learning_rate": 8.72098107884084e-05, "loss": 0.9018651008605957, "memory(GiB)": 91.52, "step": 41800, "token_acc": 0.7407774903433857, "train_speed(iter/s)": 0.156316 }, { "epoch": 0.542448316243281, "grad_norm": 0.7225425839424133, "learning_rate": 8.72062277965556e-05, "loss": 0.9145304679870605, "memory(GiB)": 91.52, "step": 41805, "token_acc": 0.7579721214435746, "train_speed(iter/s)": 0.156311 }, { "epoch": 0.5425131946449367, "grad_norm": 0.841370165348053, "learning_rate": 8.720264437653593e-05, "loss": 0.9641794204711914, "memory(GiB)": 91.52, "step": 41810, "token_acc": 0.7510414025428334, "train_speed(iter/s)": 0.156306 }, { "epoch": 0.5425780730465924, "grad_norm": 0.649165689945221, "learning_rate": 8.719906052839061e-05, "loss": 0.8991772651672363, "memory(GiB)": 91.52, "step": 41815, "token_acc": 0.7676218652172417, "train_speed(iter/s)": 0.156301 }, { "epoch": 0.5426429514482481, "grad_norm": 0.7809562683105469, "learning_rate": 8.719547625216089e-05, "loss": 0.8940502166748047, "memory(GiB)": 91.52, "step": 41820, "token_acc": 0.7643426294820718, "train_speed(iter/s)": 0.156296 }, { "epoch": 0.5427078298499038, "grad_norm": 0.7259219884872437, "learning_rate": 8.719189154788803e-05, "loss": 0.9165548324584961, "memory(GiB)": 91.52, "step": 41825, "token_acc": 0.7363774632985055, "train_speed(iter/s)": 0.156291 }, { "epoch": 0.5427727082515595, "grad_norm": 0.7285379767417908, "learning_rate": 8.718830641561328e-05, "loss": 0.9092937469482422, "memory(GiB)": 91.52, "step": 41830, "token_acc": 0.7393956793224719, "train_speed(iter/s)": 0.156286 }, { "epoch": 0.5428375866532152, "grad_norm": 0.7099735140800476, "learning_rate": 8.71847208553779e-05, "loss": 0.9132553100585937, "memory(GiB)": 91.52, "step": 41835, "token_acc": 0.7505289741314586, "train_speed(iter/s)": 0.156282 }, { "epoch": 0.5429024650548709, "grad_norm": 0.8178718686103821, "learning_rate": 8.718113486722312e-05, "loss": 0.8466606140136719, "memory(GiB)": 91.52, "step": 41840, "token_acc": 0.759081672667641, "train_speed(iter/s)": 0.156276 }, { "epoch": 0.5429673434565266, "grad_norm": 0.7739339470863342, "learning_rate": 8.717754845119026e-05, "loss": 0.9522102355957032, "memory(GiB)": 91.52, "step": 41845, "token_acc": 0.7584776272929882, "train_speed(iter/s)": 0.156271 }, { "epoch": 0.5430322218581823, "grad_norm": 0.7419053316116333, "learning_rate": 8.717396160732056e-05, "loss": 0.9798439025878907, "memory(GiB)": 91.52, "step": 41850, "token_acc": 0.7496257348376967, "train_speed(iter/s)": 0.156265 }, { "epoch": 0.543097100259838, "grad_norm": 0.8623197078704834, "learning_rate": 8.71703743356553e-05, "loss": 0.9434074401855469, "memory(GiB)": 91.52, "step": 41855, "token_acc": 0.7480066501543786, "train_speed(iter/s)": 0.15626 }, { "epoch": 0.5431619786614937, "grad_norm": 0.8074237108230591, "learning_rate": 8.716678663623575e-05, "loss": 0.9064487457275391, "memory(GiB)": 91.52, "step": 41860, "token_acc": 0.7557746064440194, "train_speed(iter/s)": 0.156255 }, { "epoch": 0.5432268570631494, "grad_norm": 0.7133458852767944, "learning_rate": 8.716319850910322e-05, "loss": 0.9410867691040039, "memory(GiB)": 91.52, "step": 41865, "token_acc": 0.7519785238643099, "train_speed(iter/s)": 0.15625 }, { "epoch": 0.5432917354648051, "grad_norm": 0.7746642827987671, "learning_rate": 8.715960995429901e-05, "loss": 0.9220832824707031, "memory(GiB)": 91.52, "step": 41870, "token_acc": 0.7491977208723557, "train_speed(iter/s)": 0.156245 }, { "epoch": 0.5433566138664608, "grad_norm": 0.7228403091430664, "learning_rate": 8.715602097186437e-05, "loss": 0.8894218444824219, "memory(GiB)": 91.52, "step": 41875, "token_acc": 0.7604125813789316, "train_speed(iter/s)": 0.156239 }, { "epoch": 0.5434214922681165, "grad_norm": 0.7693718075752258, "learning_rate": 8.715243156184065e-05, "loss": 0.9167673110961914, "memory(GiB)": 91.52, "step": 41880, "token_acc": 0.7227713240485206, "train_speed(iter/s)": 0.156232 }, { "epoch": 0.5434863706697722, "grad_norm": 0.7346073389053345, "learning_rate": 8.714884172426914e-05, "loss": 0.8809715270996094, "memory(GiB)": 91.52, "step": 41885, "token_acc": 0.7583704812416621, "train_speed(iter/s)": 0.156227 }, { "epoch": 0.5435512490714278, "grad_norm": 0.7551934719085693, "learning_rate": 8.714525145919115e-05, "loss": 0.9465596199035644, "memory(GiB)": 91.52, "step": 41890, "token_acc": 0.7552706552706553, "train_speed(iter/s)": 0.156222 }, { "epoch": 0.5436161274730835, "grad_norm": 0.6932922005653381, "learning_rate": 8.714166076664799e-05, "loss": 0.8810697555541992, "memory(GiB)": 91.52, "step": 41895, "token_acc": 0.7370890456541994, "train_speed(iter/s)": 0.156216 }, { "epoch": 0.5436810058747392, "grad_norm": 0.8155925869941711, "learning_rate": 8.713806964668099e-05, "loss": 0.910594367980957, "memory(GiB)": 91.52, "step": 41900, "token_acc": 0.7724887556221889, "train_speed(iter/s)": 0.156211 }, { "epoch": 0.5437458842763949, "grad_norm": 0.7722006440162659, "learning_rate": 8.713447809933148e-05, "loss": 0.9076974868774415, "memory(GiB)": 91.52, "step": 41905, "token_acc": 0.734946455700893, "train_speed(iter/s)": 0.156205 }, { "epoch": 0.5438107626780506, "grad_norm": 0.6585862636566162, "learning_rate": 8.713088612464079e-05, "loss": 0.9299848556518555, "memory(GiB)": 91.52, "step": 41910, "token_acc": 0.7459197176885752, "train_speed(iter/s)": 0.156199 }, { "epoch": 0.5438756410797063, "grad_norm": 0.8164676427841187, "learning_rate": 8.712729372265025e-05, "loss": 0.9023845672607422, "memory(GiB)": 91.52, "step": 41915, "token_acc": 0.7586297992250792, "train_speed(iter/s)": 0.156193 }, { "epoch": 0.543940519481362, "grad_norm": 0.7685037851333618, "learning_rate": 8.712370089340119e-05, "loss": 0.883051872253418, "memory(GiB)": 91.52, "step": 41920, "token_acc": 0.7694799730730394, "train_speed(iter/s)": 0.156188 }, { "epoch": 0.5440053978830177, "grad_norm": 0.7588796019554138, "learning_rate": 8.712010763693499e-05, "loss": 0.8834758758544922, "memory(GiB)": 91.52, "step": 41925, "token_acc": 0.761819202695115, "train_speed(iter/s)": 0.156182 }, { "epoch": 0.5440702762846734, "grad_norm": 0.7899359464645386, "learning_rate": 8.711651395329295e-05, "loss": 0.915505313873291, "memory(GiB)": 91.52, "step": 41930, "token_acc": 0.75007608034084, "train_speed(iter/s)": 0.156177 }, { "epoch": 0.5441351546863291, "grad_norm": 0.9003022313117981, "learning_rate": 8.711291984251649e-05, "loss": 0.8875879287719727, "memory(GiB)": 91.52, "step": 41935, "token_acc": 0.7580064370160288, "train_speed(iter/s)": 0.156172 }, { "epoch": 0.5442000330879848, "grad_norm": 0.8448260426521301, "learning_rate": 8.71093253046469e-05, "loss": 0.9371549606323242, "memory(GiB)": 91.52, "step": 41940, "token_acc": 0.7269525661249495, "train_speed(iter/s)": 0.156168 }, { "epoch": 0.5442649114896405, "grad_norm": 0.7677699327468872, "learning_rate": 8.710573033972563e-05, "loss": 0.9094511032104492, "memory(GiB)": 91.52, "step": 41945, "token_acc": 0.757601827405144, "train_speed(iter/s)": 0.156162 }, { "epoch": 0.5443297898912962, "grad_norm": 0.796262264251709, "learning_rate": 8.710213494779397e-05, "loss": 0.9361047744750977, "memory(GiB)": 91.52, "step": 41950, "token_acc": 0.7561359965331127, "train_speed(iter/s)": 0.156157 }, { "epoch": 0.5443946682929519, "grad_norm": 0.7983325719833374, "learning_rate": 8.709853912889334e-05, "loss": 0.9200109481811524, "memory(GiB)": 91.52, "step": 41955, "token_acc": 0.7476663916988668, "train_speed(iter/s)": 0.156153 }, { "epoch": 0.5444595466946076, "grad_norm": 0.7839791774749756, "learning_rate": 8.70949428830651e-05, "loss": 0.9307348251342773, "memory(GiB)": 91.52, "step": 41960, "token_acc": 0.7523849025300705, "train_speed(iter/s)": 0.156148 }, { "epoch": 0.5445244250962633, "grad_norm": 0.7178915739059448, "learning_rate": 8.709134621035064e-05, "loss": 0.925542163848877, "memory(GiB)": 91.52, "step": 41965, "token_acc": 0.7509503513420113, "train_speed(iter/s)": 0.156143 }, { "epoch": 0.544589303497919, "grad_norm": 0.817427396774292, "learning_rate": 8.708774911079138e-05, "loss": 0.9197607040405273, "memory(GiB)": 91.52, "step": 41970, "token_acc": 0.7295158460562171, "train_speed(iter/s)": 0.156139 }, { "epoch": 0.5446541818995747, "grad_norm": 0.8732618093490601, "learning_rate": 8.708415158442867e-05, "loss": 0.9714221000671387, "memory(GiB)": 91.52, "step": 41975, "token_acc": 0.7461254087871463, "train_speed(iter/s)": 0.156132 }, { "epoch": 0.5447190603012304, "grad_norm": 0.7575745582580566, "learning_rate": 8.708055363130394e-05, "loss": 0.9659646034240723, "memory(GiB)": 91.52, "step": 41980, "token_acc": 0.7547594323295258, "train_speed(iter/s)": 0.156126 }, { "epoch": 0.5447839387028861, "grad_norm": 0.7100415229797363, "learning_rate": 8.707695525145859e-05, "loss": 0.9123112678527832, "memory(GiB)": 91.52, "step": 41985, "token_acc": 0.7487275195113675, "train_speed(iter/s)": 0.15612 }, { "epoch": 0.5448488171045418, "grad_norm": 0.7961637377738953, "learning_rate": 8.7073356444934e-05, "loss": 0.9635392189025879, "memory(GiB)": 91.52, "step": 41990, "token_acc": 0.7678751258811681, "train_speed(iter/s)": 0.156115 }, { "epoch": 0.5449136955061975, "grad_norm": 0.7560272216796875, "learning_rate": 8.706975721177164e-05, "loss": 0.939600658416748, "memory(GiB)": 91.52, "step": 41995, "token_acc": 0.7520163831127914, "train_speed(iter/s)": 0.156109 }, { "epoch": 0.5449785739078532, "grad_norm": 0.7244036793708801, "learning_rate": 8.706615755201288e-05, "loss": 0.9639554977416992, "memory(GiB)": 91.52, "step": 42000, "token_acc": 0.7601384395455572, "train_speed(iter/s)": 0.156104 }, { "epoch": 0.5450434523095089, "grad_norm": 0.693385124206543, "learning_rate": 8.706255746569917e-05, "loss": 0.9113238334655762, "memory(GiB)": 91.52, "step": 42005, "token_acc": 0.7661818181818182, "train_speed(iter/s)": 0.1561 }, { "epoch": 0.5451083307111646, "grad_norm": 0.8549776673316956, "learning_rate": 8.705895695287194e-05, "loss": 0.9616841316223145, "memory(GiB)": 91.52, "step": 42010, "token_acc": 0.736168455821635, "train_speed(iter/s)": 0.156095 }, { "epoch": 0.5451732091128203, "grad_norm": 0.7750365138053894, "learning_rate": 8.705535601357261e-05, "loss": 0.8960248947143554, "memory(GiB)": 91.52, "step": 42015, "token_acc": 0.7523078676442655, "train_speed(iter/s)": 0.156089 }, { "epoch": 0.545238087514476, "grad_norm": 0.8015558123588562, "learning_rate": 8.705175464784263e-05, "loss": 0.9276597976684571, "memory(GiB)": 91.52, "step": 42020, "token_acc": 0.7410618547225881, "train_speed(iter/s)": 0.156086 }, { "epoch": 0.5453029659161317, "grad_norm": 0.7399624586105347, "learning_rate": 8.704815285572345e-05, "loss": 0.881925392150879, "memory(GiB)": 91.52, "step": 42025, "token_acc": 0.7630585972181119, "train_speed(iter/s)": 0.156081 }, { "epoch": 0.5453678443177874, "grad_norm": 0.7543346285820007, "learning_rate": 8.70445506372565e-05, "loss": 0.895112419128418, "memory(GiB)": 91.52, "step": 42030, "token_acc": 0.7779958746423581, "train_speed(iter/s)": 0.156074 }, { "epoch": 0.5454327227194431, "grad_norm": 0.7603600025177002, "learning_rate": 8.704094799248325e-05, "loss": 0.8929689407348633, "memory(GiB)": 91.52, "step": 42035, "token_acc": 0.7399319909321243, "train_speed(iter/s)": 0.156069 }, { "epoch": 0.5454976011210988, "grad_norm": 0.694257378578186, "learning_rate": 8.703734492144515e-05, "loss": 0.9302677154541016, "memory(GiB)": 91.52, "step": 42040, "token_acc": 0.7474295131300828, "train_speed(iter/s)": 0.156064 }, { "epoch": 0.5455624795227545, "grad_norm": 0.7434925436973572, "learning_rate": 8.703374142418368e-05, "loss": 0.9470470428466797, "memory(GiB)": 91.52, "step": 42045, "token_acc": 0.7652219579253036, "train_speed(iter/s)": 0.156059 }, { "epoch": 0.5456273579244102, "grad_norm": 0.7189685702323914, "learning_rate": 8.70301375007403e-05, "loss": 0.9615992546081543, "memory(GiB)": 91.52, "step": 42050, "token_acc": 0.7509982974771707, "train_speed(iter/s)": 0.156055 }, { "epoch": 0.5456922363260659, "grad_norm": 0.9397421479225159, "learning_rate": 8.702653315115646e-05, "loss": 0.9542388916015625, "memory(GiB)": 91.52, "step": 42055, "token_acc": 0.7427721363430567, "train_speed(iter/s)": 0.156049 }, { "epoch": 0.5457571147277216, "grad_norm": 0.6873374581336975, "learning_rate": 8.702292837547367e-05, "loss": 0.8809345245361329, "memory(GiB)": 91.52, "step": 42060, "token_acc": 0.7648988285410011, "train_speed(iter/s)": 0.156044 }, { "epoch": 0.5458219931293773, "grad_norm": 0.8170542120933533, "learning_rate": 8.701932317373341e-05, "loss": 0.8999994277954102, "memory(GiB)": 91.52, "step": 42065, "token_acc": 0.7494703580020826, "train_speed(iter/s)": 0.156039 }, { "epoch": 0.545886871531033, "grad_norm": 0.818130373954773, "learning_rate": 8.701571754597715e-05, "loss": 0.9464337348937988, "memory(GiB)": 91.52, "step": 42070, "token_acc": 0.7385972618786149, "train_speed(iter/s)": 0.156034 }, { "epoch": 0.5459517499326887, "grad_norm": 0.7764229774475098, "learning_rate": 8.70121114922464e-05, "loss": 0.9140754699707031, "memory(GiB)": 91.52, "step": 42075, "token_acc": 0.7380604867451885, "train_speed(iter/s)": 0.156028 }, { "epoch": 0.5460166283343444, "grad_norm": 0.7688162922859192, "learning_rate": 8.700850501258265e-05, "loss": 0.8944573402404785, "memory(GiB)": 91.52, "step": 42080, "token_acc": 0.7390860746970072, "train_speed(iter/s)": 0.156023 }, { "epoch": 0.5460815067360001, "grad_norm": 0.7894706726074219, "learning_rate": 8.700489810702741e-05, "loss": 0.876658821105957, "memory(GiB)": 91.52, "step": 42085, "token_acc": 0.7558890371920205, "train_speed(iter/s)": 0.156016 }, { "epoch": 0.5461463851376558, "grad_norm": 0.7984351515769958, "learning_rate": 8.700129077562218e-05, "loss": 0.8975141525268555, "memory(GiB)": 91.52, "step": 42090, "token_acc": 0.7567660946677416, "train_speed(iter/s)": 0.156012 }, { "epoch": 0.5462112635393115, "grad_norm": 0.813226044178009, "learning_rate": 8.69976830184085e-05, "loss": 0.9282087326049805, "memory(GiB)": 91.52, "step": 42095, "token_acc": 0.7535213669383718, "train_speed(iter/s)": 0.156007 }, { "epoch": 0.5462761419409672, "grad_norm": 0.6796795725822449, "learning_rate": 8.699407483542783e-05, "loss": 0.9326092720031738, "memory(GiB)": 91.52, "step": 42100, "token_acc": 0.7456460105305792, "train_speed(iter/s)": 0.156002 }, { "epoch": 0.5463410203426229, "grad_norm": 0.818612813949585, "learning_rate": 8.699046622672175e-05, "loss": 0.9201747894287109, "memory(GiB)": 91.52, "step": 42105, "token_acc": 0.7445323890856071, "train_speed(iter/s)": 0.155997 }, { "epoch": 0.5464058987442786, "grad_norm": 0.7809326648712158, "learning_rate": 8.698685719233175e-05, "loss": 0.9254066467285156, "memory(GiB)": 91.52, "step": 42110, "token_acc": 0.7518832563048146, "train_speed(iter/s)": 0.155991 }, { "epoch": 0.5464707771459343, "grad_norm": 0.6074424386024475, "learning_rate": 8.698324773229939e-05, "loss": 0.8573503494262695, "memory(GiB)": 91.52, "step": 42115, "token_acc": 0.7854666383641123, "train_speed(iter/s)": 0.155986 }, { "epoch": 0.54653565554759, "grad_norm": 0.8424140810966492, "learning_rate": 8.697963784666619e-05, "loss": 0.9397872924804688, "memory(GiB)": 91.52, "step": 42120, "token_acc": 0.7494406214379196, "train_speed(iter/s)": 0.155981 }, { "epoch": 0.5466005339492457, "grad_norm": 0.7109355926513672, "learning_rate": 8.697602753547369e-05, "loss": 0.8969110488891602, "memory(GiB)": 91.52, "step": 42125, "token_acc": 0.7579241145591069, "train_speed(iter/s)": 0.155976 }, { "epoch": 0.5466654123509013, "grad_norm": 0.7585212588310242, "learning_rate": 8.697241679876346e-05, "loss": 0.9235477447509766, "memory(GiB)": 91.52, "step": 42130, "token_acc": 0.7693086857432261, "train_speed(iter/s)": 0.155971 }, { "epoch": 0.546730290752557, "grad_norm": 0.8338141441345215, "learning_rate": 8.696880563657703e-05, "loss": 0.9725584030151367, "memory(GiB)": 91.52, "step": 42135, "token_acc": 0.7413316868683586, "train_speed(iter/s)": 0.155966 }, { "epoch": 0.5467951691542127, "grad_norm": 0.6995148658752441, "learning_rate": 8.696519404895597e-05, "loss": 0.8946442604064941, "memory(GiB)": 91.52, "step": 42140, "token_acc": 0.7558526037385181, "train_speed(iter/s)": 0.155961 }, { "epoch": 0.5468600475558684, "grad_norm": 0.7447941303253174, "learning_rate": 8.696158203594182e-05, "loss": 0.9058601379394531, "memory(GiB)": 91.52, "step": 42145, "token_acc": 0.7554966021005197, "train_speed(iter/s)": 0.155955 }, { "epoch": 0.5469249259575241, "grad_norm": 0.7479994893074036, "learning_rate": 8.695796959757618e-05, "loss": 0.9361742019653321, "memory(GiB)": 91.52, "step": 42150, "token_acc": 0.7544825934684347, "train_speed(iter/s)": 0.15595 }, { "epoch": 0.5469898043591798, "grad_norm": 0.7509341239929199, "learning_rate": 8.695435673390059e-05, "loss": 0.9288570404052734, "memory(GiB)": 91.52, "step": 42155, "token_acc": 0.7459090744167483, "train_speed(iter/s)": 0.155944 }, { "epoch": 0.5470546827608355, "grad_norm": 0.7570593953132629, "learning_rate": 8.695074344495666e-05, "loss": 0.9725290298461914, "memory(GiB)": 91.52, "step": 42160, "token_acc": 0.7396616240765024, "train_speed(iter/s)": 0.155938 }, { "epoch": 0.5471195611624912, "grad_norm": 0.7644161581993103, "learning_rate": 8.694712973078595e-05, "loss": 0.8843165397644043, "memory(GiB)": 91.52, "step": 42165, "token_acc": 0.766495303621816, "train_speed(iter/s)": 0.155934 }, { "epoch": 0.5471844395641469, "grad_norm": 0.7449244856834412, "learning_rate": 8.694351559143004e-05, "loss": 0.8826231002807617, "memory(GiB)": 91.52, "step": 42170, "token_acc": 0.7592186449103723, "train_speed(iter/s)": 0.155928 }, { "epoch": 0.5472493179658026, "grad_norm": 0.740123450756073, "learning_rate": 8.693990102693053e-05, "loss": 0.937466812133789, "memory(GiB)": 91.52, "step": 42175, "token_acc": 0.7406741328101584, "train_speed(iter/s)": 0.155923 }, { "epoch": 0.5473141963674583, "grad_norm": 0.8710167407989502, "learning_rate": 8.693628603732902e-05, "loss": 0.9020599365234375, "memory(GiB)": 91.52, "step": 42180, "token_acc": 0.772094027862774, "train_speed(iter/s)": 0.155917 }, { "epoch": 0.547379074769114, "grad_norm": 0.6926396489143372, "learning_rate": 8.69326706226671e-05, "loss": 0.8978931427001953, "memory(GiB)": 91.52, "step": 42185, "token_acc": 0.7370954356846473, "train_speed(iter/s)": 0.155912 }, { "epoch": 0.5474439531707697, "grad_norm": 0.777953565120697, "learning_rate": 8.692905478298638e-05, "loss": 0.9463797569274902, "memory(GiB)": 91.52, "step": 42190, "token_acc": 0.7364135440845953, "train_speed(iter/s)": 0.155908 }, { "epoch": 0.5475088315724254, "grad_norm": 0.7496326565742493, "learning_rate": 8.69254385183285e-05, "loss": 0.9306513786315918, "memory(GiB)": 91.52, "step": 42195, "token_acc": 0.7652328589909444, "train_speed(iter/s)": 0.155902 }, { "epoch": 0.5475737099740811, "grad_norm": 0.770799994468689, "learning_rate": 8.692182182873503e-05, "loss": 0.9291436195373535, "memory(GiB)": 91.52, "step": 42200, "token_acc": 0.7277607638263376, "train_speed(iter/s)": 0.155896 }, { "epoch": 0.5476385883757368, "grad_norm": 0.8092264533042908, "learning_rate": 8.691820471424761e-05, "loss": 0.9019899368286133, "memory(GiB)": 91.52, "step": 42205, "token_acc": 0.7509497824134834, "train_speed(iter/s)": 0.155891 }, { "epoch": 0.5477034667773925, "grad_norm": 0.7256063222885132, "learning_rate": 8.691458717490787e-05, "loss": 0.9138572692871094, "memory(GiB)": 91.52, "step": 42210, "token_acc": 0.7726868044515103, "train_speed(iter/s)": 0.155886 }, { "epoch": 0.5477683451790482, "grad_norm": 0.7298921346664429, "learning_rate": 8.691096921075743e-05, "loss": 0.9458889961242676, "memory(GiB)": 91.52, "step": 42215, "token_acc": 0.753935705781188, "train_speed(iter/s)": 0.15588 }, { "epoch": 0.5478332235807039, "grad_norm": 0.759156346321106, "learning_rate": 8.690735082183794e-05, "loss": 0.9586289405822754, "memory(GiB)": 91.52, "step": 42220, "token_acc": 0.7442641982097578, "train_speed(iter/s)": 0.155875 }, { "epoch": 0.5478981019823596, "grad_norm": 0.7313628792762756, "learning_rate": 8.690373200819105e-05, "loss": 0.9036964416503906, "memory(GiB)": 91.52, "step": 42225, "token_acc": 0.7826875684556407, "train_speed(iter/s)": 0.155868 }, { "epoch": 0.5479629803840153, "grad_norm": 0.8245922327041626, "learning_rate": 8.690011276985835e-05, "loss": 0.9252904891967774, "memory(GiB)": 91.52, "step": 42230, "token_acc": 0.7555323925593329, "train_speed(iter/s)": 0.155862 }, { "epoch": 0.548027858785671, "grad_norm": 0.8076310753822327, "learning_rate": 8.689649310688154e-05, "loss": 0.9363461494445801, "memory(GiB)": 91.52, "step": 42235, "token_acc": 0.7264297900749203, "train_speed(iter/s)": 0.155858 }, { "epoch": 0.5480927371873267, "grad_norm": 0.7362058758735657, "learning_rate": 8.689287301930228e-05, "loss": 0.9120313644409179, "memory(GiB)": 91.52, "step": 42240, "token_acc": 0.7251112909753137, "train_speed(iter/s)": 0.155853 }, { "epoch": 0.5481576155889823, "grad_norm": 0.7829142808914185, "learning_rate": 8.68892525071622e-05, "loss": 0.9520990371704101, "memory(GiB)": 91.52, "step": 42245, "token_acc": 0.7476657486397528, "train_speed(iter/s)": 0.15585 }, { "epoch": 0.548222493990638, "grad_norm": 0.725864827632904, "learning_rate": 8.688563157050297e-05, "loss": 0.9166107177734375, "memory(GiB)": 91.52, "step": 42250, "token_acc": 0.7419813717188823, "train_speed(iter/s)": 0.155845 }, { "epoch": 0.5482873723922937, "grad_norm": 0.6974787712097168, "learning_rate": 8.688201020936626e-05, "loss": 0.8639863967895508, "memory(GiB)": 91.52, "step": 42255, "token_acc": 0.7659782963742605, "train_speed(iter/s)": 0.15584 }, { "epoch": 0.5483522507939494, "grad_norm": 0.7669858336448669, "learning_rate": 8.687838842379377e-05, "loss": 0.8874512672424316, "memory(GiB)": 91.52, "step": 42260, "token_acc": 0.770313851962558, "train_speed(iter/s)": 0.155835 }, { "epoch": 0.5484171291956051, "grad_norm": 0.7293767929077148, "learning_rate": 8.687476621382715e-05, "loss": 0.894223403930664, "memory(GiB)": 91.52, "step": 42265, "token_acc": 0.7697996720151773, "train_speed(iter/s)": 0.155828 }, { "epoch": 0.5484820075972608, "grad_norm": 0.7878575921058655, "learning_rate": 8.68711435795081e-05, "loss": 0.9086524963378906, "memory(GiB)": 91.52, "step": 42270, "token_acc": 0.7551989781662874, "train_speed(iter/s)": 0.155824 }, { "epoch": 0.5485468859989165, "grad_norm": 0.7621117234230042, "learning_rate": 8.686752052087829e-05, "loss": 0.8773857116699219, "memory(GiB)": 91.52, "step": 42275, "token_acc": 0.7601945653366492, "train_speed(iter/s)": 0.155819 }, { "epoch": 0.5486117644005722, "grad_norm": 0.732651948928833, "learning_rate": 8.686389703797943e-05, "loss": 0.9217350006103515, "memory(GiB)": 91.52, "step": 42280, "token_acc": 0.7849234838465898, "train_speed(iter/s)": 0.155813 }, { "epoch": 0.5486766428022279, "grad_norm": 0.8016971945762634, "learning_rate": 8.686027313085323e-05, "loss": 0.9505487442016601, "memory(GiB)": 91.52, "step": 42285, "token_acc": 0.7449868569282764, "train_speed(iter/s)": 0.155807 }, { "epoch": 0.5487415212038836, "grad_norm": 0.8405091166496277, "learning_rate": 8.685664879954136e-05, "loss": 0.9401847839355468, "memory(GiB)": 91.52, "step": 42290, "token_acc": 0.7505722364346304, "train_speed(iter/s)": 0.155803 }, { "epoch": 0.5488063996055393, "grad_norm": 0.7489477396011353, "learning_rate": 8.685302404408555e-05, "loss": 0.9413939476013183, "memory(GiB)": 91.52, "step": 42295, "token_acc": 0.737405418120516, "train_speed(iter/s)": 0.155798 }, { "epoch": 0.548871278007195, "grad_norm": 0.7167496085166931, "learning_rate": 8.684939886452752e-05, "loss": 0.9216163635253907, "memory(GiB)": 91.52, "step": 42300, "token_acc": 0.7667106916212995, "train_speed(iter/s)": 0.155792 }, { "epoch": 0.5489361564088507, "grad_norm": 0.7278420925140381, "learning_rate": 8.684577326090897e-05, "loss": 0.9085830688476563, "memory(GiB)": 91.52, "step": 42305, "token_acc": 0.7556250424852151, "train_speed(iter/s)": 0.155788 }, { "epoch": 0.5490010348105064, "grad_norm": 0.7357620596885681, "learning_rate": 8.684214723327164e-05, "loss": 0.9146259307861329, "memory(GiB)": 91.52, "step": 42310, "token_acc": 0.750934873560665, "train_speed(iter/s)": 0.155781 }, { "epoch": 0.5490659132121621, "grad_norm": 0.7375692129135132, "learning_rate": 8.683852078165726e-05, "loss": 0.949951171875, "memory(GiB)": 91.52, "step": 42315, "token_acc": 0.750596553247901, "train_speed(iter/s)": 0.155775 }, { "epoch": 0.5491307916138178, "grad_norm": 0.6547554135322571, "learning_rate": 8.683489390610754e-05, "loss": 0.9737152099609375, "memory(GiB)": 91.52, "step": 42320, "token_acc": 0.7426121033020686, "train_speed(iter/s)": 0.155769 }, { "epoch": 0.5491956700154735, "grad_norm": 0.823460578918457, "learning_rate": 8.683126660666423e-05, "loss": 0.9498958587646484, "memory(GiB)": 91.52, "step": 42325, "token_acc": 0.7480325784391216, "train_speed(iter/s)": 0.155764 }, { "epoch": 0.5492605484171292, "grad_norm": 0.8324088454246521, "learning_rate": 8.682763888336909e-05, "loss": 0.919986629486084, "memory(GiB)": 91.52, "step": 42330, "token_acc": 0.7547196694450381, "train_speed(iter/s)": 0.15576 }, { "epoch": 0.5493254268187849, "grad_norm": 0.7432861924171448, "learning_rate": 8.682401073626383e-05, "loss": 0.9213554382324218, "memory(GiB)": 91.52, "step": 42335, "token_acc": 0.741978924687742, "train_speed(iter/s)": 0.155754 }, { "epoch": 0.5493903052204406, "grad_norm": 0.7463334202766418, "learning_rate": 8.682038216539023e-05, "loss": 0.9110239982604981, "memory(GiB)": 91.52, "step": 42340, "token_acc": 0.7549924165824065, "train_speed(iter/s)": 0.155748 }, { "epoch": 0.5494551836220963, "grad_norm": 0.7534124851226807, "learning_rate": 8.681675317079005e-05, "loss": 0.887841796875, "memory(GiB)": 91.52, "step": 42345, "token_acc": 0.7662564066803808, "train_speed(iter/s)": 0.155744 }, { "epoch": 0.549520062023752, "grad_norm": 0.8944858908653259, "learning_rate": 8.681312375250505e-05, "loss": 0.9375984191894531, "memory(GiB)": 91.52, "step": 42350, "token_acc": 0.7355139402082633, "train_speed(iter/s)": 0.15574 }, { "epoch": 0.5495849404254077, "grad_norm": 0.7644066214561462, "learning_rate": 8.680949391057698e-05, "loss": 0.9340556144714356, "memory(GiB)": 91.52, "step": 42355, "token_acc": 0.7357142857142858, "train_speed(iter/s)": 0.155735 }, { "epoch": 0.5496498188270634, "grad_norm": 0.7639716267585754, "learning_rate": 8.680586364504763e-05, "loss": 0.9031728744506836, "memory(GiB)": 91.52, "step": 42360, "token_acc": 0.7614304976709978, "train_speed(iter/s)": 0.155729 }, { "epoch": 0.549714697228719, "grad_norm": 1.3163889646530151, "learning_rate": 8.680223295595876e-05, "loss": 0.9790382385253906, "memory(GiB)": 91.52, "step": 42365, "token_acc": 0.750627645075676, "train_speed(iter/s)": 0.155723 }, { "epoch": 0.5497795756303747, "grad_norm": 0.778437077999115, "learning_rate": 8.679860184335216e-05, "loss": 0.898832130432129, "memory(GiB)": 91.52, "step": 42370, "token_acc": 0.7589229179858034, "train_speed(iter/s)": 0.155717 }, { "epoch": 0.5498444540320304, "grad_norm": 0.7712852358818054, "learning_rate": 8.679497030726964e-05, "loss": 0.8894623756408692, "memory(GiB)": 91.52, "step": 42375, "token_acc": 0.751861345050653, "train_speed(iter/s)": 0.155712 }, { "epoch": 0.5499093324336861, "grad_norm": 0.7307358384132385, "learning_rate": 8.679133834775295e-05, "loss": 0.9570052146911621, "memory(GiB)": 91.52, "step": 42380, "token_acc": 0.7467697934466528, "train_speed(iter/s)": 0.155707 }, { "epoch": 0.5499742108353418, "grad_norm": 0.7534210085868835, "learning_rate": 8.678770596484391e-05, "loss": 0.9820499420166016, "memory(GiB)": 91.52, "step": 42385, "token_acc": 0.7363422853058307, "train_speed(iter/s)": 0.155703 }, { "epoch": 0.5500390892369975, "grad_norm": 0.742485523223877, "learning_rate": 8.678407315858432e-05, "loss": 0.901606559753418, "memory(GiB)": 91.52, "step": 42390, "token_acc": 0.7430295849224275, "train_speed(iter/s)": 0.155698 }, { "epoch": 0.5501039676386532, "grad_norm": 0.8477941751480103, "learning_rate": 8.678043992901599e-05, "loss": 0.9533401489257812, "memory(GiB)": 91.52, "step": 42395, "token_acc": 0.7527915632754343, "train_speed(iter/s)": 0.155694 }, { "epoch": 0.5501688460403089, "grad_norm": 0.8142568469047546, "learning_rate": 8.67768062761807e-05, "loss": 0.9479765892028809, "memory(GiB)": 91.52, "step": 42400, "token_acc": 0.741929927309057, "train_speed(iter/s)": 0.155689 }, { "epoch": 0.5502337244419646, "grad_norm": 0.7279124855995178, "learning_rate": 8.677317220012031e-05, "loss": 0.9408382415771485, "memory(GiB)": 91.52, "step": 42405, "token_acc": 0.7455019756780604, "train_speed(iter/s)": 0.155685 }, { "epoch": 0.5502986028436203, "grad_norm": 0.8116828203201294, "learning_rate": 8.676953770087663e-05, "loss": 0.9417396545410156, "memory(GiB)": 91.52, "step": 42410, "token_acc": 0.7356520486786203, "train_speed(iter/s)": 0.155681 }, { "epoch": 0.550363481245276, "grad_norm": 0.717896044254303, "learning_rate": 8.676590277849146e-05, "loss": 0.9249275207519532, "memory(GiB)": 91.52, "step": 42415, "token_acc": 0.7375896751865605, "train_speed(iter/s)": 0.155675 }, { "epoch": 0.5504283596469317, "grad_norm": 0.7351647615432739, "learning_rate": 8.676226743300666e-05, "loss": 0.8952153205871582, "memory(GiB)": 91.52, "step": 42420, "token_acc": 0.7296393513358024, "train_speed(iter/s)": 0.15567 }, { "epoch": 0.5504932380485874, "grad_norm": 0.7947661876678467, "learning_rate": 8.675863166446403e-05, "loss": 0.9012493133544922, "memory(GiB)": 91.52, "step": 42425, "token_acc": 0.7774019458802067, "train_speed(iter/s)": 0.155665 }, { "epoch": 0.5505581164502431, "grad_norm": 0.7652872204780579, "learning_rate": 8.675499547290546e-05, "loss": 0.9422157287597657, "memory(GiB)": 91.52, "step": 42430, "token_acc": 0.7422922252010724, "train_speed(iter/s)": 0.15566 }, { "epoch": 0.5506229948518988, "grad_norm": 0.8230770826339722, "learning_rate": 8.675135885837274e-05, "loss": 0.9146700859069824, "memory(GiB)": 91.52, "step": 42435, "token_acc": 0.7473507536365431, "train_speed(iter/s)": 0.155654 }, { "epoch": 0.5506878732535545, "grad_norm": 0.7405735850334167, "learning_rate": 8.674772182090776e-05, "loss": 0.9293529510498046, "memory(GiB)": 91.52, "step": 42440, "token_acc": 0.766362691908115, "train_speed(iter/s)": 0.15565 }, { "epoch": 0.5507527516552102, "grad_norm": 0.8006726503372192, "learning_rate": 8.674408436055237e-05, "loss": 0.9134931564331055, "memory(GiB)": 91.52, "step": 42445, "token_acc": 0.7821024958175424, "train_speed(iter/s)": 0.155646 }, { "epoch": 0.5508176300568659, "grad_norm": 0.7331537008285522, "learning_rate": 8.674044647734842e-05, "loss": 0.8773429870605469, "memory(GiB)": 91.52, "step": 42450, "token_acc": 0.7723725389216812, "train_speed(iter/s)": 0.155641 }, { "epoch": 0.5508825084585216, "grad_norm": 0.8064594864845276, "learning_rate": 8.673680817133779e-05, "loss": 0.9650007247924804, "memory(GiB)": 91.52, "step": 42455, "token_acc": 0.7374698426721941, "train_speed(iter/s)": 0.155635 }, { "epoch": 0.5509473868601773, "grad_norm": 0.909172773361206, "learning_rate": 8.673316944256231e-05, "loss": 0.8834124565124511, "memory(GiB)": 91.52, "step": 42460, "token_acc": 0.7789617147813158, "train_speed(iter/s)": 0.15563 }, { "epoch": 0.551012265261833, "grad_norm": 0.6927471160888672, "learning_rate": 8.672953029106387e-05, "loss": 0.9155673980712891, "memory(GiB)": 91.52, "step": 42465, "token_acc": 0.7505840944260703, "train_speed(iter/s)": 0.155624 }, { "epoch": 0.5510771436634887, "grad_norm": 0.8458043932914734, "learning_rate": 8.67258907168844e-05, "loss": 0.9262384414672852, "memory(GiB)": 91.52, "step": 42470, "token_acc": 0.755736596819524, "train_speed(iter/s)": 0.155619 }, { "epoch": 0.5511420220651444, "grad_norm": 0.7192775011062622, "learning_rate": 8.672225072006571e-05, "loss": 0.9201641082763672, "memory(GiB)": 91.52, "step": 42475, "token_acc": 0.7391728171563849, "train_speed(iter/s)": 0.155614 }, { "epoch": 0.5512069004668001, "grad_norm": 0.7487998604774475, "learning_rate": 8.671861030064974e-05, "loss": 0.8532032012939453, "memory(GiB)": 91.52, "step": 42480, "token_acc": 0.7882185236554169, "train_speed(iter/s)": 0.155609 }, { "epoch": 0.5512717788684558, "grad_norm": 0.7336389422416687, "learning_rate": 8.671496945867836e-05, "loss": 0.9548490524291993, "memory(GiB)": 91.52, "step": 42485, "token_acc": 0.7540851406986121, "train_speed(iter/s)": 0.155604 }, { "epoch": 0.5513366572701115, "grad_norm": 0.8212345838546753, "learning_rate": 8.671132819419348e-05, "loss": 0.8819875717163086, "memory(GiB)": 91.52, "step": 42490, "token_acc": 0.7683784902416505, "train_speed(iter/s)": 0.155599 }, { "epoch": 0.5514015356717672, "grad_norm": 0.7066261172294617, "learning_rate": 8.670768650723698e-05, "loss": 0.8555068969726562, "memory(GiB)": 91.52, "step": 42495, "token_acc": 0.7668013719563962, "train_speed(iter/s)": 0.155594 }, { "epoch": 0.5514664140734229, "grad_norm": 0.7470434904098511, "learning_rate": 8.670404439785081e-05, "loss": 0.8957403182983399, "memory(GiB)": 91.52, "step": 42500, "token_acc": 0.7751653911013101, "train_speed(iter/s)": 0.155589 }, { "epoch": 0.5515312924750786, "grad_norm": 0.6701933145523071, "learning_rate": 8.670040186607684e-05, "loss": 0.9277320861816406, "memory(GiB)": 91.52, "step": 42505, "token_acc": 0.7548058736545273, "train_speed(iter/s)": 0.155584 }, { "epoch": 0.5515961708767343, "grad_norm": 0.6710187196731567, "learning_rate": 8.669675891195701e-05, "loss": 0.8804750442504883, "memory(GiB)": 91.52, "step": 42510, "token_acc": 0.7764540995094604, "train_speed(iter/s)": 0.155579 }, { "epoch": 0.55166104927839, "grad_norm": 0.7293879389762878, "learning_rate": 8.669311553553325e-05, "loss": 0.9094575881958008, "memory(GiB)": 91.52, "step": 42515, "token_acc": 0.749868108678449, "train_speed(iter/s)": 0.155573 }, { "epoch": 0.5517259276800457, "grad_norm": 0.783500611782074, "learning_rate": 8.668947173684747e-05, "loss": 0.8955050468444824, "memory(GiB)": 91.52, "step": 42520, "token_acc": 0.7617547504943195, "train_speed(iter/s)": 0.155567 }, { "epoch": 0.5517908060817014, "grad_norm": 0.7444303035736084, "learning_rate": 8.668582751594161e-05, "loss": 0.9250870704650879, "memory(GiB)": 91.52, "step": 42525, "token_acc": 0.7534289768248463, "train_speed(iter/s)": 0.155561 }, { "epoch": 0.5518556844833571, "grad_norm": 0.8147947192192078, "learning_rate": 8.668218287285763e-05, "loss": 0.9588445663452149, "memory(GiB)": 91.52, "step": 42530, "token_acc": 0.7294344473007712, "train_speed(iter/s)": 0.155556 }, { "epoch": 0.5519205628850128, "grad_norm": 0.749536395072937, "learning_rate": 8.667853780763741e-05, "loss": 0.9567107200622559, "memory(GiB)": 91.52, "step": 42535, "token_acc": 0.7394782608695653, "train_speed(iter/s)": 0.155551 }, { "epoch": 0.5519854412866685, "grad_norm": 0.7240094542503357, "learning_rate": 8.667489232032298e-05, "loss": 0.9208377838134766, "memory(GiB)": 91.52, "step": 42540, "token_acc": 0.7459223137716495, "train_speed(iter/s)": 0.155545 }, { "epoch": 0.5520503196883242, "grad_norm": 0.7401596307754517, "learning_rate": 8.667124641095623e-05, "loss": 0.8920125007629395, "memory(GiB)": 91.52, "step": 42545, "token_acc": 0.7612010838742201, "train_speed(iter/s)": 0.15554 }, { "epoch": 0.5521151980899799, "grad_norm": 0.8327234983444214, "learning_rate": 8.666760007957913e-05, "loss": 0.9452959060668945, "memory(GiB)": 91.52, "step": 42550, "token_acc": 0.7634133368791969, "train_speed(iter/s)": 0.155534 }, { "epoch": 0.5521800764916356, "grad_norm": 0.7429541349411011, "learning_rate": 8.666395332623365e-05, "loss": 0.9488525390625, "memory(GiB)": 91.52, "step": 42555, "token_acc": 0.7493852917070263, "train_speed(iter/s)": 0.15553 }, { "epoch": 0.5522449548932913, "grad_norm": 0.7283389568328857, "learning_rate": 8.666030615096176e-05, "loss": 0.9205481529235839, "memory(GiB)": 91.52, "step": 42560, "token_acc": 0.7506050826946349, "train_speed(iter/s)": 0.155524 }, { "epoch": 0.552309833294947, "grad_norm": 0.6219580769538879, "learning_rate": 8.665665855380542e-05, "loss": 0.9032660484313965, "memory(GiB)": 91.52, "step": 42565, "token_acc": 0.7428418803418804, "train_speed(iter/s)": 0.155519 }, { "epoch": 0.5523747116966027, "grad_norm": 0.7787117958068848, "learning_rate": 8.665301053480661e-05, "loss": 0.9087470054626465, "memory(GiB)": 91.52, "step": 42570, "token_acc": 0.7541745317264411, "train_speed(iter/s)": 0.155514 }, { "epoch": 0.5524395900982584, "grad_norm": 0.7669621109962463, "learning_rate": 8.664936209400734e-05, "loss": 0.9665790557861328, "memory(GiB)": 91.52, "step": 42575, "token_acc": 0.7426182912556428, "train_speed(iter/s)": 0.155509 }, { "epoch": 0.5525044684999141, "grad_norm": 0.7847237586975098, "learning_rate": 8.664571323144955e-05, "loss": 0.8709030151367188, "memory(GiB)": 91.52, "step": 42580, "token_acc": 0.7375303643724697, "train_speed(iter/s)": 0.155505 }, { "epoch": 0.5525693469015698, "grad_norm": 0.7993110418319702, "learning_rate": 8.664206394717525e-05, "loss": 0.9250623703002929, "memory(GiB)": 91.52, "step": 42585, "token_acc": 0.7760192362905541, "train_speed(iter/s)": 0.1555 }, { "epoch": 0.5526342253032255, "grad_norm": 0.8239527940750122, "learning_rate": 8.663841424122642e-05, "loss": 0.9150773048400879, "memory(GiB)": 91.52, "step": 42590, "token_acc": 0.7658552604572882, "train_speed(iter/s)": 0.155495 }, { "epoch": 0.5526991037048812, "grad_norm": 0.8714773058891296, "learning_rate": 8.66347641136451e-05, "loss": 0.9059698104858398, "memory(GiB)": 91.52, "step": 42595, "token_acc": 0.7606525044171426, "train_speed(iter/s)": 0.155493 }, { "epoch": 0.5527639821065369, "grad_norm": 0.7203782200813293, "learning_rate": 8.663111356447326e-05, "loss": 0.9304267883300781, "memory(GiB)": 91.52, "step": 42600, "token_acc": 0.7439879408259132, "train_speed(iter/s)": 0.155488 }, { "epoch": 0.5528288605081925, "grad_norm": 0.8101024627685547, "learning_rate": 8.662746259375294e-05, "loss": 0.9266353607177734, "memory(GiB)": 91.52, "step": 42605, "token_acc": 0.7591320864991233, "train_speed(iter/s)": 0.155484 }, { "epoch": 0.5528937389098482, "grad_norm": 0.7398126721382141, "learning_rate": 8.662381120152613e-05, "loss": 0.877919864654541, "memory(GiB)": 91.52, "step": 42610, "token_acc": 0.7375467210329596, "train_speed(iter/s)": 0.155478 }, { "epoch": 0.5529586173115039, "grad_norm": 0.736000657081604, "learning_rate": 8.662015938783485e-05, "loss": 0.8691143989562988, "memory(GiB)": 91.52, "step": 42615, "token_acc": 0.7680737506065017, "train_speed(iter/s)": 0.155473 }, { "epoch": 0.5530234957131596, "grad_norm": 0.7628391981124878, "learning_rate": 8.661650715272114e-05, "loss": 0.9633936882019043, "memory(GiB)": 91.52, "step": 42620, "token_acc": 0.7385966217724533, "train_speed(iter/s)": 0.155468 }, { "epoch": 0.5530883741148153, "grad_norm": 0.7155345678329468, "learning_rate": 8.661285449622702e-05, "loss": 0.8927543640136719, "memory(GiB)": 91.52, "step": 42625, "token_acc": 0.766340901490097, "train_speed(iter/s)": 0.155463 }, { "epoch": 0.553153252516471, "grad_norm": 0.7234572768211365, "learning_rate": 8.660920141839452e-05, "loss": 0.8910257339477539, "memory(GiB)": 91.52, "step": 42630, "token_acc": 0.7604010366934933, "train_speed(iter/s)": 0.155458 }, { "epoch": 0.5532181309181267, "grad_norm": 0.7571569085121155, "learning_rate": 8.66055479192657e-05, "loss": 0.9187139511108399, "memory(GiB)": 91.52, "step": 42635, "token_acc": 0.7458658988070753, "train_speed(iter/s)": 0.155452 }, { "epoch": 0.5532830093197824, "grad_norm": 0.7430616617202759, "learning_rate": 8.660189399888259e-05, "loss": 0.901552391052246, "memory(GiB)": 91.52, "step": 42640, "token_acc": 0.7573529411764706, "train_speed(iter/s)": 0.155446 }, { "epoch": 0.5533478877214381, "grad_norm": 0.7143846750259399, "learning_rate": 8.659823965728723e-05, "loss": 0.9380971908569335, "memory(GiB)": 91.52, "step": 42645, "token_acc": 0.7519131680843928, "train_speed(iter/s)": 0.155439 }, { "epoch": 0.5534127661230938, "grad_norm": 0.8115499019622803, "learning_rate": 8.659458489452168e-05, "loss": 0.9380413055419922, "memory(GiB)": 91.52, "step": 42650, "token_acc": 0.7524608953613808, "train_speed(iter/s)": 0.155435 }, { "epoch": 0.5534776445247495, "grad_norm": 0.9719724655151367, "learning_rate": 8.6590929710628e-05, "loss": 0.9344426155090332, "memory(GiB)": 91.52, "step": 42655, "token_acc": 0.7467291681242566, "train_speed(iter/s)": 0.15543 }, { "epoch": 0.5535425229264052, "grad_norm": 0.6813554763793945, "learning_rate": 8.658727410564828e-05, "loss": 0.8927074432373047, "memory(GiB)": 91.52, "step": 42660, "token_acc": 0.7847908281497744, "train_speed(iter/s)": 0.155424 }, { "epoch": 0.5536074013280609, "grad_norm": 0.7907741069793701, "learning_rate": 8.658361807962456e-05, "loss": 0.922302532196045, "memory(GiB)": 91.52, "step": 42665, "token_acc": 0.7412484422793701, "train_speed(iter/s)": 0.155418 }, { "epoch": 0.5536722797297166, "grad_norm": 0.9284861087799072, "learning_rate": 8.65799616325989e-05, "loss": 0.9458211898803711, "memory(GiB)": 91.52, "step": 42670, "token_acc": 0.745165878599162, "train_speed(iter/s)": 0.155414 }, { "epoch": 0.5537371581313723, "grad_norm": 0.7597628235816956, "learning_rate": 8.65763047646134e-05, "loss": 0.9164331436157227, "memory(GiB)": 91.52, "step": 42675, "token_acc": 0.7668334771354616, "train_speed(iter/s)": 0.15541 }, { "epoch": 0.553802036533028, "grad_norm": 0.8228163719177246, "learning_rate": 8.657264747571014e-05, "loss": 0.9556789398193359, "memory(GiB)": 91.52, "step": 42680, "token_acc": 0.7670078266104756, "train_speed(iter/s)": 0.155406 }, { "epoch": 0.5538669149346837, "grad_norm": 0.7691299915313721, "learning_rate": 8.65689897659312e-05, "loss": 0.883700180053711, "memory(GiB)": 91.52, "step": 42685, "token_acc": 0.7595757414549807, "train_speed(iter/s)": 0.155402 }, { "epoch": 0.5539317933363394, "grad_norm": 0.7901089787483215, "learning_rate": 8.65653316353187e-05, "loss": 0.9290867805480957, "memory(GiB)": 91.52, "step": 42690, "token_acc": 0.757859438312142, "train_speed(iter/s)": 0.155398 }, { "epoch": 0.553996671737995, "grad_norm": 0.8387861847877502, "learning_rate": 8.65616730839147e-05, "loss": 0.9525327682495117, "memory(GiB)": 91.52, "step": 42695, "token_acc": 0.7482536692567302, "train_speed(iter/s)": 0.155393 }, { "epoch": 0.5540615501396507, "grad_norm": 0.7271916270256042, "learning_rate": 8.655801411176133e-05, "loss": 0.83980712890625, "memory(GiB)": 91.52, "step": 42700, "token_acc": 0.7599114064230343, "train_speed(iter/s)": 0.155387 }, { "epoch": 0.5541264285413064, "grad_norm": 0.8832514882087708, "learning_rate": 8.655435471890068e-05, "loss": 0.8815510749816895, "memory(GiB)": 91.52, "step": 42705, "token_acc": 0.7708961726657283, "train_speed(iter/s)": 0.155382 }, { "epoch": 0.5541913069429621, "grad_norm": 0.7500200867652893, "learning_rate": 8.655069490537486e-05, "loss": 0.9201038360595704, "memory(GiB)": 91.52, "step": 42710, "token_acc": 0.7324721948472477, "train_speed(iter/s)": 0.155376 }, { "epoch": 0.5542561853446178, "grad_norm": 0.7369035482406616, "learning_rate": 8.6547034671226e-05, "loss": 0.8414477348327637, "memory(GiB)": 91.52, "step": 42715, "token_acc": 0.7873514431239389, "train_speed(iter/s)": 0.155371 }, { "epoch": 0.5543210637462735, "grad_norm": 0.7665390372276306, "learning_rate": 8.654337401649623e-05, "loss": 0.9006891250610352, "memory(GiB)": 91.52, "step": 42720, "token_acc": 0.7785983862571577, "train_speed(iter/s)": 0.155365 }, { "epoch": 0.5543859421479292, "grad_norm": 0.8290761709213257, "learning_rate": 8.653971294122765e-05, "loss": 0.8849489212036132, "memory(GiB)": 91.52, "step": 42725, "token_acc": 0.7456247777821673, "train_speed(iter/s)": 0.15536 }, { "epoch": 0.5544508205495849, "grad_norm": 0.8003376722335815, "learning_rate": 8.65360514454624e-05, "loss": 0.9382740020751953, "memory(GiB)": 91.52, "step": 42730, "token_acc": 0.7454026191139593, "train_speed(iter/s)": 0.155355 }, { "epoch": 0.5545156989512406, "grad_norm": 0.8313468098640442, "learning_rate": 8.653238952924265e-05, "loss": 0.8850080490112304, "memory(GiB)": 91.52, "step": 42735, "token_acc": 0.7724596467336885, "train_speed(iter/s)": 0.155351 }, { "epoch": 0.5545805773528963, "grad_norm": 0.7537244558334351, "learning_rate": 8.65287271926105e-05, "loss": 0.8861024856567383, "memory(GiB)": 91.52, "step": 42740, "token_acc": 0.7649546634199481, "train_speed(iter/s)": 0.155345 }, { "epoch": 0.554645455754552, "grad_norm": 0.7150239944458008, "learning_rate": 8.652506443560808e-05, "loss": 0.9062201499938964, "memory(GiB)": 91.52, "step": 42745, "token_acc": 0.7444466680008005, "train_speed(iter/s)": 0.155341 }, { "epoch": 0.5547103341562077, "grad_norm": 0.817270815372467, "learning_rate": 8.65214012582776e-05, "loss": 0.8970535278320313, "memory(GiB)": 91.52, "step": 42750, "token_acc": 0.747430950006504, "train_speed(iter/s)": 0.155336 }, { "epoch": 0.5547752125578634, "grad_norm": 0.8471723794937134, "learning_rate": 8.651773766066118e-05, "loss": 0.9241829872131347, "memory(GiB)": 91.52, "step": 42755, "token_acc": 0.7505339598462195, "train_speed(iter/s)": 0.155331 }, { "epoch": 0.5548400909595191, "grad_norm": 0.6699353456497192, "learning_rate": 8.651407364280097e-05, "loss": 0.9313895225524902, "memory(GiB)": 91.52, "step": 42760, "token_acc": 0.737115678613895, "train_speed(iter/s)": 0.155326 }, { "epoch": 0.5549049693611748, "grad_norm": 0.7814653515815735, "learning_rate": 8.651040920473917e-05, "loss": 0.872719669342041, "memory(GiB)": 91.52, "step": 42765, "token_acc": 0.7683421689901172, "train_speed(iter/s)": 0.155321 }, { "epoch": 0.5549698477628305, "grad_norm": 0.7128492593765259, "learning_rate": 8.650674434651794e-05, "loss": 0.9659778594970703, "memory(GiB)": 91.52, "step": 42770, "token_acc": 0.7348265624412793, "train_speed(iter/s)": 0.155316 }, { "epoch": 0.5550347261644862, "grad_norm": 0.7168795466423035, "learning_rate": 8.650307906817942e-05, "loss": 0.8833250999450684, "memory(GiB)": 91.52, "step": 42775, "token_acc": 0.7448390908459025, "train_speed(iter/s)": 0.15531 }, { "epoch": 0.5550996045661419, "grad_norm": 0.7671772837638855, "learning_rate": 8.649941336976581e-05, "loss": 0.9215032577514648, "memory(GiB)": 91.52, "step": 42780, "token_acc": 0.7442198633124909, "train_speed(iter/s)": 0.155305 }, { "epoch": 0.5551644829677976, "grad_norm": 0.6870409846305847, "learning_rate": 8.649574725131933e-05, "loss": 0.9119589805603028, "memory(GiB)": 91.52, "step": 42785, "token_acc": 0.7698547107585346, "train_speed(iter/s)": 0.155299 }, { "epoch": 0.5552293613694533, "grad_norm": 0.7468801736831665, "learning_rate": 8.649208071288211e-05, "loss": 0.9296468734741211, "memory(GiB)": 91.52, "step": 42790, "token_acc": 0.7572085411471322, "train_speed(iter/s)": 0.155295 }, { "epoch": 0.555294239771109, "grad_norm": 0.7144754528999329, "learning_rate": 8.648841375449639e-05, "loss": 0.8931316375732422, "memory(GiB)": 91.52, "step": 42795, "token_acc": 0.7662895174708818, "train_speed(iter/s)": 0.15529 }, { "epoch": 0.5553591181727647, "grad_norm": 0.8413100838661194, "learning_rate": 8.648474637620435e-05, "loss": 0.9355517387390136, "memory(GiB)": 91.52, "step": 42800, "token_acc": 0.7605345022624435, "train_speed(iter/s)": 0.155284 }, { "epoch": 0.5554239965744204, "grad_norm": 0.7725657224655151, "learning_rate": 8.648107857804819e-05, "loss": 0.9178549766540527, "memory(GiB)": 91.52, "step": 42805, "token_acc": 0.7543026706231454, "train_speed(iter/s)": 0.155279 }, { "epoch": 0.5554888749760761, "grad_norm": 0.709382176399231, "learning_rate": 8.647741036007015e-05, "loss": 0.9148201942443848, "memory(GiB)": 91.52, "step": 42810, "token_acc": 0.7617414796342478, "train_speed(iter/s)": 0.155273 }, { "epoch": 0.5555537533777318, "grad_norm": 0.7548172473907471, "learning_rate": 8.647374172231239e-05, "loss": 0.9299155235290527, "memory(GiB)": 91.52, "step": 42815, "token_acc": 0.7646939766839378, "train_speed(iter/s)": 0.155268 }, { "epoch": 0.5556186317793875, "grad_norm": 0.7777789831161499, "learning_rate": 8.647007266481717e-05, "loss": 0.8880982398986816, "memory(GiB)": 91.52, "step": 42820, "token_acc": 0.7422634227031498, "train_speed(iter/s)": 0.155262 }, { "epoch": 0.5556835101810432, "grad_norm": 0.8369085192680359, "learning_rate": 8.646640318762671e-05, "loss": 0.8939159393310547, "memory(GiB)": 91.52, "step": 42825, "token_acc": 0.7632158179444134, "train_speed(iter/s)": 0.155258 }, { "epoch": 0.5557483885826989, "grad_norm": 0.8131716847419739, "learning_rate": 8.646273329078321e-05, "loss": 0.8793668746948242, "memory(GiB)": 91.52, "step": 42830, "token_acc": 0.7548692480794599, "train_speed(iter/s)": 0.155254 }, { "epoch": 0.5558132669843546, "grad_norm": 0.7459666728973389, "learning_rate": 8.645906297432892e-05, "loss": 0.9331117630004883, "memory(GiB)": 91.52, "step": 42835, "token_acc": 0.7621812585260065, "train_speed(iter/s)": 0.155249 }, { "epoch": 0.5558781453860103, "grad_norm": 0.7676913738250732, "learning_rate": 8.64553922383061e-05, "loss": 0.9374841690063477, "memory(GiB)": 91.52, "step": 42840, "token_acc": 0.7452913478516775, "train_speed(iter/s)": 0.155244 }, { "epoch": 0.5559430237876659, "grad_norm": 0.7019056081771851, "learning_rate": 8.645172108275697e-05, "loss": 0.9209638595581054, "memory(GiB)": 91.52, "step": 42845, "token_acc": 0.7380801771171028, "train_speed(iter/s)": 0.155239 }, { "epoch": 0.5560079021893216, "grad_norm": 0.7419697046279907, "learning_rate": 8.644804950772378e-05, "loss": 0.9361333847045898, "memory(GiB)": 91.52, "step": 42850, "token_acc": 0.7473355176439573, "train_speed(iter/s)": 0.155234 }, { "epoch": 0.5560727805909773, "grad_norm": 0.7023546099662781, "learning_rate": 8.644437751324876e-05, "loss": 0.8763363838195801, "memory(GiB)": 91.52, "step": 42855, "token_acc": 0.7705494213999138, "train_speed(iter/s)": 0.155229 }, { "epoch": 0.556137658992633, "grad_norm": 0.8238818049430847, "learning_rate": 8.644070509937421e-05, "loss": 0.8408880233764648, "memory(GiB)": 91.52, "step": 42860, "token_acc": 0.7761278944074643, "train_speed(iter/s)": 0.15522 }, { "epoch": 0.5562025373942887, "grad_norm": 0.7063319683074951, "learning_rate": 8.643703226614237e-05, "loss": 0.8944129943847656, "memory(GiB)": 91.52, "step": 42865, "token_acc": 0.740610577126646, "train_speed(iter/s)": 0.155213 }, { "epoch": 0.5562674157959444, "grad_norm": 0.7672173976898193, "learning_rate": 8.643335901359551e-05, "loss": 0.8953241348266602, "memory(GiB)": 91.52, "step": 42870, "token_acc": 0.7388864436619719, "train_speed(iter/s)": 0.155208 }, { "epoch": 0.5563322941976001, "grad_norm": 0.7458106875419617, "learning_rate": 8.642968534177589e-05, "loss": 0.9237936973571778, "memory(GiB)": 91.52, "step": 42875, "token_acc": 0.7572254335260116, "train_speed(iter/s)": 0.155203 }, { "epoch": 0.5563971725992558, "grad_norm": 0.7701706290245056, "learning_rate": 8.64260112507258e-05, "loss": 0.9255494117736817, "memory(GiB)": 91.52, "step": 42880, "token_acc": 0.7705686992844181, "train_speed(iter/s)": 0.155198 }, { "epoch": 0.5564620510009115, "grad_norm": 0.7690160274505615, "learning_rate": 8.642233674048751e-05, "loss": 0.8696468353271485, "memory(GiB)": 91.52, "step": 42885, "token_acc": 0.7688439994707361, "train_speed(iter/s)": 0.155192 }, { "epoch": 0.5565269294025672, "grad_norm": 0.7691358923912048, "learning_rate": 8.641866181110331e-05, "loss": 0.8854539871215821, "memory(GiB)": 91.52, "step": 42890, "token_acc": 0.7685030395136778, "train_speed(iter/s)": 0.155186 }, { "epoch": 0.5565918078042229, "grad_norm": 0.73355633020401, "learning_rate": 8.64149864626155e-05, "loss": 0.8958877563476563, "memory(GiB)": 91.52, "step": 42895, "token_acc": 0.7514820205175535, "train_speed(iter/s)": 0.155181 }, { "epoch": 0.5566566862058786, "grad_norm": 0.7722933888435364, "learning_rate": 8.641131069506638e-05, "loss": 0.9225833892822266, "memory(GiB)": 91.52, "step": 42900, "token_acc": 0.7473648059598975, "train_speed(iter/s)": 0.155175 }, { "epoch": 0.5567215646075343, "grad_norm": 0.7730564475059509, "learning_rate": 8.640763450849822e-05, "loss": 0.9279170989990234, "memory(GiB)": 91.52, "step": 42905, "token_acc": 0.7556940344739623, "train_speed(iter/s)": 0.155171 }, { "epoch": 0.55678644300919, "grad_norm": 0.6641159057617188, "learning_rate": 8.640395790295338e-05, "loss": 0.8990989685058594, "memory(GiB)": 91.52, "step": 42910, "token_acc": 0.7439098769418434, "train_speed(iter/s)": 0.155165 }, { "epoch": 0.5568513214108457, "grad_norm": 0.7977612614631653, "learning_rate": 8.64002808784741e-05, "loss": 0.8971075057983399, "memory(GiB)": 91.52, "step": 42915, "token_acc": 0.7621906574901157, "train_speed(iter/s)": 0.155161 }, { "epoch": 0.5569161998125014, "grad_norm": 0.8416189551353455, "learning_rate": 8.639660343510274e-05, "loss": 0.8921646118164063, "memory(GiB)": 91.52, "step": 42920, "token_acc": 0.753373253493014, "train_speed(iter/s)": 0.155157 }, { "epoch": 0.5569810782141571, "grad_norm": 0.7328159809112549, "learning_rate": 8.639292557288161e-05, "loss": 0.9285043716430664, "memory(GiB)": 91.52, "step": 42925, "token_acc": 0.7565173213094973, "train_speed(iter/s)": 0.155151 }, { "epoch": 0.5570459566158128, "grad_norm": 0.9004622101783752, "learning_rate": 8.638924729185304e-05, "loss": 0.9059389114379883, "memory(GiB)": 91.52, "step": 42930, "token_acc": 0.755278310940499, "train_speed(iter/s)": 0.155147 }, { "epoch": 0.5571108350174685, "grad_norm": 0.7660062313079834, "learning_rate": 8.638556859205935e-05, "loss": 0.9258089065551758, "memory(GiB)": 91.52, "step": 42935, "token_acc": 0.7805040829145728, "train_speed(iter/s)": 0.155142 }, { "epoch": 0.5571757134191242, "grad_norm": 0.8424397706985474, "learning_rate": 8.638188947354287e-05, "loss": 0.9708773612976074, "memory(GiB)": 91.52, "step": 42940, "token_acc": 0.7520895119978431, "train_speed(iter/s)": 0.155138 }, { "epoch": 0.5572405918207799, "grad_norm": 0.7963996529579163, "learning_rate": 8.637820993634597e-05, "loss": 0.9227794647216797, "memory(GiB)": 91.52, "step": 42945, "token_acc": 0.7744306297931676, "train_speed(iter/s)": 0.155132 }, { "epoch": 0.5573054702224356, "grad_norm": 0.7602019906044006, "learning_rate": 8.637452998051096e-05, "loss": 0.8906902313232422, "memory(GiB)": 91.52, "step": 42950, "token_acc": 0.7590886139863671, "train_speed(iter/s)": 0.155127 }, { "epoch": 0.5573703486240913, "grad_norm": 0.6599910259246826, "learning_rate": 8.637084960608019e-05, "loss": 0.9227320671081543, "memory(GiB)": 91.52, "step": 42955, "token_acc": 0.7419027459622466, "train_speed(iter/s)": 0.155122 }, { "epoch": 0.557435227025747, "grad_norm": 0.6589388847351074, "learning_rate": 8.636716881309604e-05, "loss": 0.8741450309753418, "memory(GiB)": 91.52, "step": 42960, "token_acc": 0.7685302910552902, "train_speed(iter/s)": 0.155117 }, { "epoch": 0.5575001054274027, "grad_norm": 0.7285135984420776, "learning_rate": 8.636348760160085e-05, "loss": 0.9374870300292969, "memory(GiB)": 91.52, "step": 42965, "token_acc": 0.7545652766421368, "train_speed(iter/s)": 0.155113 }, { "epoch": 0.5575649838290584, "grad_norm": 0.8366227149963379, "learning_rate": 8.635980597163698e-05, "loss": 0.9015065193176269, "memory(GiB)": 91.52, "step": 42970, "token_acc": 0.7543795083173855, "train_speed(iter/s)": 0.155108 }, { "epoch": 0.5576298622307141, "grad_norm": 0.7825611233711243, "learning_rate": 8.635612392324678e-05, "loss": 0.8913003921508789, "memory(GiB)": 91.52, "step": 42975, "token_acc": 0.7350255112978605, "train_speed(iter/s)": 0.155104 }, { "epoch": 0.5576947406323698, "grad_norm": 0.7574639320373535, "learning_rate": 8.635244145647267e-05, "loss": 0.8866096496582031, "memory(GiB)": 91.52, "step": 42980, "token_acc": 0.733356539900792, "train_speed(iter/s)": 0.155099 }, { "epoch": 0.5577596190340255, "grad_norm": 0.7424061298370361, "learning_rate": 8.634875857135701e-05, "loss": 0.8971909523010254, "memory(GiB)": 91.52, "step": 42985, "token_acc": 0.7494900859330591, "train_speed(iter/s)": 0.155094 }, { "epoch": 0.5578244974356812, "grad_norm": 0.8140097856521606, "learning_rate": 8.634507526794218e-05, "loss": 0.8935447692871094, "memory(GiB)": 91.52, "step": 42990, "token_acc": 0.7646351724966943, "train_speed(iter/s)": 0.155089 }, { "epoch": 0.5578893758373369, "grad_norm": 0.7728177905082703, "learning_rate": 8.634139154627055e-05, "loss": 0.8854549407958985, "memory(GiB)": 91.52, "step": 42995, "token_acc": 0.7711570606079367, "train_speed(iter/s)": 0.155083 }, { "epoch": 0.5579542542389926, "grad_norm": 0.8322543501853943, "learning_rate": 8.633770740638451e-05, "loss": 0.898838996887207, "memory(GiB)": 91.52, "step": 43000, "token_acc": 0.7423358908780904, "train_speed(iter/s)": 0.155078 }, { "epoch": 0.5580191326406483, "grad_norm": 0.788000226020813, "learning_rate": 8.633402284832649e-05, "loss": 0.8783828735351562, "memory(GiB)": 91.52, "step": 43005, "token_acc": 0.7663288288288288, "train_speed(iter/s)": 0.155072 }, { "epoch": 0.558084011042304, "grad_norm": 0.7398161292076111, "learning_rate": 8.633033787213887e-05, "loss": 0.9088994979858398, "memory(GiB)": 91.52, "step": 43010, "token_acc": 0.7406221408966148, "train_speed(iter/s)": 0.155067 }, { "epoch": 0.5581488894439597, "grad_norm": 0.7929509878158569, "learning_rate": 8.632665247786407e-05, "loss": 0.8866833686828614, "memory(GiB)": 91.52, "step": 43015, "token_acc": 0.7384309911662209, "train_speed(iter/s)": 0.155062 }, { "epoch": 0.5582137678456154, "grad_norm": 0.7839638590812683, "learning_rate": 8.632296666554446e-05, "loss": 0.907776927947998, "memory(GiB)": 91.52, "step": 43020, "token_acc": 0.7631471516051216, "train_speed(iter/s)": 0.155057 }, { "epoch": 0.5582786462472711, "grad_norm": 0.7696208357810974, "learning_rate": 8.631928043522253e-05, "loss": 0.9440340042114258, "memory(GiB)": 91.52, "step": 43025, "token_acc": 0.7548152004164498, "train_speed(iter/s)": 0.155052 }, { "epoch": 0.5583435246489268, "grad_norm": 0.7519962191581726, "learning_rate": 8.631559378694063e-05, "loss": 0.9060348510742188, "memory(GiB)": 91.52, "step": 43030, "token_acc": 0.764980875478113, "train_speed(iter/s)": 0.155048 }, { "epoch": 0.5584084030505825, "grad_norm": 0.8522667288780212, "learning_rate": 8.631190672074122e-05, "loss": 0.8938037872314453, "memory(GiB)": 91.52, "step": 43035, "token_acc": 0.7450251297348098, "train_speed(iter/s)": 0.155043 }, { "epoch": 0.5584732814522382, "grad_norm": 0.7198073267936707, "learning_rate": 8.630821923666672e-05, "loss": 0.9082059860229492, "memory(GiB)": 91.52, "step": 43040, "token_acc": 0.7459977983959742, "train_speed(iter/s)": 0.155038 }, { "epoch": 0.5585381598538939, "grad_norm": 0.697343111038208, "learning_rate": 8.630453133475958e-05, "loss": 0.9202910423278808, "memory(GiB)": 91.52, "step": 43045, "token_acc": 0.7424382512217673, "train_speed(iter/s)": 0.155032 }, { "epoch": 0.5586030382555496, "grad_norm": 0.6888030767440796, "learning_rate": 8.630084301506223e-05, "loss": 0.8968608856201172, "memory(GiB)": 91.52, "step": 43050, "token_acc": 0.7660876215448487, "train_speed(iter/s)": 0.155027 }, { "epoch": 0.5586679166572053, "grad_norm": 0.7742286324501038, "learning_rate": 8.629715427761711e-05, "loss": 0.947795295715332, "memory(GiB)": 91.52, "step": 43055, "token_acc": 0.7673269112818848, "train_speed(iter/s)": 0.155024 }, { "epoch": 0.558732795058861, "grad_norm": 0.7309435606002808, "learning_rate": 8.629346512246667e-05, "loss": 0.9357829093933105, "memory(GiB)": 91.52, "step": 43060, "token_acc": 0.7592719904475884, "train_speed(iter/s)": 0.15502 }, { "epoch": 0.5587976734605167, "grad_norm": 0.7661548852920532, "learning_rate": 8.628977554965337e-05, "loss": 0.8921314239501953, "memory(GiB)": 91.52, "step": 43065, "token_acc": 0.750498401354147, "train_speed(iter/s)": 0.155015 }, { "epoch": 0.5588625518621724, "grad_norm": 0.8170413374900818, "learning_rate": 8.628608555921968e-05, "loss": 0.9308192253112793, "memory(GiB)": 91.52, "step": 43070, "token_acc": 0.7501298532658096, "train_speed(iter/s)": 0.155009 }, { "epoch": 0.5589274302638281, "grad_norm": 0.8032975792884827, "learning_rate": 8.628239515120804e-05, "loss": 0.9492579460144043, "memory(GiB)": 91.52, "step": 43075, "token_acc": 0.7484648086915446, "train_speed(iter/s)": 0.155004 }, { "epoch": 0.5589923086654838, "grad_norm": 0.7355442047119141, "learning_rate": 8.627870432566095e-05, "loss": 0.9022657394409179, "memory(GiB)": 91.52, "step": 43080, "token_acc": 0.7635360303866413, "train_speed(iter/s)": 0.154998 }, { "epoch": 0.5590571870671394, "grad_norm": 0.8365391492843628, "learning_rate": 8.627501308262084e-05, "loss": 0.9196894645690918, "memory(GiB)": 91.52, "step": 43085, "token_acc": 0.752233602091959, "train_speed(iter/s)": 0.154992 }, { "epoch": 0.5591220654687951, "grad_norm": 0.8079766631126404, "learning_rate": 8.627132142213022e-05, "loss": 0.9032732963562011, "memory(GiB)": 91.52, "step": 43090, "token_acc": 0.7653592375366569, "train_speed(iter/s)": 0.154988 }, { "epoch": 0.5591869438704508, "grad_norm": 0.7185434103012085, "learning_rate": 8.626762934423159e-05, "loss": 0.9099149703979492, "memory(GiB)": 91.52, "step": 43095, "token_acc": 0.7465698143664246, "train_speed(iter/s)": 0.154983 }, { "epoch": 0.5592518222721065, "grad_norm": 0.6984708309173584, "learning_rate": 8.626393684896739e-05, "loss": 0.8931419372558593, "memory(GiB)": 91.52, "step": 43100, "token_acc": 0.7791432584269663, "train_speed(iter/s)": 0.154979 }, { "epoch": 0.5593167006737622, "grad_norm": 0.8450888395309448, "learning_rate": 8.626024393638015e-05, "loss": 0.9499107360839844, "memory(GiB)": 91.52, "step": 43105, "token_acc": 0.7289818566787135, "train_speed(iter/s)": 0.154975 }, { "epoch": 0.5593815790754179, "grad_norm": 0.804741621017456, "learning_rate": 8.625655060651234e-05, "loss": 0.8754140853881835, "memory(GiB)": 91.52, "step": 43110, "token_acc": 0.7681616799174606, "train_speed(iter/s)": 0.154969 }, { "epoch": 0.5594464574770736, "grad_norm": 0.723255455493927, "learning_rate": 8.625285685940651e-05, "loss": 0.8865328788757324, "memory(GiB)": 91.52, "step": 43115, "token_acc": 0.762020529443544, "train_speed(iter/s)": 0.154964 }, { "epoch": 0.5595113358787293, "grad_norm": 0.8184428215026855, "learning_rate": 8.624916269510512e-05, "loss": 0.9089006423950196, "memory(GiB)": 91.52, "step": 43120, "token_acc": 0.7235127689673144, "train_speed(iter/s)": 0.154958 }, { "epoch": 0.559576214280385, "grad_norm": 0.8374295830726624, "learning_rate": 8.624546811365069e-05, "loss": 0.9655412673950196, "memory(GiB)": 91.52, "step": 43125, "token_acc": 0.7535322440603047, "train_speed(iter/s)": 0.154953 }, { "epoch": 0.5596410926820407, "grad_norm": 0.7914408445358276, "learning_rate": 8.624177311508576e-05, "loss": 0.9843376159667969, "memory(GiB)": 91.52, "step": 43130, "token_acc": 0.7352421214450423, "train_speed(iter/s)": 0.154948 }, { "epoch": 0.5597059710836964, "grad_norm": 0.7497194409370422, "learning_rate": 8.623807769945284e-05, "loss": 0.9532299041748047, "memory(GiB)": 91.52, "step": 43135, "token_acc": 0.7594711332388143, "train_speed(iter/s)": 0.154943 }, { "epoch": 0.559770849485352, "grad_norm": 0.7689712643623352, "learning_rate": 8.623438186679444e-05, "loss": 0.9146842956542969, "memory(GiB)": 91.52, "step": 43140, "token_acc": 0.7596958877123862, "train_speed(iter/s)": 0.154939 }, { "epoch": 0.5598357278870078, "grad_norm": 0.7847437858581543, "learning_rate": 8.62306856171531e-05, "loss": 0.9466512680053711, "memory(GiB)": 91.52, "step": 43145, "token_acc": 0.7398280389970141, "train_speed(iter/s)": 0.154934 }, { "epoch": 0.5599006062886634, "grad_norm": 0.8279219269752502, "learning_rate": 8.622698895057139e-05, "loss": 0.9354770660400391, "memory(GiB)": 91.52, "step": 43150, "token_acc": 0.7601158713218479, "train_speed(iter/s)": 0.154929 }, { "epoch": 0.5599654846903191, "grad_norm": 0.7815384268760681, "learning_rate": 8.622329186709179e-05, "loss": 0.8860263824462891, "memory(GiB)": 91.52, "step": 43155, "token_acc": 0.7483941824474989, "train_speed(iter/s)": 0.154924 }, { "epoch": 0.5600303630919748, "grad_norm": 0.7560608386993408, "learning_rate": 8.621959436675691e-05, "loss": 0.8822382926940918, "memory(GiB)": 91.52, "step": 43160, "token_acc": 0.7354934764722015, "train_speed(iter/s)": 0.154919 }, { "epoch": 0.5600952414936305, "grad_norm": 0.8173374533653259, "learning_rate": 8.621589644960925e-05, "loss": 0.8855359077453613, "memory(GiB)": 91.52, "step": 43165, "token_acc": 0.7636415443795412, "train_speed(iter/s)": 0.154914 }, { "epoch": 0.5601601198952862, "grad_norm": 0.8705917596817017, "learning_rate": 8.62121981156914e-05, "loss": 0.9044641494750977, "memory(GiB)": 91.52, "step": 43170, "token_acc": 0.7402111563721008, "train_speed(iter/s)": 0.154909 }, { "epoch": 0.560224998296942, "grad_norm": 0.6908371448516846, "learning_rate": 8.620849936504588e-05, "loss": 0.9597156524658204, "memory(GiB)": 91.52, "step": 43175, "token_acc": 0.7529473978909385, "train_speed(iter/s)": 0.154904 }, { "epoch": 0.5602898766985976, "grad_norm": 0.7159442901611328, "learning_rate": 8.620480019771529e-05, "loss": 0.9343774795532227, "memory(GiB)": 91.52, "step": 43180, "token_acc": 0.7467830882352942, "train_speed(iter/s)": 0.154898 }, { "epoch": 0.5603547551002533, "grad_norm": 0.6991665363311768, "learning_rate": 8.62011006137422e-05, "loss": 0.9364486694335937, "memory(GiB)": 91.52, "step": 43185, "token_acc": 0.7362782506604051, "train_speed(iter/s)": 0.154894 }, { "epoch": 0.560419633501909, "grad_norm": 0.8159373998641968, "learning_rate": 8.619740061316917e-05, "loss": 0.9043262481689454, "memory(GiB)": 91.52, "step": 43190, "token_acc": 0.7687241593794593, "train_speed(iter/s)": 0.154889 }, { "epoch": 0.5604845119035647, "grad_norm": 0.7460184693336487, "learning_rate": 8.619370019603876e-05, "loss": 0.9296953201293945, "memory(GiB)": 91.52, "step": 43195, "token_acc": 0.7684734814584225, "train_speed(iter/s)": 0.154884 }, { "epoch": 0.5605493903052204, "grad_norm": 0.768706738948822, "learning_rate": 8.61899993623936e-05, "loss": 0.896335506439209, "memory(GiB)": 91.52, "step": 43200, "token_acc": 0.7621396180199265, "train_speed(iter/s)": 0.15488 }, { "epoch": 0.5606142687068761, "grad_norm": 0.7739071249961853, "learning_rate": 8.618629811227627e-05, "loss": 0.9317378997802734, "memory(GiB)": 91.52, "step": 43205, "token_acc": 0.7451226468910439, "train_speed(iter/s)": 0.154876 }, { "epoch": 0.5606791471085318, "grad_norm": 0.7025591135025024, "learning_rate": 8.618259644572932e-05, "loss": 0.8817262649536133, "memory(GiB)": 91.52, "step": 43210, "token_acc": 0.7564670240037288, "train_speed(iter/s)": 0.154872 }, { "epoch": 0.5607440255101875, "grad_norm": 0.781732439994812, "learning_rate": 8.61788943627954e-05, "loss": 0.9376798629760742, "memory(GiB)": 91.52, "step": 43215, "token_acc": 0.7375652753601718, "train_speed(iter/s)": 0.154867 }, { "epoch": 0.5608089039118432, "grad_norm": 0.7265490293502808, "learning_rate": 8.617519186351708e-05, "loss": 0.9015676498413085, "memory(GiB)": 91.52, "step": 43220, "token_acc": 0.7581845803266236, "train_speed(iter/s)": 0.154862 }, { "epoch": 0.5608737823134989, "grad_norm": 0.7215536236763, "learning_rate": 8.617148894793699e-05, "loss": 0.9391942977905273, "memory(GiB)": 91.52, "step": 43225, "token_acc": 0.7483463396506875, "train_speed(iter/s)": 0.154858 }, { "epoch": 0.5609386607151546, "grad_norm": 0.6904349327087402, "learning_rate": 8.616778561609773e-05, "loss": 0.902800178527832, "memory(GiB)": 91.52, "step": 43230, "token_acc": 0.7686354535653781, "train_speed(iter/s)": 0.154853 }, { "epoch": 0.5610035391168103, "grad_norm": 0.6836498975753784, "learning_rate": 8.616408186804191e-05, "loss": 0.884986400604248, "memory(GiB)": 91.52, "step": 43235, "token_acc": 0.7545757987015369, "train_speed(iter/s)": 0.154849 }, { "epoch": 0.561068417518466, "grad_norm": 0.7318691611289978, "learning_rate": 8.616037770381218e-05, "loss": 0.9053142547607422, "memory(GiB)": 91.52, "step": 43240, "token_acc": 0.7739674763557188, "train_speed(iter/s)": 0.154845 }, { "epoch": 0.5611332959201217, "grad_norm": 0.7258700132369995, "learning_rate": 8.615667312345114e-05, "loss": 0.9244464874267578, "memory(GiB)": 91.52, "step": 43245, "token_acc": 0.7616708497669415, "train_speed(iter/s)": 0.15484 }, { "epoch": 0.5611981743217774, "grad_norm": 0.7651862502098083, "learning_rate": 8.615296812700144e-05, "loss": 0.9028258323669434, "memory(GiB)": 91.52, "step": 43250, "token_acc": 0.7405069501226492, "train_speed(iter/s)": 0.154835 }, { "epoch": 0.5612630527234331, "grad_norm": 0.817071795463562, "learning_rate": 8.61492627145057e-05, "loss": 0.8855428695678711, "memory(GiB)": 91.52, "step": 43255, "token_acc": 0.7521370331404524, "train_speed(iter/s)": 0.15483 }, { "epoch": 0.5613279311250888, "grad_norm": 0.7715451121330261, "learning_rate": 8.614555688600658e-05, "loss": 0.9513195037841797, "memory(GiB)": 91.52, "step": 43260, "token_acc": 0.746612375675856, "train_speed(iter/s)": 0.154825 }, { "epoch": 0.5613928095267445, "grad_norm": 0.6786371469497681, "learning_rate": 8.614185064154672e-05, "loss": 0.910763931274414, "memory(GiB)": 91.52, "step": 43265, "token_acc": 0.7680569377846135, "train_speed(iter/s)": 0.15482 }, { "epoch": 0.5614576879284002, "grad_norm": 0.7748084664344788, "learning_rate": 8.613814398116875e-05, "loss": 0.8621286392211914, "memory(GiB)": 91.52, "step": 43270, "token_acc": 0.7618359939112447, "train_speed(iter/s)": 0.154815 }, { "epoch": 0.5615225663300559, "grad_norm": 0.8103447556495667, "learning_rate": 8.613443690491536e-05, "loss": 0.9338621139526367, "memory(GiB)": 91.52, "step": 43275, "token_acc": 0.7615084615920329, "train_speed(iter/s)": 0.15481 }, { "epoch": 0.5615874447317116, "grad_norm": 0.7622418999671936, "learning_rate": 8.61307294128292e-05, "loss": 0.8885312080383301, "memory(GiB)": 91.52, "step": 43280, "token_acc": 0.7670709793351302, "train_speed(iter/s)": 0.154805 }, { "epoch": 0.5616523231333673, "grad_norm": 0.8258056640625, "learning_rate": 8.612702150495293e-05, "loss": 0.9036053657531739, "memory(GiB)": 91.52, "step": 43285, "token_acc": 0.7600062417102286, "train_speed(iter/s)": 0.154801 }, { "epoch": 0.561717201535023, "grad_norm": 0.7179465293884277, "learning_rate": 8.61233131813292e-05, "loss": 0.9175420761108398, "memory(GiB)": 91.52, "step": 43290, "token_acc": 0.7382701016020397, "train_speed(iter/s)": 0.154795 }, { "epoch": 0.5617820799366787, "grad_norm": 0.7946605682373047, "learning_rate": 8.611960444200073e-05, "loss": 0.9239932060241699, "memory(GiB)": 91.52, "step": 43295, "token_acc": 0.7407581566953628, "train_speed(iter/s)": 0.154791 }, { "epoch": 0.5618469583383344, "grad_norm": 0.6814990639686584, "learning_rate": 8.611589528701016e-05, "loss": 0.885126781463623, "memory(GiB)": 91.52, "step": 43300, "token_acc": 0.7456808255710586, "train_speed(iter/s)": 0.154786 }, { "epoch": 0.5619118367399901, "grad_norm": 0.7598323822021484, "learning_rate": 8.611218571640022e-05, "loss": 0.9334399223327636, "memory(GiB)": 91.52, "step": 43305, "token_acc": 0.756708935417281, "train_speed(iter/s)": 0.154781 }, { "epoch": 0.5619767151416458, "grad_norm": 0.7142242789268494, "learning_rate": 8.610847573021355e-05, "loss": 0.9167560577392578, "memory(GiB)": 91.52, "step": 43310, "token_acc": 0.7629801604052343, "train_speed(iter/s)": 0.154777 }, { "epoch": 0.5620415935433015, "grad_norm": 0.7346672415733337, "learning_rate": 8.610476532849287e-05, "loss": 0.9103179931640625, "memory(GiB)": 91.52, "step": 43315, "token_acc": 0.743997202579843, "train_speed(iter/s)": 0.154772 }, { "epoch": 0.5621064719449571, "grad_norm": 0.775081992149353, "learning_rate": 8.610105451128086e-05, "loss": 0.9229848861694336, "memory(GiB)": 91.52, "step": 43320, "token_acc": 0.761626679800183, "train_speed(iter/s)": 0.154767 }, { "epoch": 0.5621713503466128, "grad_norm": 0.6755570769309998, "learning_rate": 8.609734327862026e-05, "loss": 0.9417415618896484, "memory(GiB)": 91.52, "step": 43325, "token_acc": 0.7525276354812618, "train_speed(iter/s)": 0.154762 }, { "epoch": 0.5622362287482685, "grad_norm": 0.8072201609611511, "learning_rate": 8.609363163055375e-05, "loss": 0.9808039665222168, "memory(GiB)": 91.52, "step": 43330, "token_acc": 0.7315988514601863, "train_speed(iter/s)": 0.154756 }, { "epoch": 0.5623011071499242, "grad_norm": 0.8239874243736267, "learning_rate": 8.608991956712405e-05, "loss": 0.9519304275512696, "memory(GiB)": 91.52, "step": 43335, "token_acc": 0.7629994526546251, "train_speed(iter/s)": 0.154751 }, { "epoch": 0.5623659855515799, "grad_norm": 0.7430393099784851, "learning_rate": 8.608620708837388e-05, "loss": 0.9073638916015625, "memory(GiB)": 91.52, "step": 43340, "token_acc": 0.7682962013143132, "train_speed(iter/s)": 0.154746 }, { "epoch": 0.5624308639532356, "grad_norm": 0.7010467052459717, "learning_rate": 8.608249419434595e-05, "loss": 0.9469257354736328, "memory(GiB)": 91.52, "step": 43345, "token_acc": 0.7341967910303498, "train_speed(iter/s)": 0.154741 }, { "epoch": 0.5624957423548913, "grad_norm": 0.8403250575065613, "learning_rate": 8.6078780885083e-05, "loss": 0.9456053733825683, "memory(GiB)": 91.52, "step": 43350, "token_acc": 0.7527459284509176, "train_speed(iter/s)": 0.154736 }, { "epoch": 0.562560620756547, "grad_norm": 0.682287871837616, "learning_rate": 8.607506716062776e-05, "loss": 0.9043268203735352, "memory(GiB)": 91.52, "step": 43355, "token_acc": 0.7430484366991287, "train_speed(iter/s)": 0.15473 }, { "epoch": 0.5626254991582027, "grad_norm": 0.7429761290550232, "learning_rate": 8.607135302102299e-05, "loss": 0.9690332412719727, "memory(GiB)": 91.52, "step": 43360, "token_acc": 0.7553659878921298, "train_speed(iter/s)": 0.154726 }, { "epoch": 0.5626903775598584, "grad_norm": 0.7370619773864746, "learning_rate": 8.606763846631138e-05, "loss": 0.9534290313720704, "memory(GiB)": 91.52, "step": 43365, "token_acc": 0.7404698926404232, "train_speed(iter/s)": 0.154721 }, { "epoch": 0.5627552559615141, "grad_norm": 0.7764297127723694, "learning_rate": 8.606392349653573e-05, "loss": 0.9172463417053223, "memory(GiB)": 91.52, "step": 43370, "token_acc": 0.7681058495821727, "train_speed(iter/s)": 0.154717 }, { "epoch": 0.5628201343631698, "grad_norm": 0.8895671963691711, "learning_rate": 8.606020811173877e-05, "loss": 0.8718215942382812, "memory(GiB)": 91.52, "step": 43375, "token_acc": 0.7686918193480478, "train_speed(iter/s)": 0.154712 }, { "epoch": 0.5628850127648255, "grad_norm": 0.8663222193717957, "learning_rate": 8.605649231196324e-05, "loss": 0.9429088592529297, "memory(GiB)": 91.52, "step": 43380, "token_acc": 0.738099966898378, "train_speed(iter/s)": 0.154708 }, { "epoch": 0.5629498911664812, "grad_norm": 0.7460789084434509, "learning_rate": 8.605277609725193e-05, "loss": 0.8809167861938476, "memory(GiB)": 91.52, "step": 43385, "token_acc": 0.769606948651509, "train_speed(iter/s)": 0.154702 }, { "epoch": 0.5630147695681369, "grad_norm": 0.7725846767425537, "learning_rate": 8.604905946764759e-05, "loss": 0.9633962631225585, "memory(GiB)": 91.52, "step": 43390, "token_acc": 0.7445712554287446, "train_speed(iter/s)": 0.154698 }, { "epoch": 0.5630796479697926, "grad_norm": 0.8553091287612915, "learning_rate": 8.6045342423193e-05, "loss": 0.9252202987670899, "memory(GiB)": 91.52, "step": 43395, "token_acc": 0.7468214483139857, "train_speed(iter/s)": 0.154693 }, { "epoch": 0.5631445263714483, "grad_norm": 0.7743967771530151, "learning_rate": 8.604162496393093e-05, "loss": 0.9320442199707031, "memory(GiB)": 91.52, "step": 43400, "token_acc": 0.7517974366989685, "train_speed(iter/s)": 0.154688 }, { "epoch": 0.563209404773104, "grad_norm": 0.7441828846931458, "learning_rate": 8.603790708990414e-05, "loss": 0.9202987670898437, "memory(GiB)": 91.52, "step": 43405, "token_acc": 0.7493388918271474, "train_speed(iter/s)": 0.154683 }, { "epoch": 0.5632742831747597, "grad_norm": 0.7957321405410767, "learning_rate": 8.603418880115545e-05, "loss": 0.8944190979003906, "memory(GiB)": 91.52, "step": 43410, "token_acc": 0.7592913030394222, "train_speed(iter/s)": 0.154678 }, { "epoch": 0.5633391615764154, "grad_norm": 0.7560481429100037, "learning_rate": 8.603047009772765e-05, "loss": 0.9311102867126465, "memory(GiB)": 91.52, "step": 43415, "token_acc": 0.7630657611675876, "train_speed(iter/s)": 0.154673 }, { "epoch": 0.5634040399780711, "grad_norm": 0.7727047204971313, "learning_rate": 8.602675097966348e-05, "loss": 0.9401701927185059, "memory(GiB)": 91.52, "step": 43420, "token_acc": 0.7524526737598453, "train_speed(iter/s)": 0.154667 }, { "epoch": 0.5634689183797268, "grad_norm": 0.6857116222381592, "learning_rate": 8.60230314470058e-05, "loss": 0.9282886505126953, "memory(GiB)": 91.52, "step": 43425, "token_acc": 0.7628747559061148, "train_speed(iter/s)": 0.154663 }, { "epoch": 0.5635337967813825, "grad_norm": 0.7640033960342407, "learning_rate": 8.601931149979741e-05, "loss": 0.9127236366271972, "memory(GiB)": 91.52, "step": 43430, "token_acc": 0.7496988941202234, "train_speed(iter/s)": 0.154659 }, { "epoch": 0.5635986751830382, "grad_norm": 0.7587569952011108, "learning_rate": 8.601559113808107e-05, "loss": 0.9290637969970703, "memory(GiB)": 91.52, "step": 43435, "token_acc": 0.7553956834532374, "train_speed(iter/s)": 0.154654 }, { "epoch": 0.5636635535846939, "grad_norm": 0.9083098769187927, "learning_rate": 8.601187036189963e-05, "loss": 0.9344377517700195, "memory(GiB)": 91.52, "step": 43440, "token_acc": 0.7433801591248135, "train_speed(iter/s)": 0.154649 }, { "epoch": 0.5637284319863496, "grad_norm": 0.7642106413841248, "learning_rate": 8.600814917129591e-05, "loss": 0.9097524642944336, "memory(GiB)": 91.52, "step": 43445, "token_acc": 0.7564329425643295, "train_speed(iter/s)": 0.154645 }, { "epoch": 0.5637933103880053, "grad_norm": 0.8042303919792175, "learning_rate": 8.600442756631276e-05, "loss": 0.931916332244873, "memory(GiB)": 91.52, "step": 43450, "token_acc": 0.749936628643853, "train_speed(iter/s)": 0.15464 }, { "epoch": 0.563858188789661, "grad_norm": 0.8090401291847229, "learning_rate": 8.600070554699293e-05, "loss": 0.9290252685546875, "memory(GiB)": 91.52, "step": 43455, "token_acc": 0.7454896696244393, "train_speed(iter/s)": 0.154634 }, { "epoch": 0.5639230671913167, "grad_norm": 0.7124835848808289, "learning_rate": 8.599698311337931e-05, "loss": 0.9158819198608399, "memory(GiB)": 91.52, "step": 43460, "token_acc": 0.7448479262672811, "train_speed(iter/s)": 0.154629 }, { "epoch": 0.5639879455929724, "grad_norm": 0.7062462568283081, "learning_rate": 8.599326026551473e-05, "loss": 0.8936642646789551, "memory(GiB)": 91.52, "step": 43465, "token_acc": 0.7501024890190336, "train_speed(iter/s)": 0.154625 }, { "epoch": 0.5640528239946281, "grad_norm": 0.9385372996330261, "learning_rate": 8.5989537003442e-05, "loss": 0.935446834564209, "memory(GiB)": 91.52, "step": 43470, "token_acc": 0.7546561079337146, "train_speed(iter/s)": 0.15462 }, { "epoch": 0.5641177023962838, "grad_norm": 0.7456152439117432, "learning_rate": 8.598581332720403e-05, "loss": 0.9288119316101074, "memory(GiB)": 91.52, "step": 43475, "token_acc": 0.7525737817433081, "train_speed(iter/s)": 0.154615 }, { "epoch": 0.5641825807979395, "grad_norm": 0.7781728506088257, "learning_rate": 8.598208923684363e-05, "loss": 0.8766325950622559, "memory(GiB)": 91.52, "step": 43480, "token_acc": 0.7652191090771409, "train_speed(iter/s)": 0.154611 }, { "epoch": 0.5642474591995952, "grad_norm": 0.774590790271759, "learning_rate": 8.597836473240366e-05, "loss": 0.8944848060607911, "memory(GiB)": 91.52, "step": 43485, "token_acc": 0.7400593471810089, "train_speed(iter/s)": 0.154606 }, { "epoch": 0.5643123376012509, "grad_norm": 0.759193480014801, "learning_rate": 8.597463981392698e-05, "loss": 0.9550457000732422, "memory(GiB)": 91.52, "step": 43490, "token_acc": 0.7325266953265844, "train_speed(iter/s)": 0.154602 }, { "epoch": 0.5643772160029066, "grad_norm": 0.761455774307251, "learning_rate": 8.597091448145647e-05, "loss": 0.8853340148925781, "memory(GiB)": 91.52, "step": 43495, "token_acc": 0.7560473494595985, "train_speed(iter/s)": 0.154598 }, { "epoch": 0.5644420944045623, "grad_norm": 0.7349299788475037, "learning_rate": 8.596718873503498e-05, "loss": 0.8886348724365234, "memory(GiB)": 91.52, "step": 43500, "token_acc": 0.7427448706799369, "train_speed(iter/s)": 0.154593 }, { "epoch": 0.564506972806218, "grad_norm": 0.7067155838012695, "learning_rate": 8.59634625747054e-05, "loss": 0.9159297943115234, "memory(GiB)": 91.52, "step": 43505, "token_acc": 0.7640292676877286, "train_speed(iter/s)": 0.154587 }, { "epoch": 0.5645718512078737, "grad_norm": 0.8257154226303101, "learning_rate": 8.59597360005106e-05, "loss": 0.8932354927062989, "memory(GiB)": 91.52, "step": 43510, "token_acc": 0.7519420630744947, "train_speed(iter/s)": 0.154582 }, { "epoch": 0.5646367296095294, "grad_norm": 0.8629721403121948, "learning_rate": 8.595600901249347e-05, "loss": 0.9223530769348145, "memory(GiB)": 91.52, "step": 43515, "token_acc": 0.7624594302199784, "train_speed(iter/s)": 0.154577 }, { "epoch": 0.5647016080111851, "grad_norm": 0.7436215877532959, "learning_rate": 8.595228161069693e-05, "loss": 0.8946688652038575, "memory(GiB)": 91.52, "step": 43520, "token_acc": 0.7501421585351985, "train_speed(iter/s)": 0.154572 }, { "epoch": 0.5647664864128408, "grad_norm": 0.7596104145050049, "learning_rate": 8.594855379516382e-05, "loss": 0.9269328117370605, "memory(GiB)": 91.52, "step": 43525, "token_acc": 0.7449250845819236, "train_speed(iter/s)": 0.154567 }, { "epoch": 0.5648313648144965, "grad_norm": 0.7002005577087402, "learning_rate": 8.594482556593708e-05, "loss": 0.8958518981933594, "memory(GiB)": 91.52, "step": 43530, "token_acc": 0.7626700500458166, "train_speed(iter/s)": 0.154563 }, { "epoch": 0.5648962432161522, "grad_norm": 0.8338125944137573, "learning_rate": 8.59410969230596e-05, "loss": 0.9109331130981445, "memory(GiB)": 91.52, "step": 43535, "token_acc": 0.7573855243722304, "train_speed(iter/s)": 0.154557 }, { "epoch": 0.5649611216178079, "grad_norm": 0.9194005131721497, "learning_rate": 8.593736786657428e-05, "loss": 0.9387794494628906, "memory(GiB)": 91.52, "step": 43540, "token_acc": 0.7447385264109032, "train_speed(iter/s)": 0.154552 }, { "epoch": 0.5650260000194636, "grad_norm": 0.7063965797424316, "learning_rate": 8.593363839652406e-05, "loss": 0.9019445419311524, "memory(GiB)": 91.52, "step": 43545, "token_acc": 0.7480763850027979, "train_speed(iter/s)": 0.154547 }, { "epoch": 0.5650908784211193, "grad_norm": 0.7892757058143616, "learning_rate": 8.592990851295182e-05, "loss": 0.9109891891479492, "memory(GiB)": 91.52, "step": 43550, "token_acc": 0.7513374485596708, "train_speed(iter/s)": 0.154542 }, { "epoch": 0.565155756822775, "grad_norm": 0.678448498249054, "learning_rate": 8.592617821590052e-05, "loss": 0.8963203430175781, "memory(GiB)": 91.52, "step": 43555, "token_acc": 0.7621256565932192, "train_speed(iter/s)": 0.154537 }, { "epoch": 0.5652206352244306, "grad_norm": 0.754082202911377, "learning_rate": 8.592244750541308e-05, "loss": 0.8944764137268066, "memory(GiB)": 91.52, "step": 43560, "token_acc": 0.7499264813997941, "train_speed(iter/s)": 0.154532 }, { "epoch": 0.5652855136260863, "grad_norm": 0.7192801237106323, "learning_rate": 8.591871638153241e-05, "loss": 0.9341645240783691, "memory(GiB)": 91.52, "step": 43565, "token_acc": 0.7601386953122344, "train_speed(iter/s)": 0.154527 }, { "epoch": 0.565350392027742, "grad_norm": 0.7320101857185364, "learning_rate": 8.591498484430147e-05, "loss": 0.9178455352783204, "memory(GiB)": 91.52, "step": 43570, "token_acc": 0.7499495784760598, "train_speed(iter/s)": 0.154522 }, { "epoch": 0.5654152704293977, "grad_norm": 0.8250970840454102, "learning_rate": 8.591125289376318e-05, "loss": 0.9449204444885254, "memory(GiB)": 91.52, "step": 43575, "token_acc": 0.7436091581034794, "train_speed(iter/s)": 0.154518 }, { "epoch": 0.5654801488310534, "grad_norm": 0.7680573463439941, "learning_rate": 8.590752052996053e-05, "loss": 0.9274446487426757, "memory(GiB)": 91.52, "step": 43580, "token_acc": 0.7515940941794251, "train_speed(iter/s)": 0.154513 }, { "epoch": 0.565545027232709, "grad_norm": 0.748016893863678, "learning_rate": 8.590378775293644e-05, "loss": 0.8906848907470704, "memory(GiB)": 91.52, "step": 43585, "token_acc": 0.7658342918630107, "train_speed(iter/s)": 0.15451 }, { "epoch": 0.5656099056343648, "grad_norm": 0.8912942409515381, "learning_rate": 8.590005456273386e-05, "loss": 0.9330732345581054, "memory(GiB)": 91.52, "step": 43590, "token_acc": 0.7585406078556763, "train_speed(iter/s)": 0.154506 }, { "epoch": 0.5656747840360205, "grad_norm": 0.7706290483474731, "learning_rate": 8.589632095939577e-05, "loss": 0.915824031829834, "memory(GiB)": 91.52, "step": 43595, "token_acc": 0.7481496502078475, "train_speed(iter/s)": 0.154501 }, { "epoch": 0.5657396624376761, "grad_norm": 0.7126200795173645, "learning_rate": 8.589258694296512e-05, "loss": 0.8955400466918946, "memory(GiB)": 91.52, "step": 43600, "token_acc": 0.7484992321652939, "train_speed(iter/s)": 0.154496 }, { "epoch": 0.5658045408393318, "grad_norm": 0.728805422782898, "learning_rate": 8.58888525134849e-05, "loss": 0.8461600303649902, "memory(GiB)": 91.52, "step": 43605, "token_acc": 0.7721355129817294, "train_speed(iter/s)": 0.154492 }, { "epoch": 0.5658694192409875, "grad_norm": 0.7312848567962646, "learning_rate": 8.588511767099808e-05, "loss": 0.9165973663330078, "memory(GiB)": 91.52, "step": 43610, "token_acc": 0.7407596227377007, "train_speed(iter/s)": 0.154486 }, { "epoch": 0.5659342976426432, "grad_norm": 0.8271673917770386, "learning_rate": 8.588138241554762e-05, "loss": 0.9275257110595703, "memory(GiB)": 91.52, "step": 43615, "token_acc": 0.7468876690704266, "train_speed(iter/s)": 0.154481 }, { "epoch": 0.565999176044299, "grad_norm": 0.7288817167282104, "learning_rate": 8.587764674717652e-05, "loss": 0.9224594116210938, "memory(GiB)": 91.52, "step": 43620, "token_acc": 0.7594486601415008, "train_speed(iter/s)": 0.154476 }, { "epoch": 0.5660640544459546, "grad_norm": 0.7400733828544617, "learning_rate": 8.587391066592778e-05, "loss": 0.9350105285644531, "memory(GiB)": 91.52, "step": 43625, "token_acc": 0.7478545618789522, "train_speed(iter/s)": 0.154471 }, { "epoch": 0.5661289328476103, "grad_norm": 0.7399243712425232, "learning_rate": 8.587017417184438e-05, "loss": 0.925076961517334, "memory(GiB)": 91.52, "step": 43630, "token_acc": 0.7542398894942829, "train_speed(iter/s)": 0.154466 }, { "epoch": 0.566193811249266, "grad_norm": 0.7643298506736755, "learning_rate": 8.586643726496932e-05, "loss": 0.9088029861450195, "memory(GiB)": 91.52, "step": 43635, "token_acc": 0.7574380612355296, "train_speed(iter/s)": 0.154462 }, { "epoch": 0.5662586896509217, "grad_norm": 0.8213031888008118, "learning_rate": 8.586269994534562e-05, "loss": 0.9179512023925781, "memory(GiB)": 91.52, "step": 43640, "token_acc": 0.7436568792469297, "train_speed(iter/s)": 0.154457 }, { "epoch": 0.5663235680525774, "grad_norm": 0.7973725199699402, "learning_rate": 8.585896221301627e-05, "loss": 0.9261849403381348, "memory(GiB)": 91.52, "step": 43645, "token_acc": 0.7394563516611476, "train_speed(iter/s)": 0.154452 }, { "epoch": 0.5663884464542331, "grad_norm": 0.7433379292488098, "learning_rate": 8.585522406802429e-05, "loss": 0.9281151771545411, "memory(GiB)": 91.52, "step": 43650, "token_acc": 0.7367512567379323, "train_speed(iter/s)": 0.154447 }, { "epoch": 0.5664533248558888, "grad_norm": 0.7168590426445007, "learning_rate": 8.58514855104127e-05, "loss": 0.9322968482971191, "memory(GiB)": 91.52, "step": 43655, "token_acc": 0.7496540271242734, "train_speed(iter/s)": 0.154442 }, { "epoch": 0.5665182032575445, "grad_norm": 0.7640960812568665, "learning_rate": 8.584774654022453e-05, "loss": 0.9095611572265625, "memory(GiB)": 91.52, "step": 43660, "token_acc": 0.7563074444322384, "train_speed(iter/s)": 0.154439 }, { "epoch": 0.5665830816592002, "grad_norm": 0.8008313775062561, "learning_rate": 8.584400715750278e-05, "loss": 0.938775634765625, "memory(GiB)": 91.52, "step": 43665, "token_acc": 0.7368275000867183, "train_speed(iter/s)": 0.154434 }, { "epoch": 0.5666479600608559, "grad_norm": 0.9272781610488892, "learning_rate": 8.584026736229053e-05, "loss": 0.9483707427978516, "memory(GiB)": 91.52, "step": 43670, "token_acc": 0.7696318018898664, "train_speed(iter/s)": 0.15443 }, { "epoch": 0.5667128384625116, "grad_norm": 0.75736403465271, "learning_rate": 8.583652715463078e-05, "loss": 0.8913893699645996, "memory(GiB)": 91.52, "step": 43675, "token_acc": 0.7693211983314372, "train_speed(iter/s)": 0.154424 }, { "epoch": 0.5667777168641673, "grad_norm": 0.8239261507987976, "learning_rate": 8.583278653456656e-05, "loss": 0.901385498046875, "memory(GiB)": 91.52, "step": 43680, "token_acc": 0.729560482302606, "train_speed(iter/s)": 0.15442 }, { "epoch": 0.566842595265823, "grad_norm": 0.853055477142334, "learning_rate": 8.582904550214096e-05, "loss": 0.9553153991699219, "memory(GiB)": 91.52, "step": 43685, "token_acc": 0.7475314416380834, "train_speed(iter/s)": 0.154415 }, { "epoch": 0.5669074736674787, "grad_norm": 0.7227967977523804, "learning_rate": 8.582530405739702e-05, "loss": 0.8896180152893066, "memory(GiB)": 91.52, "step": 43690, "token_acc": 0.7536873156342183, "train_speed(iter/s)": 0.154409 }, { "epoch": 0.5669723520691344, "grad_norm": 0.8225414156913757, "learning_rate": 8.582156220037775e-05, "loss": 0.970036792755127, "memory(GiB)": 91.52, "step": 43695, "token_acc": 0.7364839175728082, "train_speed(iter/s)": 0.154404 }, { "epoch": 0.5670372304707901, "grad_norm": 0.6857432723045349, "learning_rate": 8.581781993112629e-05, "loss": 0.8859535217285156, "memory(GiB)": 91.52, "step": 43700, "token_acc": 0.7549084249084249, "train_speed(iter/s)": 0.1544 }, { "epoch": 0.5671021088724458, "grad_norm": 0.7618523240089417, "learning_rate": 8.581407724968564e-05, "loss": 0.9110061645507812, "memory(GiB)": 91.52, "step": 43705, "token_acc": 0.7572776913897227, "train_speed(iter/s)": 0.154395 }, { "epoch": 0.5671669872741015, "grad_norm": 0.7493224143981934, "learning_rate": 8.581033415609889e-05, "loss": 0.9427465438842774, "memory(GiB)": 91.52, "step": 43710, "token_acc": 0.748405677844065, "train_speed(iter/s)": 0.154389 }, { "epoch": 0.5672318656757572, "grad_norm": 0.7267886996269226, "learning_rate": 8.580659065040914e-05, "loss": 0.9070043563842773, "memory(GiB)": 91.52, "step": 43715, "token_acc": 0.7677127912505944, "train_speed(iter/s)": 0.154385 }, { "epoch": 0.5672967440774129, "grad_norm": 0.6606224775314331, "learning_rate": 8.580284673265944e-05, "loss": 0.9346794128417969, "memory(GiB)": 91.52, "step": 43720, "token_acc": 0.7416745265907277, "train_speed(iter/s)": 0.15438 }, { "epoch": 0.5673616224790686, "grad_norm": 0.7970084547996521, "learning_rate": 8.579910240289287e-05, "loss": 0.8943936347961425, "memory(GiB)": 91.52, "step": 43725, "token_acc": 0.7402915281481752, "train_speed(iter/s)": 0.154375 }, { "epoch": 0.5674265008807243, "grad_norm": 0.7702183723449707, "learning_rate": 8.579535766115254e-05, "loss": 0.9454433441162109, "memory(GiB)": 91.52, "step": 43730, "token_acc": 0.7547056326068498, "train_speed(iter/s)": 0.154371 }, { "epoch": 0.56749137928238, "grad_norm": 0.7005813121795654, "learning_rate": 8.579161250748155e-05, "loss": 0.9123184204101562, "memory(GiB)": 91.52, "step": 43735, "token_acc": 0.7407685995623632, "train_speed(iter/s)": 0.154366 }, { "epoch": 0.5675562576840357, "grad_norm": 0.7569767832756042, "learning_rate": 8.578786694192298e-05, "loss": 0.9197386741638184, "memory(GiB)": 91.52, "step": 43740, "token_acc": 0.7467543359388105, "train_speed(iter/s)": 0.154362 }, { "epoch": 0.5676211360856914, "grad_norm": 0.6933754682540894, "learning_rate": 8.578412096451993e-05, "loss": 0.9606342315673828, "memory(GiB)": 91.52, "step": 43745, "token_acc": 0.7514395393474088, "train_speed(iter/s)": 0.154357 }, { "epoch": 0.5676860144873471, "grad_norm": 0.7893430590629578, "learning_rate": 8.578037457531553e-05, "loss": 0.9466302871704102, "memory(GiB)": 91.52, "step": 43750, "token_acc": 0.7635557764798475, "train_speed(iter/s)": 0.154352 }, { "epoch": 0.5677508928890028, "grad_norm": 0.7871921062469482, "learning_rate": 8.577662777435289e-05, "loss": 0.8775595664978028, "memory(GiB)": 91.52, "step": 43755, "token_acc": 0.7483429394812681, "train_speed(iter/s)": 0.154348 }, { "epoch": 0.5678157712906585, "grad_norm": 0.8290160298347473, "learning_rate": 8.577288056167511e-05, "loss": 0.9020798683166504, "memory(GiB)": 91.52, "step": 43760, "token_acc": 0.7705854355069015, "train_speed(iter/s)": 0.154342 }, { "epoch": 0.5678806496923142, "grad_norm": 0.7602367997169495, "learning_rate": 8.576913293732532e-05, "loss": 0.9175470352172852, "memory(GiB)": 91.52, "step": 43765, "token_acc": 0.7375468012371805, "train_speed(iter/s)": 0.154337 }, { "epoch": 0.5679455280939699, "grad_norm": 0.7938994765281677, "learning_rate": 8.576538490134665e-05, "loss": 0.9717892646789551, "memory(GiB)": 91.52, "step": 43770, "token_acc": 0.7357250781826488, "train_speed(iter/s)": 0.154334 }, { "epoch": 0.5680104064956256, "grad_norm": 0.7256351709365845, "learning_rate": 8.576163645378224e-05, "loss": 0.9030717849731446, "memory(GiB)": 91.52, "step": 43775, "token_acc": 0.7491749174917491, "train_speed(iter/s)": 0.154328 }, { "epoch": 0.5680752848972813, "grad_norm": 0.8271375298500061, "learning_rate": 8.575788759467522e-05, "loss": 0.9156285285949707, "memory(GiB)": 91.52, "step": 43780, "token_acc": 0.7489102924634421, "train_speed(iter/s)": 0.154323 }, { "epoch": 0.568140163298937, "grad_norm": 0.8009006381034851, "learning_rate": 8.575413832406872e-05, "loss": 0.930760383605957, "memory(GiB)": 91.52, "step": 43785, "token_acc": 0.7383370020592204, "train_speed(iter/s)": 0.154319 }, { "epoch": 0.5682050417005927, "grad_norm": 0.7379192113876343, "learning_rate": 8.57503886420059e-05, "loss": 0.8988612174987793, "memory(GiB)": 91.52, "step": 43790, "token_acc": 0.7642432522246427, "train_speed(iter/s)": 0.154314 }, { "epoch": 0.5682699201022484, "grad_norm": 0.747654378414154, "learning_rate": 8.574663854852992e-05, "loss": 0.8781879425048829, "memory(GiB)": 91.52, "step": 43795, "token_acc": 0.8006016573958452, "train_speed(iter/s)": 0.154308 }, { "epoch": 0.568334798503904, "grad_norm": 0.7740858793258667, "learning_rate": 8.574288804368391e-05, "loss": 0.9496757507324218, "memory(GiB)": 91.52, "step": 43800, "token_acc": 0.7416247457989938, "train_speed(iter/s)": 0.154303 }, { "epoch": 0.5683996769055597, "grad_norm": 0.7490834593772888, "learning_rate": 8.573913712751105e-05, "loss": 0.9245425224304199, "memory(GiB)": 91.52, "step": 43805, "token_acc": 0.7612919414969889, "train_speed(iter/s)": 0.154299 }, { "epoch": 0.5684645553072154, "grad_norm": 0.7011309862136841, "learning_rate": 8.573538580005452e-05, "loss": 0.9307966232299805, "memory(GiB)": 91.52, "step": 43810, "token_acc": 0.7465008327132278, "train_speed(iter/s)": 0.154295 }, { "epoch": 0.5685294337088711, "grad_norm": 0.7663331627845764, "learning_rate": 8.573163406135745e-05, "loss": 0.9582984924316407, "memory(GiB)": 91.52, "step": 43815, "token_acc": 0.725701428425323, "train_speed(iter/s)": 0.15429 }, { "epoch": 0.5685943121105268, "grad_norm": 0.8919214010238647, "learning_rate": 8.572788191146302e-05, "loss": 0.9197410583496094, "memory(GiB)": 91.52, "step": 43820, "token_acc": 0.7754705213339692, "train_speed(iter/s)": 0.154286 }, { "epoch": 0.5686591905121825, "grad_norm": 0.8023800253868103, "learning_rate": 8.572412935041445e-05, "loss": 0.9466917037963867, "memory(GiB)": 91.52, "step": 43825, "token_acc": 0.7407055344317701, "train_speed(iter/s)": 0.154282 }, { "epoch": 0.5687240689138382, "grad_norm": 0.799128532409668, "learning_rate": 8.572037637825487e-05, "loss": 0.8999614715576172, "memory(GiB)": 91.52, "step": 43830, "token_acc": 0.7387291595780878, "train_speed(iter/s)": 0.154277 }, { "epoch": 0.5687889473154939, "grad_norm": 0.7904698848724365, "learning_rate": 8.571662299502753e-05, "loss": 0.8969794273376465, "memory(GiB)": 91.52, "step": 43835, "token_acc": 0.7554418768742283, "train_speed(iter/s)": 0.154272 }, { "epoch": 0.5688538257171496, "grad_norm": 0.8053157329559326, "learning_rate": 8.571286920077559e-05, "loss": 0.8923650741577148, "memory(GiB)": 91.52, "step": 43840, "token_acc": 0.7399290336046754, "train_speed(iter/s)": 0.154267 }, { "epoch": 0.5689187041188053, "grad_norm": 0.703599750995636, "learning_rate": 8.570911499554222e-05, "loss": 0.9064450263977051, "memory(GiB)": 91.52, "step": 43845, "token_acc": 0.7607640760995148, "train_speed(iter/s)": 0.154262 }, { "epoch": 0.568983582520461, "grad_norm": 0.7800384163856506, "learning_rate": 8.570536037937068e-05, "loss": 0.8870513916015625, "memory(GiB)": 91.52, "step": 43850, "token_acc": 0.7623265538870937, "train_speed(iter/s)": 0.154258 }, { "epoch": 0.5690484609221167, "grad_norm": 0.8046353459358215, "learning_rate": 8.570160535230414e-05, "loss": 0.9409196853637696, "memory(GiB)": 91.52, "step": 43855, "token_acc": 0.7437577901974187, "train_speed(iter/s)": 0.154254 }, { "epoch": 0.5691133393237724, "grad_norm": 0.7449632883071899, "learning_rate": 8.569784991438583e-05, "loss": 0.9286191940307618, "memory(GiB)": 91.52, "step": 43860, "token_acc": 0.7525266926464531, "train_speed(iter/s)": 0.154249 }, { "epoch": 0.5691782177254281, "grad_norm": 0.7443687319755554, "learning_rate": 8.569409406565895e-05, "loss": 0.924166488647461, "memory(GiB)": 91.52, "step": 43865, "token_acc": 0.7478286689926509, "train_speed(iter/s)": 0.154243 }, { "epoch": 0.5692430961270838, "grad_norm": 0.7515287399291992, "learning_rate": 8.569033780616673e-05, "loss": 0.8708775520324707, "memory(GiB)": 91.52, "step": 43870, "token_acc": 0.7516081147946561, "train_speed(iter/s)": 0.15424 }, { "epoch": 0.5693079745287395, "grad_norm": 0.6846035718917847, "learning_rate": 8.568658113595241e-05, "loss": 0.8903367042541503, "memory(GiB)": 91.52, "step": 43875, "token_acc": 0.7445387702909104, "train_speed(iter/s)": 0.154236 }, { "epoch": 0.5693728529303952, "grad_norm": 0.6329512596130371, "learning_rate": 8.568282405505921e-05, "loss": 0.8581023216247559, "memory(GiB)": 91.52, "step": 43880, "token_acc": 0.7725444633121816, "train_speed(iter/s)": 0.15423 }, { "epoch": 0.5694377313320509, "grad_norm": 0.7242279052734375, "learning_rate": 8.567906656353038e-05, "loss": 0.9194930076599122, "memory(GiB)": 91.52, "step": 43885, "token_acc": 0.7454948301329395, "train_speed(iter/s)": 0.154226 }, { "epoch": 0.5695026097337066, "grad_norm": 0.8283220529556274, "learning_rate": 8.567530866140913e-05, "loss": 0.9110853195190429, "memory(GiB)": 91.52, "step": 43890, "token_acc": 0.7407323432717379, "train_speed(iter/s)": 0.154222 }, { "epoch": 0.5695674881353623, "grad_norm": 0.7242886424064636, "learning_rate": 8.567155034873872e-05, "loss": 0.9060588836669922, "memory(GiB)": 91.52, "step": 43895, "token_acc": 0.7410981025433993, "train_speed(iter/s)": 0.154217 }, { "epoch": 0.569632366537018, "grad_norm": 0.8364166021347046, "learning_rate": 8.566779162556242e-05, "loss": 0.8955392837524414, "memory(GiB)": 91.52, "step": 43900, "token_acc": 0.7844949775066248, "train_speed(iter/s)": 0.154212 }, { "epoch": 0.5696972449386737, "grad_norm": 0.8840537667274475, "learning_rate": 8.566403249192346e-05, "loss": 0.8923969268798828, "memory(GiB)": 91.52, "step": 43905, "token_acc": 0.7765325182543725, "train_speed(iter/s)": 0.154207 }, { "epoch": 0.5697621233403294, "grad_norm": 0.8403375148773193, "learning_rate": 8.56602729478651e-05, "loss": 0.8775421142578125, "memory(GiB)": 91.52, "step": 43910, "token_acc": 0.7693696763202725, "train_speed(iter/s)": 0.154203 }, { "epoch": 0.5698270017419851, "grad_norm": 0.9331811666488647, "learning_rate": 8.565651299343063e-05, "loss": 0.9214249610900879, "memory(GiB)": 91.52, "step": 43915, "token_acc": 0.7545730183587113, "train_speed(iter/s)": 0.154198 }, { "epoch": 0.5698918801436408, "grad_norm": 0.8239490985870361, "learning_rate": 8.56527526286633e-05, "loss": 0.9135526657104492, "memory(GiB)": 91.52, "step": 43920, "token_acc": 0.7539174744262672, "train_speed(iter/s)": 0.154194 }, { "epoch": 0.5699567585452965, "grad_norm": 0.8287410736083984, "learning_rate": 8.56489918536064e-05, "loss": 0.9520803451538086, "memory(GiB)": 91.52, "step": 43925, "token_acc": 0.7485565819861432, "train_speed(iter/s)": 0.154189 }, { "epoch": 0.5700216369469522, "grad_norm": 0.7596904039382935, "learning_rate": 8.564523066830317e-05, "loss": 0.960689926147461, "memory(GiB)": 91.52, "step": 43930, "token_acc": 0.7464982778415614, "train_speed(iter/s)": 0.154185 }, { "epoch": 0.5700865153486079, "grad_norm": 0.6671377420425415, "learning_rate": 8.564146907279694e-05, "loss": 0.8420818328857422, "memory(GiB)": 91.52, "step": 43935, "token_acc": 0.7616746037387163, "train_speed(iter/s)": 0.154179 }, { "epoch": 0.5701513937502636, "grad_norm": 0.7516713738441467, "learning_rate": 8.563770706713097e-05, "loss": 0.8832836151123047, "memory(GiB)": 91.52, "step": 43940, "token_acc": 0.769803241920685, "train_speed(iter/s)": 0.154174 }, { "epoch": 0.5702162721519193, "grad_norm": 0.6815600395202637, "learning_rate": 8.563394465134856e-05, "loss": 0.9359771728515625, "memory(GiB)": 91.52, "step": 43945, "token_acc": 0.7531499439276997, "train_speed(iter/s)": 0.154168 }, { "epoch": 0.570281150553575, "grad_norm": 0.7129833698272705, "learning_rate": 8.563018182549301e-05, "loss": 0.914396858215332, "memory(GiB)": 91.52, "step": 43950, "token_acc": 0.7516373608646556, "train_speed(iter/s)": 0.154164 }, { "epoch": 0.5703460289552307, "grad_norm": 0.686015784740448, "learning_rate": 8.56264185896076e-05, "loss": 0.8888575553894043, "memory(GiB)": 91.52, "step": 43955, "token_acc": 0.7480913201393522, "train_speed(iter/s)": 0.154159 }, { "epoch": 0.5704109073568864, "grad_norm": 0.8068249225616455, "learning_rate": 8.56226549437357e-05, "loss": 0.9571470260620117, "memory(GiB)": 91.52, "step": 43960, "token_acc": 0.7459738172565038, "train_speed(iter/s)": 0.154155 }, { "epoch": 0.5704757857585421, "grad_norm": 0.801662802696228, "learning_rate": 8.561889088792054e-05, "loss": 0.9037666320800781, "memory(GiB)": 91.52, "step": 43965, "token_acc": 0.7731116732923744, "train_speed(iter/s)": 0.15415 }, { "epoch": 0.5705406641601978, "grad_norm": 0.8431240320205688, "learning_rate": 8.561512642220551e-05, "loss": 0.9516681671142578, "memory(GiB)": 91.52, "step": 43970, "token_acc": 0.757230915125652, "train_speed(iter/s)": 0.154146 }, { "epoch": 0.5706055425618535, "grad_norm": 0.8572371602058411, "learning_rate": 8.561136154663388e-05, "loss": 0.8444992065429687, "memory(GiB)": 91.52, "step": 43975, "token_acc": 0.7678589800747625, "train_speed(iter/s)": 0.154141 }, { "epoch": 0.5706704209635092, "grad_norm": 0.8900598287582397, "learning_rate": 8.560759626124898e-05, "loss": 0.9170310974121094, "memory(GiB)": 91.52, "step": 43980, "token_acc": 0.7476007677543186, "train_speed(iter/s)": 0.154137 }, { "epoch": 0.5707352993651649, "grad_norm": 0.772415816783905, "learning_rate": 8.560383056609417e-05, "loss": 0.9009620666503906, "memory(GiB)": 91.52, "step": 43985, "token_acc": 0.7485095880007594, "train_speed(iter/s)": 0.154133 }, { "epoch": 0.5708001777668206, "grad_norm": 0.7196182012557983, "learning_rate": 8.560006446121276e-05, "loss": 0.927702808380127, "memory(GiB)": 91.52, "step": 43990, "token_acc": 0.7523928024502297, "train_speed(iter/s)": 0.154128 }, { "epoch": 0.5708650561684763, "grad_norm": 0.6576011180877686, "learning_rate": 8.559629794664812e-05, "loss": 0.8886501312255859, "memory(GiB)": 91.52, "step": 43995, "token_acc": 0.7590244555217289, "train_speed(iter/s)": 0.154123 }, { "epoch": 0.570929934570132, "grad_norm": 0.8157511949539185, "learning_rate": 8.559253102244355e-05, "loss": 0.8950329780578613, "memory(GiB)": 91.52, "step": 44000, "token_acc": 0.7641193621539901, "train_speed(iter/s)": 0.154119 }, { "epoch": 0.5709948129717877, "grad_norm": 0.6656694412231445, "learning_rate": 8.558876368864244e-05, "loss": 0.904631519317627, "memory(GiB)": 91.52, "step": 44005, "token_acc": 0.7609453700116234, "train_speed(iter/s)": 0.154115 }, { "epoch": 0.5710596913734434, "grad_norm": 0.7685797214508057, "learning_rate": 8.558499594528811e-05, "loss": 0.9126178741455078, "memory(GiB)": 91.52, "step": 44010, "token_acc": 0.7596447669977082, "train_speed(iter/s)": 0.154109 }, { "epoch": 0.5711245697750991, "grad_norm": 0.7207536697387695, "learning_rate": 8.558122779242395e-05, "loss": 0.8612977027893066, "memory(GiB)": 91.52, "step": 44015, "token_acc": 0.7723785166240409, "train_speed(iter/s)": 0.154105 }, { "epoch": 0.5711894481767548, "grad_norm": 0.806618332862854, "learning_rate": 8.557745923009332e-05, "loss": 0.9549514770507812, "memory(GiB)": 91.52, "step": 44020, "token_acc": 0.7620185346075876, "train_speed(iter/s)": 0.1541 }, { "epoch": 0.5712543265784105, "grad_norm": 0.8359271883964539, "learning_rate": 8.557369025833957e-05, "loss": 0.9562770843505859, "memory(GiB)": 91.52, "step": 44025, "token_acc": 0.7511285574092247, "train_speed(iter/s)": 0.154096 }, { "epoch": 0.5713192049800662, "grad_norm": 0.7343225479125977, "learning_rate": 8.556992087720608e-05, "loss": 0.9159065246582031, "memory(GiB)": 91.52, "step": 44030, "token_acc": 0.7510790781008225, "train_speed(iter/s)": 0.154092 }, { "epoch": 0.5713840833817219, "grad_norm": 0.8049405813217163, "learning_rate": 8.556615108673622e-05, "loss": 0.9440395355224609, "memory(GiB)": 91.52, "step": 44035, "token_acc": 0.7236040771844349, "train_speed(iter/s)": 0.154087 }, { "epoch": 0.5714489617833775, "grad_norm": 0.8903355002403259, "learning_rate": 8.556238088697339e-05, "loss": 0.897929573059082, "memory(GiB)": 91.52, "step": 44040, "token_acc": 0.760480377166157, "train_speed(iter/s)": 0.154082 }, { "epoch": 0.5715138401850332, "grad_norm": 0.8851019144058228, "learning_rate": 8.555861027796098e-05, "loss": 0.9428341865539551, "memory(GiB)": 91.52, "step": 44045, "token_acc": 0.7606837606837606, "train_speed(iter/s)": 0.154077 }, { "epoch": 0.5715787185866888, "grad_norm": 0.7473590970039368, "learning_rate": 8.555483925974236e-05, "loss": 0.9680817604064942, "memory(GiB)": 91.52, "step": 44050, "token_acc": 0.7435149038636945, "train_speed(iter/s)": 0.154073 }, { "epoch": 0.5716435969883445, "grad_norm": 0.6935569047927856, "learning_rate": 8.555106783236094e-05, "loss": 0.8791355133056641, "memory(GiB)": 91.52, "step": 44055, "token_acc": 0.7707535121328225, "train_speed(iter/s)": 0.154068 }, { "epoch": 0.5717084753900002, "grad_norm": 0.7581482529640198, "learning_rate": 8.554729599586013e-05, "loss": 0.9172736167907715, "memory(GiB)": 91.52, "step": 44060, "token_acc": 0.7624250214224507, "train_speed(iter/s)": 0.154064 }, { "epoch": 0.571773353791656, "grad_norm": 0.7126219868659973, "learning_rate": 8.554352375028331e-05, "loss": 0.8721264839172364, "memory(GiB)": 91.52, "step": 44065, "token_acc": 0.7762268108342675, "train_speed(iter/s)": 0.154059 }, { "epoch": 0.5718382321933116, "grad_norm": 0.6834098100662231, "learning_rate": 8.553975109567392e-05, "loss": 0.9157662391662598, "memory(GiB)": 91.52, "step": 44070, "token_acc": 0.7616014897579143, "train_speed(iter/s)": 0.154055 }, { "epoch": 0.5719031105949673, "grad_norm": 0.799528181552887, "learning_rate": 8.553597803207534e-05, "loss": 0.9129706382751465, "memory(GiB)": 91.52, "step": 44075, "token_acc": 0.7650630703166935, "train_speed(iter/s)": 0.15405 }, { "epoch": 0.571967988996623, "grad_norm": 0.7321304082870483, "learning_rate": 8.553220455953104e-05, "loss": 0.8932443618774414, "memory(GiB)": 91.52, "step": 44080, "token_acc": 0.7740434332988625, "train_speed(iter/s)": 0.154044 }, { "epoch": 0.5720328673982787, "grad_norm": 0.7537894248962402, "learning_rate": 8.552843067808441e-05, "loss": 0.9008808135986328, "memory(GiB)": 91.52, "step": 44085, "token_acc": 0.745697896749522, "train_speed(iter/s)": 0.154039 }, { "epoch": 0.5720977457999344, "grad_norm": 0.7975993752479553, "learning_rate": 8.552465638777886e-05, "loss": 0.934353256225586, "memory(GiB)": 91.52, "step": 44090, "token_acc": 0.7406722054380664, "train_speed(iter/s)": 0.154034 }, { "epoch": 0.5721626242015901, "grad_norm": 0.8067458868026733, "learning_rate": 8.552088168865788e-05, "loss": 0.921700382232666, "memory(GiB)": 91.52, "step": 44095, "token_acc": 0.7600748873516532, "train_speed(iter/s)": 0.154029 }, { "epoch": 0.5722275026032458, "grad_norm": 0.7091606259346008, "learning_rate": 8.551710658076486e-05, "loss": 0.903846549987793, "memory(GiB)": 91.52, "step": 44100, "token_acc": 0.7322071653981808, "train_speed(iter/s)": 0.154024 }, { "epoch": 0.5722923810049015, "grad_norm": 0.7009967565536499, "learning_rate": 8.551333106414328e-05, "loss": 0.9189184188842774, "memory(GiB)": 91.52, "step": 44105, "token_acc": 0.7312376097422827, "train_speed(iter/s)": 0.15402 }, { "epoch": 0.5723572594065572, "grad_norm": 0.7694882750511169, "learning_rate": 8.550955513883656e-05, "loss": 0.8846412658691406, "memory(GiB)": 91.52, "step": 44110, "token_acc": 0.7742584519708563, "train_speed(iter/s)": 0.154014 }, { "epoch": 0.5724221378082129, "grad_norm": 0.7729194760322571, "learning_rate": 8.550577880488818e-05, "loss": 0.9144983291625977, "memory(GiB)": 91.52, "step": 44115, "token_acc": 0.7678205376136144, "train_speed(iter/s)": 0.154011 }, { "epoch": 0.5724870162098686, "grad_norm": 0.7736478447914124, "learning_rate": 8.550200206234156e-05, "loss": 0.9321495056152344, "memory(GiB)": 91.52, "step": 44120, "token_acc": 0.7614028301582697, "train_speed(iter/s)": 0.154006 }, { "epoch": 0.5725518946115243, "grad_norm": 0.754311203956604, "learning_rate": 8.54982249112402e-05, "loss": 0.8921092987060547, "memory(GiB)": 91.52, "step": 44125, "token_acc": 0.7614614928703414, "train_speed(iter/s)": 0.154 }, { "epoch": 0.57261677301318, "grad_norm": 0.7833017706871033, "learning_rate": 8.549444735162755e-05, "loss": 0.914910888671875, "memory(GiB)": 91.52, "step": 44130, "token_acc": 0.7406383314899668, "train_speed(iter/s)": 0.153997 }, { "epoch": 0.5726816514148357, "grad_norm": 0.6904562711715698, "learning_rate": 8.549066938354708e-05, "loss": 0.9020313262939453, "memory(GiB)": 91.52, "step": 44135, "token_acc": 0.7454145002888504, "train_speed(iter/s)": 0.153992 }, { "epoch": 0.5727465298164914, "grad_norm": 0.7739303112030029, "learning_rate": 8.548689100704228e-05, "loss": 0.9232782363891602, "memory(GiB)": 91.52, "step": 44140, "token_acc": 0.750471424427038, "train_speed(iter/s)": 0.153987 }, { "epoch": 0.5728114082181471, "grad_norm": 0.7802118062973022, "learning_rate": 8.548311222215661e-05, "loss": 0.915587043762207, "memory(GiB)": 91.52, "step": 44145, "token_acc": 0.7477911956479802, "train_speed(iter/s)": 0.153983 }, { "epoch": 0.5728762866198028, "grad_norm": 0.7658572196960449, "learning_rate": 8.547933302893359e-05, "loss": 0.9252721786499023, "memory(GiB)": 91.52, "step": 44150, "token_acc": 0.7547819433817904, "train_speed(iter/s)": 0.153979 }, { "epoch": 0.5729411650214585, "grad_norm": 0.7233152389526367, "learning_rate": 8.547555342741666e-05, "loss": 0.9290227890014648, "memory(GiB)": 91.52, "step": 44155, "token_acc": 0.7425206403693072, "train_speed(iter/s)": 0.153974 }, { "epoch": 0.5730060434231142, "grad_norm": 0.8118253946304321, "learning_rate": 8.547177341764936e-05, "loss": 0.8988743782043457, "memory(GiB)": 91.52, "step": 44160, "token_acc": 0.760174468523543, "train_speed(iter/s)": 0.153969 }, { "epoch": 0.5730709218247699, "grad_norm": 0.839516282081604, "learning_rate": 8.546799299967519e-05, "loss": 0.9143062591552734, "memory(GiB)": 91.52, "step": 44165, "token_acc": 0.7356130108423686, "train_speed(iter/s)": 0.153965 }, { "epoch": 0.5731358002264256, "grad_norm": 0.7820832133293152, "learning_rate": 8.546421217353762e-05, "loss": 0.9174610137939453, "memory(GiB)": 91.52, "step": 44170, "token_acc": 0.7670898106034892, "train_speed(iter/s)": 0.153961 }, { "epoch": 0.5732006786280813, "grad_norm": 0.804857611656189, "learning_rate": 8.546043093928018e-05, "loss": 0.9135271072387695, "memory(GiB)": 91.52, "step": 44175, "token_acc": 0.7763387378076352, "train_speed(iter/s)": 0.153956 }, { "epoch": 0.573265557029737, "grad_norm": 0.7790645956993103, "learning_rate": 8.545664929694639e-05, "loss": 0.9237357139587402, "memory(GiB)": 91.52, "step": 44180, "token_acc": 0.7566229718580988, "train_speed(iter/s)": 0.153952 }, { "epoch": 0.5733304354313927, "grad_norm": 0.6665612459182739, "learning_rate": 8.545286724657975e-05, "loss": 0.8600239753723145, "memory(GiB)": 91.52, "step": 44185, "token_acc": 0.7681788511749347, "train_speed(iter/s)": 0.153947 }, { "epoch": 0.5733953138330484, "grad_norm": 0.7552574276924133, "learning_rate": 8.544908478822379e-05, "loss": 0.8526693344116211, "memory(GiB)": 91.52, "step": 44190, "token_acc": 0.7621895897039551, "train_speed(iter/s)": 0.153942 }, { "epoch": 0.5734601922347041, "grad_norm": 0.750300407409668, "learning_rate": 8.544530192192207e-05, "loss": 0.881597900390625, "memory(GiB)": 91.52, "step": 44195, "token_acc": 0.770911097406602, "train_speed(iter/s)": 0.153937 }, { "epoch": 0.5735250706363598, "grad_norm": 0.825406551361084, "learning_rate": 8.544151864771807e-05, "loss": 0.9096166610717773, "memory(GiB)": 91.52, "step": 44200, "token_acc": 0.7580916197851977, "train_speed(iter/s)": 0.153933 }, { "epoch": 0.5735899490380155, "grad_norm": 0.7359864711761475, "learning_rate": 8.543773496565537e-05, "loss": 0.9766885757446289, "memory(GiB)": 91.52, "step": 44205, "token_acc": 0.7235999238916725, "train_speed(iter/s)": 0.153929 }, { "epoch": 0.5736548274396712, "grad_norm": 0.7496774196624756, "learning_rate": 8.54339508757775e-05, "loss": 0.9466279983520508, "memory(GiB)": 91.52, "step": 44210, "token_acc": 0.7293852801278095, "train_speed(iter/s)": 0.153924 }, { "epoch": 0.5737197058413269, "grad_norm": 0.88010174036026, "learning_rate": 8.543016637812799e-05, "loss": 0.9510819435119628, "memory(GiB)": 91.52, "step": 44215, "token_acc": 0.7468154471817968, "train_speed(iter/s)": 0.15392 }, { "epoch": 0.5737845842429826, "grad_norm": 0.7529430389404297, "learning_rate": 8.542638147275042e-05, "loss": 0.8135978698730468, "memory(GiB)": 91.52, "step": 44220, "token_acc": 0.7731002633487272, "train_speed(iter/s)": 0.153916 }, { "epoch": 0.5738494626446383, "grad_norm": 0.8677375912666321, "learning_rate": 8.54225961596883e-05, "loss": 0.8796028137207031, "memory(GiB)": 91.52, "step": 44225, "token_acc": 0.7738155883851248, "train_speed(iter/s)": 0.15391 }, { "epoch": 0.573914341046294, "grad_norm": 0.7950977683067322, "learning_rate": 8.541881043898525e-05, "loss": 0.933889102935791, "memory(GiB)": 91.52, "step": 44230, "token_acc": 0.7565142819447193, "train_speed(iter/s)": 0.153906 }, { "epoch": 0.5739792194479497, "grad_norm": 0.8666817545890808, "learning_rate": 8.541502431068482e-05, "loss": 0.9220006942749024, "memory(GiB)": 91.52, "step": 44235, "token_acc": 0.7689852477521311, "train_speed(iter/s)": 0.153901 }, { "epoch": 0.5740440978496054, "grad_norm": 0.6819759011268616, "learning_rate": 8.541123777483056e-05, "loss": 0.9357687950134277, "memory(GiB)": 91.52, "step": 44240, "token_acc": 0.768469549982155, "train_speed(iter/s)": 0.153897 }, { "epoch": 0.5741089762512611, "grad_norm": 0.757911205291748, "learning_rate": 8.540745083146604e-05, "loss": 0.9203033447265625, "memory(GiB)": 91.52, "step": 44245, "token_acc": 0.7566081383442679, "train_speed(iter/s)": 0.153893 }, { "epoch": 0.5741738546529168, "grad_norm": 0.7524455189704895, "learning_rate": 8.540366348063486e-05, "loss": 0.8945600509643554, "memory(GiB)": 91.52, "step": 44250, "token_acc": 0.7633743274404304, "train_speed(iter/s)": 0.153889 }, { "epoch": 0.5742387330545725, "grad_norm": 0.820406436920166, "learning_rate": 8.539987572238062e-05, "loss": 0.8862221717834473, "memory(GiB)": 91.52, "step": 44255, "token_acc": 0.7501375490591644, "train_speed(iter/s)": 0.153884 }, { "epoch": 0.5743036114562282, "grad_norm": 0.9542938470840454, "learning_rate": 8.539608755674687e-05, "loss": 0.9128377914428711, "memory(GiB)": 91.52, "step": 44260, "token_acc": 0.7507803632236095, "train_speed(iter/s)": 0.153879 }, { "epoch": 0.5743684898578839, "grad_norm": 0.8936229348182678, "learning_rate": 8.539229898377723e-05, "loss": 0.9480062484741211, "memory(GiB)": 91.52, "step": 44265, "token_acc": 0.7453392024857587, "train_speed(iter/s)": 0.153875 }, { "epoch": 0.5744333682595396, "grad_norm": 0.8047939538955688, "learning_rate": 8.53885100035153e-05, "loss": 0.9821059226989746, "memory(GiB)": 91.52, "step": 44270, "token_acc": 0.7492137401462239, "train_speed(iter/s)": 0.153871 }, { "epoch": 0.5744982466611952, "grad_norm": 0.7267900109291077, "learning_rate": 8.538472061600465e-05, "loss": 0.9081556320190429, "memory(GiB)": 91.52, "step": 44275, "token_acc": 0.7391680532445923, "train_speed(iter/s)": 0.153866 }, { "epoch": 0.5745631250628509, "grad_norm": 0.7598186731338501, "learning_rate": 8.538093082128892e-05, "loss": 0.9371268272399902, "memory(GiB)": 91.52, "step": 44280, "token_acc": 0.7483411499044641, "train_speed(iter/s)": 0.153862 }, { "epoch": 0.5746280034645066, "grad_norm": 0.7263488173484802, "learning_rate": 8.537714061941171e-05, "loss": 0.863248634338379, "memory(GiB)": 91.52, "step": 44285, "token_acc": 0.7770446781432112, "train_speed(iter/s)": 0.153857 }, { "epoch": 0.5746928818661623, "grad_norm": 0.7850039601325989, "learning_rate": 8.537335001041666e-05, "loss": 0.9517239570617676, "memory(GiB)": 91.52, "step": 44290, "token_acc": 0.7281102392942187, "train_speed(iter/s)": 0.153854 }, { "epoch": 0.574757760267818, "grad_norm": 0.7401846647262573, "learning_rate": 8.536955899434737e-05, "loss": 0.8748970031738281, "memory(GiB)": 91.52, "step": 44295, "token_acc": 0.7736967847787917, "train_speed(iter/s)": 0.153848 }, { "epoch": 0.5748226386694737, "grad_norm": 0.7716494798660278, "learning_rate": 8.536576757124745e-05, "loss": 0.9324739456176758, "memory(GiB)": 91.52, "step": 44300, "token_acc": 0.7521051616635598, "train_speed(iter/s)": 0.153843 }, { "epoch": 0.5748875170711294, "grad_norm": 0.8497592806816101, "learning_rate": 8.536197574116058e-05, "loss": 0.8976434707641602, "memory(GiB)": 91.52, "step": 44305, "token_acc": 0.7708446177734446, "train_speed(iter/s)": 0.153839 }, { "epoch": 0.5749523954727851, "grad_norm": 0.7158365249633789, "learning_rate": 8.535818350413037e-05, "loss": 0.9075760841369629, "memory(GiB)": 91.52, "step": 44310, "token_acc": 0.7657865998586754, "train_speed(iter/s)": 0.153834 }, { "epoch": 0.5750172738744408, "grad_norm": 0.7831395864486694, "learning_rate": 8.535439086020044e-05, "loss": 0.9682558059692383, "memory(GiB)": 91.52, "step": 44315, "token_acc": 0.7567908564643017, "train_speed(iter/s)": 0.15383 }, { "epoch": 0.5750821522760965, "grad_norm": 0.8079591393470764, "learning_rate": 8.535059780941445e-05, "loss": 0.8942630767822266, "memory(GiB)": 91.52, "step": 44320, "token_acc": 0.7564008522826504, "train_speed(iter/s)": 0.153825 }, { "epoch": 0.5751470306777522, "grad_norm": 0.8385806083679199, "learning_rate": 8.534680435181609e-05, "loss": 0.8973526000976563, "memory(GiB)": 91.52, "step": 44325, "token_acc": 0.7387665672928415, "train_speed(iter/s)": 0.153821 }, { "epoch": 0.5752119090794079, "grad_norm": 0.819815456867218, "learning_rate": 8.534301048744896e-05, "loss": 0.9107393264770508, "memory(GiB)": 91.52, "step": 44330, "token_acc": 0.7594138160730379, "train_speed(iter/s)": 0.153817 }, { "epoch": 0.5752767874810636, "grad_norm": 0.7396105527877808, "learning_rate": 8.533921621635674e-05, "loss": 0.9312749862670898, "memory(GiB)": 91.52, "step": 44335, "token_acc": 0.753037895909076, "train_speed(iter/s)": 0.153812 }, { "epoch": 0.5753416658827193, "grad_norm": 0.8187403678894043, "learning_rate": 8.533542153858309e-05, "loss": 0.8778749465942383, "memory(GiB)": 91.52, "step": 44340, "token_acc": 0.7563919859166737, "train_speed(iter/s)": 0.153809 }, { "epoch": 0.575406544284375, "grad_norm": 0.7572181820869446, "learning_rate": 8.53316264541717e-05, "loss": 0.8469581604003906, "memory(GiB)": 91.52, "step": 44345, "token_acc": 0.7620238984316654, "train_speed(iter/s)": 0.153804 }, { "epoch": 0.5754714226860307, "grad_norm": 0.805940568447113, "learning_rate": 8.532783096316621e-05, "loss": 0.9229881286621093, "memory(GiB)": 91.52, "step": 44350, "token_acc": 0.7597360980207352, "train_speed(iter/s)": 0.153801 }, { "epoch": 0.5755363010876864, "grad_norm": 0.8135092258453369, "learning_rate": 8.532403506561031e-05, "loss": 0.9186086654663086, "memory(GiB)": 91.52, "step": 44355, "token_acc": 0.7620346217690301, "train_speed(iter/s)": 0.153796 }, { "epoch": 0.5756011794893421, "grad_norm": 0.7358550429344177, "learning_rate": 8.532023876154771e-05, "loss": 0.9443243980407715, "memory(GiB)": 91.52, "step": 44360, "token_acc": 0.737198269847914, "train_speed(iter/s)": 0.15379 }, { "epoch": 0.5756660578909978, "grad_norm": 0.9053144454956055, "learning_rate": 8.531644205102206e-05, "loss": 0.9371942520141602, "memory(GiB)": 91.52, "step": 44365, "token_acc": 0.7684489440797279, "train_speed(iter/s)": 0.153787 }, { "epoch": 0.5757309362926535, "grad_norm": 0.7058401703834534, "learning_rate": 8.531264493407706e-05, "loss": 0.8788329124450683, "memory(GiB)": 91.52, "step": 44370, "token_acc": 0.7700569219203093, "train_speed(iter/s)": 0.153782 }, { "epoch": 0.5757958146943092, "grad_norm": 0.8053334355354309, "learning_rate": 8.530884741075644e-05, "loss": 0.9029230117797852, "memory(GiB)": 91.52, "step": 44375, "token_acc": 0.7728511530398323, "train_speed(iter/s)": 0.153778 }, { "epoch": 0.5758606930959649, "grad_norm": 0.7176382541656494, "learning_rate": 8.530504948110385e-05, "loss": 0.8970108985900879, "memory(GiB)": 91.52, "step": 44380, "token_acc": 0.7345501363223266, "train_speed(iter/s)": 0.153773 }, { "epoch": 0.5759255714976206, "grad_norm": 0.8391297459602356, "learning_rate": 8.530125114516304e-05, "loss": 0.9244503021240235, "memory(GiB)": 91.52, "step": 44385, "token_acc": 0.7519391870927707, "train_speed(iter/s)": 0.153768 }, { "epoch": 0.5759904498992763, "grad_norm": 0.7337354421615601, "learning_rate": 8.52974524029777e-05, "loss": 0.9034743309020996, "memory(GiB)": 91.52, "step": 44390, "token_acc": 0.7589065990232082, "train_speed(iter/s)": 0.153763 }, { "epoch": 0.576055328300932, "grad_norm": 0.7452526092529297, "learning_rate": 8.529365325459156e-05, "loss": 0.8876535415649414, "memory(GiB)": 91.52, "step": 44395, "token_acc": 0.7550465329663127, "train_speed(iter/s)": 0.153758 }, { "epoch": 0.5761202067025877, "grad_norm": 0.7917050719261169, "learning_rate": 8.528985370004832e-05, "loss": 0.8807485580444336, "memory(GiB)": 91.52, "step": 44400, "token_acc": 0.7517698545501351, "train_speed(iter/s)": 0.153753 }, { "epoch": 0.5761850851042434, "grad_norm": 0.7209444046020508, "learning_rate": 8.528605373939169e-05, "loss": 0.9020071983337402, "memory(GiB)": 91.52, "step": 44405, "token_acc": 0.7503125500881551, "train_speed(iter/s)": 0.153749 }, { "epoch": 0.5762499635058991, "grad_norm": 0.8093211650848389, "learning_rate": 8.528225337266546e-05, "loss": 0.9539762496948242, "memory(GiB)": 91.52, "step": 44410, "token_acc": 0.7433537394580247, "train_speed(iter/s)": 0.153745 }, { "epoch": 0.5763148419075548, "grad_norm": 0.7699163556098938, "learning_rate": 8.527845259991332e-05, "loss": 0.8986346244812011, "memory(GiB)": 91.52, "step": 44415, "token_acc": 0.7479619871188579, "train_speed(iter/s)": 0.15374 }, { "epoch": 0.5763797203092105, "grad_norm": 0.7746731638908386, "learning_rate": 8.527465142117901e-05, "loss": 0.8988645553588868, "memory(GiB)": 91.52, "step": 44420, "token_acc": 0.7690015809315335, "train_speed(iter/s)": 0.153736 }, { "epoch": 0.5764445987108662, "grad_norm": 0.7043126821517944, "learning_rate": 8.527084983650628e-05, "loss": 0.8534071922302247, "memory(GiB)": 91.52, "step": 44425, "token_acc": 0.7867529420130124, "train_speed(iter/s)": 0.153732 }, { "epoch": 0.5765094771125219, "grad_norm": 0.7041481137275696, "learning_rate": 8.526704784593889e-05, "loss": 0.9023626327514649, "memory(GiB)": 91.52, "step": 44430, "token_acc": 0.7279173419773096, "train_speed(iter/s)": 0.153727 }, { "epoch": 0.5765743555141776, "grad_norm": 0.8284013867378235, "learning_rate": 8.526324544952059e-05, "loss": 0.9036840438842774, "memory(GiB)": 91.52, "step": 44435, "token_acc": 0.7562160820061717, "train_speed(iter/s)": 0.153723 }, { "epoch": 0.5766392339158333, "grad_norm": 0.7858742475509644, "learning_rate": 8.525944264729511e-05, "loss": 0.9341228485107422, "memory(GiB)": 91.52, "step": 44440, "token_acc": 0.7460446614230178, "train_speed(iter/s)": 0.153719 }, { "epoch": 0.576704112317489, "grad_norm": 0.6927683353424072, "learning_rate": 8.525563943930624e-05, "loss": 0.8594337463378906, "memory(GiB)": 91.52, "step": 44445, "token_acc": 0.7586915297092288, "train_speed(iter/s)": 0.153714 }, { "epoch": 0.5767689907191447, "grad_norm": 0.7184839248657227, "learning_rate": 8.525183582559775e-05, "loss": 0.9274323463439942, "memory(GiB)": 91.52, "step": 44450, "token_acc": 0.7590986106404609, "train_speed(iter/s)": 0.15371 }, { "epoch": 0.5768338691208004, "grad_norm": 0.7759063839912415, "learning_rate": 8.52480318062134e-05, "loss": 0.923564338684082, "memory(GiB)": 91.52, "step": 44455, "token_acc": 0.738024256385934, "train_speed(iter/s)": 0.153706 }, { "epoch": 0.5768987475224561, "grad_norm": 0.8211904168128967, "learning_rate": 8.524422738119696e-05, "loss": 0.9008336067199707, "memory(GiB)": 91.52, "step": 44460, "token_acc": 0.7696166109339762, "train_speed(iter/s)": 0.153701 }, { "epoch": 0.5769636259241118, "grad_norm": 0.7642634510993958, "learning_rate": 8.524042255059223e-05, "loss": 0.9497209548950195, "memory(GiB)": 91.52, "step": 44465, "token_acc": 0.7354112049713565, "train_speed(iter/s)": 0.153696 }, { "epoch": 0.5770285043257675, "grad_norm": 0.8023788928985596, "learning_rate": 8.523661731444298e-05, "loss": 0.9134859085083008, "memory(GiB)": 91.52, "step": 44470, "token_acc": 0.7637103273592508, "train_speed(iter/s)": 0.153691 }, { "epoch": 0.5770933827274232, "grad_norm": 0.8291397094726562, "learning_rate": 8.5232811672793e-05, "loss": 0.9274814605712891, "memory(GiB)": 91.52, "step": 44475, "token_acc": 0.7539936102236422, "train_speed(iter/s)": 0.153688 }, { "epoch": 0.5771582611290789, "grad_norm": 0.8424213528633118, "learning_rate": 8.52290056256861e-05, "loss": 0.9220414161682129, "memory(GiB)": 91.52, "step": 44480, "token_acc": 0.7539243144662281, "train_speed(iter/s)": 0.153683 }, { "epoch": 0.5772231395307346, "grad_norm": 0.7679651379585266, "learning_rate": 8.522519917316608e-05, "loss": 0.8642747879028321, "memory(GiB)": 91.52, "step": 44485, "token_acc": 0.7766502562041595, "train_speed(iter/s)": 0.153678 }, { "epoch": 0.5772880179323903, "grad_norm": 0.7234194874763489, "learning_rate": 8.52213923152767e-05, "loss": 0.8680444717407226, "memory(GiB)": 91.52, "step": 44490, "token_acc": 0.7476810351547276, "train_speed(iter/s)": 0.153675 }, { "epoch": 0.577352896334046, "grad_norm": 0.7180025577545166, "learning_rate": 8.521758505206183e-05, "loss": 0.8731039047241211, "memory(GiB)": 91.52, "step": 44495, "token_acc": 0.7574350969438055, "train_speed(iter/s)": 0.15367 }, { "epoch": 0.5774177747357017, "grad_norm": 0.753547728061676, "learning_rate": 8.521377738356525e-05, "loss": 0.9066349029541015, "memory(GiB)": 91.52, "step": 44500, "token_acc": 0.7557805087594786, "train_speed(iter/s)": 0.153666 }, { "epoch": 0.5774826531373574, "grad_norm": 0.7842033505439758, "learning_rate": 8.520996930983081e-05, "loss": 0.9210588455200195, "memory(GiB)": 91.52, "step": 44505, "token_acc": 0.7534225852809924, "train_speed(iter/s)": 0.153661 }, { "epoch": 0.5775475315390131, "grad_norm": 0.8795467615127563, "learning_rate": 8.520616083090228e-05, "loss": 0.9371474266052247, "memory(GiB)": 91.52, "step": 44510, "token_acc": 0.7482633977683483, "train_speed(iter/s)": 0.153656 }, { "epoch": 0.5776124099406686, "grad_norm": 0.7242360711097717, "learning_rate": 8.52023519468235e-05, "loss": 0.8675403594970703, "memory(GiB)": 91.52, "step": 44515, "token_acc": 0.7568870303571998, "train_speed(iter/s)": 0.153652 }, { "epoch": 0.5776772883423243, "grad_norm": 0.7827286720275879, "learning_rate": 8.519854265763835e-05, "loss": 0.9027471542358398, "memory(GiB)": 91.52, "step": 44520, "token_acc": 0.7463276440962507, "train_speed(iter/s)": 0.153647 }, { "epoch": 0.57774216674398, "grad_norm": 0.6872838139533997, "learning_rate": 8.519473296339065e-05, "loss": 0.8804948806762696, "memory(GiB)": 91.52, "step": 44525, "token_acc": 0.7674418604651163, "train_speed(iter/s)": 0.153642 }, { "epoch": 0.5778070451456357, "grad_norm": 0.7705199718475342, "learning_rate": 8.51909228641242e-05, "loss": 0.9484268188476562, "memory(GiB)": 91.52, "step": 44530, "token_acc": 0.7273193409448281, "train_speed(iter/s)": 0.153637 }, { "epoch": 0.5778719235472914, "grad_norm": 0.8338192701339722, "learning_rate": 8.51871123598829e-05, "loss": 0.9222972869873047, "memory(GiB)": 91.52, "step": 44535, "token_acc": 0.7427106696571611, "train_speed(iter/s)": 0.153634 }, { "epoch": 0.5779368019489471, "grad_norm": 0.8282834887504578, "learning_rate": 8.518330145071056e-05, "loss": 0.9453376770019531, "memory(GiB)": 91.52, "step": 44540, "token_acc": 0.7414472244510906, "train_speed(iter/s)": 0.15363 }, { "epoch": 0.5780016803506028, "grad_norm": 0.7231586575508118, "learning_rate": 8.517949013665105e-05, "loss": 0.8956794738769531, "memory(GiB)": 91.52, "step": 44545, "token_acc": 0.7720080394664718, "train_speed(iter/s)": 0.153625 }, { "epoch": 0.5780665587522585, "grad_norm": 0.7898216843605042, "learning_rate": 8.517567841774823e-05, "loss": 0.9278985977172851, "memory(GiB)": 91.52, "step": 44550, "token_acc": 0.7676690950523081, "train_speed(iter/s)": 0.153621 }, { "epoch": 0.5781314371539142, "grad_norm": 0.8693126440048218, "learning_rate": 8.517186629404597e-05, "loss": 0.8792856216430665, "memory(GiB)": 91.52, "step": 44555, "token_acc": 0.7683080808080808, "train_speed(iter/s)": 0.153617 }, { "epoch": 0.5781963155555699, "grad_norm": 0.7387945055961609, "learning_rate": 8.516805376558814e-05, "loss": 0.9267499923706055, "memory(GiB)": 91.52, "step": 44560, "token_acc": 0.7550827742000742, "train_speed(iter/s)": 0.153611 }, { "epoch": 0.5782611939572256, "grad_norm": 0.6776882410049438, "learning_rate": 8.516424083241861e-05, "loss": 0.8742183685302735, "memory(GiB)": 91.52, "step": 44565, "token_acc": 0.7493278618933069, "train_speed(iter/s)": 0.153607 }, { "epoch": 0.5783260723588813, "grad_norm": 0.6650400757789612, "learning_rate": 8.516042749458124e-05, "loss": 0.8780887603759766, "memory(GiB)": 91.52, "step": 44570, "token_acc": 0.7812940793518753, "train_speed(iter/s)": 0.153602 }, { "epoch": 0.578390950760537, "grad_norm": 0.7110337615013123, "learning_rate": 8.515661375211996e-05, "loss": 0.903896713256836, "memory(GiB)": 91.52, "step": 44575, "token_acc": 0.7387458973916048, "train_speed(iter/s)": 0.153597 }, { "epoch": 0.5784558291621927, "grad_norm": 0.787395179271698, "learning_rate": 8.515279960507863e-05, "loss": 0.8911496162414551, "memory(GiB)": 91.52, "step": 44580, "token_acc": 0.7562741791488939, "train_speed(iter/s)": 0.153591 }, { "epoch": 0.5785207075638484, "grad_norm": 0.675797700881958, "learning_rate": 8.514898505350114e-05, "loss": 0.8863678932189941, "memory(GiB)": 91.52, "step": 44585, "token_acc": 0.7410140617542313, "train_speed(iter/s)": 0.153587 }, { "epoch": 0.5785855859655041, "grad_norm": 0.818820059299469, "learning_rate": 8.514517009743136e-05, "loss": 0.8810685157775879, "memory(GiB)": 91.52, "step": 44590, "token_acc": 0.7550331169257001, "train_speed(iter/s)": 0.153582 }, { "epoch": 0.5786504643671598, "grad_norm": 0.7085621953010559, "learning_rate": 8.514135473691327e-05, "loss": 0.9056033134460449, "memory(GiB)": 91.52, "step": 44595, "token_acc": 0.7510931718802556, "train_speed(iter/s)": 0.153579 }, { "epoch": 0.5787153427688155, "grad_norm": 0.7720783948898315, "learning_rate": 8.513753897199072e-05, "loss": 0.9038408279418946, "memory(GiB)": 91.52, "step": 44600, "token_acc": 0.7481481481481481, "train_speed(iter/s)": 0.153574 }, { "epoch": 0.5787802211704712, "grad_norm": 0.8013516068458557, "learning_rate": 8.513372280270762e-05, "loss": 0.917724323272705, "memory(GiB)": 91.52, "step": 44605, "token_acc": 0.7431598603556061, "train_speed(iter/s)": 0.153571 }, { "epoch": 0.5788450995721269, "grad_norm": 0.7729446887969971, "learning_rate": 8.51299062291079e-05, "loss": 0.9405421257019043, "memory(GiB)": 91.52, "step": 44610, "token_acc": 0.7464545140089934, "train_speed(iter/s)": 0.153566 }, { "epoch": 0.5789099779737826, "grad_norm": 0.6669530272483826, "learning_rate": 8.512608925123548e-05, "loss": 0.916778564453125, "memory(GiB)": 91.52, "step": 44615, "token_acc": 0.759310296092149, "train_speed(iter/s)": 0.153561 }, { "epoch": 0.5789748563754383, "grad_norm": 0.8818715810775757, "learning_rate": 8.51222718691343e-05, "loss": 0.9241943359375, "memory(GiB)": 91.52, "step": 44620, "token_acc": 0.7402931100114198, "train_speed(iter/s)": 0.153557 }, { "epoch": 0.579039734777094, "grad_norm": 0.6977535486221313, "learning_rate": 8.511845408284826e-05, "loss": 0.9667598724365234, "memory(GiB)": 91.52, "step": 44625, "token_acc": 0.7596217551044991, "train_speed(iter/s)": 0.153552 }, { "epoch": 0.5791046131787497, "grad_norm": 0.7622299194335938, "learning_rate": 8.511463589242132e-05, "loss": 0.9172510147094727, "memory(GiB)": 91.52, "step": 44630, "token_acc": 0.7492874815997995, "train_speed(iter/s)": 0.153548 }, { "epoch": 0.5791694915804054, "grad_norm": 0.7770572304725647, "learning_rate": 8.511081729789743e-05, "loss": 0.9456059455871582, "memory(GiB)": 91.52, "step": 44635, "token_acc": 0.7597786828556059, "train_speed(iter/s)": 0.153542 }, { "epoch": 0.5792343699820611, "grad_norm": 0.7508944869041443, "learning_rate": 8.510699829932047e-05, "loss": 0.9138370513916015, "memory(GiB)": 91.52, "step": 44640, "token_acc": 0.7520240573675688, "train_speed(iter/s)": 0.153538 }, { "epoch": 0.5792992483837168, "grad_norm": 0.6923722624778748, "learning_rate": 8.510317889673446e-05, "loss": 0.8805131912231445, "memory(GiB)": 91.52, "step": 44645, "token_acc": 0.7444301999237726, "train_speed(iter/s)": 0.153533 }, { "epoch": 0.5793641267853725, "grad_norm": 0.7430931925773621, "learning_rate": 8.509935909018332e-05, "loss": 0.9235373497009277, "memory(GiB)": 91.52, "step": 44650, "token_acc": 0.7278983589200635, "train_speed(iter/s)": 0.153528 }, { "epoch": 0.5794290051870282, "grad_norm": 0.7667968273162842, "learning_rate": 8.509553887971102e-05, "loss": 0.9109970092773437, "memory(GiB)": 91.52, "step": 44655, "token_acc": 0.7489810156795065, "train_speed(iter/s)": 0.153524 }, { "epoch": 0.5794938835886839, "grad_norm": 0.81017005443573, "learning_rate": 8.509171826536152e-05, "loss": 0.8958213806152344, "memory(GiB)": 91.52, "step": 44660, "token_acc": 0.7482666666666666, "train_speed(iter/s)": 0.15352 }, { "epoch": 0.5795587619903396, "grad_norm": 0.6856555938720703, "learning_rate": 8.508789724717878e-05, "loss": 0.8681867599487305, "memory(GiB)": 91.52, "step": 44665, "token_acc": 0.7442159383033419, "train_speed(iter/s)": 0.153515 }, { "epoch": 0.5796236403919953, "grad_norm": 0.6758288741111755, "learning_rate": 8.508407582520678e-05, "loss": 0.8926112174987793, "memory(GiB)": 91.52, "step": 44670, "token_acc": 0.7671259089169536, "train_speed(iter/s)": 0.15351 }, { "epoch": 0.579688518793651, "grad_norm": 0.7765833139419556, "learning_rate": 8.508025399948948e-05, "loss": 0.8951045989990234, "memory(GiB)": 91.52, "step": 44675, "token_acc": 0.7600150268091936, "train_speed(iter/s)": 0.153505 }, { "epoch": 0.5797533971953067, "grad_norm": 0.7790072560310364, "learning_rate": 8.507643177007089e-05, "loss": 0.9562945365905762, "memory(GiB)": 91.52, "step": 44680, "token_acc": 0.7541539621090136, "train_speed(iter/s)": 0.153501 }, { "epoch": 0.5798182755969624, "grad_norm": 0.81392902135849, "learning_rate": 8.507260913699497e-05, "loss": 0.9325730323791503, "memory(GiB)": 91.52, "step": 44685, "token_acc": 0.7597954334957516, "train_speed(iter/s)": 0.153498 }, { "epoch": 0.5798831539986181, "grad_norm": 0.7511690258979797, "learning_rate": 8.506878610030573e-05, "loss": 0.9345239639282227, "memory(GiB)": 91.52, "step": 44690, "token_acc": 0.7431563997869697, "train_speed(iter/s)": 0.153493 }, { "epoch": 0.5799480324002738, "grad_norm": 0.7875550389289856, "learning_rate": 8.506496266004715e-05, "loss": 0.9539590835571289, "memory(GiB)": 91.52, "step": 44695, "token_acc": 0.7246381758601856, "train_speed(iter/s)": 0.153489 }, { "epoch": 0.5800129108019295, "grad_norm": 0.7953363656997681, "learning_rate": 8.506113881626323e-05, "loss": 0.9257984161376953, "memory(GiB)": 91.52, "step": 44700, "token_acc": 0.7595266272189349, "train_speed(iter/s)": 0.153485 }, { "epoch": 0.5800777892035852, "grad_norm": 0.8113934397697449, "learning_rate": 8.505731456899799e-05, "loss": 0.8924321174621582, "memory(GiB)": 91.52, "step": 44705, "token_acc": 0.7589566310496543, "train_speed(iter/s)": 0.153481 }, { "epoch": 0.5801426676052409, "grad_norm": 0.7748269438743591, "learning_rate": 8.505348991829544e-05, "loss": 0.9255308151245117, "memory(GiB)": 91.52, "step": 44710, "token_acc": 0.7482603777465464, "train_speed(iter/s)": 0.153476 }, { "epoch": 0.5802075460068966, "grad_norm": 0.8759264945983887, "learning_rate": 8.504966486419956e-05, "loss": 0.8765986442565918, "memory(GiB)": 91.52, "step": 44715, "token_acc": 0.7409999327097773, "train_speed(iter/s)": 0.153471 }, { "epoch": 0.5802724244085523, "grad_norm": 0.8221396803855896, "learning_rate": 8.504583940675441e-05, "loss": 0.9636024475097656, "memory(GiB)": 91.52, "step": 44720, "token_acc": 0.7450263181372112, "train_speed(iter/s)": 0.153468 }, { "epoch": 0.580337302810208, "grad_norm": 0.7244228720664978, "learning_rate": 8.504201354600399e-05, "loss": 0.8970331192016602, "memory(GiB)": 91.52, "step": 44725, "token_acc": 0.7634605523404879, "train_speed(iter/s)": 0.153464 }, { "epoch": 0.5804021812118637, "grad_norm": 0.7890056371688843, "learning_rate": 8.503818728199234e-05, "loss": 0.9062627792358399, "memory(GiB)": 91.52, "step": 44730, "token_acc": 0.7514692698475944, "train_speed(iter/s)": 0.15346 }, { "epoch": 0.5804670596135194, "grad_norm": 0.7390502095222473, "learning_rate": 8.503436061476349e-05, "loss": 0.9547314643859863, "memory(GiB)": 91.52, "step": 44735, "token_acc": 0.7343553459119497, "train_speed(iter/s)": 0.153455 }, { "epoch": 0.5805319380151751, "grad_norm": 0.6884648203849792, "learning_rate": 8.503053354436146e-05, "loss": 0.9087043762207031, "memory(GiB)": 91.52, "step": 44740, "token_acc": 0.7626769787313665, "train_speed(iter/s)": 0.153451 }, { "epoch": 0.5805968164168308, "grad_norm": 0.7177262902259827, "learning_rate": 8.502670607083031e-05, "loss": 0.9009507179260254, "memory(GiB)": 91.52, "step": 44745, "token_acc": 0.7524585072978848, "train_speed(iter/s)": 0.153446 }, { "epoch": 0.5806616948184865, "grad_norm": 0.7426280379295349, "learning_rate": 8.502287819421407e-05, "loss": 0.8962457656860352, "memory(GiB)": 91.52, "step": 44750, "token_acc": 0.7493399928647877, "train_speed(iter/s)": 0.153441 }, { "epoch": 0.5807265732201421, "grad_norm": 0.7878442406654358, "learning_rate": 8.501904991455682e-05, "loss": 0.8635762214660645, "memory(GiB)": 91.52, "step": 44755, "token_acc": 0.7765368715828778, "train_speed(iter/s)": 0.153436 }, { "epoch": 0.5807914516217978, "grad_norm": 0.7266294956207275, "learning_rate": 8.501522123190259e-05, "loss": 0.8624994277954101, "memory(GiB)": 91.52, "step": 44760, "token_acc": 0.7643488255154084, "train_speed(iter/s)": 0.153431 }, { "epoch": 0.5808563300234535, "grad_norm": 0.8101352453231812, "learning_rate": 8.501139214629545e-05, "loss": 0.9572643280029297, "memory(GiB)": 91.52, "step": 44765, "token_acc": 0.7457444105691057, "train_speed(iter/s)": 0.153427 }, { "epoch": 0.5809212084251092, "grad_norm": 0.6927368640899658, "learning_rate": 8.500756265777945e-05, "loss": 0.8615095138549804, "memory(GiB)": 91.52, "step": 44770, "token_acc": 0.7700244366005974, "train_speed(iter/s)": 0.153422 }, { "epoch": 0.5809860868267649, "grad_norm": 0.7712875008583069, "learning_rate": 8.500373276639868e-05, "loss": 0.917481517791748, "memory(GiB)": 91.52, "step": 44775, "token_acc": 0.7643536606081196, "train_speed(iter/s)": 0.153417 }, { "epoch": 0.5810509652284206, "grad_norm": 0.7926610112190247, "learning_rate": 8.499990247219721e-05, "loss": 0.8805230140686036, "memory(GiB)": 91.52, "step": 44780, "token_acc": 0.7478000078923484, "train_speed(iter/s)": 0.153413 }, { "epoch": 0.5811158436300763, "grad_norm": 0.7662168145179749, "learning_rate": 8.499607177521911e-05, "loss": 0.9411605834960938, "memory(GiB)": 91.52, "step": 44785, "token_acc": 0.7559834638816362, "train_speed(iter/s)": 0.153409 }, { "epoch": 0.581180722031732, "grad_norm": 0.8032063841819763, "learning_rate": 8.499224067550846e-05, "loss": 0.8969042778015137, "memory(GiB)": 91.52, "step": 44790, "token_acc": 0.7614018048642305, "train_speed(iter/s)": 0.153405 }, { "epoch": 0.5812456004333877, "grad_norm": 0.7152096033096313, "learning_rate": 8.498840917310937e-05, "loss": 0.8478434562683106, "memory(GiB)": 91.52, "step": 44795, "token_acc": 0.7692807010426387, "train_speed(iter/s)": 0.1534 }, { "epoch": 0.5813104788350434, "grad_norm": 0.7606576681137085, "learning_rate": 8.498457726806592e-05, "loss": 0.8999557495117188, "memory(GiB)": 91.52, "step": 44800, "token_acc": 0.767427146525736, "train_speed(iter/s)": 0.153395 }, { "epoch": 0.5813753572366991, "grad_norm": 0.7968512773513794, "learning_rate": 8.498074496042221e-05, "loss": 0.9474311828613281, "memory(GiB)": 91.52, "step": 44805, "token_acc": 0.7353233099290519, "train_speed(iter/s)": 0.153391 }, { "epoch": 0.5814402356383548, "grad_norm": 0.8211795687675476, "learning_rate": 8.497691225022232e-05, "loss": 0.9455076217651367, "memory(GiB)": 91.52, "step": 44810, "token_acc": 0.7434128952262864, "train_speed(iter/s)": 0.153387 }, { "epoch": 0.5815051140400105, "grad_norm": 0.7073356509208679, "learning_rate": 8.497307913751039e-05, "loss": 0.914691162109375, "memory(GiB)": 91.52, "step": 44815, "token_acc": 0.7614214682655748, "train_speed(iter/s)": 0.153381 }, { "epoch": 0.5815699924416662, "grad_norm": 0.7801418304443359, "learning_rate": 8.49692456223305e-05, "loss": 0.8966626167297364, "memory(GiB)": 91.52, "step": 44820, "token_acc": 0.7527037872129018, "train_speed(iter/s)": 0.153376 }, { "epoch": 0.5816348708433219, "grad_norm": 0.803364634513855, "learning_rate": 8.49654117047268e-05, "loss": 0.9231002807617188, "memory(GiB)": 91.52, "step": 44825, "token_acc": 0.7558390578999019, "train_speed(iter/s)": 0.153372 }, { "epoch": 0.5816997492449776, "grad_norm": 0.6521673202514648, "learning_rate": 8.49615773847434e-05, "loss": 0.9144600868225098, "memory(GiB)": 91.52, "step": 44830, "token_acc": 0.7531580037274798, "train_speed(iter/s)": 0.153369 }, { "epoch": 0.5817646276466333, "grad_norm": 0.7539169788360596, "learning_rate": 8.49577426624244e-05, "loss": 0.91202392578125, "memory(GiB)": 91.52, "step": 44835, "token_acc": 0.7657136738408823, "train_speed(iter/s)": 0.153364 }, { "epoch": 0.581829506048289, "grad_norm": 0.7188588976860046, "learning_rate": 8.495390753781395e-05, "loss": 0.8825021743774414, "memory(GiB)": 91.52, "step": 44840, "token_acc": 0.775623854504441, "train_speed(iter/s)": 0.153359 }, { "epoch": 0.5818943844499447, "grad_norm": 0.7922303080558777, "learning_rate": 8.495007201095618e-05, "loss": 0.9373649597167969, "memory(GiB)": 91.52, "step": 44845, "token_acc": 0.7529169259489732, "train_speed(iter/s)": 0.153356 }, { "epoch": 0.5819592628516004, "grad_norm": 0.6885001063346863, "learning_rate": 8.494623608189524e-05, "loss": 0.8948243141174317, "memory(GiB)": 91.52, "step": 44850, "token_acc": 0.7752361369915826, "train_speed(iter/s)": 0.15335 }, { "epoch": 0.5820241412532561, "grad_norm": 0.704111635684967, "learning_rate": 8.494239975067524e-05, "loss": 0.8937517166137695, "memory(GiB)": 91.52, "step": 44855, "token_acc": 0.7557376513989096, "train_speed(iter/s)": 0.153345 }, { "epoch": 0.5820890196549118, "grad_norm": 0.6796494126319885, "learning_rate": 8.493856301734038e-05, "loss": 0.9139715194702148, "memory(GiB)": 91.52, "step": 44860, "token_acc": 0.7542497657609423, "train_speed(iter/s)": 0.153341 }, { "epoch": 0.5821538980565675, "grad_norm": 0.7069922089576721, "learning_rate": 8.493472588193477e-05, "loss": 0.8808972358703613, "memory(GiB)": 91.52, "step": 44865, "token_acc": 0.7353691395724807, "train_speed(iter/s)": 0.153337 }, { "epoch": 0.5822187764582232, "grad_norm": 0.8545217514038086, "learning_rate": 8.493088834450259e-05, "loss": 0.9081811904907227, "memory(GiB)": 91.52, "step": 44870, "token_acc": 0.7337007420214682, "train_speed(iter/s)": 0.15333 }, { "epoch": 0.5822836548598789, "grad_norm": 0.7731820940971375, "learning_rate": 8.492705040508799e-05, "loss": 0.8858092308044434, "memory(GiB)": 91.52, "step": 44875, "token_acc": 0.7524882567421082, "train_speed(iter/s)": 0.153325 }, { "epoch": 0.5823485332615346, "grad_norm": 0.7668782472610474, "learning_rate": 8.492321206373514e-05, "loss": 0.893122673034668, "memory(GiB)": 91.52, "step": 44880, "token_acc": 0.7633428896830695, "train_speed(iter/s)": 0.153321 }, { "epoch": 0.5824134116631903, "grad_norm": 0.7887907028198242, "learning_rate": 8.491937332048823e-05, "loss": 0.9197952270507812, "memory(GiB)": 91.52, "step": 44885, "token_acc": 0.7643433760991784, "train_speed(iter/s)": 0.153317 }, { "epoch": 0.582478290064846, "grad_norm": 0.8034054040908813, "learning_rate": 8.49155341753914e-05, "loss": 0.8676875114440918, "memory(GiB)": 91.52, "step": 44890, "token_acc": 0.7599274339850837, "train_speed(iter/s)": 0.153313 }, { "epoch": 0.5825431684665017, "grad_norm": 0.7574629783630371, "learning_rate": 8.491169462848887e-05, "loss": 0.9181609153747559, "memory(GiB)": 91.52, "step": 44895, "token_acc": 0.7521885297739906, "train_speed(iter/s)": 0.153308 }, { "epoch": 0.5826080468681574, "grad_norm": 0.7725316286087036, "learning_rate": 8.490785467982479e-05, "loss": 0.9657857894897461, "memory(GiB)": 91.52, "step": 44900, "token_acc": 0.7370578013835611, "train_speed(iter/s)": 0.153304 }, { "epoch": 0.5826729252698131, "grad_norm": 0.8512711524963379, "learning_rate": 8.490401432944337e-05, "loss": 0.9418207168579101, "memory(GiB)": 91.52, "step": 44905, "token_acc": 0.7486942955979109, "train_speed(iter/s)": 0.1533 }, { "epoch": 0.5827378036714688, "grad_norm": 0.7746474742889404, "learning_rate": 8.490017357738878e-05, "loss": 0.9144561767578125, "memory(GiB)": 91.52, "step": 44910, "token_acc": 0.7484092644438789, "train_speed(iter/s)": 0.153295 }, { "epoch": 0.5828026820731245, "grad_norm": 0.810603141784668, "learning_rate": 8.489633242370527e-05, "loss": 0.9327029228210449, "memory(GiB)": 91.52, "step": 44915, "token_acc": 0.7461217620371725, "train_speed(iter/s)": 0.153291 }, { "epoch": 0.5828675604747802, "grad_norm": 0.6744030117988586, "learning_rate": 8.4892490868437e-05, "loss": 0.8566975593566895, "memory(GiB)": 91.52, "step": 44920, "token_acc": 0.7535380696292103, "train_speed(iter/s)": 0.153286 }, { "epoch": 0.5829324388764359, "grad_norm": 0.8204644322395325, "learning_rate": 8.488864891162817e-05, "loss": 0.9265409469604492, "memory(GiB)": 91.52, "step": 44925, "token_acc": 0.7589955394019494, "train_speed(iter/s)": 0.153282 }, { "epoch": 0.5829973172780916, "grad_norm": 0.7031214237213135, "learning_rate": 8.488480655332305e-05, "loss": 0.9213449478149414, "memory(GiB)": 91.52, "step": 44930, "token_acc": 0.751047448522829, "train_speed(iter/s)": 0.153278 }, { "epoch": 0.5830621956797473, "grad_norm": 0.7672505974769592, "learning_rate": 8.488096379356579e-05, "loss": 0.8971296310424804, "memory(GiB)": 91.52, "step": 44935, "token_acc": 0.747225989397115, "train_speed(iter/s)": 0.153273 }, { "epoch": 0.583127074081403, "grad_norm": 0.7646816372871399, "learning_rate": 8.487712063240067e-05, "loss": 0.8582748413085938, "memory(GiB)": 91.52, "step": 44940, "token_acc": 0.7537941609037143, "train_speed(iter/s)": 0.153268 }, { "epoch": 0.5831919524830587, "grad_norm": 0.7210651636123657, "learning_rate": 8.487327706987187e-05, "loss": 0.9024515151977539, "memory(GiB)": 91.52, "step": 44945, "token_acc": 0.7322526749848921, "train_speed(iter/s)": 0.153263 }, { "epoch": 0.5832568308847144, "grad_norm": 0.7253261804580688, "learning_rate": 8.486943310602365e-05, "loss": 0.9213800430297852, "memory(GiB)": 91.52, "step": 44950, "token_acc": 0.7453170528652698, "train_speed(iter/s)": 0.153259 }, { "epoch": 0.5833217092863701, "grad_norm": 0.704887330532074, "learning_rate": 8.486558874090025e-05, "loss": 0.8712948799133301, "memory(GiB)": 91.52, "step": 44955, "token_acc": 0.7555739914964222, "train_speed(iter/s)": 0.153254 }, { "epoch": 0.5833865876880258, "grad_norm": 0.6768651604652405, "learning_rate": 8.486174397454588e-05, "loss": 0.9031239509582519, "memory(GiB)": 91.52, "step": 44960, "token_acc": 0.7711255156157926, "train_speed(iter/s)": 0.153251 }, { "epoch": 0.5834514660896815, "grad_norm": 0.7284184694290161, "learning_rate": 8.485789880700481e-05, "loss": 0.8708681106567383, "memory(GiB)": 91.52, "step": 44965, "token_acc": 0.7736494660076729, "train_speed(iter/s)": 0.153247 }, { "epoch": 0.5835163444913372, "grad_norm": 0.7236838936805725, "learning_rate": 8.485405323832127e-05, "loss": 0.8819039344787598, "memory(GiB)": 91.52, "step": 44970, "token_acc": 0.7737947296306917, "train_speed(iter/s)": 0.153242 }, { "epoch": 0.5835812228929929, "grad_norm": 0.7450270056724548, "learning_rate": 8.485020726853956e-05, "loss": 0.9224412918090821, "memory(GiB)": 91.52, "step": 44975, "token_acc": 0.75, "train_speed(iter/s)": 0.153237 }, { "epoch": 0.5836461012946486, "grad_norm": 0.7219679355621338, "learning_rate": 8.484636089770389e-05, "loss": 0.8837374687194824, "memory(GiB)": 91.52, "step": 44980, "token_acc": 0.7521640697829272, "train_speed(iter/s)": 0.153233 }, { "epoch": 0.5837109796963043, "grad_norm": 0.668454647064209, "learning_rate": 8.484251412585854e-05, "loss": 0.8924186706542969, "memory(GiB)": 91.52, "step": 44985, "token_acc": 0.7375427107061503, "train_speed(iter/s)": 0.153227 }, { "epoch": 0.5837758580979598, "grad_norm": 0.7540942430496216, "learning_rate": 8.483866695304778e-05, "loss": 0.9061751365661621, "memory(GiB)": 91.52, "step": 44990, "token_acc": 0.7599835937208695, "train_speed(iter/s)": 0.153222 }, { "epoch": 0.5838407364996155, "grad_norm": 0.8445993661880493, "learning_rate": 8.483481937931588e-05, "loss": 0.9558387756347656, "memory(GiB)": 91.52, "step": 44995, "token_acc": 0.7351832088224831, "train_speed(iter/s)": 0.153218 }, { "epoch": 0.5839056149012712, "grad_norm": 0.9307457804679871, "learning_rate": 8.483097140470713e-05, "loss": 0.8969178199768066, "memory(GiB)": 91.52, "step": 45000, "token_acc": 0.7520079903449998, "train_speed(iter/s)": 0.153214 }, { "epoch": 0.5839704933029269, "grad_norm": 0.8111497759819031, "learning_rate": 8.48271230292658e-05, "loss": 0.9405743598937988, "memory(GiB)": 91.52, "step": 45005, "token_acc": 0.7398642770140943, "train_speed(iter/s)": 0.15321 }, { "epoch": 0.5840353717045826, "grad_norm": 0.7113521695137024, "learning_rate": 8.482327425303617e-05, "loss": 0.8971519470214844, "memory(GiB)": 91.52, "step": 45010, "token_acc": 0.7542673767581592, "train_speed(iter/s)": 0.153205 }, { "epoch": 0.5841002501062383, "grad_norm": 0.7348881363868713, "learning_rate": 8.481942507606255e-05, "loss": 0.8711529731750488, "memory(GiB)": 91.52, "step": 45015, "token_acc": 0.7517630031102762, "train_speed(iter/s)": 0.153201 }, { "epoch": 0.584165128507894, "grad_norm": 0.783745288848877, "learning_rate": 8.481557549838924e-05, "loss": 0.905550479888916, "memory(GiB)": 91.52, "step": 45020, "token_acc": 0.7506573859242073, "train_speed(iter/s)": 0.153197 }, { "epoch": 0.5842300069095497, "grad_norm": 0.741883397102356, "learning_rate": 8.48117255200605e-05, "loss": 0.8887754440307617, "memory(GiB)": 91.52, "step": 45025, "token_acc": 0.755593863127047, "train_speed(iter/s)": 0.153192 }, { "epoch": 0.5842948853112054, "grad_norm": 0.7927882671356201, "learning_rate": 8.480787514112069e-05, "loss": 0.8524301528930665, "memory(GiB)": 91.52, "step": 45030, "token_acc": 0.7649376759147567, "train_speed(iter/s)": 0.153187 }, { "epoch": 0.5843597637128611, "grad_norm": 0.7354751825332642, "learning_rate": 8.480402436161408e-05, "loss": 0.87216796875, "memory(GiB)": 91.52, "step": 45035, "token_acc": 0.7504292332113123, "train_speed(iter/s)": 0.15318 }, { "epoch": 0.5844246421145168, "grad_norm": 0.7767350673675537, "learning_rate": 8.4800173181585e-05, "loss": 0.9017786979675293, "memory(GiB)": 91.52, "step": 45040, "token_acc": 0.740864050733254, "train_speed(iter/s)": 0.153177 }, { "epoch": 0.5844895205161725, "grad_norm": 0.6970803141593933, "learning_rate": 8.479632160107775e-05, "loss": 0.8607096672058105, "memory(GiB)": 91.52, "step": 45045, "token_acc": 0.7551205651367457, "train_speed(iter/s)": 0.153173 }, { "epoch": 0.5845543989178282, "grad_norm": 0.8632600903511047, "learning_rate": 8.47924696201367e-05, "loss": 0.9114337921142578, "memory(GiB)": 91.52, "step": 45050, "token_acc": 0.7618831100696023, "train_speed(iter/s)": 0.153168 }, { "epoch": 0.5846192773194839, "grad_norm": 0.7768083810806274, "learning_rate": 8.478861723880613e-05, "loss": 0.8813892364501953, "memory(GiB)": 91.52, "step": 45055, "token_acc": 0.7544675642594859, "train_speed(iter/s)": 0.153164 }, { "epoch": 0.5846841557211396, "grad_norm": 0.7739286422729492, "learning_rate": 8.478476445713039e-05, "loss": 0.9191757202148437, "memory(GiB)": 91.52, "step": 45060, "token_acc": 0.7582138319114177, "train_speed(iter/s)": 0.15316 }, { "epoch": 0.5847490341227953, "grad_norm": 0.6542857885360718, "learning_rate": 8.478091127515383e-05, "loss": 0.8932533264160156, "memory(GiB)": 91.52, "step": 45065, "token_acc": 0.750206633844827, "train_speed(iter/s)": 0.153156 }, { "epoch": 0.584813912524451, "grad_norm": 0.7800121307373047, "learning_rate": 8.477705769292077e-05, "loss": 0.8768040657043457, "memory(GiB)": 91.52, "step": 45070, "token_acc": 0.7729104550964909, "train_speed(iter/s)": 0.15315 }, { "epoch": 0.5848787909261067, "grad_norm": 0.8025170564651489, "learning_rate": 8.477320371047558e-05, "loss": 0.9591009140014648, "memory(GiB)": 91.52, "step": 45075, "token_acc": 0.7326480544082482, "train_speed(iter/s)": 0.153146 }, { "epoch": 0.5849436693277624, "grad_norm": 0.7937532663345337, "learning_rate": 8.47693493278626e-05, "loss": 0.8627232551574707, "memory(GiB)": 91.52, "step": 45080, "token_acc": 0.7737774289321913, "train_speed(iter/s)": 0.153141 }, { "epoch": 0.5850085477294181, "grad_norm": 0.7494316101074219, "learning_rate": 8.476549454512618e-05, "loss": 0.9214303970336915, "memory(GiB)": 91.52, "step": 45085, "token_acc": 0.7383630115588878, "train_speed(iter/s)": 0.153136 }, { "epoch": 0.5850734261310738, "grad_norm": 0.7649753093719482, "learning_rate": 8.476163936231069e-05, "loss": 0.9509033203125, "memory(GiB)": 91.52, "step": 45090, "token_acc": 0.7661047685776697, "train_speed(iter/s)": 0.153132 }, { "epoch": 0.5851383045327295, "grad_norm": 0.7293195128440857, "learning_rate": 8.475778377946048e-05, "loss": 0.9396598815917969, "memory(GiB)": 91.52, "step": 45095, "token_acc": 0.7419892706974047, "train_speed(iter/s)": 0.153128 }, { "epoch": 0.5852031829343852, "grad_norm": 0.7241893410682678, "learning_rate": 8.475392779661995e-05, "loss": 0.9476593971252442, "memory(GiB)": 91.52, "step": 45100, "token_acc": 0.7223730611990138, "train_speed(iter/s)": 0.153123 }, { "epoch": 0.5852680613360409, "grad_norm": 0.7522896528244019, "learning_rate": 8.475007141383344e-05, "loss": 0.90849609375, "memory(GiB)": 91.52, "step": 45105, "token_acc": 0.7538838638403645, "train_speed(iter/s)": 0.153119 }, { "epoch": 0.5853329397376966, "grad_norm": 0.7935114502906799, "learning_rate": 8.474621463114534e-05, "loss": 0.9018796920776367, "memory(GiB)": 91.52, "step": 45110, "token_acc": 0.7601062260253763, "train_speed(iter/s)": 0.153115 }, { "epoch": 0.5853978181393523, "grad_norm": 0.8503714203834534, "learning_rate": 8.474235744860004e-05, "loss": 0.9125252723693847, "memory(GiB)": 91.52, "step": 45115, "token_acc": 0.7711430660492322, "train_speed(iter/s)": 0.15311 }, { "epoch": 0.585462696541008, "grad_norm": 0.7845509052276611, "learning_rate": 8.473849986624195e-05, "loss": 0.8889471054077148, "memory(GiB)": 91.52, "step": 45120, "token_acc": 0.7635885347272044, "train_speed(iter/s)": 0.153105 }, { "epoch": 0.5855275749426637, "grad_norm": 0.751238226890564, "learning_rate": 8.473464188411542e-05, "loss": 0.8808588027954102, "memory(GiB)": 91.52, "step": 45125, "token_acc": 0.7632823456508268, "train_speed(iter/s)": 0.153101 }, { "epoch": 0.5855924533443194, "grad_norm": 0.8148629665374756, "learning_rate": 8.473078350226487e-05, "loss": 0.9442127227783204, "memory(GiB)": 91.52, "step": 45130, "token_acc": 0.75359477124183, "train_speed(iter/s)": 0.153096 }, { "epoch": 0.5856573317459751, "grad_norm": 0.9114114046096802, "learning_rate": 8.47269247207347e-05, "loss": 0.9357192993164063, "memory(GiB)": 91.52, "step": 45135, "token_acc": 0.764399320568252, "train_speed(iter/s)": 0.153093 }, { "epoch": 0.5857222101476308, "grad_norm": 0.7536181807518005, "learning_rate": 8.472306553956932e-05, "loss": 0.9514713287353516, "memory(GiB)": 91.52, "step": 45140, "token_acc": 0.7643818367828756, "train_speed(iter/s)": 0.153088 }, { "epoch": 0.5857870885492865, "grad_norm": 0.7945564985275269, "learning_rate": 8.471920595881313e-05, "loss": 0.9094881057739258, "memory(GiB)": 91.52, "step": 45145, "token_acc": 0.7429846233268188, "train_speed(iter/s)": 0.153084 }, { "epoch": 0.5858519669509422, "grad_norm": 0.7358911037445068, "learning_rate": 8.471534597851053e-05, "loss": 0.950390625, "memory(GiB)": 91.52, "step": 45150, "token_acc": 0.7539996499212323, "train_speed(iter/s)": 0.15308 }, { "epoch": 0.5859168453525979, "grad_norm": 0.7311059236526489, "learning_rate": 8.4711485598706e-05, "loss": 0.9155127525329589, "memory(GiB)": 91.52, "step": 45155, "token_acc": 0.7346334603855974, "train_speed(iter/s)": 0.153076 }, { "epoch": 0.5859817237542536, "grad_norm": 0.7485836744308472, "learning_rate": 8.470762481944392e-05, "loss": 0.9658184051513672, "memory(GiB)": 91.52, "step": 45160, "token_acc": 0.7331678808742295, "train_speed(iter/s)": 0.153072 }, { "epoch": 0.5860466021559093, "grad_norm": 0.8343965411186218, "learning_rate": 8.470376364076872e-05, "loss": 0.910737133026123, "memory(GiB)": 91.52, "step": 45165, "token_acc": 0.76446404544775, "train_speed(iter/s)": 0.153069 }, { "epoch": 0.586111480557565, "grad_norm": 0.7957420945167542, "learning_rate": 8.469990206272485e-05, "loss": 0.8982103347778321, "memory(GiB)": 91.52, "step": 45170, "token_acc": 0.7527578796561605, "train_speed(iter/s)": 0.153064 }, { "epoch": 0.5861763589592207, "grad_norm": 0.7017624974250793, "learning_rate": 8.469604008535672e-05, "loss": 0.9339564323425293, "memory(GiB)": 91.52, "step": 45175, "token_acc": 0.7697416242648516, "train_speed(iter/s)": 0.153059 }, { "epoch": 0.5862412373608764, "grad_norm": 0.725362241268158, "learning_rate": 8.469217770870881e-05, "loss": 0.9126609802246094, "memory(GiB)": 91.52, "step": 45180, "token_acc": 0.7551255092653437, "train_speed(iter/s)": 0.153055 }, { "epoch": 0.5863061157625321, "grad_norm": 0.7268421053886414, "learning_rate": 8.468831493282555e-05, "loss": 0.8969556808471679, "memory(GiB)": 91.52, "step": 45185, "token_acc": 0.7714337568058076, "train_speed(iter/s)": 0.15305 }, { "epoch": 0.5863709941641878, "grad_norm": 0.7877702116966248, "learning_rate": 8.46844517577514e-05, "loss": 0.9342855453491211, "memory(GiB)": 91.52, "step": 45190, "token_acc": 0.7419176466251336, "train_speed(iter/s)": 0.153045 }, { "epoch": 0.5864358725658435, "grad_norm": 0.8383650183677673, "learning_rate": 8.46805881835308e-05, "loss": 0.9544597625732422, "memory(GiB)": 91.52, "step": 45195, "token_acc": 0.7474906958385024, "train_speed(iter/s)": 0.153041 }, { "epoch": 0.5865007509674992, "grad_norm": 0.7392706871032715, "learning_rate": 8.467672421020823e-05, "loss": 0.8969203948974609, "memory(GiB)": 91.52, "step": 45200, "token_acc": 0.769546505140671, "train_speed(iter/s)": 0.153036 }, { "epoch": 0.5865656293691549, "grad_norm": 0.7819551229476929, "learning_rate": 8.467285983782814e-05, "loss": 0.9502323150634766, "memory(GiB)": 91.52, "step": 45205, "token_acc": 0.7430828785378856, "train_speed(iter/s)": 0.153032 }, { "epoch": 0.5866305077708106, "grad_norm": 0.7631805539131165, "learning_rate": 8.466899506643502e-05, "loss": 0.975578784942627, "memory(GiB)": 91.52, "step": 45210, "token_acc": 0.7296289905090595, "train_speed(iter/s)": 0.153028 }, { "epoch": 0.5866953861724663, "grad_norm": 0.6603520512580872, "learning_rate": 8.466512989607334e-05, "loss": 0.9160255432128906, "memory(GiB)": 91.52, "step": 45215, "token_acc": 0.7574840592240355, "train_speed(iter/s)": 0.153023 }, { "epoch": 0.586760264574122, "grad_norm": 0.8927873969078064, "learning_rate": 8.466126432678756e-05, "loss": 0.9482870101928711, "memory(GiB)": 91.52, "step": 45220, "token_acc": 0.7554844765845009, "train_speed(iter/s)": 0.153019 }, { "epoch": 0.5868251429757777, "grad_norm": 0.7006145119667053, "learning_rate": 8.46573983586222e-05, "loss": 0.9086694717407227, "memory(GiB)": 91.52, "step": 45225, "token_acc": 0.7790019930056782, "train_speed(iter/s)": 0.153015 }, { "epoch": 0.5868900213774333, "grad_norm": 0.7913771867752075, "learning_rate": 8.465353199162171e-05, "loss": 0.9258630752563477, "memory(GiB)": 91.52, "step": 45230, "token_acc": 0.7552522025365476, "train_speed(iter/s)": 0.153011 }, { "epoch": 0.586954899779089, "grad_norm": 0.7605026960372925, "learning_rate": 8.464966522583062e-05, "loss": 0.9162490844726563, "memory(GiB)": 91.52, "step": 45235, "token_acc": 0.748858589309563, "train_speed(iter/s)": 0.153005 }, { "epoch": 0.5870197781807447, "grad_norm": 0.7933992147445679, "learning_rate": 8.46457980612934e-05, "loss": 0.9245864868164062, "memory(GiB)": 91.52, "step": 45240, "token_acc": 0.7502397851525033, "train_speed(iter/s)": 0.153001 }, { "epoch": 0.5870846565824004, "grad_norm": 0.7939669489860535, "learning_rate": 8.464193049805458e-05, "loss": 0.8980849266052247, "memory(GiB)": 91.52, "step": 45245, "token_acc": 0.743302312464749, "train_speed(iter/s)": 0.152995 }, { "epoch": 0.5871495349840561, "grad_norm": 0.8203541040420532, "learning_rate": 8.463806253615865e-05, "loss": 0.9335216522216797, "memory(GiB)": 91.52, "step": 45250, "token_acc": 0.7542851170568562, "train_speed(iter/s)": 0.15299 }, { "epoch": 0.5872144133857118, "grad_norm": 0.8349524736404419, "learning_rate": 8.463419417565012e-05, "loss": 0.9163166046142578, "memory(GiB)": 91.52, "step": 45255, "token_acc": 0.7525342251018915, "train_speed(iter/s)": 0.152986 }, { "epoch": 0.5872792917873675, "grad_norm": 0.8083001971244812, "learning_rate": 8.463032541657352e-05, "loss": 0.9209296226501464, "memory(GiB)": 91.52, "step": 45260, "token_acc": 0.7416690101237345, "train_speed(iter/s)": 0.152981 }, { "epoch": 0.5873441701890232, "grad_norm": 0.807628870010376, "learning_rate": 8.462645625897335e-05, "loss": 0.9234825134277344, "memory(GiB)": 91.52, "step": 45265, "token_acc": 0.7527970842515681, "train_speed(iter/s)": 0.152977 }, { "epoch": 0.5874090485906789, "grad_norm": 0.6972952485084534, "learning_rate": 8.462258670289416e-05, "loss": 0.9007444381713867, "memory(GiB)": 91.52, "step": 45270, "token_acc": 0.7525762642462295, "train_speed(iter/s)": 0.152971 }, { "epoch": 0.5874739269923346, "grad_norm": 0.7980510592460632, "learning_rate": 8.461871674838046e-05, "loss": 0.9723408699035645, "memory(GiB)": 91.52, "step": 45275, "token_acc": 0.7698952631079328, "train_speed(iter/s)": 0.152967 }, { "epoch": 0.5875388053939903, "grad_norm": 0.7977967858314514, "learning_rate": 8.461484639547682e-05, "loss": 0.9015155792236328, "memory(GiB)": 91.52, "step": 45280, "token_acc": 0.7494157668983878, "train_speed(iter/s)": 0.152962 }, { "epoch": 0.587603683795646, "grad_norm": 0.8058083057403564, "learning_rate": 8.461097564422773e-05, "loss": 0.8606297492980957, "memory(GiB)": 91.52, "step": 45285, "token_acc": 0.7637191157347204, "train_speed(iter/s)": 0.152957 }, { "epoch": 0.5876685621973017, "grad_norm": 0.8268173933029175, "learning_rate": 8.460710449467777e-05, "loss": 0.8838491439819336, "memory(GiB)": 91.52, "step": 45290, "token_acc": 0.7654791592447453, "train_speed(iter/s)": 0.152952 }, { "epoch": 0.5877334405989574, "grad_norm": 0.7151963710784912, "learning_rate": 8.460323294687148e-05, "loss": 0.8746583938598633, "memory(GiB)": 91.52, "step": 45295, "token_acc": 0.7839051383399209, "train_speed(iter/s)": 0.152947 }, { "epoch": 0.5877983190006131, "grad_norm": 0.7232065200805664, "learning_rate": 8.459936100085342e-05, "loss": 0.928831672668457, "memory(GiB)": 91.52, "step": 45300, "token_acc": 0.7362391123640175, "train_speed(iter/s)": 0.152942 }, { "epoch": 0.5878631974022688, "grad_norm": 0.8246002197265625, "learning_rate": 8.459548865666813e-05, "loss": 0.931512451171875, "memory(GiB)": 91.52, "step": 45305, "token_acc": 0.747326671073609, "train_speed(iter/s)": 0.152938 }, { "epoch": 0.5879280758039245, "grad_norm": 0.7520503401756287, "learning_rate": 8.459161591436018e-05, "loss": 0.849034309387207, "memory(GiB)": 91.52, "step": 45310, "token_acc": 0.7681095266702501, "train_speed(iter/s)": 0.152934 }, { "epoch": 0.5879929542055802, "grad_norm": 0.6987949013710022, "learning_rate": 8.458774277397416e-05, "loss": 0.8869487762451171, "memory(GiB)": 91.52, "step": 45315, "token_acc": 0.7444199559206571, "train_speed(iter/s)": 0.15293 }, { "epoch": 0.5880578326072359, "grad_norm": 0.7804885506629944, "learning_rate": 8.458386923555459e-05, "loss": 0.9264389991760253, "memory(GiB)": 91.52, "step": 45320, "token_acc": 0.7485585272664119, "train_speed(iter/s)": 0.152925 }, { "epoch": 0.5881227110088916, "grad_norm": 0.7397280335426331, "learning_rate": 8.45799952991461e-05, "loss": 0.87645263671875, "memory(GiB)": 91.52, "step": 45325, "token_acc": 0.7465114358847329, "train_speed(iter/s)": 0.152921 }, { "epoch": 0.5881875894105473, "grad_norm": 0.7877868413925171, "learning_rate": 8.457612096479323e-05, "loss": 0.9405702590942383, "memory(GiB)": 91.52, "step": 45330, "token_acc": 0.7567949170490645, "train_speed(iter/s)": 0.152917 }, { "epoch": 0.588252467812203, "grad_norm": 0.8568789958953857, "learning_rate": 8.457224623254063e-05, "loss": 0.9530820846557617, "memory(GiB)": 91.52, "step": 45335, "token_acc": 0.76347864280224, "train_speed(iter/s)": 0.152913 }, { "epoch": 0.5883173462138587, "grad_norm": 0.7467566728591919, "learning_rate": 8.456837110243281e-05, "loss": 0.9035434722900391, "memory(GiB)": 91.52, "step": 45340, "token_acc": 0.7509989102796949, "train_speed(iter/s)": 0.152908 }, { "epoch": 0.5883822246155144, "grad_norm": 0.725630521774292, "learning_rate": 8.456449557451441e-05, "loss": 0.9348075866699219, "memory(GiB)": 91.52, "step": 45345, "token_acc": 0.7627694556083303, "train_speed(iter/s)": 0.152905 }, { "epoch": 0.5884471030171701, "grad_norm": 0.7717567086219788, "learning_rate": 8.456061964883001e-05, "loss": 0.8612634658813476, "memory(GiB)": 91.52, "step": 45350, "token_acc": 0.7642098487286771, "train_speed(iter/s)": 0.1529 }, { "epoch": 0.5885119814188258, "grad_norm": 0.7991069555282593, "learning_rate": 8.455674332542422e-05, "loss": 0.91250581741333, "memory(GiB)": 91.52, "step": 45355, "token_acc": 0.7816388983339, "train_speed(iter/s)": 0.152895 }, { "epoch": 0.5885768598204815, "grad_norm": 0.7710584402084351, "learning_rate": 8.455286660434166e-05, "loss": 0.9043203353881836, "memory(GiB)": 91.52, "step": 45360, "token_acc": 0.7592109176221788, "train_speed(iter/s)": 0.152892 }, { "epoch": 0.5886417382221372, "grad_norm": 0.853279173374176, "learning_rate": 8.454898948562694e-05, "loss": 0.8963934898376464, "memory(GiB)": 91.52, "step": 45365, "token_acc": 0.7626574385833645, "train_speed(iter/s)": 0.152887 }, { "epoch": 0.5887066166237929, "grad_norm": 0.8212559819221497, "learning_rate": 8.454511196932467e-05, "loss": 0.9154548645019531, "memory(GiB)": 91.52, "step": 45370, "token_acc": 0.7574282417423914, "train_speed(iter/s)": 0.152883 }, { "epoch": 0.5887714950254486, "grad_norm": 0.8439857363700867, "learning_rate": 8.454123405547947e-05, "loss": 0.912127685546875, "memory(GiB)": 91.52, "step": 45375, "token_acc": 0.734764606655421, "train_speed(iter/s)": 0.152879 }, { "epoch": 0.5888363734271043, "grad_norm": 0.7120808362960815, "learning_rate": 8.453735574413597e-05, "loss": 0.8901384353637696, "memory(GiB)": 91.52, "step": 45380, "token_acc": 0.769912456946039, "train_speed(iter/s)": 0.152874 }, { "epoch": 0.58890125182876, "grad_norm": 0.8781533241271973, "learning_rate": 8.453347703533881e-05, "loss": 0.8955453872680664, "memory(GiB)": 91.52, "step": 45385, "token_acc": 0.7525928938729264, "train_speed(iter/s)": 0.152869 }, { "epoch": 0.5889661302304157, "grad_norm": 0.7586499452590942, "learning_rate": 8.452959792913261e-05, "loss": 0.8465192794799805, "memory(GiB)": 91.52, "step": 45390, "token_acc": 0.7599535861918921, "train_speed(iter/s)": 0.152864 }, { "epoch": 0.5890310086320714, "grad_norm": 0.680721640586853, "learning_rate": 8.452571842556204e-05, "loss": 0.8764734268188477, "memory(GiB)": 91.52, "step": 45395, "token_acc": 0.7684442990348438, "train_speed(iter/s)": 0.15286 }, { "epoch": 0.5890958870337271, "grad_norm": 0.7966756224632263, "learning_rate": 8.452183852467171e-05, "loss": 0.867645263671875, "memory(GiB)": 91.52, "step": 45400, "token_acc": 0.7861825849174711, "train_speed(iter/s)": 0.152856 }, { "epoch": 0.5891607654353828, "grad_norm": 0.8601071834564209, "learning_rate": 8.451795822650627e-05, "loss": 0.8899576187133789, "memory(GiB)": 91.52, "step": 45405, "token_acc": 0.7560079163132598, "train_speed(iter/s)": 0.152853 }, { "epoch": 0.5892256438370385, "grad_norm": 0.8019266128540039, "learning_rate": 8.451407753111041e-05, "loss": 0.9446506500244141, "memory(GiB)": 91.52, "step": 45410, "token_acc": 0.7624139061987537, "train_speed(iter/s)": 0.15285 }, { "epoch": 0.5892905222386942, "grad_norm": 0.7656005620956421, "learning_rate": 8.451019643852876e-05, "loss": 0.8973719596862793, "memory(GiB)": 91.52, "step": 45415, "token_acc": 0.7411867561622201, "train_speed(iter/s)": 0.152846 }, { "epoch": 0.5893554006403499, "grad_norm": 0.7884216904640198, "learning_rate": 8.450631494880599e-05, "loss": 0.9127389907836914, "memory(GiB)": 91.52, "step": 45420, "token_acc": 0.7464202713846615, "train_speed(iter/s)": 0.152843 }, { "epoch": 0.5894202790420056, "grad_norm": 0.7839286923408508, "learning_rate": 8.450243306198677e-05, "loss": 0.9259045600891114, "memory(GiB)": 91.52, "step": 45425, "token_acc": 0.7486071521119461, "train_speed(iter/s)": 0.152838 }, { "epoch": 0.5894851574436613, "grad_norm": 0.6775681376457214, "learning_rate": 8.449855077811577e-05, "loss": 0.8796440124511719, "memory(GiB)": 91.52, "step": 45430, "token_acc": 0.7719492400767302, "train_speed(iter/s)": 0.152834 }, { "epoch": 0.589550035845317, "grad_norm": 0.7982516288757324, "learning_rate": 8.449466809723767e-05, "loss": 0.9301640510559082, "memory(GiB)": 91.52, "step": 45435, "token_acc": 0.7531093060096397, "train_speed(iter/s)": 0.152829 }, { "epoch": 0.5896149142469727, "grad_norm": 0.7477866411209106, "learning_rate": 8.449078501939716e-05, "loss": 0.9315653800964355, "memory(GiB)": 91.52, "step": 45440, "token_acc": 0.7346410194426218, "train_speed(iter/s)": 0.152825 }, { "epoch": 0.5896797926486284, "grad_norm": 0.66178297996521, "learning_rate": 8.44869015446389e-05, "loss": 0.8994308471679687, "memory(GiB)": 91.52, "step": 45445, "token_acc": 0.7759396506087877, "train_speed(iter/s)": 0.152821 }, { "epoch": 0.589744671050284, "grad_norm": 0.7299962043762207, "learning_rate": 8.448301767300759e-05, "loss": 0.9337065696716309, "memory(GiB)": 91.52, "step": 45450, "token_acc": 0.7733805457196418, "train_speed(iter/s)": 0.152817 }, { "epoch": 0.5898095494519398, "grad_norm": 0.7697514891624451, "learning_rate": 8.447913340454794e-05, "loss": 0.8835629463195801, "memory(GiB)": 91.52, "step": 45455, "token_acc": 0.7384471105649516, "train_speed(iter/s)": 0.152813 }, { "epoch": 0.5898744278535955, "grad_norm": 0.6713480353355408, "learning_rate": 8.447524873930465e-05, "loss": 0.9083226203918457, "memory(GiB)": 91.52, "step": 45460, "token_acc": 0.7645614035087719, "train_speed(iter/s)": 0.152808 }, { "epoch": 0.5899393062552512, "grad_norm": 0.9222580790519714, "learning_rate": 8.44713636773224e-05, "loss": 0.9014143943786621, "memory(GiB)": 91.52, "step": 45465, "token_acc": 0.750950709375457, "train_speed(iter/s)": 0.152804 }, { "epoch": 0.5900041846569067, "grad_norm": 0.8339338302612305, "learning_rate": 8.446747821864593e-05, "loss": 0.9795495986938476, "memory(GiB)": 91.52, "step": 45470, "token_acc": 0.7346016325979896, "train_speed(iter/s)": 0.1528 }, { "epoch": 0.5900690630585624, "grad_norm": 0.7343423962593079, "learning_rate": 8.446359236331993e-05, "loss": 0.8791309356689453, "memory(GiB)": 91.52, "step": 45475, "token_acc": 0.7719645293315143, "train_speed(iter/s)": 0.152796 }, { "epoch": 0.5901339414602181, "grad_norm": 0.7789555788040161, "learning_rate": 8.445970611138914e-05, "loss": 0.9447595596313476, "memory(GiB)": 91.52, "step": 45480, "token_acc": 0.7361933946203609, "train_speed(iter/s)": 0.152792 }, { "epoch": 0.5901988198618738, "grad_norm": 0.8605543971061707, "learning_rate": 8.445581946289826e-05, "loss": 0.9228363037109375, "memory(GiB)": 91.52, "step": 45485, "token_acc": 0.7620394491998511, "train_speed(iter/s)": 0.152786 }, { "epoch": 0.5902636982635295, "grad_norm": 0.7807661294937134, "learning_rate": 8.445193241789202e-05, "loss": 0.9037563323974609, "memory(GiB)": 91.52, "step": 45490, "token_acc": 0.7528523663438006, "train_speed(iter/s)": 0.152782 }, { "epoch": 0.5903285766651852, "grad_norm": 0.7449665665626526, "learning_rate": 8.444804497641517e-05, "loss": 0.9129596710205078, "memory(GiB)": 91.52, "step": 45495, "token_acc": 0.738764153495382, "train_speed(iter/s)": 0.152778 }, { "epoch": 0.5903934550668409, "grad_norm": 0.7041008472442627, "learning_rate": 8.444415713851244e-05, "loss": 0.8868366241455078, "memory(GiB)": 91.52, "step": 45500, "token_acc": 0.7511046144646357, "train_speed(iter/s)": 0.152773 }, { "epoch": 0.5904583334684966, "grad_norm": 0.7785696983337402, "learning_rate": 8.444026890422855e-05, "loss": 0.8968361854553223, "memory(GiB)": 91.52, "step": 45505, "token_acc": 0.7679724625319849, "train_speed(iter/s)": 0.152769 }, { "epoch": 0.5905232118701523, "grad_norm": 0.7866855263710022, "learning_rate": 8.443638027360827e-05, "loss": 0.9425360679626464, "memory(GiB)": 91.52, "step": 45510, "token_acc": 0.7516110955449706, "train_speed(iter/s)": 0.152765 }, { "epoch": 0.590588090271808, "grad_norm": 0.7766964435577393, "learning_rate": 8.443249124669634e-05, "loss": 0.865924072265625, "memory(GiB)": 91.52, "step": 45515, "token_acc": 0.7573502172229145, "train_speed(iter/s)": 0.152761 }, { "epoch": 0.5906529686734637, "grad_norm": 0.7889199256896973, "learning_rate": 8.442860182353754e-05, "loss": 0.90950927734375, "memory(GiB)": 91.52, "step": 45520, "token_acc": 0.7516520198923632, "train_speed(iter/s)": 0.152757 }, { "epoch": 0.5907178470751194, "grad_norm": 0.786521852016449, "learning_rate": 8.442471200417657e-05, "loss": 0.9097177505493164, "memory(GiB)": 91.52, "step": 45525, "token_acc": 0.7675728220427044, "train_speed(iter/s)": 0.152753 }, { "epoch": 0.5907827254767751, "grad_norm": 0.7954641580581665, "learning_rate": 8.442082178865825e-05, "loss": 0.9424171447753906, "memory(GiB)": 91.52, "step": 45530, "token_acc": 0.7383625088311484, "train_speed(iter/s)": 0.152748 }, { "epoch": 0.5908476038784308, "grad_norm": 0.750665009021759, "learning_rate": 8.441693117702733e-05, "loss": 0.8999631881713868, "memory(GiB)": 91.52, "step": 45535, "token_acc": 0.7462585034013606, "train_speed(iter/s)": 0.152744 }, { "epoch": 0.5909124822800865, "grad_norm": 0.8781236410140991, "learning_rate": 8.441304016932856e-05, "loss": 0.9152923583984375, "memory(GiB)": 91.52, "step": 45540, "token_acc": 0.7663961738994081, "train_speed(iter/s)": 0.15274 }, { "epoch": 0.5909773606817422, "grad_norm": 0.6871755123138428, "learning_rate": 8.440914876560675e-05, "loss": 0.8885078430175781, "memory(GiB)": 91.52, "step": 45545, "token_acc": 0.779878393195335, "train_speed(iter/s)": 0.152736 }, { "epoch": 0.5910422390833979, "grad_norm": 0.7395609021186829, "learning_rate": 8.440525696590669e-05, "loss": 0.9476523399353027, "memory(GiB)": 91.52, "step": 45550, "token_acc": 0.7600966256510908, "train_speed(iter/s)": 0.152732 }, { "epoch": 0.5911071174850536, "grad_norm": 0.704962968826294, "learning_rate": 8.440136477027312e-05, "loss": 0.9436095237731934, "memory(GiB)": 91.52, "step": 45555, "token_acc": 0.7481083719794972, "train_speed(iter/s)": 0.152727 }, { "epoch": 0.5911719958867093, "grad_norm": 0.6651776432991028, "learning_rate": 8.439747217875086e-05, "loss": 0.8666715621948242, "memory(GiB)": 91.52, "step": 45560, "token_acc": 0.7739763587618815, "train_speed(iter/s)": 0.152722 }, { "epoch": 0.591236874288365, "grad_norm": 0.8334172368049622, "learning_rate": 8.439357919138473e-05, "loss": 0.8966341018676758, "memory(GiB)": 91.52, "step": 45565, "token_acc": 0.7567435017165277, "train_speed(iter/s)": 0.152718 }, { "epoch": 0.5913017526900207, "grad_norm": 0.7724868059158325, "learning_rate": 8.43896858082195e-05, "loss": 0.9349943161010742, "memory(GiB)": 91.52, "step": 45570, "token_acc": 0.7458820114650606, "train_speed(iter/s)": 0.152714 }, { "epoch": 0.5913666310916764, "grad_norm": 0.7946611642837524, "learning_rate": 8.438579202929995e-05, "loss": 0.8703108787536621, "memory(GiB)": 91.52, "step": 45575, "token_acc": 0.7625685849176981, "train_speed(iter/s)": 0.152709 }, { "epoch": 0.5914315094933321, "grad_norm": 0.7879191040992737, "learning_rate": 8.438189785467093e-05, "loss": 0.9284714698791504, "memory(GiB)": 91.52, "step": 45580, "token_acc": 0.7416081438051672, "train_speed(iter/s)": 0.152706 }, { "epoch": 0.5914963878949878, "grad_norm": 0.6731768846511841, "learning_rate": 8.437800328437726e-05, "loss": 0.8453067779541016, "memory(GiB)": 91.52, "step": 45585, "token_acc": 0.8030432483013955, "train_speed(iter/s)": 0.1527 }, { "epoch": 0.5915612662966435, "grad_norm": 0.7571609020233154, "learning_rate": 8.437410831846372e-05, "loss": 0.9183337211608886, "memory(GiB)": 91.52, "step": 45590, "token_acc": 0.7604678937754672, "train_speed(iter/s)": 0.152695 }, { "epoch": 0.5916261446982992, "grad_norm": 0.7092158794403076, "learning_rate": 8.437021295697515e-05, "loss": 0.8726909637451172, "memory(GiB)": 91.52, "step": 45595, "token_acc": 0.7558612943984276, "train_speed(iter/s)": 0.15269 }, { "epoch": 0.5916910230999549, "grad_norm": 0.7972362637519836, "learning_rate": 8.43663171999564e-05, "loss": 0.8900158882141114, "memory(GiB)": 91.52, "step": 45600, "token_acc": 0.7696803144675967, "train_speed(iter/s)": 0.152686 }, { "epoch": 0.5917559015016106, "grad_norm": 0.6848744750022888, "learning_rate": 8.436242104745229e-05, "loss": 0.9172824859619141, "memory(GiB)": 91.52, "step": 45605, "token_acc": 0.7577434805747738, "train_speed(iter/s)": 0.152682 }, { "epoch": 0.5918207799032663, "grad_norm": 0.7073135375976562, "learning_rate": 8.435852449950764e-05, "loss": 0.8993122100830078, "memory(GiB)": 91.52, "step": 45610, "token_acc": 0.7595089951856753, "train_speed(iter/s)": 0.152677 }, { "epoch": 0.591885658304922, "grad_norm": 0.7314444780349731, "learning_rate": 8.435462755616728e-05, "loss": 0.9150553703308105, "memory(GiB)": 91.52, "step": 45615, "token_acc": 0.7799561329509026, "train_speed(iter/s)": 0.152673 }, { "epoch": 0.5919505367065777, "grad_norm": 0.7222590446472168, "learning_rate": 8.43507302174761e-05, "loss": 0.8882028579711914, "memory(GiB)": 91.52, "step": 45620, "token_acc": 0.7639289401545089, "train_speed(iter/s)": 0.152668 }, { "epoch": 0.5920154151082334, "grad_norm": 0.7796160578727722, "learning_rate": 8.434683248347892e-05, "loss": 0.9319114685058594, "memory(GiB)": 91.52, "step": 45625, "token_acc": 0.7580266497461929, "train_speed(iter/s)": 0.152664 }, { "epoch": 0.5920802935098891, "grad_norm": 0.8212881088256836, "learning_rate": 8.434293435422062e-05, "loss": 0.8765625953674316, "memory(GiB)": 91.52, "step": 45630, "token_acc": 0.7642378889326507, "train_speed(iter/s)": 0.152661 }, { "epoch": 0.5921451719115448, "grad_norm": 0.7777549624443054, "learning_rate": 8.433903582974602e-05, "loss": 0.9087708473205567, "memory(GiB)": 91.52, "step": 45635, "token_acc": 0.7511908097506305, "train_speed(iter/s)": 0.152656 }, { "epoch": 0.5922100503132005, "grad_norm": 0.7799917459487915, "learning_rate": 8.433513691010002e-05, "loss": 0.9404487609863281, "memory(GiB)": 91.52, "step": 45640, "token_acc": 0.7448206941615098, "train_speed(iter/s)": 0.152652 }, { "epoch": 0.5922749287148562, "grad_norm": 0.8230609893798828, "learning_rate": 8.433123759532746e-05, "loss": 0.9044338226318359, "memory(GiB)": 91.52, "step": 45645, "token_acc": 0.7414300815463606, "train_speed(iter/s)": 0.152648 }, { "epoch": 0.5923398071165119, "grad_norm": 0.7526905536651611, "learning_rate": 8.432733788547323e-05, "loss": 0.881612491607666, "memory(GiB)": 91.52, "step": 45650, "token_acc": 0.7486845153390476, "train_speed(iter/s)": 0.152643 }, { "epoch": 0.5924046855181676, "grad_norm": 0.7406185865402222, "learning_rate": 8.432343778058221e-05, "loss": 0.8695821762084961, "memory(GiB)": 91.52, "step": 45655, "token_acc": 0.7601813373630525, "train_speed(iter/s)": 0.152639 }, { "epoch": 0.5924695639198233, "grad_norm": 0.7558847665786743, "learning_rate": 8.431953728069928e-05, "loss": 0.9198361396789551, "memory(GiB)": 91.52, "step": 45660, "token_acc": 0.7672343876723439, "train_speed(iter/s)": 0.152634 }, { "epoch": 0.592534442321479, "grad_norm": 0.7555331587791443, "learning_rate": 8.431563638586933e-05, "loss": 0.8815277099609375, "memory(GiB)": 91.52, "step": 45665, "token_acc": 0.7654631760644419, "train_speed(iter/s)": 0.152631 }, { "epoch": 0.5925993207231347, "grad_norm": 0.69865483045578, "learning_rate": 8.431173509613723e-05, "loss": 0.890389347076416, "memory(GiB)": 91.52, "step": 45670, "token_acc": 0.7517102771228694, "train_speed(iter/s)": 0.152627 }, { "epoch": 0.5926641991247904, "grad_norm": 0.9482477903366089, "learning_rate": 8.430783341154788e-05, "loss": 0.9063840866088867, "memory(GiB)": 91.52, "step": 45675, "token_acc": 0.7488067821501285, "train_speed(iter/s)": 0.152622 }, { "epoch": 0.5927290775264461, "grad_norm": 0.7831933498382568, "learning_rate": 8.430393133214622e-05, "loss": 0.8780292510986328, "memory(GiB)": 91.52, "step": 45680, "token_acc": 0.7772178194968069, "train_speed(iter/s)": 0.152618 }, { "epoch": 0.5927939559281018, "grad_norm": 0.6934492588043213, "learning_rate": 8.430002885797711e-05, "loss": 0.9082904815673828, "memory(GiB)": 91.52, "step": 45685, "token_acc": 0.7345281811137688, "train_speed(iter/s)": 0.152612 }, { "epoch": 0.5928588343297575, "grad_norm": 0.813049852848053, "learning_rate": 8.429612598908548e-05, "loss": 0.9253062248229981, "memory(GiB)": 91.52, "step": 45690, "token_acc": 0.7398387362185289, "train_speed(iter/s)": 0.152609 }, { "epoch": 0.5929237127314132, "grad_norm": 0.7941802740097046, "learning_rate": 8.429222272551624e-05, "loss": 0.9675946235656738, "memory(GiB)": 91.52, "step": 45695, "token_acc": 0.7789436070686071, "train_speed(iter/s)": 0.152605 }, { "epoch": 0.5929885911330689, "grad_norm": 0.8134929537773132, "learning_rate": 8.428831906731431e-05, "loss": 0.9229780197143554, "memory(GiB)": 91.52, "step": 45700, "token_acc": 0.763790783720148, "train_speed(iter/s)": 0.152601 }, { "epoch": 0.5930534695347246, "grad_norm": 0.773173987865448, "learning_rate": 8.42844150145246e-05, "loss": 0.8930602073669434, "memory(GiB)": 91.52, "step": 45705, "token_acc": 0.758494031221304, "train_speed(iter/s)": 0.152597 }, { "epoch": 0.5931183479363802, "grad_norm": 0.8540878891944885, "learning_rate": 8.428051056719207e-05, "loss": 0.9480159759521485, "memory(GiB)": 91.52, "step": 45710, "token_acc": 0.7608818541548897, "train_speed(iter/s)": 0.152593 }, { "epoch": 0.5931832263380359, "grad_norm": 0.8306303024291992, "learning_rate": 8.42766057253616e-05, "loss": 0.8761795997619629, "memory(GiB)": 91.52, "step": 45715, "token_acc": 0.757330384533067, "train_speed(iter/s)": 0.152589 }, { "epoch": 0.5932481047396916, "grad_norm": 0.7261715531349182, "learning_rate": 8.427270048907817e-05, "loss": 0.9017198562622071, "memory(GiB)": 91.52, "step": 45720, "token_acc": 0.7527333333333334, "train_speed(iter/s)": 0.152585 }, { "epoch": 0.5933129831413473, "grad_norm": 0.6815939545631409, "learning_rate": 8.42687948583867e-05, "loss": 0.8723920822143555, "memory(GiB)": 91.52, "step": 45725, "token_acc": 0.7779609708632899, "train_speed(iter/s)": 0.152579 }, { "epoch": 0.593377861543003, "grad_norm": 0.7241215109825134, "learning_rate": 8.426488883333216e-05, "loss": 0.9251983642578125, "memory(GiB)": 91.52, "step": 45730, "token_acc": 0.7654265410463359, "train_speed(iter/s)": 0.152575 }, { "epoch": 0.5934427399446587, "grad_norm": 0.7381876707077026, "learning_rate": 8.42609824139595e-05, "loss": 0.9645614624023438, "memory(GiB)": 91.52, "step": 45735, "token_acc": 0.7391288077239727, "train_speed(iter/s)": 0.152571 }, { "epoch": 0.5935076183463144, "grad_norm": 0.7350744009017944, "learning_rate": 8.425707560031362e-05, "loss": 0.9036643981933594, "memory(GiB)": 91.52, "step": 45740, "token_acc": 0.7575727348295926, "train_speed(iter/s)": 0.152566 }, { "epoch": 0.5935724967479701, "grad_norm": 0.7640547156333923, "learning_rate": 8.425316839243955e-05, "loss": 0.9207897186279297, "memory(GiB)": 91.52, "step": 45745, "token_acc": 0.7482941894706601, "train_speed(iter/s)": 0.152562 }, { "epoch": 0.5936373751496258, "grad_norm": 0.7320500612258911, "learning_rate": 8.424926079038221e-05, "loss": 0.9052179336547852, "memory(GiB)": 91.52, "step": 45750, "token_acc": 0.7558009994588917, "train_speed(iter/s)": 0.152559 }, { "epoch": 0.5937022535512815, "grad_norm": 0.785553514957428, "learning_rate": 8.424535279418658e-05, "loss": 0.8978919982910156, "memory(GiB)": 91.52, "step": 45755, "token_acc": 0.7512742852921622, "train_speed(iter/s)": 0.152554 }, { "epoch": 0.5937671319529372, "grad_norm": 0.7915628552436829, "learning_rate": 8.424144440389763e-05, "loss": 0.9063790321350098, "memory(GiB)": 91.52, "step": 45760, "token_acc": 0.760871839581517, "train_speed(iter/s)": 0.15255 }, { "epoch": 0.5938320103545929, "grad_norm": 0.853029191493988, "learning_rate": 8.423753561956036e-05, "loss": 0.9604575157165527, "memory(GiB)": 91.52, "step": 45765, "token_acc": 0.7273144478284451, "train_speed(iter/s)": 0.152545 }, { "epoch": 0.5938968887562486, "grad_norm": 0.8355787992477417, "learning_rate": 8.423362644121972e-05, "loss": 0.9472055435180664, "memory(GiB)": 91.52, "step": 45770, "token_acc": 0.7416833513014015, "train_speed(iter/s)": 0.152541 }, { "epoch": 0.5939617671579043, "grad_norm": 0.7335947751998901, "learning_rate": 8.422971686892072e-05, "loss": 0.9456034660339355, "memory(GiB)": 91.52, "step": 45775, "token_acc": 0.7513484358144552, "train_speed(iter/s)": 0.152537 }, { "epoch": 0.59402664555956, "grad_norm": 0.7470160722732544, "learning_rate": 8.422580690270832e-05, "loss": 0.8931086540222168, "memory(GiB)": 91.52, "step": 45780, "token_acc": 0.7833131893035201, "train_speed(iter/s)": 0.152532 }, { "epoch": 0.5940915239612157, "grad_norm": 0.8295378088951111, "learning_rate": 8.422189654262755e-05, "loss": 0.9482977867126465, "memory(GiB)": 91.52, "step": 45785, "token_acc": 0.7512258064516129, "train_speed(iter/s)": 0.152528 }, { "epoch": 0.5941564023628714, "grad_norm": 0.802522599697113, "learning_rate": 8.42179857887234e-05, "loss": 0.8897043228149414, "memory(GiB)": 91.52, "step": 45790, "token_acc": 0.765098093974726, "train_speed(iter/s)": 0.152524 }, { "epoch": 0.5942212807645271, "grad_norm": 0.8149679899215698, "learning_rate": 8.421407464104087e-05, "loss": 0.9090267181396484, "memory(GiB)": 91.52, "step": 45795, "token_acc": 0.7344085568703416, "train_speed(iter/s)": 0.152521 }, { "epoch": 0.5942861591661828, "grad_norm": 0.7562265396118164, "learning_rate": 8.421016309962497e-05, "loss": 0.8587902069091797, "memory(GiB)": 91.52, "step": 45800, "token_acc": 0.7606832239630757, "train_speed(iter/s)": 0.152517 }, { "epoch": 0.5943510375678385, "grad_norm": 0.803094208240509, "learning_rate": 8.420625116452071e-05, "loss": 0.900969409942627, "memory(GiB)": 91.52, "step": 45805, "token_acc": 0.7562038601796673, "train_speed(iter/s)": 0.152512 }, { "epoch": 0.5944159159694942, "grad_norm": 0.7519320845603943, "learning_rate": 8.420233883577312e-05, "loss": 0.9315990447998047, "memory(GiB)": 91.52, "step": 45810, "token_acc": 0.7321304189193671, "train_speed(iter/s)": 0.152507 }, { "epoch": 0.5944807943711499, "grad_norm": 0.9522486329078674, "learning_rate": 8.419842611342721e-05, "loss": 0.93069429397583, "memory(GiB)": 91.52, "step": 45815, "token_acc": 0.7185190350020918, "train_speed(iter/s)": 0.152503 }, { "epoch": 0.5945456727728056, "grad_norm": 0.8785051703453064, "learning_rate": 8.419451299752803e-05, "loss": 0.9232572555541992, "memory(GiB)": 91.52, "step": 45820, "token_acc": 0.7422884914206368, "train_speed(iter/s)": 0.152499 }, { "epoch": 0.5946105511744613, "grad_norm": 0.7676977515220642, "learning_rate": 8.419059948812058e-05, "loss": 0.9116870880126953, "memory(GiB)": 91.52, "step": 45825, "token_acc": 0.7452689411924303, "train_speed(iter/s)": 0.152495 }, { "epoch": 0.594675429576117, "grad_norm": 0.7433050274848938, "learning_rate": 8.41866855852499e-05, "loss": 0.9160167694091796, "memory(GiB)": 91.52, "step": 45830, "token_acc": 0.7548063718758583, "train_speed(iter/s)": 0.152491 }, { "epoch": 0.5947403079777727, "grad_norm": 0.7429766654968262, "learning_rate": 8.418277128896106e-05, "loss": 0.8929713249206543, "memory(GiB)": 91.52, "step": 45835, "token_acc": 0.7486830553116769, "train_speed(iter/s)": 0.152486 }, { "epoch": 0.5948051863794284, "grad_norm": 0.7487028241157532, "learning_rate": 8.41788565992991e-05, "loss": 0.8594978332519532, "memory(GiB)": 91.52, "step": 45840, "token_acc": 0.7788439980269943, "train_speed(iter/s)": 0.152482 }, { "epoch": 0.5948700647810841, "grad_norm": 0.9405953288078308, "learning_rate": 8.417494151630902e-05, "loss": 0.9295928955078125, "memory(GiB)": 91.52, "step": 45845, "token_acc": 0.7452460946971009, "train_speed(iter/s)": 0.152478 }, { "epoch": 0.5949349431827398, "grad_norm": 0.7856045961380005, "learning_rate": 8.417102604003595e-05, "loss": 0.8978407859802247, "memory(GiB)": 91.52, "step": 45850, "token_acc": 0.7501806239737274, "train_speed(iter/s)": 0.152475 }, { "epoch": 0.5949998215843955, "grad_norm": 0.7344446778297424, "learning_rate": 8.416711017052489e-05, "loss": 0.9105754852294922, "memory(GiB)": 91.52, "step": 45855, "token_acc": 0.7599732209664081, "train_speed(iter/s)": 0.152471 }, { "epoch": 0.5950646999860512, "grad_norm": 0.7631356120109558, "learning_rate": 8.416319390782093e-05, "loss": 0.8737197875976562, "memory(GiB)": 91.52, "step": 45860, "token_acc": 0.758771777306395, "train_speed(iter/s)": 0.152466 }, { "epoch": 0.5951295783877069, "grad_norm": 0.6965010762214661, "learning_rate": 8.415927725196912e-05, "loss": 0.9429232597351074, "memory(GiB)": 91.52, "step": 45865, "token_acc": 0.7608983957219251, "train_speed(iter/s)": 0.152462 }, { "epoch": 0.5951944567893626, "grad_norm": 0.731550931930542, "learning_rate": 8.415536020301457e-05, "loss": 0.8546571731567383, "memory(GiB)": 91.52, "step": 45870, "token_acc": 0.7639160839160839, "train_speed(iter/s)": 0.152457 }, { "epoch": 0.5952593351910183, "grad_norm": 0.7641667127609253, "learning_rate": 8.415144276100233e-05, "loss": 0.9144577026367188, "memory(GiB)": 91.52, "step": 45875, "token_acc": 0.7576872210282691, "train_speed(iter/s)": 0.152454 }, { "epoch": 0.595324213592674, "grad_norm": 0.8268030285835266, "learning_rate": 8.414752492597749e-05, "loss": 0.9187955856323242, "memory(GiB)": 91.52, "step": 45880, "token_acc": 0.7618450215364028, "train_speed(iter/s)": 0.152449 }, { "epoch": 0.5953890919943297, "grad_norm": 0.819828450679779, "learning_rate": 8.414360669798511e-05, "loss": 0.9308794975280762, "memory(GiB)": 91.52, "step": 45885, "token_acc": 0.7387857619402692, "train_speed(iter/s)": 0.152445 }, { "epoch": 0.5954539703959854, "grad_norm": 0.7846664786338806, "learning_rate": 8.413968807707031e-05, "loss": 0.8851617813110352, "memory(GiB)": 91.52, "step": 45890, "token_acc": 0.7464391637069313, "train_speed(iter/s)": 0.152441 }, { "epoch": 0.595518848797641, "grad_norm": 0.7853789329528809, "learning_rate": 8.413576906327818e-05, "loss": 0.9368083000183105, "memory(GiB)": 91.52, "step": 45895, "token_acc": 0.7437714867789992, "train_speed(iter/s)": 0.152437 }, { "epoch": 0.5955837271992968, "grad_norm": 0.8309641480445862, "learning_rate": 8.413184965665382e-05, "loss": 0.9389861106872559, "memory(GiB)": 91.52, "step": 45900, "token_acc": 0.7529302695185749, "train_speed(iter/s)": 0.152434 }, { "epoch": 0.5956486056009525, "grad_norm": 0.81950843334198, "learning_rate": 8.412792985724231e-05, "loss": 0.972206974029541, "memory(GiB)": 91.52, "step": 45905, "token_acc": 0.7434323858742463, "train_speed(iter/s)": 0.15243 }, { "epoch": 0.5957134840026082, "grad_norm": 0.7922891974449158, "learning_rate": 8.41240096650888e-05, "loss": 0.9166969299316406, "memory(GiB)": 91.52, "step": 45910, "token_acc": 0.7497981393715991, "train_speed(iter/s)": 0.152426 }, { "epoch": 0.5957783624042639, "grad_norm": 0.6950787305831909, "learning_rate": 8.412008908023839e-05, "loss": 0.9456780433654786, "memory(GiB)": 91.52, "step": 45915, "token_acc": 0.7532420866951949, "train_speed(iter/s)": 0.152422 }, { "epoch": 0.5958432408059195, "grad_norm": 0.8044103980064392, "learning_rate": 8.411616810273616e-05, "loss": 0.9043542861938476, "memory(GiB)": 91.52, "step": 45920, "token_acc": 0.7645430427928794, "train_speed(iter/s)": 0.152419 }, { "epoch": 0.5959081192075752, "grad_norm": 0.6973428726196289, "learning_rate": 8.411224673262727e-05, "loss": 0.9313417434692383, "memory(GiB)": 91.52, "step": 45925, "token_acc": 0.7623654745080987, "train_speed(iter/s)": 0.152414 }, { "epoch": 0.595972997609231, "grad_norm": 0.721063494682312, "learning_rate": 8.410832496995687e-05, "loss": 0.8955325126647949, "memory(GiB)": 91.52, "step": 45930, "token_acc": 0.7780335179457671, "train_speed(iter/s)": 0.15241 }, { "epoch": 0.5960378760108866, "grad_norm": 0.7804429531097412, "learning_rate": 8.410440281477003e-05, "loss": 0.8932619094848633, "memory(GiB)": 91.52, "step": 45935, "token_acc": 0.7404604995525181, "train_speed(iter/s)": 0.152406 }, { "epoch": 0.5961027544125423, "grad_norm": 0.7862731218338013, "learning_rate": 8.410048026711194e-05, "loss": 0.8985831260681152, "memory(GiB)": 91.52, "step": 45940, "token_acc": 0.7646913808511294, "train_speed(iter/s)": 0.152401 }, { "epoch": 0.5961676328141979, "grad_norm": 0.8010521531105042, "learning_rate": 8.409655732702771e-05, "loss": 0.9322652816772461, "memory(GiB)": 91.52, "step": 45945, "token_acc": 0.7526533206228154, "train_speed(iter/s)": 0.152397 }, { "epoch": 0.5962325112158536, "grad_norm": 0.6264523863792419, "learning_rate": 8.409263399456249e-05, "loss": 0.9165149688720703, "memory(GiB)": 91.52, "step": 45950, "token_acc": 0.7623219442551673, "train_speed(iter/s)": 0.152392 }, { "epoch": 0.5962973896175093, "grad_norm": 0.8256183862686157, "learning_rate": 8.408871026976143e-05, "loss": 0.929744815826416, "memory(GiB)": 91.52, "step": 45955, "token_acc": 0.7639787472440092, "train_speed(iter/s)": 0.152388 }, { "epoch": 0.596362268019165, "grad_norm": 0.7115943431854248, "learning_rate": 8.408478615266969e-05, "loss": 0.9571126937866211, "memory(GiB)": 91.52, "step": 45960, "token_acc": 0.7618045210182006, "train_speed(iter/s)": 0.152385 }, { "epoch": 0.5964271464208207, "grad_norm": 0.7856202721595764, "learning_rate": 8.408086164333243e-05, "loss": 0.9462444305419921, "memory(GiB)": 91.52, "step": 45965, "token_acc": 0.7471723443530793, "train_speed(iter/s)": 0.152381 }, { "epoch": 0.5964920248224764, "grad_norm": 0.7200335264205933, "learning_rate": 8.40769367417948e-05, "loss": 0.9026341438293457, "memory(GiB)": 91.52, "step": 45970, "token_acc": 0.7269077660076918, "train_speed(iter/s)": 0.152376 }, { "epoch": 0.5965569032241321, "grad_norm": 0.8216629028320312, "learning_rate": 8.407301144810198e-05, "loss": 0.9204755783081054, "memory(GiB)": 91.52, "step": 45975, "token_acc": 0.746959437249359, "train_speed(iter/s)": 0.152373 }, { "epoch": 0.5966217816257878, "grad_norm": 0.7735400795936584, "learning_rate": 8.406908576229913e-05, "loss": 0.913572120666504, "memory(GiB)": 91.52, "step": 45980, "token_acc": 0.7499482865614011, "train_speed(iter/s)": 0.152369 }, { "epoch": 0.5966866600274435, "grad_norm": 0.7387486696243286, "learning_rate": 8.406515968443144e-05, "loss": 0.9061453819274903, "memory(GiB)": 91.52, "step": 45985, "token_acc": 0.7525990305866622, "train_speed(iter/s)": 0.152365 }, { "epoch": 0.5967515384290992, "grad_norm": 0.705735981464386, "learning_rate": 8.406123321454408e-05, "loss": 0.9006041526794434, "memory(GiB)": 91.52, "step": 45990, "token_acc": 0.7647428643279266, "train_speed(iter/s)": 0.15236 }, { "epoch": 0.5968164168307549, "grad_norm": 0.7985436320304871, "learning_rate": 8.405730635268225e-05, "loss": 0.9574530601501465, "memory(GiB)": 91.52, "step": 45995, "token_acc": 0.7492188476440445, "train_speed(iter/s)": 0.152356 }, { "epoch": 0.5968812952324106, "grad_norm": 0.7955454587936401, "learning_rate": 8.405337909889112e-05, "loss": 0.9142970085144043, "memory(GiB)": 91.52, "step": 46000, "token_acc": 0.7362232421024422, "train_speed(iter/s)": 0.152352 }, { "epoch": 0.5969461736340663, "grad_norm": 0.6877115368843079, "learning_rate": 8.40494514532159e-05, "loss": 0.9274906158447266, "memory(GiB)": 91.52, "step": 46005, "token_acc": 0.7503987240829346, "train_speed(iter/s)": 0.152347 }, { "epoch": 0.597011052035722, "grad_norm": 0.7613318562507629, "learning_rate": 8.404552341570177e-05, "loss": 0.9178353309631347, "memory(GiB)": 91.52, "step": 46010, "token_acc": 0.7632132483055658, "train_speed(iter/s)": 0.152343 }, { "epoch": 0.5970759304373777, "grad_norm": 0.7172138094902039, "learning_rate": 8.404159498639397e-05, "loss": 0.9008735656738281, "memory(GiB)": 91.52, "step": 46015, "token_acc": 0.7586536927739671, "train_speed(iter/s)": 0.15234 }, { "epoch": 0.5971408088390334, "grad_norm": 0.7310563325881958, "learning_rate": 8.403766616533766e-05, "loss": 0.9449907302856445, "memory(GiB)": 91.52, "step": 46020, "token_acc": 0.7531544647809207, "train_speed(iter/s)": 0.152336 }, { "epoch": 0.5972056872406891, "grad_norm": 0.694624662399292, "learning_rate": 8.40337369525781e-05, "loss": 0.8981328964233398, "memory(GiB)": 91.52, "step": 46025, "token_acc": 0.7548735697132364, "train_speed(iter/s)": 0.152332 }, { "epoch": 0.5972705656423448, "grad_norm": 0.7077838778495789, "learning_rate": 8.402980734816048e-05, "loss": 0.8834254264831543, "memory(GiB)": 91.52, "step": 46030, "token_acc": 0.7614989082200465, "train_speed(iter/s)": 0.152328 }, { "epoch": 0.5973354440440005, "grad_norm": 0.7308592796325684, "learning_rate": 8.402587735213003e-05, "loss": 0.8928964614868165, "memory(GiB)": 91.52, "step": 46035, "token_acc": 0.757799058594384, "train_speed(iter/s)": 0.152324 }, { "epoch": 0.5974003224456562, "grad_norm": 0.8565403819084167, "learning_rate": 8.402194696453198e-05, "loss": 0.9034027099609375, "memory(GiB)": 91.52, "step": 46040, "token_acc": 0.7540735711640538, "train_speed(iter/s)": 0.15232 }, { "epoch": 0.5974652008473119, "grad_norm": 0.7013031244277954, "learning_rate": 8.401801618541153e-05, "loss": 0.9017154693603515, "memory(GiB)": 91.52, "step": 46045, "token_acc": 0.7320970537261698, "train_speed(iter/s)": 0.152315 }, { "epoch": 0.5975300792489676, "grad_norm": 0.868590772151947, "learning_rate": 8.401408501481395e-05, "loss": 0.9278741836547851, "memory(GiB)": 91.52, "step": 46050, "token_acc": 0.7584015996903825, "train_speed(iter/s)": 0.15231 }, { "epoch": 0.5975949576506233, "grad_norm": 0.6941534876823425, "learning_rate": 8.401015345278447e-05, "loss": 0.8934467315673829, "memory(GiB)": 91.52, "step": 46055, "token_acc": 0.7792194453954125, "train_speed(iter/s)": 0.152307 }, { "epoch": 0.597659836052279, "grad_norm": 0.7387004494667053, "learning_rate": 8.400622149936835e-05, "loss": 0.9782526016235351, "memory(GiB)": 91.52, "step": 46060, "token_acc": 0.7434614509893109, "train_speed(iter/s)": 0.152303 }, { "epoch": 0.5977247144539347, "grad_norm": 0.7221342921257019, "learning_rate": 8.400228915461079e-05, "loss": 0.9010480880737305, "memory(GiB)": 91.52, "step": 46065, "token_acc": 0.741724458425681, "train_speed(iter/s)": 0.152298 }, { "epoch": 0.5977895928555904, "grad_norm": 0.7374585270881653, "learning_rate": 8.399835641855711e-05, "loss": 0.9177781105041504, "memory(GiB)": 91.52, "step": 46070, "token_acc": 0.7647848560112774, "train_speed(iter/s)": 0.152294 }, { "epoch": 0.5978544712572461, "grad_norm": 0.7038484811782837, "learning_rate": 8.399442329125251e-05, "loss": 0.8900444984436036, "memory(GiB)": 91.52, "step": 46075, "token_acc": 0.7825128581925055, "train_speed(iter/s)": 0.152289 }, { "epoch": 0.5979193496589018, "grad_norm": 0.822608470916748, "learning_rate": 8.399048977274226e-05, "loss": 0.9396065711975098, "memory(GiB)": 91.52, "step": 46080, "token_acc": 0.7419492779842768, "train_speed(iter/s)": 0.152285 }, { "epoch": 0.5979842280605575, "grad_norm": 0.8533024191856384, "learning_rate": 8.398655586307167e-05, "loss": 0.8971995353698731, "memory(GiB)": 91.52, "step": 46085, "token_acc": 0.7547156016961544, "train_speed(iter/s)": 0.152281 }, { "epoch": 0.5980491064622132, "grad_norm": 0.8019109964370728, "learning_rate": 8.398262156228599e-05, "loss": 0.9399358749389648, "memory(GiB)": 91.52, "step": 46090, "token_acc": 0.7429259090613413, "train_speed(iter/s)": 0.152277 }, { "epoch": 0.5981139848638689, "grad_norm": 0.7577924132347107, "learning_rate": 8.397868687043046e-05, "loss": 0.9321057319641113, "memory(GiB)": 91.52, "step": 46095, "token_acc": 0.7469830803682508, "train_speed(iter/s)": 0.152271 }, { "epoch": 0.5981788632655246, "grad_norm": 0.716733455657959, "learning_rate": 8.397475178755039e-05, "loss": 0.8503314971923828, "memory(GiB)": 91.52, "step": 46100, "token_acc": 0.7714871580996072, "train_speed(iter/s)": 0.152267 }, { "epoch": 0.5982437416671803, "grad_norm": 0.7386760711669922, "learning_rate": 8.397081631369108e-05, "loss": 0.9378482818603515, "memory(GiB)": 91.52, "step": 46105, "token_acc": 0.7470043236565781, "train_speed(iter/s)": 0.152264 }, { "epoch": 0.598308620068836, "grad_norm": 0.7216939926147461, "learning_rate": 8.39668804488978e-05, "loss": 0.8917478561401367, "memory(GiB)": 91.52, "step": 46110, "token_acc": 0.7827402020722054, "train_speed(iter/s)": 0.152259 }, { "epoch": 0.5983734984704917, "grad_norm": 0.6814588904380798, "learning_rate": 8.396294419321583e-05, "loss": 0.8669681549072266, "memory(GiB)": 91.52, "step": 46115, "token_acc": 0.7613513147194362, "train_speed(iter/s)": 0.152255 }, { "epoch": 0.5984383768721474, "grad_norm": 0.8231400847434998, "learning_rate": 8.395900754669049e-05, "loss": 0.8902428627014161, "memory(GiB)": 91.52, "step": 46120, "token_acc": 0.7438464481571229, "train_speed(iter/s)": 0.152251 }, { "epoch": 0.5985032552738031, "grad_norm": 0.7097775340080261, "learning_rate": 8.395507050936708e-05, "loss": 0.9142333984375, "memory(GiB)": 91.52, "step": 46125, "token_acc": 0.7571054668780182, "train_speed(iter/s)": 0.152247 }, { "epoch": 0.5985681336754588, "grad_norm": 0.7683021426200867, "learning_rate": 8.395113308129091e-05, "loss": 0.8978811264038086, "memory(GiB)": 91.52, "step": 46130, "token_acc": 0.7729835061590925, "train_speed(iter/s)": 0.152244 }, { "epoch": 0.5986330120771145, "grad_norm": 0.8625382781028748, "learning_rate": 8.394719526250727e-05, "loss": 0.913296890258789, "memory(GiB)": 91.52, "step": 46135, "token_acc": 0.7554491106902825, "train_speed(iter/s)": 0.152239 }, { "epoch": 0.5986978904787702, "grad_norm": 0.775884747505188, "learning_rate": 8.394325705306151e-05, "loss": 0.925382423400879, "memory(GiB)": 91.52, "step": 46140, "token_acc": 0.7634399902916549, "train_speed(iter/s)": 0.152235 }, { "epoch": 0.5987627688804259, "grad_norm": 0.7426758408546448, "learning_rate": 8.393931845299893e-05, "loss": 0.8935966491699219, "memory(GiB)": 91.52, "step": 46145, "token_acc": 0.7357971014492753, "train_speed(iter/s)": 0.15223 }, { "epoch": 0.5988276472820816, "grad_norm": 0.8619611263275146, "learning_rate": 8.393537946236485e-05, "loss": 0.9599161148071289, "memory(GiB)": 91.52, "step": 46150, "token_acc": 0.7391656683813684, "train_speed(iter/s)": 0.152227 }, { "epoch": 0.5988925256837373, "grad_norm": 0.7979872822761536, "learning_rate": 8.39314400812046e-05, "loss": 0.9773092269897461, "memory(GiB)": 91.52, "step": 46155, "token_acc": 0.7535174570088587, "train_speed(iter/s)": 0.152223 }, { "epoch": 0.598957404085393, "grad_norm": 0.7609697580337524, "learning_rate": 8.392750030956354e-05, "loss": 0.9378361701965332, "memory(GiB)": 91.52, "step": 46160, "token_acc": 0.7677326701460646, "train_speed(iter/s)": 0.152219 }, { "epoch": 0.5990222824870487, "grad_norm": 0.706294059753418, "learning_rate": 8.392356014748696e-05, "loss": 0.9328338623046875, "memory(GiB)": 91.52, "step": 46165, "token_acc": 0.7363128491620111, "train_speed(iter/s)": 0.152215 }, { "epoch": 0.5990871608887044, "grad_norm": 0.8230454921722412, "learning_rate": 8.391961959502028e-05, "loss": 0.9908750534057618, "memory(GiB)": 91.52, "step": 46170, "token_acc": 0.7281688232504573, "train_speed(iter/s)": 0.152211 }, { "epoch": 0.5991520392903601, "grad_norm": 0.7992243766784668, "learning_rate": 8.391567865220876e-05, "loss": 0.9347582817077636, "memory(GiB)": 91.52, "step": 46175, "token_acc": 0.7298901021733788, "train_speed(iter/s)": 0.152207 }, { "epoch": 0.5992169176920158, "grad_norm": 0.7324425578117371, "learning_rate": 8.391173731909782e-05, "loss": 0.9035434722900391, "memory(GiB)": 91.52, "step": 46180, "token_acc": 0.7616077414089597, "train_speed(iter/s)": 0.152203 }, { "epoch": 0.5992817960936714, "grad_norm": 0.8617393970489502, "learning_rate": 8.390779559573276e-05, "loss": 0.9051849365234375, "memory(GiB)": 91.52, "step": 46185, "token_acc": 0.7451817822919283, "train_speed(iter/s)": 0.152199 }, { "epoch": 0.5993466744953271, "grad_norm": 0.804501473903656, "learning_rate": 8.390385348215898e-05, "loss": 0.8830678939819336, "memory(GiB)": 91.52, "step": 46190, "token_acc": 0.7436005477720425, "train_speed(iter/s)": 0.152196 }, { "epoch": 0.5994115528969828, "grad_norm": 0.7312988042831421, "learning_rate": 8.389991097842185e-05, "loss": 0.9017868041992188, "memory(GiB)": 91.52, "step": 46195, "token_acc": 0.7501762052438681, "train_speed(iter/s)": 0.152192 }, { "epoch": 0.5994764312986385, "grad_norm": 0.8608386516571045, "learning_rate": 8.389596808456673e-05, "loss": 0.9331735610961914, "memory(GiB)": 91.52, "step": 46200, "token_acc": 0.7275898867181086, "train_speed(iter/s)": 0.152188 }, { "epoch": 0.5995413097002942, "grad_norm": 0.7077175378799438, "learning_rate": 8.389202480063898e-05, "loss": 0.8947185516357422, "memory(GiB)": 91.52, "step": 46205, "token_acc": 0.7639451783559694, "train_speed(iter/s)": 0.152184 }, { "epoch": 0.5996061881019499, "grad_norm": 0.7327923774719238, "learning_rate": 8.388808112668398e-05, "loss": 0.8938755035400391, "memory(GiB)": 91.52, "step": 46210, "token_acc": 0.7460915492957747, "train_speed(iter/s)": 0.15218 }, { "epoch": 0.5996710665036056, "grad_norm": 0.7461970448493958, "learning_rate": 8.388413706274714e-05, "loss": 0.8997047424316407, "memory(GiB)": 91.52, "step": 46215, "token_acc": 0.7477365654205608, "train_speed(iter/s)": 0.152175 }, { "epoch": 0.5997359449052613, "grad_norm": 0.7825918793678284, "learning_rate": 8.388019260887383e-05, "loss": 0.8970204353332519, "memory(GiB)": 91.52, "step": 46220, "token_acc": 0.7745889582584251, "train_speed(iter/s)": 0.152171 }, { "epoch": 0.599800823306917, "grad_norm": 0.7718614935874939, "learning_rate": 8.387624776510942e-05, "loss": 0.8633297920227051, "memory(GiB)": 91.52, "step": 46225, "token_acc": 0.7650819017179384, "train_speed(iter/s)": 0.152168 }, { "epoch": 0.5998657017085727, "grad_norm": 0.7553175091743469, "learning_rate": 8.387230253149936e-05, "loss": 0.9089344024658204, "memory(GiB)": 91.52, "step": 46230, "token_acc": 0.7736729929970567, "train_speed(iter/s)": 0.152164 }, { "epoch": 0.5999305801102284, "grad_norm": 0.7562875747680664, "learning_rate": 8.386835690808902e-05, "loss": 0.9505908966064454, "memory(GiB)": 91.52, "step": 46235, "token_acc": 0.7366334549536907, "train_speed(iter/s)": 0.15216 }, { "epoch": 0.5999954585118841, "grad_norm": 0.7843017578125, "learning_rate": 8.386441089492381e-05, "loss": 0.8864540100097656, "memory(GiB)": 91.52, "step": 46240, "token_acc": 0.784750830564784, "train_speed(iter/s)": 0.152155 }, { "epoch": 0.6000603369135398, "grad_norm": 0.8598990440368652, "learning_rate": 8.386046449204911e-05, "loss": 0.9452104568481445, "memory(GiB)": 91.52, "step": 46245, "token_acc": 0.7382533197139939, "train_speed(iter/s)": 0.152151 }, { "epoch": 0.6001252153151955, "grad_norm": 0.7747818827629089, "learning_rate": 8.38565176995104e-05, "loss": 0.8830820083618164, "memory(GiB)": 91.52, "step": 46250, "token_acc": 0.7540220549186217, "train_speed(iter/s)": 0.152147 }, { "epoch": 0.6001900937168512, "grad_norm": 0.808914840221405, "learning_rate": 8.385257051735304e-05, "loss": 0.9117303848266601, "memory(GiB)": 91.52, "step": 46255, "token_acc": 0.7615646485548361, "train_speed(iter/s)": 0.152143 }, { "epoch": 0.6002549721185069, "grad_norm": 0.9128750562667847, "learning_rate": 8.384862294562248e-05, "loss": 0.9300414085388183, "memory(GiB)": 91.52, "step": 46260, "token_acc": 0.7495627043881664, "train_speed(iter/s)": 0.152138 }, { "epoch": 0.6003198505201626, "grad_norm": 0.7393701076507568, "learning_rate": 8.384467498436414e-05, "loss": 0.8682760238647461, "memory(GiB)": 91.52, "step": 46265, "token_acc": 0.763380713382629, "train_speed(iter/s)": 0.152134 }, { "epoch": 0.6003847289218183, "grad_norm": 0.7756372690200806, "learning_rate": 8.384072663362348e-05, "loss": 0.9141830444335938, "memory(GiB)": 91.52, "step": 46270, "token_acc": 0.7727030767592888, "train_speed(iter/s)": 0.152129 }, { "epoch": 0.600449607323474, "grad_norm": 0.8049975633621216, "learning_rate": 8.383677789344591e-05, "loss": 0.9115936279296875, "memory(GiB)": 91.52, "step": 46275, "token_acc": 0.7699474185693617, "train_speed(iter/s)": 0.152126 }, { "epoch": 0.6005144857251297, "grad_norm": 0.7062103748321533, "learning_rate": 8.383282876387687e-05, "loss": 0.8647975921630859, "memory(GiB)": 91.52, "step": 46280, "token_acc": 0.7624238823704144, "train_speed(iter/s)": 0.152121 }, { "epoch": 0.6005793641267854, "grad_norm": 0.769268810749054, "learning_rate": 8.382887924496182e-05, "loss": 0.8953577995300293, "memory(GiB)": 91.52, "step": 46285, "token_acc": 0.746786224408824, "train_speed(iter/s)": 0.152117 }, { "epoch": 0.6006442425284411, "grad_norm": 0.8181671500205994, "learning_rate": 8.382492933674619e-05, "loss": 0.8860631942749023, "memory(GiB)": 91.52, "step": 46290, "token_acc": 0.7689248309691105, "train_speed(iter/s)": 0.152112 }, { "epoch": 0.6007091209300968, "grad_norm": 0.7447044849395752, "learning_rate": 8.382097903927547e-05, "loss": 0.8965547561645508, "memory(GiB)": 91.52, "step": 46295, "token_acc": 0.7461317160407434, "train_speed(iter/s)": 0.152108 }, { "epoch": 0.6007739993317525, "grad_norm": 0.7507146596908569, "learning_rate": 8.381702835259508e-05, "loss": 0.9284152984619141, "memory(GiB)": 91.52, "step": 46300, "token_acc": 0.7546201576387724, "train_speed(iter/s)": 0.152104 }, { "epoch": 0.6008388777334082, "grad_norm": 0.6984331607818604, "learning_rate": 8.381307727675052e-05, "loss": 0.9472638130187988, "memory(GiB)": 91.52, "step": 46305, "token_acc": 0.7196593093391662, "train_speed(iter/s)": 0.1521 }, { "epoch": 0.6009037561350639, "grad_norm": 0.7282285690307617, "learning_rate": 8.380912581178724e-05, "loss": 0.9049066543579102, "memory(GiB)": 91.52, "step": 46310, "token_acc": 0.7451524518757904, "train_speed(iter/s)": 0.152096 }, { "epoch": 0.6009686345367196, "grad_norm": 0.7418222427368164, "learning_rate": 8.380517395775071e-05, "loss": 0.9160204887390136, "memory(GiB)": 91.52, "step": 46315, "token_acc": 0.7587042082954889, "train_speed(iter/s)": 0.152092 }, { "epoch": 0.6010335129383753, "grad_norm": 0.7870514988899231, "learning_rate": 8.380122171468641e-05, "loss": 0.9202852249145508, "memory(GiB)": 91.52, "step": 46320, "token_acc": 0.7444376246739297, "train_speed(iter/s)": 0.152089 }, { "epoch": 0.601098391340031, "grad_norm": 0.7282603979110718, "learning_rate": 8.379726908263985e-05, "loss": 0.8882230758666992, "memory(GiB)": 91.52, "step": 46325, "token_acc": 0.7499447106524143, "train_speed(iter/s)": 0.152085 }, { "epoch": 0.6011632697416867, "grad_norm": 0.7569155693054199, "learning_rate": 8.379331606165648e-05, "loss": 0.9084822654724121, "memory(GiB)": 91.52, "step": 46330, "token_acc": 0.7511513048121204, "train_speed(iter/s)": 0.152081 }, { "epoch": 0.6012281481433424, "grad_norm": 0.7501115798950195, "learning_rate": 8.37893626517818e-05, "loss": 0.9229809761047363, "memory(GiB)": 91.52, "step": 46335, "token_acc": 0.764581088362069, "train_speed(iter/s)": 0.152077 }, { "epoch": 0.601293026544998, "grad_norm": 0.7420656085014343, "learning_rate": 8.37854088530613e-05, "loss": 0.8777042388916015, "memory(GiB)": 91.52, "step": 46340, "token_acc": 0.7491564608847788, "train_speed(iter/s)": 0.152072 }, { "epoch": 0.6013579049466538, "grad_norm": 0.694568932056427, "learning_rate": 8.37814546655405e-05, "loss": 0.8810680389404297, "memory(GiB)": 91.52, "step": 46345, "token_acc": 0.7483274370380749, "train_speed(iter/s)": 0.152069 }, { "epoch": 0.6014227833483095, "grad_norm": 0.6423575282096863, "learning_rate": 8.377750008926489e-05, "loss": 0.8844390869140625, "memory(GiB)": 91.52, "step": 46350, "token_acc": 0.795274828131556, "train_speed(iter/s)": 0.152064 }, { "epoch": 0.6014876617499652, "grad_norm": 0.7653166055679321, "learning_rate": 8.377354512427998e-05, "loss": 0.9350974082946777, "memory(GiB)": 91.52, "step": 46355, "token_acc": 0.7642195893176883, "train_speed(iter/s)": 0.152059 }, { "epoch": 0.6015525401516209, "grad_norm": 0.8167422413825989, "learning_rate": 8.37695897706313e-05, "loss": 0.8950857162475586, "memory(GiB)": 91.52, "step": 46360, "token_acc": 0.7706536736805795, "train_speed(iter/s)": 0.152055 }, { "epoch": 0.6016174185532766, "grad_norm": 0.733182430267334, "learning_rate": 8.376563402836435e-05, "loss": 0.8955005645751953, "memory(GiB)": 91.52, "step": 46365, "token_acc": 0.748072733366851, "train_speed(iter/s)": 0.152052 }, { "epoch": 0.6016822969549322, "grad_norm": 0.7451302409172058, "learning_rate": 8.376167789752465e-05, "loss": 0.863868236541748, "memory(GiB)": 91.52, "step": 46370, "token_acc": 0.7528026110401589, "train_speed(iter/s)": 0.152048 }, { "epoch": 0.601747175356588, "grad_norm": 0.6746030449867249, "learning_rate": 8.375772137815773e-05, "loss": 0.8997112274169922, "memory(GiB)": 91.52, "step": 46375, "token_acc": 0.7719637223974763, "train_speed(iter/s)": 0.152045 }, { "epoch": 0.6018120537582436, "grad_norm": 0.6851014494895935, "learning_rate": 8.375376447030914e-05, "loss": 0.8734216690063477, "memory(GiB)": 91.52, "step": 46380, "token_acc": 0.7703132424430718, "train_speed(iter/s)": 0.152041 }, { "epoch": 0.6018769321598993, "grad_norm": 0.7672955989837646, "learning_rate": 8.374980717402438e-05, "loss": 0.8924724578857421, "memory(GiB)": 91.52, "step": 46385, "token_acc": 0.7438210911895122, "train_speed(iter/s)": 0.152036 }, { "epoch": 0.601941810561555, "grad_norm": 0.7344194650650024, "learning_rate": 8.374584948934903e-05, "loss": 0.8877164840698242, "memory(GiB)": 91.52, "step": 46390, "token_acc": 0.7474666970283502, "train_speed(iter/s)": 0.152032 }, { "epoch": 0.6020066889632107, "grad_norm": 0.7554643750190735, "learning_rate": 8.374189141632862e-05, "loss": 0.8954179763793946, "memory(GiB)": 91.52, "step": 46395, "token_acc": 0.7606512201312593, "train_speed(iter/s)": 0.152027 }, { "epoch": 0.6020715673648664, "grad_norm": 0.7365559935569763, "learning_rate": 8.373793295500867e-05, "loss": 0.8920246124267578, "memory(GiB)": 91.52, "step": 46400, "token_acc": 0.7673656171026569, "train_speed(iter/s)": 0.152023 }, { "epoch": 0.6021364457665221, "grad_norm": 0.6737845540046692, "learning_rate": 8.373397410543478e-05, "loss": 0.8910789489746094, "memory(GiB)": 91.52, "step": 46405, "token_acc": 0.7610600781332488, "train_speed(iter/s)": 0.152019 }, { "epoch": 0.6022013241681778, "grad_norm": 0.8121669292449951, "learning_rate": 8.373001486765248e-05, "loss": 0.93126220703125, "memory(GiB)": 91.52, "step": 46410, "token_acc": 0.7730853391684902, "train_speed(iter/s)": 0.152015 }, { "epoch": 0.6022662025698335, "grad_norm": 0.8017638921737671, "learning_rate": 8.372605524170733e-05, "loss": 0.918109130859375, "memory(GiB)": 91.52, "step": 46415, "token_acc": 0.7623826714801444, "train_speed(iter/s)": 0.152011 }, { "epoch": 0.6023310809714892, "grad_norm": 0.7494528293609619, "learning_rate": 8.372209522764494e-05, "loss": 0.8763521194458008, "memory(GiB)": 91.52, "step": 46420, "token_acc": 0.7519450945663863, "train_speed(iter/s)": 0.152006 }, { "epoch": 0.6023959593731448, "grad_norm": 0.7121830582618713, "learning_rate": 8.371813482551081e-05, "loss": 0.810889720916748, "memory(GiB)": 91.52, "step": 46425, "token_acc": 0.769240538481077, "train_speed(iter/s)": 0.152002 }, { "epoch": 0.6024608377748005, "grad_norm": 1.7046476602554321, "learning_rate": 8.371417403535056e-05, "loss": 0.916139793395996, "memory(GiB)": 91.52, "step": 46430, "token_acc": 0.7560687432867884, "train_speed(iter/s)": 0.151998 }, { "epoch": 0.6025257161764562, "grad_norm": 0.8512828350067139, "learning_rate": 8.371021285720979e-05, "loss": 0.9235841751098632, "memory(GiB)": 91.52, "step": 46435, "token_acc": 0.7623188405797101, "train_speed(iter/s)": 0.151995 }, { "epoch": 0.6025905945781119, "grad_norm": 0.7444579005241394, "learning_rate": 8.370625129113405e-05, "loss": 0.9155220985412598, "memory(GiB)": 91.52, "step": 46440, "token_acc": 0.738810858016156, "train_speed(iter/s)": 0.15199 }, { "epoch": 0.6026554729797676, "grad_norm": 0.7530880570411682, "learning_rate": 8.370228933716892e-05, "loss": 0.8861687660217286, "memory(GiB)": 91.52, "step": 46445, "token_acc": 0.7667108084247698, "train_speed(iter/s)": 0.151985 }, { "epoch": 0.6027203513814233, "grad_norm": 0.7656809687614441, "learning_rate": 8.369832699536004e-05, "loss": 0.9015807151794434, "memory(GiB)": 91.52, "step": 46450, "token_acc": 0.7650600301025587, "train_speed(iter/s)": 0.151983 }, { "epoch": 0.602785229783079, "grad_norm": 0.7552958726882935, "learning_rate": 8.369436426575297e-05, "loss": 0.9157541275024415, "memory(GiB)": 91.52, "step": 46455, "token_acc": 0.762273553774665, "train_speed(iter/s)": 0.151979 }, { "epoch": 0.6028501081847347, "grad_norm": 0.7950041890144348, "learning_rate": 8.369040114839333e-05, "loss": 0.9093582153320312, "memory(GiB)": 91.52, "step": 46460, "token_acc": 0.7573806644376467, "train_speed(iter/s)": 0.151976 }, { "epoch": 0.6029149865863904, "grad_norm": 0.7901896834373474, "learning_rate": 8.368643764332671e-05, "loss": 0.8771697998046875, "memory(GiB)": 91.52, "step": 46465, "token_acc": 0.7826801517067004, "train_speed(iter/s)": 0.151972 }, { "epoch": 0.6029798649880461, "grad_norm": 0.7612332701683044, "learning_rate": 8.368247375059874e-05, "loss": 0.9273235321044921, "memory(GiB)": 91.52, "step": 46470, "token_acc": 0.7389984360145972, "train_speed(iter/s)": 0.151968 }, { "epoch": 0.6030447433897018, "grad_norm": 0.8193848729133606, "learning_rate": 8.367850947025501e-05, "loss": 0.8875467300415039, "memory(GiB)": 91.52, "step": 46475, "token_acc": 0.7562154504291786, "train_speed(iter/s)": 0.151965 }, { "epoch": 0.6031096217913575, "grad_norm": 0.7838100790977478, "learning_rate": 8.36745448023412e-05, "loss": 0.9377300262451171, "memory(GiB)": 91.52, "step": 46480, "token_acc": 0.7619472694232793, "train_speed(iter/s)": 0.15196 }, { "epoch": 0.6031745001930132, "grad_norm": 0.7941482663154602, "learning_rate": 8.367057974690287e-05, "loss": 0.8664125442504883, "memory(GiB)": 91.52, "step": 46485, "token_acc": 0.783475591241861, "train_speed(iter/s)": 0.151955 }, { "epoch": 0.6032393785946689, "grad_norm": 0.7655596733093262, "learning_rate": 8.366661430398569e-05, "loss": 0.887600326538086, "memory(GiB)": 91.52, "step": 46490, "token_acc": 0.7560859036451695, "train_speed(iter/s)": 0.15195 }, { "epoch": 0.6033042569963246, "grad_norm": 0.730607807636261, "learning_rate": 8.366264847363526e-05, "loss": 0.8667227745056152, "memory(GiB)": 91.52, "step": 46495, "token_acc": 0.7646346228995702, "train_speed(iter/s)": 0.151946 }, { "epoch": 0.6033691353979803, "grad_norm": 0.7607686519622803, "learning_rate": 8.365868225589726e-05, "loss": 0.895022201538086, "memory(GiB)": 91.52, "step": 46500, "token_acc": 0.7695373366894402, "train_speed(iter/s)": 0.151942 }, { "epoch": 0.603434013799636, "grad_norm": 0.790462851524353, "learning_rate": 8.365471565081729e-05, "loss": 0.9044933319091797, "memory(GiB)": 91.52, "step": 46505, "token_acc": 0.7549443757725587, "train_speed(iter/s)": 0.151939 }, { "epoch": 0.6034988922012917, "grad_norm": 0.7726203203201294, "learning_rate": 8.365074865844103e-05, "loss": 0.952032470703125, "memory(GiB)": 91.52, "step": 46510, "token_acc": 0.7638000556173526, "train_speed(iter/s)": 0.151934 }, { "epoch": 0.6035637706029474, "grad_norm": 0.7351187467575073, "learning_rate": 8.364678127881411e-05, "loss": 0.8954530715942383, "memory(GiB)": 91.52, "step": 46515, "token_acc": 0.7542510982003684, "train_speed(iter/s)": 0.151931 }, { "epoch": 0.6036286490046031, "grad_norm": 0.7112710475921631, "learning_rate": 8.364281351198221e-05, "loss": 0.9206689834594727, "memory(GiB)": 91.52, "step": 46520, "token_acc": 0.7290490100616683, "train_speed(iter/s)": 0.151927 }, { "epoch": 0.6036935274062588, "grad_norm": 0.7969520092010498, "learning_rate": 8.363884535799098e-05, "loss": 0.9276664733886719, "memory(GiB)": 91.52, "step": 46525, "token_acc": 0.7776278001148765, "train_speed(iter/s)": 0.151922 }, { "epoch": 0.6037584058079145, "grad_norm": 0.816272497177124, "learning_rate": 8.363487681688607e-05, "loss": 0.8637290000915527, "memory(GiB)": 91.52, "step": 46530, "token_acc": 0.749289924564029, "train_speed(iter/s)": 0.151919 }, { "epoch": 0.6038232842095702, "grad_norm": 0.7326134443283081, "learning_rate": 8.363090788871317e-05, "loss": 0.8894610404968262, "memory(GiB)": 91.52, "step": 46535, "token_acc": 0.764681607516983, "train_speed(iter/s)": 0.151914 }, { "epoch": 0.6038881626112259, "grad_norm": 0.7437499761581421, "learning_rate": 8.362693857351794e-05, "loss": 0.9422468185424805, "memory(GiB)": 91.52, "step": 46540, "token_acc": 0.7310048176187199, "train_speed(iter/s)": 0.151911 }, { "epoch": 0.6039530410128816, "grad_norm": 0.8405824303627014, "learning_rate": 8.362296887134606e-05, "loss": 0.8666452407836914, "memory(GiB)": 91.52, "step": 46545, "token_acc": 0.7697303749473389, "train_speed(iter/s)": 0.151908 }, { "epoch": 0.6040179194145373, "grad_norm": 0.9225948452949524, "learning_rate": 8.361899878224323e-05, "loss": 0.896827220916748, "memory(GiB)": 91.52, "step": 46550, "token_acc": 0.7443365695792881, "train_speed(iter/s)": 0.151905 }, { "epoch": 0.604082797816193, "grad_norm": 0.7654286623001099, "learning_rate": 8.361502830625512e-05, "loss": 0.9419198989868164, "memory(GiB)": 91.52, "step": 46555, "token_acc": 0.7620263648284906, "train_speed(iter/s)": 0.1519 }, { "epoch": 0.6041476762178487, "grad_norm": 0.6996784806251526, "learning_rate": 8.361105744342743e-05, "loss": 0.9046060562133789, "memory(GiB)": 91.52, "step": 46560, "token_acc": 0.7541979792168566, "train_speed(iter/s)": 0.151896 }, { "epoch": 0.6042125546195044, "grad_norm": 0.8108406066894531, "learning_rate": 8.360708619380586e-05, "loss": 0.9535726547241211, "memory(GiB)": 91.52, "step": 46565, "token_acc": 0.7708535559438296, "train_speed(iter/s)": 0.151892 }, { "epoch": 0.6042774330211601, "grad_norm": 0.8107397556304932, "learning_rate": 8.360311455743611e-05, "loss": 0.9585731506347657, "memory(GiB)": 91.52, "step": 46570, "token_acc": 0.7473285074112375, "train_speed(iter/s)": 0.151888 }, { "epoch": 0.6043423114228158, "grad_norm": 0.812034010887146, "learning_rate": 8.359914253436387e-05, "loss": 0.9128190994262695, "memory(GiB)": 91.52, "step": 46575, "token_acc": 0.7637171829420752, "train_speed(iter/s)": 0.151882 }, { "epoch": 0.6044071898244715, "grad_norm": 0.7028534412384033, "learning_rate": 8.359517012463485e-05, "loss": 0.8723603248596191, "memory(GiB)": 91.52, "step": 46580, "token_acc": 0.7578843626806833, "train_speed(iter/s)": 0.151878 }, { "epoch": 0.6044720682261272, "grad_norm": 0.7625970840454102, "learning_rate": 8.359119732829478e-05, "loss": 0.867900276184082, "memory(GiB)": 91.52, "step": 46585, "token_acc": 0.7808794015690568, "train_speed(iter/s)": 0.151874 }, { "epoch": 0.6045369466277829, "grad_norm": 0.794472336769104, "learning_rate": 8.358722414538938e-05, "loss": 0.9122762680053711, "memory(GiB)": 91.52, "step": 46590, "token_acc": 0.7560537774043236, "train_speed(iter/s)": 0.151871 }, { "epoch": 0.6046018250294386, "grad_norm": 0.7771356701850891, "learning_rate": 8.358325057596437e-05, "loss": 0.9446950912475586, "memory(GiB)": 91.52, "step": 46595, "token_acc": 0.7555809959931311, "train_speed(iter/s)": 0.151866 }, { "epoch": 0.6046667034310943, "grad_norm": 0.9020977020263672, "learning_rate": 8.357927662006546e-05, "loss": 0.9270925521850586, "memory(GiB)": 91.52, "step": 46600, "token_acc": 0.7311944829654061, "train_speed(iter/s)": 0.151862 }, { "epoch": 0.60473158183275, "grad_norm": 0.7140486240386963, "learning_rate": 8.35753022777384e-05, "loss": 0.8593700408935547, "memory(GiB)": 91.52, "step": 46605, "token_acc": 0.7803524701283142, "train_speed(iter/s)": 0.151857 }, { "epoch": 0.6047964602344057, "grad_norm": 0.7486796975135803, "learning_rate": 8.357132754902892e-05, "loss": 0.8676728248596192, "memory(GiB)": 91.52, "step": 46610, "token_acc": 0.7451428571428571, "train_speed(iter/s)": 0.151854 }, { "epoch": 0.6048613386360614, "grad_norm": 0.7637696862220764, "learning_rate": 8.356735243398278e-05, "loss": 0.9466677665710449, "memory(GiB)": 91.52, "step": 46615, "token_acc": 0.7479348840424429, "train_speed(iter/s)": 0.151851 }, { "epoch": 0.6049262170377171, "grad_norm": 0.712661623954773, "learning_rate": 8.356337693264569e-05, "loss": 0.9238831520080566, "memory(GiB)": 91.52, "step": 46620, "token_acc": 0.7453749751342749, "train_speed(iter/s)": 0.151848 }, { "epoch": 0.6049910954393728, "grad_norm": 0.7978736758232117, "learning_rate": 8.355940104506343e-05, "loss": 0.9602596282958984, "memory(GiB)": 91.52, "step": 46625, "token_acc": 0.7250848896434635, "train_speed(iter/s)": 0.151844 }, { "epoch": 0.6050559738410285, "grad_norm": 0.7076209783554077, "learning_rate": 8.355542477128173e-05, "loss": 0.8837839126586914, "memory(GiB)": 91.52, "step": 46630, "token_acc": 0.7606953162723322, "train_speed(iter/s)": 0.15184 }, { "epoch": 0.6051208522426842, "grad_norm": 0.7412877082824707, "learning_rate": 8.355144811134637e-05, "loss": 0.9613115310668945, "memory(GiB)": 91.52, "step": 46635, "token_acc": 0.7461709211986681, "train_speed(iter/s)": 0.151836 }, { "epoch": 0.6051857306443399, "grad_norm": 0.789263129234314, "learning_rate": 8.35474710653031e-05, "loss": 0.9281793594360351, "memory(GiB)": 91.52, "step": 46640, "token_acc": 0.7638219652353928, "train_speed(iter/s)": 0.151832 }, { "epoch": 0.6052506090459956, "grad_norm": 0.7697061896324158, "learning_rate": 8.35434936331977e-05, "loss": 0.9226079940795898, "memory(GiB)": 91.52, "step": 46645, "token_acc": 0.7559603922322631, "train_speed(iter/s)": 0.151828 }, { "epoch": 0.6053154874476513, "grad_norm": 0.7410221695899963, "learning_rate": 8.353951581507593e-05, "loss": 0.8820626258850097, "memory(GiB)": 91.52, "step": 46650, "token_acc": 0.7549124574883487, "train_speed(iter/s)": 0.151824 }, { "epoch": 0.605380365849307, "grad_norm": 0.7800716161727905, "learning_rate": 8.353553761098357e-05, "loss": 0.9268113136291504, "memory(GiB)": 91.52, "step": 46655, "token_acc": 0.7492999561418305, "train_speed(iter/s)": 0.151821 }, { "epoch": 0.6054452442509627, "grad_norm": 0.8146663904190063, "learning_rate": 8.353155902096641e-05, "loss": 0.9020851135253907, "memory(GiB)": 91.52, "step": 46660, "token_acc": 0.7611129513083803, "train_speed(iter/s)": 0.151817 }, { "epoch": 0.6055101226526183, "grad_norm": 0.6891317963600159, "learning_rate": 8.352758004507022e-05, "loss": 0.9186108589172364, "memory(GiB)": 91.52, "step": 46665, "token_acc": 0.7654239867201549, "train_speed(iter/s)": 0.151813 }, { "epoch": 0.605575001054274, "grad_norm": 0.7919310927391052, "learning_rate": 8.35236006833408e-05, "loss": 0.9183393478393554, "memory(GiB)": 91.52, "step": 46670, "token_acc": 0.7304556354916067, "train_speed(iter/s)": 0.15181 }, { "epoch": 0.6056398794559297, "grad_norm": 0.7922415137290955, "learning_rate": 8.351962093582393e-05, "loss": 0.8943081855773926, "memory(GiB)": 91.52, "step": 46675, "token_acc": 0.74870252223067, "train_speed(iter/s)": 0.151806 }, { "epoch": 0.6057047578575854, "grad_norm": 0.7863512635231018, "learning_rate": 8.351564080256542e-05, "loss": 0.914959716796875, "memory(GiB)": 91.52, "step": 46680, "token_acc": 0.7530881189140832, "train_speed(iter/s)": 0.151803 }, { "epoch": 0.6057696362592411, "grad_norm": 0.7874813079833984, "learning_rate": 8.351166028361108e-05, "loss": 0.9460237503051758, "memory(GiB)": 91.52, "step": 46685, "token_acc": 0.7633464224693505, "train_speed(iter/s)": 0.151797 }, { "epoch": 0.6058345146608968, "grad_norm": 0.6475919485092163, "learning_rate": 8.35076793790067e-05, "loss": 0.8536787033081055, "memory(GiB)": 91.52, "step": 46690, "token_acc": 0.7677444925190738, "train_speed(iter/s)": 0.151793 }, { "epoch": 0.6058993930625525, "grad_norm": 0.7743303179740906, "learning_rate": 8.350369808879812e-05, "loss": 0.8853100776672364, "memory(GiB)": 91.52, "step": 46695, "token_acc": 0.755264583134219, "train_speed(iter/s)": 0.151788 }, { "epoch": 0.6059642714642082, "grad_norm": 0.7608923316001892, "learning_rate": 8.349971641303112e-05, "loss": 0.9301957130432129, "memory(GiB)": 91.52, "step": 46700, "token_acc": 0.7534631087780694, "train_speed(iter/s)": 0.151784 }, { "epoch": 0.6060291498658639, "grad_norm": 0.8340263962745667, "learning_rate": 8.349573435175157e-05, "loss": 0.8976299285888671, "memory(GiB)": 91.52, "step": 46705, "token_acc": 0.7441785697882699, "train_speed(iter/s)": 0.151781 }, { "epoch": 0.6060940282675196, "grad_norm": 0.8393217921257019, "learning_rate": 8.349175190500524e-05, "loss": 0.9165737152099609, "memory(GiB)": 91.52, "step": 46710, "token_acc": 0.7458021370938431, "train_speed(iter/s)": 0.151777 }, { "epoch": 0.6061589066691753, "grad_norm": 0.8454082012176514, "learning_rate": 8.348776907283797e-05, "loss": 0.8956145286560059, "memory(GiB)": 91.52, "step": 46715, "token_acc": 0.7583959416939575, "train_speed(iter/s)": 0.151774 }, { "epoch": 0.606223785070831, "grad_norm": 0.6462608575820923, "learning_rate": 8.348378585529563e-05, "loss": 0.8670372009277344, "memory(GiB)": 91.52, "step": 46720, "token_acc": 0.7582968665154758, "train_speed(iter/s)": 0.15177 }, { "epoch": 0.6062886634724867, "grad_norm": 0.6392116546630859, "learning_rate": 8.347980225242404e-05, "loss": 0.8849761962890625, "memory(GiB)": 91.52, "step": 46725, "token_acc": 0.7558097372845041, "train_speed(iter/s)": 0.151765 }, { "epoch": 0.6063535418741424, "grad_norm": 0.7968322038650513, "learning_rate": 8.347581826426903e-05, "loss": 0.8721713066101074, "memory(GiB)": 91.52, "step": 46730, "token_acc": 0.7509177091129405, "train_speed(iter/s)": 0.151762 }, { "epoch": 0.6064184202757981, "grad_norm": 0.8061937689781189, "learning_rate": 8.347183389087648e-05, "loss": 0.917563533782959, "memory(GiB)": 91.52, "step": 46735, "token_acc": 0.7404709294296368, "train_speed(iter/s)": 0.151757 }, { "epoch": 0.6064832986774538, "grad_norm": 0.8053085207939148, "learning_rate": 8.34678491322922e-05, "loss": 0.9074283599853515, "memory(GiB)": 91.52, "step": 46740, "token_acc": 0.7374137166740549, "train_speed(iter/s)": 0.151753 }, { "epoch": 0.6065481770791095, "grad_norm": 0.7200352549552917, "learning_rate": 8.346386398856206e-05, "loss": 0.8569971084594726, "memory(GiB)": 91.52, "step": 46745, "token_acc": 0.7616761382193481, "train_speed(iter/s)": 0.151748 }, { "epoch": 0.6066130554807652, "grad_norm": 0.8120097517967224, "learning_rate": 8.345987845973193e-05, "loss": 0.9395137786865234, "memory(GiB)": 91.52, "step": 46750, "token_acc": 0.7283668660503897, "train_speed(iter/s)": 0.151744 }, { "epoch": 0.6066779338824209, "grad_norm": 0.7359198927879333, "learning_rate": 8.345589254584769e-05, "loss": 0.867152214050293, "memory(GiB)": 91.52, "step": 46755, "token_acc": 0.758991843829389, "train_speed(iter/s)": 0.15174 }, { "epoch": 0.6067428122840766, "grad_norm": 0.7303672432899475, "learning_rate": 8.345190624695518e-05, "loss": 0.8570102691650391, "memory(GiB)": 91.52, "step": 46760, "token_acc": 0.7653673657828782, "train_speed(iter/s)": 0.151736 }, { "epoch": 0.6068076906857323, "grad_norm": 0.8311952352523804, "learning_rate": 8.344791956310027e-05, "loss": 0.8945719718933105, "memory(GiB)": 91.52, "step": 46765, "token_acc": 0.7482921497443105, "train_speed(iter/s)": 0.151732 }, { "epoch": 0.606872569087388, "grad_norm": 0.8239821791648865, "learning_rate": 8.344393249432889e-05, "loss": 0.9456392288208008, "memory(GiB)": 91.52, "step": 46770, "token_acc": 0.7597249729083367, "train_speed(iter/s)": 0.151729 }, { "epoch": 0.6069374474890437, "grad_norm": 0.6834131479263306, "learning_rate": 8.343994504068686e-05, "loss": 0.9065986633300781, "memory(GiB)": 91.52, "step": 46775, "token_acc": 0.7494562125623264, "train_speed(iter/s)": 0.151725 }, { "epoch": 0.6070023258906994, "grad_norm": 0.776839554309845, "learning_rate": 8.34359572022201e-05, "loss": 0.8959983825683594, "memory(GiB)": 91.52, "step": 46780, "token_acc": 0.7615604324089807, "train_speed(iter/s)": 0.151722 }, { "epoch": 0.6070672042923551, "grad_norm": 0.7538840174674988, "learning_rate": 8.34319689789745e-05, "loss": 0.8713927268981934, "memory(GiB)": 91.52, "step": 46785, "token_acc": 0.7738652085117597, "train_speed(iter/s)": 0.151716 }, { "epoch": 0.6071320826940108, "grad_norm": 0.7130113244056702, "learning_rate": 8.342798037099595e-05, "loss": 0.895649528503418, "memory(GiB)": 91.52, "step": 46790, "token_acc": 0.773919780258478, "train_speed(iter/s)": 0.151711 }, { "epoch": 0.6071969610956665, "grad_norm": 0.7387165427207947, "learning_rate": 8.342399137833037e-05, "loss": 0.95052490234375, "memory(GiB)": 91.52, "step": 46795, "token_acc": 0.7616270145817344, "train_speed(iter/s)": 0.151707 }, { "epoch": 0.6072618394973222, "grad_norm": 0.7592119574546814, "learning_rate": 8.342000200102363e-05, "loss": 0.8949151992797851, "memory(GiB)": 91.52, "step": 46800, "token_acc": 0.7359069275515601, "train_speed(iter/s)": 0.151704 }, { "epoch": 0.6073267178989779, "grad_norm": 0.7822607159614563, "learning_rate": 8.341601223912165e-05, "loss": 0.9331060409545898, "memory(GiB)": 91.52, "step": 46805, "token_acc": 0.7558878010055571, "train_speed(iter/s)": 0.1517 }, { "epoch": 0.6073915963006336, "grad_norm": 0.7161049246788025, "learning_rate": 8.341202209267037e-05, "loss": 0.9201648712158204, "memory(GiB)": 91.52, "step": 46810, "token_acc": 0.7543925449352298, "train_speed(iter/s)": 0.151696 }, { "epoch": 0.6074564747022893, "grad_norm": 0.8742667436599731, "learning_rate": 8.340803156171568e-05, "loss": 0.9289156913757324, "memory(GiB)": 91.52, "step": 46815, "token_acc": 0.7475240854274743, "train_speed(iter/s)": 0.151693 }, { "epoch": 0.607521353103945, "grad_norm": 0.7584534287452698, "learning_rate": 8.340404064630352e-05, "loss": 0.9056708335876464, "memory(GiB)": 91.52, "step": 46820, "token_acc": 0.7726662189388852, "train_speed(iter/s)": 0.15169 }, { "epoch": 0.6075862315056006, "grad_norm": 0.7453998327255249, "learning_rate": 8.34000493464798e-05, "loss": 0.8905954360961914, "memory(GiB)": 91.52, "step": 46825, "token_acc": 0.7734172528693076, "train_speed(iter/s)": 0.151685 }, { "epoch": 0.6076511099072563, "grad_norm": 0.8443477749824524, "learning_rate": 8.339605766229046e-05, "loss": 0.9235604286193848, "memory(GiB)": 91.52, "step": 46830, "token_acc": 0.736536117275654, "train_speed(iter/s)": 0.151681 }, { "epoch": 0.607715988308912, "grad_norm": 0.8347136378288269, "learning_rate": 8.339206559378145e-05, "loss": 0.8874897003173828, "memory(GiB)": 91.52, "step": 46835, "token_acc": 0.7596929528412524, "train_speed(iter/s)": 0.151677 }, { "epoch": 0.6077808667105677, "grad_norm": 0.8324294090270996, "learning_rate": 8.33880731409987e-05, "loss": 0.8945772171020507, "memory(GiB)": 91.52, "step": 46840, "token_acc": 0.748802495265679, "train_speed(iter/s)": 0.151673 }, { "epoch": 0.6078457451122234, "grad_norm": 0.7580892443656921, "learning_rate": 8.338408030398815e-05, "loss": 0.9255526542663575, "memory(GiB)": 91.52, "step": 46845, "token_acc": 0.7289942395550553, "train_speed(iter/s)": 0.151669 }, { "epoch": 0.6079106235138791, "grad_norm": 0.7994038462638855, "learning_rate": 8.338008708279575e-05, "loss": 0.9386476516723633, "memory(GiB)": 91.52, "step": 46850, "token_acc": 0.7493122990753323, "train_speed(iter/s)": 0.151665 }, { "epoch": 0.6079755019155348, "grad_norm": 0.8015425801277161, "learning_rate": 8.337609347746745e-05, "loss": 0.9251148223876953, "memory(GiB)": 91.52, "step": 46855, "token_acc": 0.7410650114341718, "train_speed(iter/s)": 0.151661 }, { "epoch": 0.6080403803171905, "grad_norm": 0.6562846302986145, "learning_rate": 8.337209948804922e-05, "loss": 0.9129690170288086, "memory(GiB)": 91.52, "step": 46860, "token_acc": 0.7500465636058856, "train_speed(iter/s)": 0.151658 }, { "epoch": 0.6081052587188462, "grad_norm": 0.8615440726280212, "learning_rate": 8.336810511458702e-05, "loss": 0.9262654304504394, "memory(GiB)": 91.52, "step": 46865, "token_acc": 0.7480548069988993, "train_speed(iter/s)": 0.151654 }, { "epoch": 0.6081701371205019, "grad_norm": 0.7902463674545288, "learning_rate": 8.33641103571268e-05, "loss": 0.9818351745605469, "memory(GiB)": 91.52, "step": 46870, "token_acc": 0.7439947478180273, "train_speed(iter/s)": 0.151651 }, { "epoch": 0.6082350155221576, "grad_norm": 0.8111397624015808, "learning_rate": 8.336011521571456e-05, "loss": 0.92697172164917, "memory(GiB)": 91.52, "step": 46875, "token_acc": 0.7626971444452164, "train_speed(iter/s)": 0.151647 }, { "epoch": 0.6082998939238133, "grad_norm": 0.7298490405082703, "learning_rate": 8.335611969039625e-05, "loss": 0.9377738952636718, "memory(GiB)": 91.52, "step": 46880, "token_acc": 0.7430091481217155, "train_speed(iter/s)": 0.151642 }, { "epoch": 0.608364772325469, "grad_norm": 0.7499546408653259, "learning_rate": 8.335212378121788e-05, "loss": 0.881527042388916, "memory(GiB)": 91.52, "step": 46885, "token_acc": 0.7566418703506907, "train_speed(iter/s)": 0.151638 }, { "epoch": 0.6084296507271247, "grad_norm": 0.6655128598213196, "learning_rate": 8.33481274882254e-05, "loss": 0.9430047988891601, "memory(GiB)": 91.52, "step": 46890, "token_acc": 0.7399017364887672, "train_speed(iter/s)": 0.151634 }, { "epoch": 0.6084945291287804, "grad_norm": 0.7137559652328491, "learning_rate": 8.334413081146482e-05, "loss": 0.9083724021911621, "memory(GiB)": 91.52, "step": 46895, "token_acc": 0.7683275563258232, "train_speed(iter/s)": 0.15163 }, { "epoch": 0.608559407530436, "grad_norm": 0.848639190196991, "learning_rate": 8.334013375098213e-05, "loss": 0.9519130706787109, "memory(GiB)": 91.52, "step": 46900, "token_acc": 0.7466087558618953, "train_speed(iter/s)": 0.151627 }, { "epoch": 0.6086242859320917, "grad_norm": 0.7729840874671936, "learning_rate": 8.333613630682332e-05, "loss": 0.8629806518554688, "memory(GiB)": 91.52, "step": 46905, "token_acc": 0.7851555166155579, "train_speed(iter/s)": 0.151622 }, { "epoch": 0.6086891643337474, "grad_norm": 0.8544286489486694, "learning_rate": 8.33321384790344e-05, "loss": 0.8607315063476563, "memory(GiB)": 91.52, "step": 46910, "token_acc": 0.766997427416391, "train_speed(iter/s)": 0.151617 }, { "epoch": 0.6087540427354031, "grad_norm": 0.7780249714851379, "learning_rate": 8.332814026766138e-05, "loss": 0.868659496307373, "memory(GiB)": 91.52, "step": 46915, "token_acc": 0.792240041386446, "train_speed(iter/s)": 0.151613 }, { "epoch": 0.6088189211370588, "grad_norm": 0.7441797256469727, "learning_rate": 8.332414167275026e-05, "loss": 0.8816448211669922, "memory(GiB)": 91.52, "step": 46920, "token_acc": 0.7485136552952024, "train_speed(iter/s)": 0.151609 }, { "epoch": 0.6088837995387145, "grad_norm": 0.8359254598617554, "learning_rate": 8.332014269434708e-05, "loss": 0.8921339035034179, "memory(GiB)": 91.52, "step": 46925, "token_acc": 0.7569547637972146, "train_speed(iter/s)": 0.151605 }, { "epoch": 0.6089486779403702, "grad_norm": 0.7984007596969604, "learning_rate": 8.331614333249782e-05, "loss": 0.8889480590820312, "memory(GiB)": 91.52, "step": 46930, "token_acc": 0.7570533470211335, "train_speed(iter/s)": 0.151602 }, { "epoch": 0.6090135563420259, "grad_norm": 0.7470531463623047, "learning_rate": 8.331214358724854e-05, "loss": 0.9265956878662109, "memory(GiB)": 91.52, "step": 46935, "token_acc": 0.7547247483252872, "train_speed(iter/s)": 0.151599 }, { "epoch": 0.6090784347436816, "grad_norm": 0.7602896690368652, "learning_rate": 8.330814345864525e-05, "loss": 0.9523017883300782, "memory(GiB)": 91.52, "step": 46940, "token_acc": 0.7463199048104708, "train_speed(iter/s)": 0.151595 }, { "epoch": 0.6091433131453373, "grad_norm": 0.7634224891662598, "learning_rate": 8.3304142946734e-05, "loss": 0.8925273895263672, "memory(GiB)": 91.52, "step": 46945, "token_acc": 0.7423957526822255, "train_speed(iter/s)": 0.151592 }, { "epoch": 0.609208191546993, "grad_norm": 0.7157328128814697, "learning_rate": 8.33001420515608e-05, "loss": 0.8673513412475586, "memory(GiB)": 91.52, "step": 46950, "token_acc": 0.7628660481298927, "train_speed(iter/s)": 0.151588 }, { "epoch": 0.6092730699486487, "grad_norm": 0.6408298015594482, "learning_rate": 8.329614077317172e-05, "loss": 0.9111650466918946, "memory(GiB)": 91.52, "step": 46955, "token_acc": 0.764106273839274, "train_speed(iter/s)": 0.151584 }, { "epoch": 0.6093379483503044, "grad_norm": 0.6850001811981201, "learning_rate": 8.32921391116128e-05, "loss": 0.9080670356750489, "memory(GiB)": 91.52, "step": 46960, "token_acc": 0.7434719687912917, "train_speed(iter/s)": 0.15158 }, { "epoch": 0.6094028267519601, "grad_norm": 0.7600319385528564, "learning_rate": 8.328813706693008e-05, "loss": 0.9286558151245117, "memory(GiB)": 91.52, "step": 46965, "token_acc": 0.7355972832600879, "train_speed(iter/s)": 0.151577 }, { "epoch": 0.6094677051536158, "grad_norm": 0.7734828591346741, "learning_rate": 8.32841346391696e-05, "loss": 0.8646663665771485, "memory(GiB)": 91.52, "step": 46970, "token_acc": 0.7868350554917719, "train_speed(iter/s)": 0.151572 }, { "epoch": 0.6095325835552715, "grad_norm": 0.7680752277374268, "learning_rate": 8.328013182837747e-05, "loss": 0.9539945602416993, "memory(GiB)": 91.52, "step": 46975, "token_acc": 0.7295388143052628, "train_speed(iter/s)": 0.151568 }, { "epoch": 0.6095974619569272, "grad_norm": 0.7400915622711182, "learning_rate": 8.327612863459971e-05, "loss": 0.8914009094238281, "memory(GiB)": 91.52, "step": 46980, "token_acc": 0.7738658121996014, "train_speed(iter/s)": 0.151564 }, { "epoch": 0.6096623403585829, "grad_norm": 0.7800717353820801, "learning_rate": 8.327212505788241e-05, "loss": 0.9186983108520508, "memory(GiB)": 91.52, "step": 46985, "token_acc": 0.7569341811307785, "train_speed(iter/s)": 0.15156 }, { "epoch": 0.6097272187602386, "grad_norm": 0.7768157124519348, "learning_rate": 8.326812109827165e-05, "loss": 0.8472575187683106, "memory(GiB)": 91.52, "step": 46990, "token_acc": 0.7577762003097573, "train_speed(iter/s)": 0.151556 }, { "epoch": 0.6097920971618943, "grad_norm": 0.7617335915565491, "learning_rate": 8.326411675581347e-05, "loss": 0.8902985572814941, "memory(GiB)": 91.52, "step": 46995, "token_acc": 0.7425667265938322, "train_speed(iter/s)": 0.151552 }, { "epoch": 0.60985697556355, "grad_norm": 0.71113520860672, "learning_rate": 8.326011203055397e-05, "loss": 0.8864970207214355, "memory(GiB)": 91.52, "step": 47000, "token_acc": 0.7741019761982749, "train_speed(iter/s)": 0.151547 }, { "epoch": 0.6099218539652057, "grad_norm": 0.8569291234016418, "learning_rate": 8.325610692253924e-05, "loss": 0.9291150093078613, "memory(GiB)": 91.52, "step": 47005, "token_acc": 0.7343651072810412, "train_speed(iter/s)": 0.151543 }, { "epoch": 0.6099867323668614, "grad_norm": 0.7453956604003906, "learning_rate": 8.325210143181542e-05, "loss": 0.9297935485839843, "memory(GiB)": 91.52, "step": 47010, "token_acc": 0.7399862825788752, "train_speed(iter/s)": 0.15154 }, { "epoch": 0.6100516107685171, "grad_norm": 0.7532526850700378, "learning_rate": 8.324809555842849e-05, "loss": 0.863153076171875, "memory(GiB)": 91.52, "step": 47015, "token_acc": 0.7716697327129098, "train_speed(iter/s)": 0.151536 }, { "epoch": 0.6101164891701728, "grad_norm": 0.901455283164978, "learning_rate": 8.324408930242467e-05, "loss": 0.9049196243286133, "memory(GiB)": 91.52, "step": 47020, "token_acc": 0.7401924249610408, "train_speed(iter/s)": 0.151531 }, { "epoch": 0.6101813675718285, "grad_norm": 0.809992790222168, "learning_rate": 8.324008266384998e-05, "loss": 0.9477021217346191, "memory(GiB)": 91.52, "step": 47025, "token_acc": 0.7436815350468963, "train_speed(iter/s)": 0.151528 }, { "epoch": 0.6102462459734842, "grad_norm": 0.7340131402015686, "learning_rate": 8.323607564275058e-05, "loss": 0.8785724639892578, "memory(GiB)": 91.52, "step": 47030, "token_acc": 0.7625460731499858, "train_speed(iter/s)": 0.151524 }, { "epoch": 0.6103111243751399, "grad_norm": 0.7668341398239136, "learning_rate": 8.323206823917254e-05, "loss": 0.9263362884521484, "memory(GiB)": 91.52, "step": 47035, "token_acc": 0.7630800226657433, "train_speed(iter/s)": 0.151519 }, { "epoch": 0.6103760027767956, "grad_norm": 0.906775176525116, "learning_rate": 8.322806045316199e-05, "loss": 0.9180339813232422, "memory(GiB)": 91.52, "step": 47040, "token_acc": 0.7492793285896734, "train_speed(iter/s)": 0.151515 }, { "epoch": 0.6104408811784513, "grad_norm": 0.8935883045196533, "learning_rate": 8.322405228476509e-05, "loss": 0.9276777267456054, "memory(GiB)": 91.52, "step": 47045, "token_acc": 0.7436826953833031, "train_speed(iter/s)": 0.151512 }, { "epoch": 0.610505759580107, "grad_norm": 0.7405577898025513, "learning_rate": 8.322004373402791e-05, "loss": 0.9080876350402832, "memory(GiB)": 91.52, "step": 47050, "token_acc": 0.7505891737065371, "train_speed(iter/s)": 0.151508 }, { "epoch": 0.6105706379817627, "grad_norm": 0.7716673612594604, "learning_rate": 8.321603480099661e-05, "loss": 0.8943829536437988, "memory(GiB)": 91.52, "step": 47055, "token_acc": 0.7677900314213542, "train_speed(iter/s)": 0.151504 }, { "epoch": 0.6106355163834184, "grad_norm": 0.7632371187210083, "learning_rate": 8.321202548571731e-05, "loss": 0.9454373359680176, "memory(GiB)": 91.52, "step": 47060, "token_acc": 0.7544545548904958, "train_speed(iter/s)": 0.151499 }, { "epoch": 0.6107003947850741, "grad_norm": 0.7011064887046814, "learning_rate": 8.320801578823618e-05, "loss": 0.8850915908813477, "memory(GiB)": 91.52, "step": 47065, "token_acc": 0.7610906943168395, "train_speed(iter/s)": 0.151495 }, { "epoch": 0.6107652731867298, "grad_norm": 0.8031762838363647, "learning_rate": 8.320400570859934e-05, "loss": 0.933047103881836, "memory(GiB)": 91.52, "step": 47070, "token_acc": 0.7392893098176666, "train_speed(iter/s)": 0.151491 }, { "epoch": 0.6108301515883855, "grad_norm": 0.7221530079841614, "learning_rate": 8.319999524685292e-05, "loss": 0.8964573860168457, "memory(GiB)": 91.52, "step": 47075, "token_acc": 0.767398572955086, "train_speed(iter/s)": 0.151486 }, { "epoch": 0.6108950299900412, "grad_norm": 0.8287214040756226, "learning_rate": 8.319598440304311e-05, "loss": 0.9139228820800781, "memory(GiB)": 91.52, "step": 47080, "token_acc": 0.7486104270738057, "train_speed(iter/s)": 0.151482 }, { "epoch": 0.6109599083916969, "grad_norm": 0.7892794013023376, "learning_rate": 8.319197317721603e-05, "loss": 0.9216828346252441, "memory(GiB)": 91.52, "step": 47085, "token_acc": 0.748464226595549, "train_speed(iter/s)": 0.151478 }, { "epoch": 0.6110247867933526, "grad_norm": 0.7917786836624146, "learning_rate": 8.318796156941788e-05, "loss": 0.9563300132751464, "memory(GiB)": 91.52, "step": 47090, "token_acc": 0.7392757468214864, "train_speed(iter/s)": 0.151475 }, { "epoch": 0.6110896651950083, "grad_norm": 0.8655663132667542, "learning_rate": 8.318394957969479e-05, "loss": 0.8560302734375, "memory(GiB)": 91.52, "step": 47095, "token_acc": 0.7500709018718095, "train_speed(iter/s)": 0.151471 }, { "epoch": 0.611154543596664, "grad_norm": 0.7082943320274353, "learning_rate": 8.317993720809295e-05, "loss": 0.8870494842529297, "memory(GiB)": 91.52, "step": 47100, "token_acc": 0.7619416382132422, "train_speed(iter/s)": 0.151467 }, { "epoch": 0.6112194219983197, "grad_norm": 0.7589390277862549, "learning_rate": 8.317592445465851e-05, "loss": 0.8920601844787598, "memory(GiB)": 91.52, "step": 47105, "token_acc": 0.7445134399455597, "train_speed(iter/s)": 0.151463 }, { "epoch": 0.6112843003999754, "grad_norm": 0.76454097032547, "learning_rate": 8.317191131943769e-05, "loss": 0.8571554183959961, "memory(GiB)": 91.52, "step": 47110, "token_acc": 0.7751095118898623, "train_speed(iter/s)": 0.15146 }, { "epoch": 0.6113491788016311, "grad_norm": 0.8179759383201599, "learning_rate": 8.316789780247665e-05, "loss": 0.9282301902770996, "memory(GiB)": 91.52, "step": 47115, "token_acc": 0.7527105040833568, "train_speed(iter/s)": 0.151456 }, { "epoch": 0.6114140572032868, "grad_norm": 0.8708111643791199, "learning_rate": 8.316388390382156e-05, "loss": 0.9234352111816406, "memory(GiB)": 91.52, "step": 47120, "token_acc": 0.750972093183304, "train_speed(iter/s)": 0.151453 }, { "epoch": 0.6114789356049425, "grad_norm": 0.802275538444519, "learning_rate": 8.315986962351864e-05, "loss": 0.8886789321899414, "memory(GiB)": 91.52, "step": 47125, "token_acc": 0.7640765765765766, "train_speed(iter/s)": 0.151449 }, { "epoch": 0.6115438140065982, "grad_norm": 0.8234026432037354, "learning_rate": 8.315585496161406e-05, "loss": 0.8806601524353027, "memory(GiB)": 91.52, "step": 47130, "token_acc": 0.7634362112544898, "train_speed(iter/s)": 0.151445 }, { "epoch": 0.6116086924082539, "grad_norm": 0.7667402029037476, "learning_rate": 8.315183991815406e-05, "loss": 0.9030723571777344, "memory(GiB)": 91.52, "step": 47135, "token_acc": 0.7570545752363934, "train_speed(iter/s)": 0.151441 }, { "epoch": 0.6116735708099095, "grad_norm": 0.8481952548027039, "learning_rate": 8.31478244931848e-05, "loss": 0.9404629707336426, "memory(GiB)": 91.52, "step": 47140, "token_acc": 0.7707522602920093, "train_speed(iter/s)": 0.151437 }, { "epoch": 0.6117384492115652, "grad_norm": 0.6966739296913147, "learning_rate": 8.31438086867525e-05, "loss": 0.9210696220397949, "memory(GiB)": 91.52, "step": 47145, "token_acc": 0.7516881707464126, "train_speed(iter/s)": 0.151431 }, { "epoch": 0.6118033276132209, "grad_norm": 0.7410143613815308, "learning_rate": 8.31397924989034e-05, "loss": 0.9130613327026367, "memory(GiB)": 91.52, "step": 47150, "token_acc": 0.7629484510836292, "train_speed(iter/s)": 0.151428 }, { "epoch": 0.6118682060148766, "grad_norm": 0.7343726754188538, "learning_rate": 8.313577592968371e-05, "loss": 0.8198616027832031, "memory(GiB)": 91.52, "step": 47155, "token_acc": 0.7586744639376218, "train_speed(iter/s)": 0.151424 }, { "epoch": 0.6119330844165323, "grad_norm": 0.6687002182006836, "learning_rate": 8.313175897913963e-05, "loss": 0.9560497283935547, "memory(GiB)": 91.52, "step": 47160, "token_acc": 0.7408686585644, "train_speed(iter/s)": 0.151421 }, { "epoch": 0.611997962818188, "grad_norm": 0.7032124400138855, "learning_rate": 8.31277416473174e-05, "loss": 0.9305840492248535, "memory(GiB)": 91.52, "step": 47165, "token_acc": 0.7555935728422395, "train_speed(iter/s)": 0.151417 }, { "epoch": 0.6120628412198437, "grad_norm": 0.7753682732582092, "learning_rate": 8.312372393426327e-05, "loss": 0.8768239974975586, "memory(GiB)": 91.52, "step": 47170, "token_acc": 0.7698649902092136, "train_speed(iter/s)": 0.151413 }, { "epoch": 0.6121277196214994, "grad_norm": 0.7773409485816956, "learning_rate": 8.311970584002343e-05, "loss": 0.9167413711547852, "memory(GiB)": 91.52, "step": 47175, "token_acc": 0.7510292164674635, "train_speed(iter/s)": 0.151409 }, { "epoch": 0.6121925980231551, "grad_norm": 0.7831675410270691, "learning_rate": 8.311568736464416e-05, "loss": 0.8942785263061523, "memory(GiB)": 91.52, "step": 47180, "token_acc": 0.7717697915521852, "train_speed(iter/s)": 0.151406 }, { "epoch": 0.6122574764248108, "grad_norm": 0.8085764646530151, "learning_rate": 8.311166850817168e-05, "loss": 0.9395095825195312, "memory(GiB)": 91.52, "step": 47185, "token_acc": 0.740361495730039, "train_speed(iter/s)": 0.151402 }, { "epoch": 0.6123223548264665, "grad_norm": 0.7730945348739624, "learning_rate": 8.310764927065226e-05, "loss": 0.9115083694458008, "memory(GiB)": 91.52, "step": 47190, "token_acc": 0.7608087194178274, "train_speed(iter/s)": 0.151398 }, { "epoch": 0.6123872332281222, "grad_norm": 0.6387589573860168, "learning_rate": 8.310362965213214e-05, "loss": 0.9393129348754883, "memory(GiB)": 91.52, "step": 47195, "token_acc": 0.7473563053017769, "train_speed(iter/s)": 0.151394 }, { "epoch": 0.6124521116297779, "grad_norm": 0.8073731660842896, "learning_rate": 8.309960965265758e-05, "loss": 0.9265369415283203, "memory(GiB)": 91.52, "step": 47200, "token_acc": 0.7511935887234285, "train_speed(iter/s)": 0.15139 }, { "epoch": 0.6125169900314336, "grad_norm": 0.8992177248001099, "learning_rate": 8.309558927227486e-05, "loss": 0.9441468238830566, "memory(GiB)": 91.52, "step": 47205, "token_acc": 0.768136722988684, "train_speed(iter/s)": 0.151386 }, { "epoch": 0.6125818684330893, "grad_norm": 0.7870897054672241, "learning_rate": 8.30915685110302e-05, "loss": 0.9074204444885254, "memory(GiB)": 91.52, "step": 47210, "token_acc": 0.7870267260579065, "train_speed(iter/s)": 0.151382 }, { "epoch": 0.612646746834745, "grad_norm": 0.7368381023406982, "learning_rate": 8.30875473689699e-05, "loss": 0.8999859809875488, "memory(GiB)": 91.52, "step": 47215, "token_acc": 0.7456904975960599, "train_speed(iter/s)": 0.151377 }, { "epoch": 0.6127116252364007, "grad_norm": 0.7223094701766968, "learning_rate": 8.308352584614025e-05, "loss": 0.8847513198852539, "memory(GiB)": 91.52, "step": 47220, "token_acc": 0.7584660475299269, "train_speed(iter/s)": 0.151374 }, { "epoch": 0.6127765036380564, "grad_norm": 0.786582350730896, "learning_rate": 8.307950394258752e-05, "loss": 0.9483963966369628, "memory(GiB)": 91.52, "step": 47225, "token_acc": 0.747260059495851, "train_speed(iter/s)": 0.15137 }, { "epoch": 0.6128413820397121, "grad_norm": 0.7793516516685486, "learning_rate": 8.307548165835797e-05, "loss": 0.9332281112670898, "memory(GiB)": 91.52, "step": 47230, "token_acc": 0.7445046893317703, "train_speed(iter/s)": 0.151367 }, { "epoch": 0.6129062604413678, "grad_norm": 0.7903251051902771, "learning_rate": 8.30714589934979e-05, "loss": 0.9182346343994141, "memory(GiB)": 91.52, "step": 47235, "token_acc": 0.7528085867620752, "train_speed(iter/s)": 0.151362 }, { "epoch": 0.6129711388430235, "grad_norm": 0.7844015955924988, "learning_rate": 8.306743594805364e-05, "loss": 0.9304535865783692, "memory(GiB)": 91.52, "step": 47240, "token_acc": 0.7742681896646711, "train_speed(iter/s)": 0.151358 }, { "epoch": 0.6130360172446792, "grad_norm": 0.7752413153648376, "learning_rate": 8.306341252207143e-05, "loss": 0.8999183654785157, "memory(GiB)": 91.52, "step": 47245, "token_acc": 0.753226293031207, "train_speed(iter/s)": 0.151353 }, { "epoch": 0.6131008956463349, "grad_norm": 0.7978798151016235, "learning_rate": 8.30593887155976e-05, "loss": 0.9089446067810059, "memory(GiB)": 91.52, "step": 47250, "token_acc": 0.7584720121028744, "train_speed(iter/s)": 0.15135 }, { "epoch": 0.6131657740479906, "grad_norm": 0.8036659955978394, "learning_rate": 8.305536452867845e-05, "loss": 0.9088050842285156, "memory(GiB)": 91.52, "step": 47255, "token_acc": 0.7630929348188672, "train_speed(iter/s)": 0.151346 }, { "epoch": 0.6132306524496463, "grad_norm": 0.734079122543335, "learning_rate": 8.30513399613603e-05, "loss": 0.8948587417602539, "memory(GiB)": 91.52, "step": 47260, "token_acc": 0.7598510096059596, "train_speed(iter/s)": 0.151342 }, { "epoch": 0.613295530851302, "grad_norm": 0.7924689650535583, "learning_rate": 8.304731501368947e-05, "loss": 0.8733797073364258, "memory(GiB)": 91.52, "step": 47265, "token_acc": 0.7647407524502055, "train_speed(iter/s)": 0.151338 }, { "epoch": 0.6133604092529577, "grad_norm": 0.8904873132705688, "learning_rate": 8.304328968571225e-05, "loss": 0.8611993789672852, "memory(GiB)": 91.52, "step": 47270, "token_acc": 0.7696510110893673, "train_speed(iter/s)": 0.151335 }, { "epoch": 0.6134252876546133, "grad_norm": 0.6976816058158875, "learning_rate": 8.303926397747497e-05, "loss": 0.8901882171630859, "memory(GiB)": 91.52, "step": 47275, "token_acc": 0.7653310265995242, "train_speed(iter/s)": 0.151332 }, { "epoch": 0.613490166056269, "grad_norm": 0.8333035707473755, "learning_rate": 8.303523788902398e-05, "loss": 0.8716262817382813, "memory(GiB)": 91.52, "step": 47280, "token_acc": 0.7583892617449665, "train_speed(iter/s)": 0.151328 }, { "epoch": 0.6135550444579247, "grad_norm": 0.8370442986488342, "learning_rate": 8.30312114204056e-05, "loss": 0.9311885833740234, "memory(GiB)": 91.52, "step": 47285, "token_acc": 0.7390387551806878, "train_speed(iter/s)": 0.151324 }, { "epoch": 0.6136199228595804, "grad_norm": 0.8556503653526306, "learning_rate": 8.302718457166615e-05, "loss": 0.9005985260009766, "memory(GiB)": 91.52, "step": 47290, "token_acc": 0.7627480705622933, "train_speed(iter/s)": 0.15132 }, { "epoch": 0.6136848012612361, "grad_norm": 0.818830132484436, "learning_rate": 8.3023157342852e-05, "loss": 0.9419896125793457, "memory(GiB)": 91.52, "step": 47295, "token_acc": 0.7574943746824417, "train_speed(iter/s)": 0.151317 }, { "epoch": 0.6137496796628918, "grad_norm": 0.7625954747200012, "learning_rate": 8.301912973400948e-05, "loss": 0.9160089492797852, "memory(GiB)": 91.52, "step": 47300, "token_acc": 0.7541570056310036, "train_speed(iter/s)": 0.151312 }, { "epoch": 0.6138145580645475, "grad_norm": 0.8093103766441345, "learning_rate": 8.301510174518493e-05, "loss": 0.9075413703918457, "memory(GiB)": 91.52, "step": 47305, "token_acc": 0.7619944919553558, "train_speed(iter/s)": 0.151308 }, { "epoch": 0.6138794364662032, "grad_norm": 0.7446954846382141, "learning_rate": 8.301107337642472e-05, "loss": 0.8743038177490234, "memory(GiB)": 91.52, "step": 47310, "token_acc": 0.7608449214026602, "train_speed(iter/s)": 0.151304 }, { "epoch": 0.6139443148678589, "grad_norm": 0.7426902055740356, "learning_rate": 8.30070446277752e-05, "loss": 0.9292617797851562, "memory(GiB)": 91.52, "step": 47315, "token_acc": 0.7777271622934888, "train_speed(iter/s)": 0.1513 }, { "epoch": 0.6140091932695146, "grad_norm": 0.9581027030944824, "learning_rate": 8.300301549928273e-05, "loss": 0.9219964981079102, "memory(GiB)": 91.52, "step": 47320, "token_acc": 0.7468463798168308, "train_speed(iter/s)": 0.151297 }, { "epoch": 0.6140740716711703, "grad_norm": 0.8211404085159302, "learning_rate": 8.299898599099369e-05, "loss": 0.8905862808227539, "memory(GiB)": 91.52, "step": 47325, "token_acc": 0.7615529479936259, "train_speed(iter/s)": 0.151293 }, { "epoch": 0.614138950072826, "grad_norm": 0.7371664047241211, "learning_rate": 8.299495610295444e-05, "loss": 0.9115791320800781, "memory(GiB)": 91.52, "step": 47330, "token_acc": 0.7503292635176022, "train_speed(iter/s)": 0.15129 }, { "epoch": 0.6142038284744817, "grad_norm": 0.7800536155700684, "learning_rate": 8.299092583521137e-05, "loss": 0.9230289459228516, "memory(GiB)": 91.52, "step": 47335, "token_acc": 0.7755282288198927, "train_speed(iter/s)": 0.151286 }, { "epoch": 0.6142687068761374, "grad_norm": 0.7658205628395081, "learning_rate": 8.298689518781084e-05, "loss": 0.891353702545166, "memory(GiB)": 91.52, "step": 47340, "token_acc": 0.7677426964966776, "train_speed(iter/s)": 0.151281 }, { "epoch": 0.6143335852777931, "grad_norm": 0.6829959750175476, "learning_rate": 8.298286416079923e-05, "loss": 0.8753022193908692, "memory(GiB)": 91.52, "step": 47345, "token_acc": 0.7468534240692412, "train_speed(iter/s)": 0.151277 }, { "epoch": 0.6143984636794488, "grad_norm": 0.7598987817764282, "learning_rate": 8.297883275422296e-05, "loss": 0.9190703392028808, "memory(GiB)": 91.52, "step": 47350, "token_acc": 0.7765741130276259, "train_speed(iter/s)": 0.151274 }, { "epoch": 0.6144633420811045, "grad_norm": 0.7989799380302429, "learning_rate": 8.297480096812839e-05, "loss": 0.8969183921813965, "memory(GiB)": 91.52, "step": 47355, "token_acc": 0.7532120866996477, "train_speed(iter/s)": 0.15127 }, { "epoch": 0.6145282204827602, "grad_norm": 0.7288308143615723, "learning_rate": 8.297076880256196e-05, "loss": 0.9051471710205078, "memory(GiB)": 91.52, "step": 47360, "token_acc": 0.7481349807669891, "train_speed(iter/s)": 0.151267 }, { "epoch": 0.6145930988844159, "grad_norm": 0.8965965509414673, "learning_rate": 8.296673625757001e-05, "loss": 0.8889787673950196, "memory(GiB)": 91.52, "step": 47365, "token_acc": 0.7815940597680968, "train_speed(iter/s)": 0.151263 }, { "epoch": 0.6146579772860716, "grad_norm": 0.7383771538734436, "learning_rate": 8.296270333319901e-05, "loss": 0.8662934303283691, "memory(GiB)": 91.52, "step": 47370, "token_acc": 0.7510514396635393, "train_speed(iter/s)": 0.151259 }, { "epoch": 0.6147228556877273, "grad_norm": 0.7451352477073669, "learning_rate": 8.295867002949533e-05, "loss": 0.9246660232543945, "memory(GiB)": 91.52, "step": 47375, "token_acc": 0.7282421740626075, "train_speed(iter/s)": 0.151256 }, { "epoch": 0.6147877340893829, "grad_norm": 0.7987974286079407, "learning_rate": 8.29546363465054e-05, "loss": 0.8991070747375488, "memory(GiB)": 91.52, "step": 47380, "token_acc": 0.7486245641224332, "train_speed(iter/s)": 0.151252 }, { "epoch": 0.6148526124910386, "grad_norm": 0.6926356554031372, "learning_rate": 8.295060228427563e-05, "loss": 0.9179132461547852, "memory(GiB)": 91.52, "step": 47385, "token_acc": 0.7618483412322274, "train_speed(iter/s)": 0.151249 }, { "epoch": 0.6149174908926943, "grad_norm": 0.7401725649833679, "learning_rate": 8.294656784285244e-05, "loss": 0.9278213500976562, "memory(GiB)": 91.52, "step": 47390, "token_acc": 0.7734553775743707, "train_speed(iter/s)": 0.151246 }, { "epoch": 0.61498236929435, "grad_norm": 0.7529772520065308, "learning_rate": 8.294253302228227e-05, "loss": 0.8856592178344727, "memory(GiB)": 91.52, "step": 47395, "token_acc": 0.7715396853455896, "train_speed(iter/s)": 0.151242 }, { "epoch": 0.6150472476960057, "grad_norm": 0.7965887784957886, "learning_rate": 8.293849782261155e-05, "loss": 0.9191424369812011, "memory(GiB)": 91.52, "step": 47400, "token_acc": 0.754593309560197, "train_speed(iter/s)": 0.151239 }, { "epoch": 0.6151121260976614, "grad_norm": 0.769196093082428, "learning_rate": 8.293446224388675e-05, "loss": 0.9073251724243164, "memory(GiB)": 91.52, "step": 47405, "token_acc": 0.7397911749982121, "train_speed(iter/s)": 0.151235 }, { "epoch": 0.6151770044993171, "grad_norm": 0.7429274916648865, "learning_rate": 8.293042628615425e-05, "loss": 0.9026290893554687, "memory(GiB)": 91.52, "step": 47410, "token_acc": 0.7568453844892605, "train_speed(iter/s)": 0.151231 }, { "epoch": 0.6152418829009728, "grad_norm": 0.6938990950584412, "learning_rate": 8.292638994946054e-05, "loss": 0.9012941360473633, "memory(GiB)": 91.52, "step": 47415, "token_acc": 0.7585326213141398, "train_speed(iter/s)": 0.151226 }, { "epoch": 0.6153067613026285, "grad_norm": 0.6899986863136292, "learning_rate": 8.292235323385203e-05, "loss": 0.9120342254638671, "memory(GiB)": 91.52, "step": 47420, "token_acc": 0.7520879279800426, "train_speed(iter/s)": 0.151223 }, { "epoch": 0.6153716397042842, "grad_norm": 0.7418481707572937, "learning_rate": 8.291831613937523e-05, "loss": 0.938321590423584, "memory(GiB)": 91.52, "step": 47425, "token_acc": 0.7419660108039904, "train_speed(iter/s)": 0.15122 }, { "epoch": 0.6154365181059399, "grad_norm": 0.7899881601333618, "learning_rate": 8.291427866607653e-05, "loss": 0.8998880386352539, "memory(GiB)": 91.52, "step": 47430, "token_acc": 0.7587204987383108, "train_speed(iter/s)": 0.151215 }, { "epoch": 0.6155013965075956, "grad_norm": 0.6470940709114075, "learning_rate": 8.291024081400244e-05, "loss": 0.8953011512756348, "memory(GiB)": 91.52, "step": 47435, "token_acc": 0.7451283404319475, "train_speed(iter/s)": 0.151211 }, { "epoch": 0.6155662749092513, "grad_norm": 0.8086926937103271, "learning_rate": 8.290620258319942e-05, "loss": 0.9148014068603516, "memory(GiB)": 91.52, "step": 47440, "token_acc": 0.7530476310167148, "train_speed(iter/s)": 0.151206 }, { "epoch": 0.615631153310907, "grad_norm": 0.6964075565338135, "learning_rate": 8.290216397371395e-05, "loss": 0.9111326217651368, "memory(GiB)": 91.52, "step": 47445, "token_acc": 0.7523878905617614, "train_speed(iter/s)": 0.151202 }, { "epoch": 0.6156960317125627, "grad_norm": 0.7923653721809387, "learning_rate": 8.289812498559248e-05, "loss": 0.9146966934204102, "memory(GiB)": 91.52, "step": 47450, "token_acc": 0.7519848065008251, "train_speed(iter/s)": 0.151199 }, { "epoch": 0.6157609101142184, "grad_norm": 0.721120297908783, "learning_rate": 8.28940856188815e-05, "loss": 0.887209129333496, "memory(GiB)": 91.52, "step": 47455, "token_acc": 0.761240404532716, "train_speed(iter/s)": 0.151195 }, { "epoch": 0.6158257885158741, "grad_norm": 0.6900761723518372, "learning_rate": 8.289004587362752e-05, "loss": 0.8979728698730469, "memory(GiB)": 91.52, "step": 47460, "token_acc": 0.7557353430123019, "train_speed(iter/s)": 0.151191 }, { "epoch": 0.6158906669175298, "grad_norm": 0.7379868626594543, "learning_rate": 8.2886005749877e-05, "loss": 0.9073113441467285, "memory(GiB)": 91.52, "step": 47465, "token_acc": 0.7585207272124902, "train_speed(iter/s)": 0.151187 }, { "epoch": 0.6159555453191855, "grad_norm": 0.6808361411094666, "learning_rate": 8.288196524767643e-05, "loss": 0.9046834945678711, "memory(GiB)": 91.52, "step": 47470, "token_acc": 0.7592136077899667, "train_speed(iter/s)": 0.151183 }, { "epoch": 0.6160204237208412, "grad_norm": 0.770475447177887, "learning_rate": 8.287792436707233e-05, "loss": 0.9238228797912598, "memory(GiB)": 91.52, "step": 47475, "token_acc": 0.7632721724502172, "train_speed(iter/s)": 0.151179 }, { "epoch": 0.6160853021224969, "grad_norm": 0.7934204339981079, "learning_rate": 8.287388310811119e-05, "loss": 0.9225869178771973, "memory(GiB)": 91.52, "step": 47480, "token_acc": 0.7343560082149294, "train_speed(iter/s)": 0.151176 }, { "epoch": 0.6161501805241526, "grad_norm": 0.7607600092887878, "learning_rate": 8.286984147083954e-05, "loss": 0.8744071960449219, "memory(GiB)": 91.52, "step": 47485, "token_acc": 0.7497432935438326, "train_speed(iter/s)": 0.151172 }, { "epoch": 0.6162150589258083, "grad_norm": 0.7201524376869202, "learning_rate": 8.286579945530384e-05, "loss": 0.8694347381591797, "memory(GiB)": 91.52, "step": 47490, "token_acc": 0.7758964143426295, "train_speed(iter/s)": 0.151168 }, { "epoch": 0.616279937327464, "grad_norm": 0.7559707164764404, "learning_rate": 8.286175706155066e-05, "loss": 0.8677292823791504, "memory(GiB)": 91.52, "step": 47495, "token_acc": 0.756199287176765, "train_speed(iter/s)": 0.151165 }, { "epoch": 0.6163448157291197, "grad_norm": 0.7579864263534546, "learning_rate": 8.285771428962648e-05, "loss": 0.8956171035766601, "memory(GiB)": 91.52, "step": 47500, "token_acc": 0.7619751219847433, "train_speed(iter/s)": 0.151162 }, { "epoch": 0.6164096941307754, "grad_norm": 0.7684990167617798, "learning_rate": 8.285367113957783e-05, "loss": 0.9259527206420899, "memory(GiB)": 91.52, "step": 47505, "token_acc": 0.7539218293007179, "train_speed(iter/s)": 0.151158 }, { "epoch": 0.6164745725324311, "grad_norm": 0.8014104962348938, "learning_rate": 8.284962761145126e-05, "loss": 0.9052645683288574, "memory(GiB)": 91.52, "step": 47510, "token_acc": 0.7632725206741998, "train_speed(iter/s)": 0.151153 }, { "epoch": 0.6165394509340868, "grad_norm": 0.938937783241272, "learning_rate": 8.284558370529328e-05, "loss": 0.9052759170532226, "memory(GiB)": 91.52, "step": 47515, "token_acc": 0.7706604426002767, "train_speed(iter/s)": 0.151148 }, { "epoch": 0.6166043293357425, "grad_norm": 0.7216717004776001, "learning_rate": 8.284153942115045e-05, "loss": 0.874897575378418, "memory(GiB)": 91.52, "step": 47520, "token_acc": 0.7583220171028003, "train_speed(iter/s)": 0.151144 }, { "epoch": 0.6166692077373982, "grad_norm": 0.7531216144561768, "learning_rate": 8.283749475906929e-05, "loss": 0.8838624954223633, "memory(GiB)": 91.52, "step": 47525, "token_acc": 0.7843301877287839, "train_speed(iter/s)": 0.15114 }, { "epoch": 0.6167340861390539, "grad_norm": 0.6768922209739685, "learning_rate": 8.283344971909637e-05, "loss": 0.8944780349731445, "memory(GiB)": 91.52, "step": 47530, "token_acc": 0.7490067369148384, "train_speed(iter/s)": 0.151136 }, { "epoch": 0.6167989645407096, "grad_norm": 0.8590599894523621, "learning_rate": 8.28294043012782e-05, "loss": 0.9260124206542969, "memory(GiB)": 91.52, "step": 47535, "token_acc": 0.7527031367091318, "train_speed(iter/s)": 0.151133 }, { "epoch": 0.6168638429423653, "grad_norm": 0.7876222729682922, "learning_rate": 8.282535850566135e-05, "loss": 0.9396450042724609, "memory(GiB)": 91.52, "step": 47540, "token_acc": 0.7526078710289237, "train_speed(iter/s)": 0.151129 }, { "epoch": 0.616928721344021, "grad_norm": 0.8674687147140503, "learning_rate": 8.28213123322924e-05, "loss": 0.9323704719543457, "memory(GiB)": 91.52, "step": 47545, "token_acc": 0.7490133072543746, "train_speed(iter/s)": 0.151125 }, { "epoch": 0.6169935997456767, "grad_norm": 0.7719108462333679, "learning_rate": 8.281726578121792e-05, "loss": 0.9071510314941407, "memory(GiB)": 91.52, "step": 47550, "token_acc": 0.7368820244087431, "train_speed(iter/s)": 0.151121 }, { "epoch": 0.6170584781473324, "grad_norm": 0.7644121050834656, "learning_rate": 8.281321885248444e-05, "loss": 0.8867960929870605, "memory(GiB)": 91.52, "step": 47555, "token_acc": 0.7518525484295655, "train_speed(iter/s)": 0.151118 }, { "epoch": 0.6171233565489881, "grad_norm": 0.7543298006057739, "learning_rate": 8.280917154613855e-05, "loss": 0.8867181777954102, "memory(GiB)": 91.52, "step": 47560, "token_acc": 0.7503885607709045, "train_speed(iter/s)": 0.151115 }, { "epoch": 0.6171882349506438, "grad_norm": 0.8612163066864014, "learning_rate": 8.280512386222684e-05, "loss": 0.8977134704589844, "memory(GiB)": 91.52, "step": 47565, "token_acc": 0.757876119237997, "train_speed(iter/s)": 0.151111 }, { "epoch": 0.6172531133522995, "grad_norm": 0.8616824746131897, "learning_rate": 8.280107580079585e-05, "loss": 0.9265956878662109, "memory(GiB)": 91.52, "step": 47570, "token_acc": 0.7557523138198802, "train_speed(iter/s)": 0.151106 }, { "epoch": 0.6173179917539552, "grad_norm": 0.7185134291648865, "learning_rate": 8.279702736189221e-05, "loss": 0.9283046722412109, "memory(GiB)": 91.52, "step": 47575, "token_acc": 0.738696622941669, "train_speed(iter/s)": 0.151103 }, { "epoch": 0.6173828701556109, "grad_norm": 0.8209149837493896, "learning_rate": 8.279297854556247e-05, "loss": 0.9318269729614258, "memory(GiB)": 91.52, "step": 47580, "token_acc": 0.7559751839170085, "train_speed(iter/s)": 0.1511 }, { "epoch": 0.6174477485572666, "grad_norm": 0.7082929611206055, "learning_rate": 8.278892935185327e-05, "loss": 0.8655011177062988, "memory(GiB)": 91.52, "step": 47585, "token_acc": 0.7341302152884083, "train_speed(iter/s)": 0.151095 }, { "epoch": 0.6175126269589223, "grad_norm": 0.715697705745697, "learning_rate": 8.278487978081116e-05, "loss": 0.9415092468261719, "memory(GiB)": 91.52, "step": 47590, "token_acc": 0.7482048343053636, "train_speed(iter/s)": 0.151091 }, { "epoch": 0.617577505360578, "grad_norm": 0.7983992099761963, "learning_rate": 8.278082983248278e-05, "loss": 0.9262670516967774, "memory(GiB)": 91.52, "step": 47595, "token_acc": 0.7669575442505261, "train_speed(iter/s)": 0.151087 }, { "epoch": 0.6176423837622337, "grad_norm": 0.7320743799209595, "learning_rate": 8.27767795069147e-05, "loss": 0.8885078430175781, "memory(GiB)": 91.52, "step": 47600, "token_acc": 0.7723539985197194, "train_speed(iter/s)": 0.151084 }, { "epoch": 0.6177072621638894, "grad_norm": 0.7336322665214539, "learning_rate": 8.277272880415357e-05, "loss": 0.8913883209228516, "memory(GiB)": 91.52, "step": 47605, "token_acc": 0.7553366174055829, "train_speed(iter/s)": 0.151079 }, { "epoch": 0.6177721405655451, "grad_norm": 0.774115800857544, "learning_rate": 8.276867772424598e-05, "loss": 0.9244283676147461, "memory(GiB)": 91.52, "step": 47610, "token_acc": 0.7694666838842975, "train_speed(iter/s)": 0.151076 }, { "epoch": 0.6178370189672008, "grad_norm": 0.769111156463623, "learning_rate": 8.276462626723856e-05, "loss": 0.9513114929199219, "memory(GiB)": 91.52, "step": 47615, "token_acc": 0.7545801382540702, "train_speed(iter/s)": 0.151071 }, { "epoch": 0.6179018973688564, "grad_norm": 0.847662091255188, "learning_rate": 8.276057443317792e-05, "loss": 0.8967721939086915, "memory(GiB)": 91.52, "step": 47620, "token_acc": 0.7607995934270709, "train_speed(iter/s)": 0.151067 }, { "epoch": 0.6179667757705121, "grad_norm": 0.7407526969909668, "learning_rate": 8.27565222221107e-05, "loss": 0.9344522476196289, "memory(GiB)": 91.52, "step": 47625, "token_acc": 0.7623429306421292, "train_speed(iter/s)": 0.151064 }, { "epoch": 0.6180316541721678, "grad_norm": 0.7150853872299194, "learning_rate": 8.275246963408353e-05, "loss": 0.8949999809265137, "memory(GiB)": 91.52, "step": 47630, "token_acc": 0.7614795468181978, "train_speed(iter/s)": 0.15106 }, { "epoch": 0.6180965325738235, "grad_norm": 0.812540590763092, "learning_rate": 8.274841666914305e-05, "loss": 0.9319202423095703, "memory(GiB)": 91.52, "step": 47635, "token_acc": 0.7603090687271249, "train_speed(iter/s)": 0.151057 }, { "epoch": 0.6181614109754792, "grad_norm": 0.7404698729515076, "learning_rate": 8.27443633273359e-05, "loss": 0.9205611228942872, "memory(GiB)": 91.52, "step": 47640, "token_acc": 0.7483034964131067, "train_speed(iter/s)": 0.151053 }, { "epoch": 0.6182262893771349, "grad_norm": 0.7753533124923706, "learning_rate": 8.274030960870872e-05, "loss": 0.853603458404541, "memory(GiB)": 91.52, "step": 47645, "token_acc": 0.7837553781981804, "train_speed(iter/s)": 0.151049 }, { "epoch": 0.6182911677787906, "grad_norm": 0.7998868823051453, "learning_rate": 8.273625551330817e-05, "loss": 0.9528326034545899, "memory(GiB)": 91.52, "step": 47650, "token_acc": 0.7483467795265176, "train_speed(iter/s)": 0.151045 }, { "epoch": 0.6183560461804463, "grad_norm": 0.7554352879524231, "learning_rate": 8.273220104118089e-05, "loss": 0.8912567138671875, "memory(GiB)": 91.52, "step": 47655, "token_acc": 0.7504802213902002, "train_speed(iter/s)": 0.151041 }, { "epoch": 0.618420924582102, "grad_norm": 0.7885903716087341, "learning_rate": 8.272814619237355e-05, "loss": 0.8996738433837891, "memory(GiB)": 91.52, "step": 47660, "token_acc": 0.7499024580569645, "train_speed(iter/s)": 0.151038 }, { "epoch": 0.6184858029837577, "grad_norm": 0.731393039226532, "learning_rate": 8.272409096693281e-05, "loss": 0.9111278533935547, "memory(GiB)": 91.52, "step": 47665, "token_acc": 0.7400864356660853, "train_speed(iter/s)": 0.151035 }, { "epoch": 0.6185506813854134, "grad_norm": 0.7451720833778381, "learning_rate": 8.272003536490535e-05, "loss": 0.9006748199462891, "memory(GiB)": 91.52, "step": 47670, "token_acc": 0.7544563995503453, "train_speed(iter/s)": 0.151031 }, { "epoch": 0.6186155597870691, "grad_norm": 0.702341616153717, "learning_rate": 8.271597938633781e-05, "loss": 0.8971691131591797, "memory(GiB)": 91.52, "step": 47675, "token_acc": 0.7442130397293891, "train_speed(iter/s)": 0.151027 }, { "epoch": 0.6186804381887248, "grad_norm": 0.6968240737915039, "learning_rate": 8.27119230312769e-05, "loss": 0.9049409866333008, "memory(GiB)": 91.52, "step": 47680, "token_acc": 0.7603219263233658, "train_speed(iter/s)": 0.151023 }, { "epoch": 0.6187453165903805, "grad_norm": 0.8092896938323975, "learning_rate": 8.270786629976927e-05, "loss": 0.9052229881286621, "memory(GiB)": 91.52, "step": 47685, "token_acc": 0.7579857870935324, "train_speed(iter/s)": 0.151018 }, { "epoch": 0.6188101949920362, "grad_norm": 0.7316973805427551, "learning_rate": 8.270380919186163e-05, "loss": 0.9198700904846191, "memory(GiB)": 91.52, "step": 47690, "token_acc": 0.7744138402003187, "train_speed(iter/s)": 0.151016 }, { "epoch": 0.6188750733936919, "grad_norm": 0.7727027535438538, "learning_rate": 8.269975170760065e-05, "loss": 0.8745668411254883, "memory(GiB)": 91.52, "step": 47695, "token_acc": 0.7816186556927298, "train_speed(iter/s)": 0.151012 }, { "epoch": 0.6189399517953476, "grad_norm": 0.826984703540802, "learning_rate": 8.269569384703304e-05, "loss": 0.9100477218627929, "memory(GiB)": 91.52, "step": 47700, "token_acc": 0.7439569967053927, "train_speed(iter/s)": 0.151008 }, { "epoch": 0.6190048301970033, "grad_norm": 0.7679373621940613, "learning_rate": 8.26916356102055e-05, "loss": 0.8783522605895996, "memory(GiB)": 91.52, "step": 47705, "token_acc": 0.7559343689059242, "train_speed(iter/s)": 0.151003 }, { "epoch": 0.619069708598659, "grad_norm": 0.6705203056335449, "learning_rate": 8.268757699716472e-05, "loss": 0.881930160522461, "memory(GiB)": 91.52, "step": 47710, "token_acc": 0.7584796529642636, "train_speed(iter/s)": 0.151 }, { "epoch": 0.6191345870003147, "grad_norm": 0.7560821175575256, "learning_rate": 8.268351800795738e-05, "loss": 0.9163120269775391, "memory(GiB)": 91.52, "step": 47715, "token_acc": 0.7452240735420856, "train_speed(iter/s)": 0.150996 }, { "epoch": 0.6191994654019704, "grad_norm": 0.7113154530525208, "learning_rate": 8.267945864263023e-05, "loss": 0.9025503158569336, "memory(GiB)": 91.52, "step": 47720, "token_acc": 0.763554263446015, "train_speed(iter/s)": 0.150992 }, { "epoch": 0.619264343803626, "grad_norm": 0.8399767875671387, "learning_rate": 8.267539890122998e-05, "loss": 0.8843276977539063, "memory(GiB)": 91.52, "step": 47725, "token_acc": 0.7537490950460234, "train_speed(iter/s)": 0.150988 }, { "epoch": 0.6193292222052817, "grad_norm": 0.7979794144630432, "learning_rate": 8.267133878380336e-05, "loss": 0.8921999931335449, "memory(GiB)": 91.52, "step": 47730, "token_acc": 0.7475448474531884, "train_speed(iter/s)": 0.150984 }, { "epoch": 0.6193941006069374, "grad_norm": 0.7744570374488831, "learning_rate": 8.266727829039705e-05, "loss": 0.9133007049560546, "memory(GiB)": 91.52, "step": 47735, "token_acc": 0.742230083384471, "train_speed(iter/s)": 0.15098 }, { "epoch": 0.6194589790085931, "grad_norm": 0.8575723767280579, "learning_rate": 8.26632174210578e-05, "loss": 0.9214101791381836, "memory(GiB)": 91.52, "step": 47740, "token_acc": 0.7741430700447094, "train_speed(iter/s)": 0.150977 }, { "epoch": 0.6195238574102488, "grad_norm": 0.7551814913749695, "learning_rate": 8.265915617583237e-05, "loss": 0.885252571105957, "memory(GiB)": 91.52, "step": 47745, "token_acc": 0.766136825618981, "train_speed(iter/s)": 0.150973 }, { "epoch": 0.6195887358119045, "grad_norm": 0.705577552318573, "learning_rate": 8.265509455476746e-05, "loss": 0.8800216674804687, "memory(GiB)": 91.52, "step": 47750, "token_acc": 0.7863073509412354, "train_speed(iter/s)": 0.150969 }, { "epoch": 0.6196536142135602, "grad_norm": 0.7008650898933411, "learning_rate": 8.265103255790981e-05, "loss": 0.8784419059753418, "memory(GiB)": 91.52, "step": 47755, "token_acc": 0.7638196915776987, "train_speed(iter/s)": 0.150965 }, { "epoch": 0.6197184926152159, "grad_norm": 0.7964631915092468, "learning_rate": 8.264697018530618e-05, "loss": 0.9397943496704102, "memory(GiB)": 91.52, "step": 47760, "token_acc": 0.7682323176768232, "train_speed(iter/s)": 0.150962 }, { "epoch": 0.6197833710168716, "grad_norm": 0.718069314956665, "learning_rate": 8.264290743700335e-05, "loss": 0.9345928192138672, "memory(GiB)": 91.52, "step": 47765, "token_acc": 0.7454718779790277, "train_speed(iter/s)": 0.150958 }, { "epoch": 0.6198482494185273, "grad_norm": 0.738615870475769, "learning_rate": 8.263884431304801e-05, "loss": 0.9028396606445312, "memory(GiB)": 91.52, "step": 47770, "token_acc": 0.7514804514968101, "train_speed(iter/s)": 0.150954 }, { "epoch": 0.619913127820183, "grad_norm": 0.7970812320709229, "learning_rate": 8.263478081348697e-05, "loss": 0.9327327728271484, "memory(GiB)": 91.52, "step": 47775, "token_acc": 0.7354424281355387, "train_speed(iter/s)": 0.150951 }, { "epoch": 0.6199780062218387, "grad_norm": 0.7032647728919983, "learning_rate": 8.263071693836696e-05, "loss": 0.8905645370483398, "memory(GiB)": 91.52, "step": 47780, "token_acc": 0.7646820438350613, "train_speed(iter/s)": 0.150946 }, { "epoch": 0.6200428846234944, "grad_norm": 0.745303213596344, "learning_rate": 8.262665268773477e-05, "loss": 0.9166529655456543, "memory(GiB)": 91.52, "step": 47785, "token_acc": 0.7663995358450884, "train_speed(iter/s)": 0.150942 }, { "epoch": 0.6201077630251501, "grad_norm": 0.752937912940979, "learning_rate": 8.262258806163716e-05, "loss": 0.8980366706848144, "memory(GiB)": 91.52, "step": 47790, "token_acc": 0.772910098923796, "train_speed(iter/s)": 0.150938 }, { "epoch": 0.6201726414268058, "grad_norm": 0.8323113322257996, "learning_rate": 8.26185230601209e-05, "loss": 0.9029218673706054, "memory(GiB)": 91.52, "step": 47795, "token_acc": 0.7467427032592768, "train_speed(iter/s)": 0.150935 }, { "epoch": 0.6202375198284615, "grad_norm": 0.8402870893478394, "learning_rate": 8.261445768323278e-05, "loss": 0.8934474945068359, "memory(GiB)": 91.52, "step": 47800, "token_acc": 0.7647210174226273, "train_speed(iter/s)": 0.150932 }, { "epoch": 0.6203023982301172, "grad_norm": 0.8459683060646057, "learning_rate": 8.261039193101958e-05, "loss": 0.8974952697753906, "memory(GiB)": 91.52, "step": 47805, "token_acc": 0.7682581876427087, "train_speed(iter/s)": 0.150928 }, { "epoch": 0.6203672766317729, "grad_norm": 0.812947154045105, "learning_rate": 8.260632580352808e-05, "loss": 0.9332489013671875, "memory(GiB)": 91.52, "step": 47810, "token_acc": 0.7452486149098815, "train_speed(iter/s)": 0.150925 }, { "epoch": 0.6204321550334286, "grad_norm": 0.7140143513679504, "learning_rate": 8.260225930080508e-05, "loss": 0.8519440650939941, "memory(GiB)": 91.52, "step": 47815, "token_acc": 0.7648037041867745, "train_speed(iter/s)": 0.15092 }, { "epoch": 0.6204970334350843, "grad_norm": 0.7601797580718994, "learning_rate": 8.25981924228974e-05, "loss": 0.9353263854980469, "memory(GiB)": 91.52, "step": 47820, "token_acc": 0.7646146087901005, "train_speed(iter/s)": 0.150917 }, { "epoch": 0.62056191183674, "grad_norm": 0.763222873210907, "learning_rate": 8.25941251698518e-05, "loss": 0.8729944229125977, "memory(GiB)": 91.52, "step": 47825, "token_acc": 0.7713212104945313, "train_speed(iter/s)": 0.150914 }, { "epoch": 0.6206267902383957, "grad_norm": 0.7194887399673462, "learning_rate": 8.259005754171509e-05, "loss": 0.9177962303161621, "memory(GiB)": 91.52, "step": 47830, "token_acc": 0.7668164822298599, "train_speed(iter/s)": 0.150911 }, { "epoch": 0.6206916686400514, "grad_norm": 0.7775047421455383, "learning_rate": 8.258598953853413e-05, "loss": 0.9269364356994629, "memory(GiB)": 91.52, "step": 47835, "token_acc": 0.7676570871894788, "train_speed(iter/s)": 0.150907 }, { "epoch": 0.6207565470417071, "grad_norm": 0.7048811316490173, "learning_rate": 8.258192116035568e-05, "loss": 0.9034796714782715, "memory(GiB)": 91.52, "step": 47840, "token_acc": 0.7424490836885634, "train_speed(iter/s)": 0.150903 }, { "epoch": 0.6208214254433628, "grad_norm": 0.6862676739692688, "learning_rate": 8.25778524072266e-05, "loss": 0.9368346214294434, "memory(GiB)": 91.52, "step": 47845, "token_acc": 0.7429713092623924, "train_speed(iter/s)": 0.150899 }, { "epoch": 0.6208863038450185, "grad_norm": 0.8412541747093201, "learning_rate": 8.257378327919366e-05, "loss": 0.9351312637329101, "memory(GiB)": 91.52, "step": 47850, "token_acc": 0.7560246590696805, "train_speed(iter/s)": 0.150895 }, { "epoch": 0.6209511822466741, "grad_norm": 0.7964047789573669, "learning_rate": 8.256971377630373e-05, "loss": 0.8320905685424804, "memory(GiB)": 91.52, "step": 47855, "token_acc": 0.774941578285098, "train_speed(iter/s)": 0.150892 }, { "epoch": 0.6210160606483298, "grad_norm": 0.6945655345916748, "learning_rate": 8.256564389860365e-05, "loss": 0.9144608497619628, "memory(GiB)": 91.52, "step": 47860, "token_acc": 0.7617055510860821, "train_speed(iter/s)": 0.150889 }, { "epoch": 0.6210809390499855, "grad_norm": 0.7720966935157776, "learning_rate": 8.256157364614022e-05, "loss": 0.9182361602783203, "memory(GiB)": 91.52, "step": 47865, "token_acc": 0.7614698926687284, "train_speed(iter/s)": 0.150885 }, { "epoch": 0.6211458174516412, "grad_norm": 0.789471447467804, "learning_rate": 8.255750301896028e-05, "loss": 0.8793176651000977, "memory(GiB)": 91.52, "step": 47870, "token_acc": 0.7434716353813119, "train_speed(iter/s)": 0.15088 }, { "epoch": 0.6212106958532969, "grad_norm": 0.7432973980903625, "learning_rate": 8.25534320171107e-05, "loss": 0.8928279876708984, "memory(GiB)": 91.52, "step": 47875, "token_acc": 0.7487285894462753, "train_speed(iter/s)": 0.150876 }, { "epoch": 0.6212755742549526, "grad_norm": 0.8093062043190002, "learning_rate": 8.254936064063834e-05, "loss": 0.9045826911926269, "memory(GiB)": 91.52, "step": 47880, "token_acc": 0.7391352468088246, "train_speed(iter/s)": 0.150873 }, { "epoch": 0.6213404526566083, "grad_norm": 0.7727893590927124, "learning_rate": 8.254528888959001e-05, "loss": 0.9321596145629882, "memory(GiB)": 91.52, "step": 47885, "token_acc": 0.7543168244423678, "train_speed(iter/s)": 0.15087 }, { "epoch": 0.621405331058264, "grad_norm": 0.8079776763916016, "learning_rate": 8.25412167640126e-05, "loss": 0.9378454208374023, "memory(GiB)": 91.52, "step": 47890, "token_acc": 0.7600882314778773, "train_speed(iter/s)": 0.150865 }, { "epoch": 0.6214702094599197, "grad_norm": 0.7691839933395386, "learning_rate": 8.253714426395296e-05, "loss": 0.9324203491210937, "memory(GiB)": 91.52, "step": 47895, "token_acc": 0.7506942092786997, "train_speed(iter/s)": 0.150861 }, { "epoch": 0.6215350878615754, "grad_norm": 0.8719114065170288, "learning_rate": 8.253307138945796e-05, "loss": 0.9602554321289063, "memory(GiB)": 91.52, "step": 47900, "token_acc": 0.75233066040811, "train_speed(iter/s)": 0.150859 }, { "epoch": 0.6215999662632311, "grad_norm": 0.7735220193862915, "learning_rate": 8.252899814057446e-05, "loss": 0.909693717956543, "memory(GiB)": 91.52, "step": 47905, "token_acc": 0.7599425793611954, "train_speed(iter/s)": 0.150856 }, { "epoch": 0.6216648446648868, "grad_norm": 0.7895457744598389, "learning_rate": 8.252492451734934e-05, "loss": 0.8620671272277832, "memory(GiB)": 91.52, "step": 47910, "token_acc": 0.7672152895816923, "train_speed(iter/s)": 0.150851 }, { "epoch": 0.6217297230665425, "grad_norm": 0.8350714445114136, "learning_rate": 8.25208505198295e-05, "loss": 0.895135498046875, "memory(GiB)": 91.52, "step": 47915, "token_acc": 0.7568917453925972, "train_speed(iter/s)": 0.150848 }, { "epoch": 0.6217946014681982, "grad_norm": 0.7541400194168091, "learning_rate": 8.251677614806179e-05, "loss": 0.9252694129943848, "memory(GiB)": 91.52, "step": 47920, "token_acc": 0.7569412385411858, "train_speed(iter/s)": 0.150844 }, { "epoch": 0.6218594798698539, "grad_norm": 0.881422221660614, "learning_rate": 8.251270140209313e-05, "loss": 0.864891242980957, "memory(GiB)": 91.52, "step": 47925, "token_acc": 0.7528247643243966, "train_speed(iter/s)": 0.150839 }, { "epoch": 0.6219243582715096, "grad_norm": 0.7795678377151489, "learning_rate": 8.250862628197038e-05, "loss": 0.8907389640808105, "memory(GiB)": 91.52, "step": 47930, "token_acc": 0.7560236305712783, "train_speed(iter/s)": 0.150835 }, { "epoch": 0.6219892366731653, "grad_norm": 0.7357072830200195, "learning_rate": 8.250455078774046e-05, "loss": 0.873259162902832, "memory(GiB)": 91.52, "step": 47935, "token_acc": 0.7539508506616257, "train_speed(iter/s)": 0.150832 }, { "epoch": 0.622054115074821, "grad_norm": 0.7632442712783813, "learning_rate": 8.250047491945024e-05, "loss": 0.9728625297546387, "memory(GiB)": 91.52, "step": 47940, "token_acc": 0.7452262170996043, "train_speed(iter/s)": 0.150828 }, { "epoch": 0.6221189934764767, "grad_norm": 0.7356510162353516, "learning_rate": 8.249639867714667e-05, "loss": 0.9171344757080078, "memory(GiB)": 91.52, "step": 47945, "token_acc": 0.7574472917218669, "train_speed(iter/s)": 0.150824 }, { "epoch": 0.6221838718781324, "grad_norm": 0.733771026134491, "learning_rate": 8.249232206087662e-05, "loss": 0.8612239837646485, "memory(GiB)": 91.52, "step": 47950, "token_acc": 0.7570317395952372, "train_speed(iter/s)": 0.15082 }, { "epoch": 0.6222487502797881, "grad_norm": 0.7011624574661255, "learning_rate": 8.248824507068704e-05, "loss": 0.9426662445068359, "memory(GiB)": 91.52, "step": 47955, "token_acc": 0.7485557179647804, "train_speed(iter/s)": 0.150817 }, { "epoch": 0.6223136286814438, "grad_norm": 0.7813484072685242, "learning_rate": 8.248416770662481e-05, "loss": 0.8677167892456055, "memory(GiB)": 91.52, "step": 47960, "token_acc": 0.7719148667424529, "train_speed(iter/s)": 0.150813 }, { "epoch": 0.6223785070830995, "grad_norm": 0.7573915719985962, "learning_rate": 8.248008996873687e-05, "loss": 0.9155227661132812, "memory(GiB)": 91.52, "step": 47965, "token_acc": 0.7696082048254999, "train_speed(iter/s)": 0.150809 }, { "epoch": 0.6224433854847552, "grad_norm": 0.7854450345039368, "learning_rate": 8.247601185707015e-05, "loss": 0.8677128791809082, "memory(GiB)": 91.52, "step": 47970, "token_acc": 0.764957823840973, "train_speed(iter/s)": 0.150804 }, { "epoch": 0.6225082638864109, "grad_norm": 0.7556080222129822, "learning_rate": 8.247193337167157e-05, "loss": 0.9551603317260742, "memory(GiB)": 91.52, "step": 47975, "token_acc": 0.7517478152309613, "train_speed(iter/s)": 0.1508 }, { "epoch": 0.6225731422880666, "grad_norm": 0.8410745859146118, "learning_rate": 8.246785451258807e-05, "loss": 0.9061458587646485, "memory(GiB)": 91.52, "step": 47980, "token_acc": 0.7577647093590236, "train_speed(iter/s)": 0.150796 }, { "epoch": 0.6226380206897223, "grad_norm": 0.7953965663909912, "learning_rate": 8.246377527986659e-05, "loss": 0.9333549499511719, "memory(GiB)": 91.52, "step": 47985, "token_acc": 0.7534375101103239, "train_speed(iter/s)": 0.150792 }, { "epoch": 0.622702899091378, "grad_norm": 0.7874994874000549, "learning_rate": 8.24596956735541e-05, "loss": 0.9116447448730469, "memory(GiB)": 91.52, "step": 47990, "token_acc": 0.7703588369482565, "train_speed(iter/s)": 0.150788 }, { "epoch": 0.6227677774930337, "grad_norm": 0.7807138562202454, "learning_rate": 8.245561569369749e-05, "loss": 0.9844741821289062, "memory(GiB)": 91.52, "step": 47995, "token_acc": 0.7286206768728917, "train_speed(iter/s)": 0.150785 }, { "epoch": 0.6228326558946894, "grad_norm": 0.7882904410362244, "learning_rate": 8.245153534034375e-05, "loss": 0.9373954772949219, "memory(GiB)": 91.52, "step": 48000, "token_acc": 0.7485881841876629, "train_speed(iter/s)": 0.150781 }, { "epoch": 0.6228975342963451, "grad_norm": 0.7177576422691345, "learning_rate": 8.244745461353983e-05, "loss": 0.8712820053100586, "memory(GiB)": 91.52, "step": 48005, "token_acc": 0.768988101361314, "train_speed(iter/s)": 0.150778 }, { "epoch": 0.6229624126980008, "grad_norm": 0.7981554865837097, "learning_rate": 8.244337351333269e-05, "loss": 0.9039268493652344, "memory(GiB)": 91.52, "step": 48010, "token_acc": 0.7570904824754363, "train_speed(iter/s)": 0.150775 }, { "epoch": 0.6230272910996565, "grad_norm": 0.7648643851280212, "learning_rate": 8.24392920397693e-05, "loss": 0.9239508628845214, "memory(GiB)": 91.52, "step": 48015, "token_acc": 0.7563697548245313, "train_speed(iter/s)": 0.150771 }, { "epoch": 0.6230921695013122, "grad_norm": 0.7250347137451172, "learning_rate": 8.243521019289662e-05, "loss": 0.8410571098327637, "memory(GiB)": 91.52, "step": 48020, "token_acc": 0.7835251605995718, "train_speed(iter/s)": 0.150767 }, { "epoch": 0.6231570479029679, "grad_norm": 0.7657439112663269, "learning_rate": 8.243112797276163e-05, "loss": 0.9468907356262207, "memory(GiB)": 91.52, "step": 48025, "token_acc": 0.74713806075806, "train_speed(iter/s)": 0.150763 }, { "epoch": 0.6232219263046236, "grad_norm": 0.8370407819747925, "learning_rate": 8.24270453794113e-05, "loss": 0.9026629447937011, "memory(GiB)": 91.52, "step": 48030, "token_acc": 0.7439860037906401, "train_speed(iter/s)": 0.15076 }, { "epoch": 0.6232868047062793, "grad_norm": 0.7136421799659729, "learning_rate": 8.242296241289264e-05, "loss": 0.8937400817871094, "memory(GiB)": 91.52, "step": 48035, "token_acc": 0.7675763416861114, "train_speed(iter/s)": 0.150755 }, { "epoch": 0.623351683107935, "grad_norm": 0.7577604055404663, "learning_rate": 8.24188790732526e-05, "loss": 0.9202756881713867, "memory(GiB)": 91.52, "step": 48040, "token_acc": 0.7575715764056571, "train_speed(iter/s)": 0.150751 }, { "epoch": 0.6234165615095907, "grad_norm": 0.7957627773284912, "learning_rate": 8.241479536053817e-05, "loss": 0.907048225402832, "memory(GiB)": 91.52, "step": 48045, "token_acc": 0.7573191911673489, "train_speed(iter/s)": 0.150748 }, { "epoch": 0.6234814399112464, "grad_norm": 0.7560232281684875, "learning_rate": 8.241071127479639e-05, "loss": 0.9087526321411132, "memory(GiB)": 91.52, "step": 48050, "token_acc": 0.7418766908728464, "train_speed(iter/s)": 0.150743 }, { "epoch": 0.6235463183129021, "grad_norm": 0.8145696520805359, "learning_rate": 8.240662681607423e-05, "loss": 0.8806221008300781, "memory(GiB)": 91.52, "step": 48055, "token_acc": 0.765418723165202, "train_speed(iter/s)": 0.150739 }, { "epoch": 0.6236111967145578, "grad_norm": 0.7652111649513245, "learning_rate": 8.240254198441868e-05, "loss": 0.8772087097167969, "memory(GiB)": 91.52, "step": 48060, "token_acc": 0.7645655990195753, "train_speed(iter/s)": 0.150735 }, { "epoch": 0.6236760751162135, "grad_norm": 0.8195810914039612, "learning_rate": 8.239845677987676e-05, "loss": 0.9144081115722656, "memory(GiB)": 91.52, "step": 48065, "token_acc": 0.7465888900063696, "train_speed(iter/s)": 0.150732 }, { "epoch": 0.6237409535178692, "grad_norm": 0.8407844305038452, "learning_rate": 8.23943712024955e-05, "loss": 0.8872230529785157, "memory(GiB)": 91.52, "step": 48070, "token_acc": 0.7489530191121602, "train_speed(iter/s)": 0.150728 }, { "epoch": 0.6238058319195249, "grad_norm": 0.7989033460617065, "learning_rate": 8.239028525232189e-05, "loss": 0.9678380966186524, "memory(GiB)": 91.52, "step": 48075, "token_acc": 0.7363519128815017, "train_speed(iter/s)": 0.150724 }, { "epoch": 0.6238707103211806, "grad_norm": 0.7311700582504272, "learning_rate": 8.238619892940296e-05, "loss": 0.8801925659179688, "memory(GiB)": 91.52, "step": 48080, "token_acc": 0.7560401277378149, "train_speed(iter/s)": 0.15072 }, { "epoch": 0.6239355887228363, "grad_norm": 0.7439350485801697, "learning_rate": 8.238211223378574e-05, "loss": 0.9160205841064453, "memory(GiB)": 91.52, "step": 48085, "token_acc": 0.7594707632912411, "train_speed(iter/s)": 0.150717 }, { "epoch": 0.624000467124492, "grad_norm": 0.7060187458992004, "learning_rate": 8.237802516551726e-05, "loss": 0.8424263000488281, "memory(GiB)": 91.52, "step": 48090, "token_acc": 0.7934367501436345, "train_speed(iter/s)": 0.150712 }, { "epoch": 0.6240653455261476, "grad_norm": 0.8049753904342651, "learning_rate": 8.237393772464455e-05, "loss": 0.8780221939086914, "memory(GiB)": 91.52, "step": 48095, "token_acc": 0.7622716497001812, "train_speed(iter/s)": 0.150708 }, { "epoch": 0.6241302239278033, "grad_norm": 0.6966526508331299, "learning_rate": 8.236984991121463e-05, "loss": 0.9206050872802735, "memory(GiB)": 91.52, "step": 48100, "token_acc": 0.7262101111509501, "train_speed(iter/s)": 0.150705 }, { "epoch": 0.624195102329459, "grad_norm": 0.8505682945251465, "learning_rate": 8.236576172527459e-05, "loss": 0.920224666595459, "memory(GiB)": 91.52, "step": 48105, "token_acc": 0.7585801509348098, "train_speed(iter/s)": 0.1507 }, { "epoch": 0.6242599807311147, "grad_norm": 0.7546423077583313, "learning_rate": 8.236167316687143e-05, "loss": 0.8767363548278808, "memory(GiB)": 91.52, "step": 48110, "token_acc": 0.779028998061296, "train_speed(iter/s)": 0.150697 }, { "epoch": 0.6243248591327704, "grad_norm": 0.8614649176597595, "learning_rate": 8.235758423605222e-05, "loss": 0.8950309753417969, "memory(GiB)": 91.52, "step": 48115, "token_acc": 0.7647514819881441, "train_speed(iter/s)": 0.150693 }, { "epoch": 0.6243897375344261, "grad_norm": 0.7191919684410095, "learning_rate": 8.235349493286402e-05, "loss": 0.9125771522521973, "memory(GiB)": 91.52, "step": 48120, "token_acc": 0.761759822910902, "train_speed(iter/s)": 0.150688 }, { "epoch": 0.6244546159360818, "grad_norm": 0.8345818519592285, "learning_rate": 8.234940525735389e-05, "loss": 0.891202163696289, "memory(GiB)": 91.52, "step": 48125, "token_acc": 0.7552626954840977, "train_speed(iter/s)": 0.150684 }, { "epoch": 0.6245194943377375, "grad_norm": 0.7790125012397766, "learning_rate": 8.234531520956887e-05, "loss": 0.9028886795043946, "memory(GiB)": 91.52, "step": 48130, "token_acc": 0.748501314636284, "train_speed(iter/s)": 0.150681 }, { "epoch": 0.6245843727393932, "grad_norm": 0.797340989112854, "learning_rate": 8.234122478955603e-05, "loss": 0.8865116119384766, "memory(GiB)": 91.52, "step": 48135, "token_acc": 0.7491874322860238, "train_speed(iter/s)": 0.150677 }, { "epoch": 0.6246492511410489, "grad_norm": 0.8411600589752197, "learning_rate": 8.233713399736248e-05, "loss": 0.9130634307861328, "memory(GiB)": 91.52, "step": 48140, "token_acc": 0.7503722264589462, "train_speed(iter/s)": 0.150674 }, { "epoch": 0.6247141295427046, "grad_norm": 0.8595064878463745, "learning_rate": 8.233304283303527e-05, "loss": 0.9093450546264649, "memory(GiB)": 91.52, "step": 48145, "token_acc": 0.7504111090809428, "train_speed(iter/s)": 0.15067 }, { "epoch": 0.6247790079443603, "grad_norm": 0.7744112610816956, "learning_rate": 8.232895129662148e-05, "loss": 0.9016807556152344, "memory(GiB)": 91.52, "step": 48150, "token_acc": 0.7322364852648707, "train_speed(iter/s)": 0.150668 }, { "epoch": 0.624843886346016, "grad_norm": 0.7681029438972473, "learning_rate": 8.232485938816819e-05, "loss": 0.8787018775939941, "memory(GiB)": 91.52, "step": 48155, "token_acc": 0.7696994017218737, "train_speed(iter/s)": 0.150665 }, { "epoch": 0.6249087647476717, "grad_norm": 0.6889796257019043, "learning_rate": 8.232076710772252e-05, "loss": 0.8568533897399903, "memory(GiB)": 91.52, "step": 48160, "token_acc": 0.7581212497413615, "train_speed(iter/s)": 0.150661 }, { "epoch": 0.6249736431493274, "grad_norm": 0.775458812713623, "learning_rate": 8.231667445533154e-05, "loss": 0.9130994796752929, "memory(GiB)": 91.52, "step": 48165, "token_acc": 0.7420670411047114, "train_speed(iter/s)": 0.150657 }, { "epoch": 0.625038521550983, "grad_norm": 0.762185275554657, "learning_rate": 8.231258143104233e-05, "loss": 0.9261754989624024, "memory(GiB)": 91.52, "step": 48170, "token_acc": 0.7557026280280151, "train_speed(iter/s)": 0.150653 }, { "epoch": 0.6251033999526387, "grad_norm": 0.7789055109024048, "learning_rate": 8.230848803490203e-05, "loss": 0.9026885032653809, "memory(GiB)": 91.52, "step": 48175, "token_acc": 0.7507491963166785, "train_speed(iter/s)": 0.150649 }, { "epoch": 0.6251682783542944, "grad_norm": 0.8619564771652222, "learning_rate": 8.230439426695774e-05, "loss": 0.8688854217529297, "memory(GiB)": 91.52, "step": 48180, "token_acc": 0.7500715512306811, "train_speed(iter/s)": 0.150644 }, { "epoch": 0.6252331567559501, "grad_norm": 0.7623466849327087, "learning_rate": 8.230030012725652e-05, "loss": 0.8761443138122559, "memory(GiB)": 91.52, "step": 48185, "token_acc": 0.7552873968997721, "train_speed(iter/s)": 0.15064 }, { "epoch": 0.6252980351576058, "grad_norm": 0.7483389377593994, "learning_rate": 8.229620561584554e-05, "loss": 0.8896944999694825, "memory(GiB)": 91.52, "step": 48190, "token_acc": 0.7550609061431952, "train_speed(iter/s)": 0.150635 }, { "epoch": 0.6253629135592615, "grad_norm": 0.8858396410942078, "learning_rate": 8.229211073277191e-05, "loss": 0.9298004150390625, "memory(GiB)": 91.52, "step": 48195, "token_acc": 0.7567595423781222, "train_speed(iter/s)": 0.150631 }, { "epoch": 0.6254277919609172, "grad_norm": 0.8958259224891663, "learning_rate": 8.228801547808276e-05, "loss": 0.9028751373291015, "memory(GiB)": 91.52, "step": 48200, "token_acc": 0.7562053937977642, "train_speed(iter/s)": 0.150627 }, { "epoch": 0.625492670362573, "grad_norm": 0.8618268966674805, "learning_rate": 8.22839198518252e-05, "loss": 0.9343068122863769, "memory(GiB)": 91.52, "step": 48205, "token_acc": 0.746979728346188, "train_speed(iter/s)": 0.150624 }, { "epoch": 0.6255575487642286, "grad_norm": 0.7776549458503723, "learning_rate": 8.227982385404636e-05, "loss": 0.9217507362365722, "memory(GiB)": 91.52, "step": 48210, "token_acc": 0.7335833061623737, "train_speed(iter/s)": 0.15062 }, { "epoch": 0.6256224271658843, "grad_norm": 0.8208039999008179, "learning_rate": 8.22757274847934e-05, "loss": 0.9136886596679688, "memory(GiB)": 91.52, "step": 48215, "token_acc": 0.7467139771675408, "train_speed(iter/s)": 0.150617 }, { "epoch": 0.62568730556754, "grad_norm": 0.7436546683311462, "learning_rate": 8.227163074411343e-05, "loss": 0.9384109497070312, "memory(GiB)": 91.52, "step": 48220, "token_acc": 0.7532303183695218, "train_speed(iter/s)": 0.150614 }, { "epoch": 0.6257521839691957, "grad_norm": 0.7596924304962158, "learning_rate": 8.226753363205362e-05, "loss": 0.9316610336303711, "memory(GiB)": 91.52, "step": 48225, "token_acc": 0.7433708004719618, "train_speed(iter/s)": 0.15061 }, { "epoch": 0.6258170623708514, "grad_norm": 0.6739394068717957, "learning_rate": 8.22634361486611e-05, "loss": 0.9090553283691406, "memory(GiB)": 91.52, "step": 48230, "token_acc": 0.7390324963072378, "train_speed(iter/s)": 0.150607 }, { "epoch": 0.6258819407725071, "grad_norm": 0.7390378713607788, "learning_rate": 8.225933829398304e-05, "loss": 0.8790021896362304, "memory(GiB)": 91.52, "step": 48235, "token_acc": 0.7853406586176267, "train_speed(iter/s)": 0.150602 }, { "epoch": 0.6259468191741628, "grad_norm": 0.8025129437446594, "learning_rate": 8.225524006806659e-05, "loss": 0.8538154602050781, "memory(GiB)": 91.52, "step": 48240, "token_acc": 0.7687170027351949, "train_speed(iter/s)": 0.150598 }, { "epoch": 0.6260116975758185, "grad_norm": 0.7703980207443237, "learning_rate": 8.22511414709589e-05, "loss": 0.8726912498474121, "memory(GiB)": 91.52, "step": 48245, "token_acc": 0.7593174999096266, "train_speed(iter/s)": 0.150595 }, { "epoch": 0.6260765759774742, "grad_norm": 0.761576771736145, "learning_rate": 8.224704250270718e-05, "loss": 0.9346317291259766, "memory(GiB)": 91.52, "step": 48250, "token_acc": 0.7457632408476639, "train_speed(iter/s)": 0.150592 }, { "epoch": 0.6261414543791299, "grad_norm": 0.7001541256904602, "learning_rate": 8.224294316335855e-05, "loss": 0.9458448410034179, "memory(GiB)": 91.52, "step": 48255, "token_acc": 0.7326032929481205, "train_speed(iter/s)": 0.150589 }, { "epoch": 0.6262063327807856, "grad_norm": 0.6817945837974548, "learning_rate": 8.22388434529602e-05, "loss": 0.9574496269226074, "memory(GiB)": 91.52, "step": 48260, "token_acc": 0.7510314134424845, "train_speed(iter/s)": 0.150585 }, { "epoch": 0.6262712111824413, "grad_norm": 0.7773733735084534, "learning_rate": 8.223474337155932e-05, "loss": 0.9126109123229981, "memory(GiB)": 91.52, "step": 48265, "token_acc": 0.762063227953411, "train_speed(iter/s)": 0.150581 }, { "epoch": 0.626336089584097, "grad_norm": 0.7590768337249756, "learning_rate": 8.223064291920308e-05, "loss": 0.9029485702514648, "memory(GiB)": 91.52, "step": 48270, "token_acc": 0.7483549939437588, "train_speed(iter/s)": 0.150578 }, { "epoch": 0.6264009679857527, "grad_norm": 0.8312452435493469, "learning_rate": 8.22265420959387e-05, "loss": 0.9329007148742676, "memory(GiB)": 91.52, "step": 48275, "token_acc": 0.7625118408588569, "train_speed(iter/s)": 0.150574 }, { "epoch": 0.6264658463874084, "grad_norm": 0.7166520357131958, "learning_rate": 8.222244090181332e-05, "loss": 0.9022720336914063, "memory(GiB)": 91.52, "step": 48280, "token_acc": 0.7654293523167863, "train_speed(iter/s)": 0.15057 }, { "epoch": 0.6265307247890641, "grad_norm": 0.7614984512329102, "learning_rate": 8.221833933687417e-05, "loss": 0.9032426834106445, "memory(GiB)": 91.52, "step": 48285, "token_acc": 0.7741837560712358, "train_speed(iter/s)": 0.150566 }, { "epoch": 0.6265956031907198, "grad_norm": 0.748877227306366, "learning_rate": 8.221423740116848e-05, "loss": 0.9119200706481934, "memory(GiB)": 91.52, "step": 48290, "token_acc": 0.7402744912446758, "train_speed(iter/s)": 0.150562 }, { "epoch": 0.6266604815923755, "grad_norm": 0.7202255725860596, "learning_rate": 8.221013509474338e-05, "loss": 0.9161401748657226, "memory(GiB)": 91.52, "step": 48295, "token_acc": 0.7578280916660932, "train_speed(iter/s)": 0.150558 }, { "epoch": 0.6267253599940312, "grad_norm": 0.7850733995437622, "learning_rate": 8.220603241764613e-05, "loss": 0.901212501525879, "memory(GiB)": 91.52, "step": 48300, "token_acc": 0.7595643021818304, "train_speed(iter/s)": 0.150554 }, { "epoch": 0.6267902383956869, "grad_norm": 0.7643979787826538, "learning_rate": 8.220192936992392e-05, "loss": 0.9035562515258789, "memory(GiB)": 91.52, "step": 48305, "token_acc": 0.7790373212417161, "train_speed(iter/s)": 0.150551 }, { "epoch": 0.6268551167973426, "grad_norm": 0.7239747047424316, "learning_rate": 8.2197825951624e-05, "loss": 0.8799505233764648, "memory(GiB)": 91.52, "step": 48310, "token_acc": 0.7786864574701877, "train_speed(iter/s)": 0.150547 }, { "epoch": 0.6269199951989983, "grad_norm": 0.7602390646934509, "learning_rate": 8.219372216279356e-05, "loss": 0.9394960403442383, "memory(GiB)": 91.52, "step": 48315, "token_acc": 0.7232115723576248, "train_speed(iter/s)": 0.150545 }, { "epoch": 0.626984873600654, "grad_norm": 0.7174022197723389, "learning_rate": 8.218961800347983e-05, "loss": 0.8630875587463379, "memory(GiB)": 91.52, "step": 48320, "token_acc": 0.7389014550487829, "train_speed(iter/s)": 0.15054 }, { "epoch": 0.6270497520023097, "grad_norm": 0.7870537638664246, "learning_rate": 8.218551347373007e-05, "loss": 0.904726791381836, "memory(GiB)": 91.52, "step": 48325, "token_acc": 0.7669618851810849, "train_speed(iter/s)": 0.150537 }, { "epoch": 0.6271146304039654, "grad_norm": 0.8009892106056213, "learning_rate": 8.218140857359147e-05, "loss": 0.8950058937072753, "memory(GiB)": 91.52, "step": 48330, "token_acc": 0.7784039322500861, "train_speed(iter/s)": 0.150532 }, { "epoch": 0.627179508805621, "grad_norm": 0.8205573558807373, "learning_rate": 8.21773033031113e-05, "loss": 0.971717643737793, "memory(GiB)": 91.52, "step": 48335, "token_acc": 0.733926981300089, "train_speed(iter/s)": 0.150529 }, { "epoch": 0.6272443872072767, "grad_norm": 0.7530282735824585, "learning_rate": 8.21731976623368e-05, "loss": 0.8810121536254882, "memory(GiB)": 91.52, "step": 48340, "token_acc": 0.7715885458326668, "train_speed(iter/s)": 0.150525 }, { "epoch": 0.6273092656089324, "grad_norm": 0.7532617449760437, "learning_rate": 8.21690916513152e-05, "loss": 0.8988515853881835, "memory(GiB)": 91.52, "step": 48345, "token_acc": 0.7541641781140089, "train_speed(iter/s)": 0.150522 }, { "epoch": 0.6273741440105881, "grad_norm": 0.8363198041915894, "learning_rate": 8.216498527009378e-05, "loss": 0.9183835983276367, "memory(GiB)": 91.52, "step": 48350, "token_acc": 0.7406105100463678, "train_speed(iter/s)": 0.150518 }, { "epoch": 0.6274390224122438, "grad_norm": 0.8193973302841187, "learning_rate": 8.216087851871977e-05, "loss": 0.8928765296936035, "memory(GiB)": 91.52, "step": 48355, "token_acc": 0.746821916798703, "train_speed(iter/s)": 0.150514 }, { "epoch": 0.6275039008138995, "grad_norm": 0.8164350390434265, "learning_rate": 8.215677139724043e-05, "loss": 0.8719528198242188, "memory(GiB)": 91.52, "step": 48360, "token_acc": 0.7673241460387956, "train_speed(iter/s)": 0.15051 }, { "epoch": 0.6275687792155552, "grad_norm": 0.8083869814872742, "learning_rate": 8.215266390570305e-05, "loss": 0.9244440078735352, "memory(GiB)": 91.52, "step": 48365, "token_acc": 0.7537029881504379, "train_speed(iter/s)": 0.150506 }, { "epoch": 0.6276336576172109, "grad_norm": 0.7825356125831604, "learning_rate": 8.214855604415487e-05, "loss": 0.9577841758728027, "memory(GiB)": 91.52, "step": 48370, "token_acc": 0.7355032998013712, "train_speed(iter/s)": 0.150502 }, { "epoch": 0.6276985360188666, "grad_norm": 0.7940748333930969, "learning_rate": 8.214444781264319e-05, "loss": 0.9275127410888672, "memory(GiB)": 91.52, "step": 48375, "token_acc": 0.7406135531135531, "train_speed(iter/s)": 0.150499 }, { "epoch": 0.6277634144205223, "grad_norm": 0.6634221076965332, "learning_rate": 8.214033921121525e-05, "loss": 0.8795269966125489, "memory(GiB)": 91.52, "step": 48380, "token_acc": 0.7701897769774207, "train_speed(iter/s)": 0.150496 }, { "epoch": 0.627828292822178, "grad_norm": 0.8031619191169739, "learning_rate": 8.213623023991839e-05, "loss": 0.869206428527832, "memory(GiB)": 91.52, "step": 48385, "token_acc": 0.7600309160561866, "train_speed(iter/s)": 0.150491 }, { "epoch": 0.6278931712238337, "grad_norm": 0.7854105830192566, "learning_rate": 8.213212089879982e-05, "loss": 0.8673423767089844, "memory(GiB)": 91.52, "step": 48390, "token_acc": 0.7740452165027462, "train_speed(iter/s)": 0.150487 }, { "epoch": 0.6279580496254894, "grad_norm": 0.9651669859886169, "learning_rate": 8.212801118790691e-05, "loss": 0.9551671028137207, "memory(GiB)": 91.52, "step": 48395, "token_acc": 0.726314145122237, "train_speed(iter/s)": 0.150484 }, { "epoch": 0.6280229280271451, "grad_norm": 0.8683618307113647, "learning_rate": 8.212390110728689e-05, "loss": 0.8676420211791992, "memory(GiB)": 91.52, "step": 48400, "token_acc": 0.7372884230521353, "train_speed(iter/s)": 0.15048 }, { "epoch": 0.6280878064288008, "grad_norm": 0.8021453619003296, "learning_rate": 8.21197906569871e-05, "loss": 0.8838553428649902, "memory(GiB)": 91.52, "step": 48405, "token_acc": 0.7613191727221912, "train_speed(iter/s)": 0.150476 }, { "epoch": 0.6281526848304565, "grad_norm": 0.7433492541313171, "learning_rate": 8.211567983705481e-05, "loss": 0.8699322700500488, "memory(GiB)": 91.52, "step": 48410, "token_acc": 0.7676093514328809, "train_speed(iter/s)": 0.150473 }, { "epoch": 0.6282175632321122, "grad_norm": 0.8942570686340332, "learning_rate": 8.211156864753736e-05, "loss": 0.9628105163574219, "memory(GiB)": 91.52, "step": 48415, "token_acc": 0.7411232187761945, "train_speed(iter/s)": 0.150469 }, { "epoch": 0.6282824416337679, "grad_norm": 0.751166045665741, "learning_rate": 8.210745708848203e-05, "loss": 0.9089256286621094, "memory(GiB)": 91.52, "step": 48420, "token_acc": 0.7461494170145386, "train_speed(iter/s)": 0.150466 }, { "epoch": 0.6283473200354236, "grad_norm": 0.7513813376426697, "learning_rate": 8.210334515993615e-05, "loss": 0.9172898292541504, "memory(GiB)": 91.52, "step": 48425, "token_acc": 0.7392848004094166, "train_speed(iter/s)": 0.150463 }, { "epoch": 0.6284121984370793, "grad_norm": 0.8028303980827332, "learning_rate": 8.209923286194705e-05, "loss": 0.8566669464111328, "memory(GiB)": 91.52, "step": 48430, "token_acc": 0.7635595381196955, "train_speed(iter/s)": 0.15046 }, { "epoch": 0.628477076838735, "grad_norm": 0.8404502272605896, "learning_rate": 8.209512019456205e-05, "loss": 0.8394027709960937, "memory(GiB)": 91.52, "step": 48435, "token_acc": 0.7679674864181147, "train_speed(iter/s)": 0.150456 }, { "epoch": 0.6285419552403907, "grad_norm": 0.7700358629226685, "learning_rate": 8.209100715782844e-05, "loss": 0.8647358894348145, "memory(GiB)": 91.52, "step": 48440, "token_acc": 0.7552686470545925, "train_speed(iter/s)": 0.150453 }, { "epoch": 0.6286068336420464, "grad_norm": 0.8218814134597778, "learning_rate": 8.208689375179362e-05, "loss": 0.9464834213256836, "memory(GiB)": 91.52, "step": 48445, "token_acc": 0.7685778748701787, "train_speed(iter/s)": 0.150449 }, { "epoch": 0.6286717120437021, "grad_norm": 0.8135143518447876, "learning_rate": 8.208277997650488e-05, "loss": 0.9091852188110352, "memory(GiB)": 91.52, "step": 48450, "token_acc": 0.7646597064133762, "train_speed(iter/s)": 0.150446 }, { "epoch": 0.6287365904453578, "grad_norm": 0.6901909112930298, "learning_rate": 8.207866583200957e-05, "loss": 0.9080996513366699, "memory(GiB)": 91.52, "step": 48455, "token_acc": 0.750316206884764, "train_speed(iter/s)": 0.150443 }, { "epoch": 0.6288014688470135, "grad_norm": 0.844093918800354, "learning_rate": 8.207455131835504e-05, "loss": 0.9060042381286622, "memory(GiB)": 91.52, "step": 48460, "token_acc": 0.7725627553998833, "train_speed(iter/s)": 0.150439 }, { "epoch": 0.6288663472486692, "grad_norm": 0.8106334209442139, "learning_rate": 8.207043643558863e-05, "loss": 0.8809717178344727, "memory(GiB)": 91.52, "step": 48465, "token_acc": 0.7709881640517479, "train_speed(iter/s)": 0.150435 }, { "epoch": 0.6289312256503249, "grad_norm": 0.8113794922828674, "learning_rate": 8.206632118375773e-05, "loss": 0.911174201965332, "memory(GiB)": 91.52, "step": 48470, "token_acc": 0.7611599352771344, "train_speed(iter/s)": 0.150432 }, { "epoch": 0.6289961040519806, "grad_norm": 0.7425387501716614, "learning_rate": 8.206220556290963e-05, "loss": 0.8978414535522461, "memory(GiB)": 91.52, "step": 48475, "token_acc": 0.7549972192233454, "train_speed(iter/s)": 0.150428 }, { "epoch": 0.6290609824536363, "grad_norm": 0.7553325295448303, "learning_rate": 8.205808957309176e-05, "loss": 0.8913860321044922, "memory(GiB)": 91.52, "step": 48480, "token_acc": 0.772664713978191, "train_speed(iter/s)": 0.150424 }, { "epoch": 0.629125860855292, "grad_norm": 0.8524496555328369, "learning_rate": 8.205397321435144e-05, "loss": 0.9078540802001953, "memory(GiB)": 91.52, "step": 48485, "token_acc": 0.757840945384861, "train_speed(iter/s)": 0.15042 }, { "epoch": 0.6291907392569477, "grad_norm": 0.7616776823997498, "learning_rate": 8.204985648673608e-05, "loss": 0.9119976997375489, "memory(GiB)": 91.52, "step": 48490, "token_acc": 0.7586158489506563, "train_speed(iter/s)": 0.150416 }, { "epoch": 0.6292556176586034, "grad_norm": 0.7483240962028503, "learning_rate": 8.204573939029302e-05, "loss": 0.8561222076416015, "memory(GiB)": 91.52, "step": 48495, "token_acc": 0.7799975636496528, "train_speed(iter/s)": 0.150412 }, { "epoch": 0.6293204960602591, "grad_norm": 0.9232606291770935, "learning_rate": 8.204162192506965e-05, "loss": 0.915102195739746, "memory(GiB)": 91.52, "step": 48500, "token_acc": 0.7461009499503757, "train_speed(iter/s)": 0.150409 }, { "epoch": 0.6293853744619148, "grad_norm": 0.8608232140541077, "learning_rate": 8.203750409111338e-05, "loss": 0.9232125282287598, "memory(GiB)": 91.52, "step": 48505, "token_acc": 0.7453220237841525, "train_speed(iter/s)": 0.150405 }, { "epoch": 0.6294502528635705, "grad_norm": 0.7644767165184021, "learning_rate": 8.203338588847155e-05, "loss": 0.8954833030700684, "memory(GiB)": 91.52, "step": 48510, "token_acc": 0.7412819298181685, "train_speed(iter/s)": 0.150402 }, { "epoch": 0.6295151312652262, "grad_norm": 0.7758306264877319, "learning_rate": 8.202926731719159e-05, "loss": 0.8747941970825195, "memory(GiB)": 91.52, "step": 48515, "token_acc": 0.7546170991880273, "train_speed(iter/s)": 0.150398 }, { "epoch": 0.6295800096668819, "grad_norm": 0.8736361265182495, "learning_rate": 8.202514837732088e-05, "loss": 0.9073495864868164, "memory(GiB)": 91.52, "step": 48520, "token_acc": 0.7708157962156424, "train_speed(iter/s)": 0.150395 }, { "epoch": 0.6296448880685376, "grad_norm": 0.7907730937004089, "learning_rate": 8.202102906890682e-05, "loss": 0.940827751159668, "memory(GiB)": 91.52, "step": 48525, "token_acc": 0.7464946502375527, "train_speed(iter/s)": 0.150392 }, { "epoch": 0.6297097664701933, "grad_norm": 0.6917264461517334, "learning_rate": 8.201690939199683e-05, "loss": 0.8321034431457519, "memory(GiB)": 91.52, "step": 48530, "token_acc": 0.7697225290357131, "train_speed(iter/s)": 0.150388 }, { "epoch": 0.629774644871849, "grad_norm": 0.7927380800247192, "learning_rate": 8.201278934663828e-05, "loss": 0.9172091484069824, "memory(GiB)": 91.52, "step": 48535, "token_acc": 0.7560348169292033, "train_speed(iter/s)": 0.150385 }, { "epoch": 0.6298395232735047, "grad_norm": 0.8315426111221313, "learning_rate": 8.200866893287863e-05, "loss": 0.9596320152282715, "memory(GiB)": 91.52, "step": 48540, "token_acc": 0.7319377474616802, "train_speed(iter/s)": 0.150381 }, { "epoch": 0.6299044016751604, "grad_norm": 0.7066714763641357, "learning_rate": 8.200454815076528e-05, "loss": 0.8473048210144043, "memory(GiB)": 91.52, "step": 48545, "token_acc": 0.7600729449815917, "train_speed(iter/s)": 0.150378 }, { "epoch": 0.6299692800768161, "grad_norm": 0.84248948097229, "learning_rate": 8.200042700034563e-05, "loss": 0.943422794342041, "memory(GiB)": 91.52, "step": 48550, "token_acc": 0.7614844307553619, "train_speed(iter/s)": 0.150374 }, { "epoch": 0.6300341584784718, "grad_norm": 0.7964866757392883, "learning_rate": 8.199630548166715e-05, "loss": 0.9288297653198242, "memory(GiB)": 91.52, "step": 48555, "token_acc": 0.7621138104040137, "train_speed(iter/s)": 0.15037 }, { "epoch": 0.6300990368801275, "grad_norm": 0.8299693465232849, "learning_rate": 8.199218359477725e-05, "loss": 0.9319437980651856, "memory(GiB)": 91.52, "step": 48560, "token_acc": 0.7516205067766647, "train_speed(iter/s)": 0.150366 }, { "epoch": 0.6301639152817832, "grad_norm": 0.8068399429321289, "learning_rate": 8.198806133972334e-05, "loss": 0.8818501472473145, "memory(GiB)": 91.52, "step": 48565, "token_acc": 0.7557015011547344, "train_speed(iter/s)": 0.150363 }, { "epoch": 0.6302287936834388, "grad_norm": 0.7293136119842529, "learning_rate": 8.198393871655287e-05, "loss": 0.883023738861084, "memory(GiB)": 91.52, "step": 48570, "token_acc": 0.758311453609302, "train_speed(iter/s)": 0.150359 }, { "epoch": 0.6302936720850945, "grad_norm": 0.7685503959655762, "learning_rate": 8.197981572531333e-05, "loss": 0.9007955551147461, "memory(GiB)": 91.52, "step": 48575, "token_acc": 0.7505454025249455, "train_speed(iter/s)": 0.150356 }, { "epoch": 0.6303585504867502, "grad_norm": 0.7509061098098755, "learning_rate": 8.197569236605209e-05, "loss": 0.8732734680175781, "memory(GiB)": 91.52, "step": 48580, "token_acc": 0.7522884425419822, "train_speed(iter/s)": 0.150351 }, { "epoch": 0.6304234288884059, "grad_norm": 0.7958953976631165, "learning_rate": 8.197156863881665e-05, "loss": 0.9167971611022949, "memory(GiB)": 91.52, "step": 48585, "token_acc": 0.7395766997565087, "train_speed(iter/s)": 0.150347 }, { "epoch": 0.6304883072900616, "grad_norm": 0.7458211183547974, "learning_rate": 8.196744454365449e-05, "loss": 0.8965482711791992, "memory(GiB)": 91.52, "step": 48590, "token_acc": 0.7726575859425674, "train_speed(iter/s)": 0.150343 }, { "epoch": 0.6305531856917173, "grad_norm": 0.8103258609771729, "learning_rate": 8.196332008061299e-05, "loss": 0.9733462333679199, "memory(GiB)": 91.52, "step": 48595, "token_acc": 0.7231316725978648, "train_speed(iter/s)": 0.150339 }, { "epoch": 0.630618064093373, "grad_norm": 0.8007723093032837, "learning_rate": 8.195919524973968e-05, "loss": 0.8941727638244629, "memory(GiB)": 91.52, "step": 48600, "token_acc": 0.7592437115894818, "train_speed(iter/s)": 0.150336 }, { "epoch": 0.6306829424950287, "grad_norm": 0.6941136121749878, "learning_rate": 8.195507005108201e-05, "loss": 0.8772241592407226, "memory(GiB)": 91.52, "step": 48605, "token_acc": 0.7843674409116078, "train_speed(iter/s)": 0.150332 }, { "epoch": 0.6307478208966844, "grad_norm": 0.8186010718345642, "learning_rate": 8.195094448468745e-05, "loss": 0.8937620162963867, "memory(GiB)": 91.52, "step": 48610, "token_acc": 0.750511855227528, "train_speed(iter/s)": 0.150329 }, { "epoch": 0.63081269929834, "grad_norm": 0.731939971446991, "learning_rate": 8.194681855060347e-05, "loss": 0.8853736877441406, "memory(GiB)": 91.52, "step": 48615, "token_acc": 0.7533011633561516, "train_speed(iter/s)": 0.150325 }, { "epoch": 0.6308775776999958, "grad_norm": 0.7936692237854004, "learning_rate": 8.194269224887756e-05, "loss": 0.8843273162841797, "memory(GiB)": 91.52, "step": 48620, "token_acc": 0.7553434147165667, "train_speed(iter/s)": 0.150321 }, { "epoch": 0.6309424561016514, "grad_norm": 0.738589346408844, "learning_rate": 8.19385655795572e-05, "loss": 0.9377488136291504, "memory(GiB)": 91.52, "step": 48625, "token_acc": 0.7531772575250836, "train_speed(iter/s)": 0.150318 }, { "epoch": 0.6310073345033071, "grad_norm": 0.7350278496742249, "learning_rate": 8.193443854268989e-05, "loss": 0.9344624519348145, "memory(GiB)": 91.52, "step": 48630, "token_acc": 0.7586830958798614, "train_speed(iter/s)": 0.150313 }, { "epoch": 0.6310722129049628, "grad_norm": 0.815022885799408, "learning_rate": 8.193031113832312e-05, "loss": 0.8991596221923828, "memory(GiB)": 91.52, "step": 48635, "token_acc": 0.7610111314278382, "train_speed(iter/s)": 0.15031 }, { "epoch": 0.6311370913066185, "grad_norm": 0.8801852464675903, "learning_rate": 8.192618336650437e-05, "loss": 0.8902617454528808, "memory(GiB)": 91.52, "step": 48640, "token_acc": 0.7455361405518953, "train_speed(iter/s)": 0.150306 }, { "epoch": 0.6312019697082742, "grad_norm": 0.6661524772644043, "learning_rate": 8.192205522728117e-05, "loss": 0.9149116516113281, "memory(GiB)": 91.52, "step": 48645, "token_acc": 0.74852228303362, "train_speed(iter/s)": 0.150302 }, { "epoch": 0.63126684810993, "grad_norm": 0.7630476355552673, "learning_rate": 8.191792672070103e-05, "loss": 0.946682357788086, "memory(GiB)": 91.52, "step": 48650, "token_acc": 0.7311979148804829, "train_speed(iter/s)": 0.150298 }, { "epoch": 0.6313317265115856, "grad_norm": 0.7318201065063477, "learning_rate": 8.19137978468114e-05, "loss": 0.8990897178649903, "memory(GiB)": 91.52, "step": 48655, "token_acc": 0.7532490527284961, "train_speed(iter/s)": 0.150295 }, { "epoch": 0.6313966049132413, "grad_norm": 0.7260342240333557, "learning_rate": 8.190966860565986e-05, "loss": 0.8371121406555175, "memory(GiB)": 91.52, "step": 48660, "token_acc": 0.7774318550657437, "train_speed(iter/s)": 0.150291 }, { "epoch": 0.631461483314897, "grad_norm": 0.7716198563575745, "learning_rate": 8.190553899729392e-05, "loss": 0.9417644500732422, "memory(GiB)": 91.52, "step": 48665, "token_acc": 0.76587358463523, "train_speed(iter/s)": 0.150287 }, { "epoch": 0.6315263617165527, "grad_norm": 0.8762434124946594, "learning_rate": 8.190140902176107e-05, "loss": 0.9014989852905273, "memory(GiB)": 91.52, "step": 48670, "token_acc": 0.7667906794595652, "train_speed(iter/s)": 0.150283 }, { "epoch": 0.6315912401182084, "grad_norm": 0.8527013063430786, "learning_rate": 8.189727867910887e-05, "loss": 0.8799519538879395, "memory(GiB)": 91.52, "step": 48675, "token_acc": 0.7564674057987579, "train_speed(iter/s)": 0.150279 }, { "epoch": 0.6316561185198641, "grad_norm": 0.9317147135734558, "learning_rate": 8.189314796938483e-05, "loss": 0.9010416030883789, "memory(GiB)": 91.52, "step": 48680, "token_acc": 0.7484183103833272, "train_speed(iter/s)": 0.150276 }, { "epoch": 0.6317209969215198, "grad_norm": 0.8138973712921143, "learning_rate": 8.18890168926365e-05, "loss": 0.8641510009765625, "memory(GiB)": 91.52, "step": 48685, "token_acc": 0.775249948990002, "train_speed(iter/s)": 0.150272 }, { "epoch": 0.6317858753231755, "grad_norm": 0.7726796865463257, "learning_rate": 8.188488544891142e-05, "loss": 0.9484591484069824, "memory(GiB)": 91.52, "step": 48690, "token_acc": 0.7460661964188823, "train_speed(iter/s)": 0.150269 }, { "epoch": 0.6318507537248312, "grad_norm": 0.7567886114120483, "learning_rate": 8.188075363825713e-05, "loss": 0.887003231048584, "memory(GiB)": 91.52, "step": 48695, "token_acc": 0.7532160389866341, "train_speed(iter/s)": 0.150266 }, { "epoch": 0.6319156321264869, "grad_norm": 0.777923583984375, "learning_rate": 8.187662146072118e-05, "loss": 0.8954218864440918, "memory(GiB)": 91.52, "step": 48700, "token_acc": 0.7684301464538423, "train_speed(iter/s)": 0.150261 }, { "epoch": 0.6319805105281426, "grad_norm": 0.7882512807846069, "learning_rate": 8.187248891635112e-05, "loss": 0.9792285919189453, "memory(GiB)": 91.52, "step": 48705, "token_acc": 0.7360588972431078, "train_speed(iter/s)": 0.150258 }, { "epoch": 0.6320453889297983, "grad_norm": 0.7695382237434387, "learning_rate": 8.186835600519452e-05, "loss": 0.8684556007385253, "memory(GiB)": 91.52, "step": 48710, "token_acc": 0.7659058933875253, "train_speed(iter/s)": 0.150254 }, { "epoch": 0.632110267331454, "grad_norm": 0.7602781057357788, "learning_rate": 8.18642227272989e-05, "loss": 0.8906379699707031, "memory(GiB)": 91.52, "step": 48715, "token_acc": 0.7700696853223337, "train_speed(iter/s)": 0.15025 }, { "epoch": 0.6321751457331097, "grad_norm": 0.7485826015472412, "learning_rate": 8.186008908271188e-05, "loss": 0.8795688629150391, "memory(GiB)": 91.52, "step": 48720, "token_acc": 0.7480437604301873, "train_speed(iter/s)": 0.150246 }, { "epoch": 0.6322400241347654, "grad_norm": 0.7551665306091309, "learning_rate": 8.1855955071481e-05, "loss": 0.9067152976989746, "memory(GiB)": 91.52, "step": 48725, "token_acc": 0.7380480554867476, "train_speed(iter/s)": 0.150243 }, { "epoch": 0.6323049025364211, "grad_norm": 0.7293605208396912, "learning_rate": 8.185182069365384e-05, "loss": 0.9627855300903321, "memory(GiB)": 91.52, "step": 48730, "token_acc": 0.7383762132183023, "train_speed(iter/s)": 0.15024 }, { "epoch": 0.6323697809380768, "grad_norm": 0.7457404732704163, "learning_rate": 8.184768594927796e-05, "loss": 0.9116767883300781, "memory(GiB)": 91.52, "step": 48735, "token_acc": 0.7722795136216716, "train_speed(iter/s)": 0.150236 }, { "epoch": 0.6324346593397325, "grad_norm": 0.7757263779640198, "learning_rate": 8.184355083840098e-05, "loss": 0.9573565483093261, "memory(GiB)": 91.52, "step": 48740, "token_acc": 0.7307594430508598, "train_speed(iter/s)": 0.150233 }, { "epoch": 0.6324995377413882, "grad_norm": 0.7428579330444336, "learning_rate": 8.183941536107048e-05, "loss": 0.9357538223266602, "memory(GiB)": 91.52, "step": 48745, "token_acc": 0.7281900478579254, "train_speed(iter/s)": 0.15023 }, { "epoch": 0.6325644161430439, "grad_norm": 0.779048502445221, "learning_rate": 8.183527951733401e-05, "loss": 0.9391632080078125, "memory(GiB)": 91.52, "step": 48750, "token_acc": 0.7465942992703354, "train_speed(iter/s)": 0.150227 }, { "epoch": 0.6326292945446996, "grad_norm": 0.7269995212554932, "learning_rate": 8.18311433072392e-05, "loss": 0.8849333763122559, "memory(GiB)": 91.52, "step": 48755, "token_acc": 0.7405168986083499, "train_speed(iter/s)": 0.150222 }, { "epoch": 0.6326941729463553, "grad_norm": 0.8395586013793945, "learning_rate": 8.182700673083364e-05, "loss": 0.8812058448791504, "memory(GiB)": 91.52, "step": 48760, "token_acc": 0.7590949068521627, "train_speed(iter/s)": 0.150219 }, { "epoch": 0.632759051348011, "grad_norm": 0.7192484736442566, "learning_rate": 8.182286978816493e-05, "loss": 0.8987283706665039, "memory(GiB)": 91.52, "step": 48765, "token_acc": 0.7547512486216514, "train_speed(iter/s)": 0.150215 }, { "epoch": 0.6328239297496667, "grad_norm": 0.736068606376648, "learning_rate": 8.18187324792807e-05, "loss": 0.8464418411254883, "memory(GiB)": 91.52, "step": 48770, "token_acc": 0.7616749032678236, "train_speed(iter/s)": 0.150211 }, { "epoch": 0.6328888081513224, "grad_norm": 0.7533125281333923, "learning_rate": 8.181459480422853e-05, "loss": 0.8623542785644531, "memory(GiB)": 91.52, "step": 48775, "token_acc": 0.7633941822499258, "train_speed(iter/s)": 0.150208 }, { "epoch": 0.6329536865529781, "grad_norm": 0.7988277673721313, "learning_rate": 8.181045676305607e-05, "loss": 0.9738304138183593, "memory(GiB)": 91.52, "step": 48780, "token_acc": 0.7453272714151692, "train_speed(iter/s)": 0.150204 }, { "epoch": 0.6330185649546338, "grad_norm": 0.7434374094009399, "learning_rate": 8.180631835581089e-05, "loss": 0.9073654174804687, "memory(GiB)": 91.52, "step": 48785, "token_acc": 0.744416456759026, "train_speed(iter/s)": 0.1502 }, { "epoch": 0.6330834433562895, "grad_norm": 0.8623983860015869, "learning_rate": 8.180217958254068e-05, "loss": 0.9113882064819336, "memory(GiB)": 91.52, "step": 48790, "token_acc": 0.7601104209799862, "train_speed(iter/s)": 0.150197 }, { "epoch": 0.6331483217579452, "grad_norm": 0.7306343913078308, "learning_rate": 8.179804044329301e-05, "loss": 0.9235441207885742, "memory(GiB)": 91.52, "step": 48795, "token_acc": 0.7555805929159918, "train_speed(iter/s)": 0.150193 }, { "epoch": 0.6332132001596009, "grad_norm": 0.7872875928878784, "learning_rate": 8.179390093811554e-05, "loss": 0.8485476493835449, "memory(GiB)": 91.52, "step": 48800, "token_acc": 0.7714653045623545, "train_speed(iter/s)": 0.150189 }, { "epoch": 0.6332780785612566, "grad_norm": 0.7562640309333801, "learning_rate": 8.17897610670559e-05, "loss": 0.9215180397033691, "memory(GiB)": 91.52, "step": 48805, "token_acc": 0.7688939199857011, "train_speed(iter/s)": 0.150185 }, { "epoch": 0.6333429569629122, "grad_norm": 0.8203831315040588, "learning_rate": 8.178562083016175e-05, "loss": 0.8783920288085938, "memory(GiB)": 91.52, "step": 48810, "token_acc": 0.7546315226622212, "train_speed(iter/s)": 0.150181 }, { "epoch": 0.6334078353645679, "grad_norm": 0.8022199869155884, "learning_rate": 8.178148022748071e-05, "loss": 0.9468399047851562, "memory(GiB)": 91.52, "step": 48815, "token_acc": 0.7621605490898239, "train_speed(iter/s)": 0.150177 }, { "epoch": 0.6334727137662236, "grad_norm": 0.7382869124412537, "learning_rate": 8.177733925906045e-05, "loss": 0.892311954498291, "memory(GiB)": 91.52, "step": 48820, "token_acc": 0.7436615619657342, "train_speed(iter/s)": 0.150175 }, { "epoch": 0.6335375921678793, "grad_norm": 0.8396251797676086, "learning_rate": 8.17731979249486e-05, "loss": 0.9477767944335938, "memory(GiB)": 91.52, "step": 48825, "token_acc": 0.7627551020408163, "train_speed(iter/s)": 0.150172 }, { "epoch": 0.633602470569535, "grad_norm": 0.7123351097106934, "learning_rate": 8.176905622519285e-05, "loss": 0.8670343399047852, "memory(GiB)": 91.52, "step": 48830, "token_acc": 0.7782069326699617, "train_speed(iter/s)": 0.150168 }, { "epoch": 0.6336673489711907, "grad_norm": 0.8098528981208801, "learning_rate": 8.176491415984084e-05, "loss": 0.9259923934936524, "memory(GiB)": 91.52, "step": 48835, "token_acc": 0.7417527027496784, "train_speed(iter/s)": 0.150164 }, { "epoch": 0.6337322273728464, "grad_norm": 0.7245531678199768, "learning_rate": 8.176077172894025e-05, "loss": 0.8736003875732422, "memory(GiB)": 91.52, "step": 48840, "token_acc": 0.7652948072109749, "train_speed(iter/s)": 0.15016 }, { "epoch": 0.6337971057745021, "grad_norm": 0.7249601483345032, "learning_rate": 8.175662893253872e-05, "loss": 0.8915424346923828, "memory(GiB)": 91.52, "step": 48845, "token_acc": 0.7589987209939704, "train_speed(iter/s)": 0.150157 }, { "epoch": 0.6338619841761578, "grad_norm": 0.7267927527427673, "learning_rate": 8.175248577068397e-05, "loss": 0.9794862747192383, "memory(GiB)": 91.52, "step": 48850, "token_acc": 0.7559072120426923, "train_speed(iter/s)": 0.150153 }, { "epoch": 0.6339268625778135, "grad_norm": 0.7855822443962097, "learning_rate": 8.174834224342364e-05, "loss": 0.9011011123657227, "memory(GiB)": 91.52, "step": 48855, "token_acc": 0.7744774477447744, "train_speed(iter/s)": 0.15015 }, { "epoch": 0.6339917409794692, "grad_norm": 0.7998640537261963, "learning_rate": 8.174419835080545e-05, "loss": 0.9337017059326171, "memory(GiB)": 91.52, "step": 48860, "token_acc": 0.7666591456077015, "train_speed(iter/s)": 0.150146 }, { "epoch": 0.6340566193811249, "grad_norm": 0.6894059777259827, "learning_rate": 8.174005409287706e-05, "loss": 0.8485482215881348, "memory(GiB)": 91.52, "step": 48865, "token_acc": 0.7704595118648241, "train_speed(iter/s)": 0.150141 }, { "epoch": 0.6341214977827806, "grad_norm": 0.8220369815826416, "learning_rate": 8.173590946968617e-05, "loss": 0.8954757690429688, "memory(GiB)": 91.52, "step": 48870, "token_acc": 0.7520374672714534, "train_speed(iter/s)": 0.150138 }, { "epoch": 0.6341863761844363, "grad_norm": 0.807267427444458, "learning_rate": 8.173176448128047e-05, "loss": 0.9425742149353027, "memory(GiB)": 91.52, "step": 48875, "token_acc": 0.7566205182818602, "train_speed(iter/s)": 0.150135 }, { "epoch": 0.634251254586092, "grad_norm": 0.8062065839767456, "learning_rate": 8.172761912770768e-05, "loss": 0.9676765441894531, "memory(GiB)": 91.52, "step": 48880, "token_acc": 0.7342000515449156, "train_speed(iter/s)": 0.150131 }, { "epoch": 0.6343161329877477, "grad_norm": 0.7729055285453796, "learning_rate": 8.172347340901547e-05, "loss": 0.9371904373168946, "memory(GiB)": 91.52, "step": 48885, "token_acc": 0.7545369048033718, "train_speed(iter/s)": 0.150127 }, { "epoch": 0.6343810113894034, "grad_norm": 0.7747371196746826, "learning_rate": 8.171932732525159e-05, "loss": 0.956664752960205, "memory(GiB)": 91.52, "step": 48890, "token_acc": 0.7400515441401302, "train_speed(iter/s)": 0.150124 }, { "epoch": 0.6344458897910591, "grad_norm": 0.747932493686676, "learning_rate": 8.171518087646372e-05, "loss": 0.9109374046325683, "memory(GiB)": 91.52, "step": 48895, "token_acc": 0.7601757631822387, "train_speed(iter/s)": 0.15012 }, { "epoch": 0.6345107681927148, "grad_norm": 0.7784586548805237, "learning_rate": 8.17110340626996e-05, "loss": 0.9187057495117188, "memory(GiB)": 91.52, "step": 48900, "token_acc": 0.7510171372210578, "train_speed(iter/s)": 0.150116 }, { "epoch": 0.6345756465943705, "grad_norm": 0.8319282531738281, "learning_rate": 8.170688688400694e-05, "loss": 0.9109831809997558, "memory(GiB)": 91.52, "step": 48905, "token_acc": 0.7423479005998286, "train_speed(iter/s)": 0.150112 }, { "epoch": 0.6346405249960262, "grad_norm": 0.7733145356178284, "learning_rate": 8.170273934043345e-05, "loss": 0.8597244262695313, "memory(GiB)": 91.52, "step": 48910, "token_acc": 0.7560959248700319, "train_speed(iter/s)": 0.150109 }, { "epoch": 0.6347054033976819, "grad_norm": 0.6882641315460205, "learning_rate": 8.169859143202689e-05, "loss": 0.8830093383789063, "memory(GiB)": 91.52, "step": 48915, "token_acc": 0.7757864058155506, "train_speed(iter/s)": 0.150104 }, { "epoch": 0.6347702817993376, "grad_norm": 0.7378524541854858, "learning_rate": 8.1694443158835e-05, "loss": 0.9223185539245605, "memory(GiB)": 91.52, "step": 48920, "token_acc": 0.757453637660485, "train_speed(iter/s)": 0.1501 }, { "epoch": 0.6348351602009933, "grad_norm": 0.8493019342422485, "learning_rate": 8.169029452090547e-05, "loss": 0.8625117301940918, "memory(GiB)": 91.52, "step": 48925, "token_acc": 0.769950976452624, "train_speed(iter/s)": 0.150097 }, { "epoch": 0.634900038602649, "grad_norm": 0.7970516681671143, "learning_rate": 8.168614551828606e-05, "loss": 0.8591936111450196, "memory(GiB)": 91.52, "step": 48930, "token_acc": 0.769880208997069, "train_speed(iter/s)": 0.150093 }, { "epoch": 0.6349649170043047, "grad_norm": 0.7875433564186096, "learning_rate": 8.168199615102455e-05, "loss": 0.9274204254150391, "memory(GiB)": 91.52, "step": 48935, "token_acc": 0.7346337745132008, "train_speed(iter/s)": 0.150089 }, { "epoch": 0.6350297954059604, "grad_norm": 0.8148036599159241, "learning_rate": 8.167784641916865e-05, "loss": 0.9061050415039062, "memory(GiB)": 91.52, "step": 48940, "token_acc": 0.7486229344016024, "train_speed(iter/s)": 0.150086 }, { "epoch": 0.6350946738076161, "grad_norm": 0.6658105850219727, "learning_rate": 8.167369632276615e-05, "loss": 0.8548098564147949, "memory(GiB)": 91.52, "step": 48945, "token_acc": 0.7966015293118097, "train_speed(iter/s)": 0.150082 }, { "epoch": 0.6351595522092718, "grad_norm": 0.7619851231575012, "learning_rate": 8.166954586186478e-05, "loss": 0.9370454788208008, "memory(GiB)": 91.52, "step": 48950, "token_acc": 0.7435506417588004, "train_speed(iter/s)": 0.150078 }, { "epoch": 0.6352244306109275, "grad_norm": 0.7732502818107605, "learning_rate": 8.166539503651233e-05, "loss": 0.9085757255554199, "memory(GiB)": 91.52, "step": 48955, "token_acc": 0.7747813876300504, "train_speed(iter/s)": 0.150075 }, { "epoch": 0.6352893090125832, "grad_norm": 0.6731835603713989, "learning_rate": 8.166124384675652e-05, "loss": 0.8888862609863282, "memory(GiB)": 91.52, "step": 48960, "token_acc": 0.7557737708283436, "train_speed(iter/s)": 0.150072 }, { "epoch": 0.6353541874142389, "grad_norm": 0.7772451043128967, "learning_rate": 8.165709229264517e-05, "loss": 0.8979110717773438, "memory(GiB)": 91.52, "step": 48965, "token_acc": 0.7626063762225327, "train_speed(iter/s)": 0.150067 }, { "epoch": 0.6354190658158946, "grad_norm": 0.6981971263885498, "learning_rate": 8.165294037422606e-05, "loss": 0.9398526191711426, "memory(GiB)": 91.52, "step": 48970, "token_acc": 0.7531757859626712, "train_speed(iter/s)": 0.150064 }, { "epoch": 0.6354839442175503, "grad_norm": 0.7180098295211792, "learning_rate": 8.164878809154693e-05, "loss": 0.9080929756164551, "memory(GiB)": 91.52, "step": 48975, "token_acc": 0.7446041815247175, "train_speed(iter/s)": 0.150061 }, { "epoch": 0.635548822619206, "grad_norm": 0.7977572083473206, "learning_rate": 8.164463544465559e-05, "loss": 0.9605751991271972, "memory(GiB)": 91.52, "step": 48980, "token_acc": 0.7274131565066687, "train_speed(iter/s)": 0.150058 }, { "epoch": 0.6356137010208617, "grad_norm": 0.7201310396194458, "learning_rate": 8.164048243359981e-05, "loss": 0.8779549598693848, "memory(GiB)": 91.52, "step": 48985, "token_acc": 0.7536335788307371, "train_speed(iter/s)": 0.150054 }, { "epoch": 0.6356785794225174, "grad_norm": 0.7711949944496155, "learning_rate": 8.163632905842741e-05, "loss": 0.9315104484558105, "memory(GiB)": 91.52, "step": 48990, "token_acc": 0.7466915618448637, "train_speed(iter/s)": 0.15005 }, { "epoch": 0.6357434578241731, "grad_norm": 0.7634446620941162, "learning_rate": 8.163217531918616e-05, "loss": 0.9454336166381836, "memory(GiB)": 91.52, "step": 48995, "token_acc": 0.7357502755471579, "train_speed(iter/s)": 0.150047 }, { "epoch": 0.6358083362258288, "grad_norm": 0.7315118312835693, "learning_rate": 8.162802121592387e-05, "loss": 0.8892663955688477, "memory(GiB)": 91.52, "step": 49000, "token_acc": 0.7795919878558999, "train_speed(iter/s)": 0.150044 }, { "epoch": 0.6358732146274845, "grad_norm": 0.7811680436134338, "learning_rate": 8.162386674868837e-05, "loss": 0.8797781944274903, "memory(GiB)": 91.52, "step": 49005, "token_acc": 0.7698101950255435, "train_speed(iter/s)": 0.15004 }, { "epoch": 0.6359380930291402, "grad_norm": 0.6806870102882385, "learning_rate": 8.161971191752742e-05, "loss": 0.9081968307495117, "memory(GiB)": 91.52, "step": 49010, "token_acc": 0.7482543182653436, "train_speed(iter/s)": 0.150036 }, { "epoch": 0.6360029714307959, "grad_norm": 0.6900807619094849, "learning_rate": 8.161555672248887e-05, "loss": 0.8835399627685547, "memory(GiB)": 91.52, "step": 49015, "token_acc": 0.75903451995685, "train_speed(iter/s)": 0.150033 }, { "epoch": 0.6360678498324516, "grad_norm": 0.7540825605392456, "learning_rate": 8.161140116362053e-05, "loss": 0.8848972320556641, "memory(GiB)": 91.52, "step": 49020, "token_acc": 0.7644122191845952, "train_speed(iter/s)": 0.15003 }, { "epoch": 0.6361327282341073, "grad_norm": 0.768096923828125, "learning_rate": 8.160724524097021e-05, "loss": 0.8816213607788086, "memory(GiB)": 91.52, "step": 49025, "token_acc": 0.764778041861535, "train_speed(iter/s)": 0.150027 }, { "epoch": 0.636197606635763, "grad_norm": 0.8037617206573486, "learning_rate": 8.160308895458576e-05, "loss": 0.8857828140258789, "memory(GiB)": 91.52, "step": 49030, "token_acc": 0.760246533127889, "train_speed(iter/s)": 0.150024 }, { "epoch": 0.6362624850374187, "grad_norm": 0.8444920778274536, "learning_rate": 8.159893230451499e-05, "loss": 0.9306694984436035, "memory(GiB)": 91.52, "step": 49035, "token_acc": 0.7556053811659192, "train_speed(iter/s)": 0.15002 }, { "epoch": 0.6363273634390744, "grad_norm": 0.8149263262748718, "learning_rate": 8.159477529080574e-05, "loss": 0.929880428314209, "memory(GiB)": 91.52, "step": 49040, "token_acc": 0.7571025087223792, "train_speed(iter/s)": 0.150016 }, { "epoch": 0.6363922418407301, "grad_norm": 0.7728058099746704, "learning_rate": 8.159061791350585e-05, "loss": 0.933962345123291, "memory(GiB)": 91.52, "step": 49045, "token_acc": 0.7249163308517178, "train_speed(iter/s)": 0.150013 }, { "epoch": 0.6364571202423857, "grad_norm": 0.7036052346229553, "learning_rate": 8.158646017266316e-05, "loss": 0.8627494812011719, "memory(GiB)": 91.52, "step": 49050, "token_acc": 0.7584524696795176, "train_speed(iter/s)": 0.150009 }, { "epoch": 0.6365219986440414, "grad_norm": 0.7071139216423035, "learning_rate": 8.158230206832553e-05, "loss": 0.9134273529052734, "memory(GiB)": 91.52, "step": 49055, "token_acc": 0.7575334728728529, "train_speed(iter/s)": 0.150006 }, { "epoch": 0.636586877045697, "grad_norm": 0.9334726333618164, "learning_rate": 8.157814360054078e-05, "loss": 0.9156093597412109, "memory(GiB)": 91.52, "step": 49060, "token_acc": 0.7532436802460859, "train_speed(iter/s)": 0.150003 }, { "epoch": 0.6366517554473528, "grad_norm": 0.7215054035186768, "learning_rate": 8.15739847693568e-05, "loss": 0.8933921813964844, "memory(GiB)": 91.52, "step": 49065, "token_acc": 0.7695139911634757, "train_speed(iter/s)": 0.15 }, { "epoch": 0.6367166338490085, "grad_norm": 0.7431822419166565, "learning_rate": 8.156982557482144e-05, "loss": 0.9162097930908203, "memory(GiB)": 91.52, "step": 49070, "token_acc": 0.7495199860359574, "train_speed(iter/s)": 0.149997 }, { "epoch": 0.6367815122506642, "grad_norm": 0.8458612561225891, "learning_rate": 8.156566601698254e-05, "loss": 0.9349983215332032, "memory(GiB)": 91.52, "step": 49075, "token_acc": 0.7549460431654677, "train_speed(iter/s)": 0.149994 }, { "epoch": 0.6368463906523198, "grad_norm": 0.6946824193000793, "learning_rate": 8.156150609588802e-05, "loss": 0.9123139381408691, "memory(GiB)": 91.52, "step": 49080, "token_acc": 0.7505937387549478, "train_speed(iter/s)": 0.14999 }, { "epoch": 0.6369112690539755, "grad_norm": 0.7353752851486206, "learning_rate": 8.15573458115857e-05, "loss": 0.8742551803588867, "memory(GiB)": 91.52, "step": 49085, "token_acc": 0.7490927204155696, "train_speed(iter/s)": 0.149987 }, { "epoch": 0.6369761474556312, "grad_norm": 0.747835636138916, "learning_rate": 8.155318516412346e-05, "loss": 0.9068028450012207, "memory(GiB)": 91.52, "step": 49090, "token_acc": 0.7434489825794174, "train_speed(iter/s)": 0.149983 }, { "epoch": 0.637041025857287, "grad_norm": 0.8205708861351013, "learning_rate": 8.154902415354921e-05, "loss": 0.9489249229431153, "memory(GiB)": 91.52, "step": 49095, "token_acc": 0.7285475172029012, "train_speed(iter/s)": 0.14998 }, { "epoch": 0.6371059042589426, "grad_norm": 0.7806447744369507, "learning_rate": 8.154486277991082e-05, "loss": 0.8961312294006347, "memory(GiB)": 91.52, "step": 49100, "token_acc": 0.7521862620524703, "train_speed(iter/s)": 0.149976 }, { "epoch": 0.6371707826605983, "grad_norm": 0.797362208366394, "learning_rate": 8.154070104325618e-05, "loss": 0.8598583221435547, "memory(GiB)": 91.52, "step": 49105, "token_acc": 0.7719701111718608, "train_speed(iter/s)": 0.149973 }, { "epoch": 0.637235661062254, "grad_norm": 0.7860172986984253, "learning_rate": 8.153653894363318e-05, "loss": 0.9015816688537598, "memory(GiB)": 91.52, "step": 49110, "token_acc": 0.7521803871516699, "train_speed(iter/s)": 0.14997 }, { "epoch": 0.6373005394639097, "grad_norm": 0.7275068759918213, "learning_rate": 8.153237648108973e-05, "loss": 0.8621298789978027, "memory(GiB)": 91.52, "step": 49115, "token_acc": 0.8075564121095319, "train_speed(iter/s)": 0.149966 }, { "epoch": 0.6373654178655654, "grad_norm": 0.7138658761978149, "learning_rate": 8.152821365567372e-05, "loss": 0.8775721549987793, "memory(GiB)": 91.52, "step": 49120, "token_acc": 0.7607981999926229, "train_speed(iter/s)": 0.149963 }, { "epoch": 0.6374302962672211, "grad_norm": 0.7613016963005066, "learning_rate": 8.152405046743305e-05, "loss": 0.8877339363098145, "memory(GiB)": 91.52, "step": 49125, "token_acc": 0.7545785056573001, "train_speed(iter/s)": 0.149959 }, { "epoch": 0.6374951746688768, "grad_norm": 0.7446520924568176, "learning_rate": 8.151988691641563e-05, "loss": 0.8941215515136719, "memory(GiB)": 91.52, "step": 49130, "token_acc": 0.7778865769073847, "train_speed(iter/s)": 0.149956 }, { "epoch": 0.6375600530705325, "grad_norm": 0.8353784680366516, "learning_rate": 8.151572300266939e-05, "loss": 0.8993538856506348, "memory(GiB)": 91.52, "step": 49135, "token_acc": 0.7723217873503174, "train_speed(iter/s)": 0.149953 }, { "epoch": 0.6376249314721882, "grad_norm": 0.6718891859054565, "learning_rate": 8.151155872624223e-05, "loss": 0.8889755249023438, "memory(GiB)": 91.52, "step": 49140, "token_acc": 0.7523724202004114, "train_speed(iter/s)": 0.149949 }, { "epoch": 0.6376898098738439, "grad_norm": 0.7523990869522095, "learning_rate": 8.150739408718208e-05, "loss": 0.9479848861694335, "memory(GiB)": 91.52, "step": 49145, "token_acc": 0.7474981046247157, "train_speed(iter/s)": 0.149946 }, { "epoch": 0.6377546882754996, "grad_norm": 0.8140077590942383, "learning_rate": 8.150322908553687e-05, "loss": 0.8908072471618652, "memory(GiB)": 91.52, "step": 49150, "token_acc": 0.7721088435374149, "train_speed(iter/s)": 0.149942 }, { "epoch": 0.6378195666771553, "grad_norm": 0.7687255144119263, "learning_rate": 8.149906372135451e-05, "loss": 0.8681316375732422, "memory(GiB)": 91.52, "step": 49155, "token_acc": 0.7703224946695096, "train_speed(iter/s)": 0.149939 }, { "epoch": 0.637884445078811, "grad_norm": 0.7781481146812439, "learning_rate": 8.149489799468297e-05, "loss": 0.9204624176025391, "memory(GiB)": 91.52, "step": 49160, "token_acc": 0.7719427842320865, "train_speed(iter/s)": 0.149936 }, { "epoch": 0.6379493234804667, "grad_norm": 0.6766704320907593, "learning_rate": 8.149073190557019e-05, "loss": 0.887886905670166, "memory(GiB)": 91.52, "step": 49165, "token_acc": 0.7743079936435198, "train_speed(iter/s)": 0.149933 }, { "epoch": 0.6380142018821224, "grad_norm": 0.8224804401397705, "learning_rate": 8.148656545406408e-05, "loss": 0.8965963363647461, "memory(GiB)": 91.52, "step": 49170, "token_acc": 0.7677121416201654, "train_speed(iter/s)": 0.149931 }, { "epoch": 0.6380790802837781, "grad_norm": 0.6983288526535034, "learning_rate": 8.148239864021259e-05, "loss": 0.882144546508789, "memory(GiB)": 91.52, "step": 49175, "token_acc": 0.7554336283185841, "train_speed(iter/s)": 0.149928 }, { "epoch": 0.6381439586854338, "grad_norm": 0.8784216046333313, "learning_rate": 8.147823146406368e-05, "loss": 0.9373067855834961, "memory(GiB)": 91.52, "step": 49180, "token_acc": 0.7532218739115291, "train_speed(iter/s)": 0.149924 }, { "epoch": 0.6382088370870895, "grad_norm": 0.7974181771278381, "learning_rate": 8.147406392566531e-05, "loss": 0.9077917098999023, "memory(GiB)": 91.52, "step": 49185, "token_acc": 0.7518665061070016, "train_speed(iter/s)": 0.149921 }, { "epoch": 0.6382737154887452, "grad_norm": 0.6984332203865051, "learning_rate": 8.146989602506545e-05, "loss": 0.9268892288208008, "memory(GiB)": 91.52, "step": 49190, "token_acc": 0.7862135299775569, "train_speed(iter/s)": 0.149917 }, { "epoch": 0.6383385938904009, "grad_norm": 0.7455404996871948, "learning_rate": 8.146572776231205e-05, "loss": 0.8722537994384766, "memory(GiB)": 91.52, "step": 49195, "token_acc": 0.7691898363708927, "train_speed(iter/s)": 0.149913 }, { "epoch": 0.6384034722920566, "grad_norm": 0.7911468148231506, "learning_rate": 8.146155913745306e-05, "loss": 0.9052016258239746, "memory(GiB)": 91.52, "step": 49200, "token_acc": 0.743887506418612, "train_speed(iter/s)": 0.14991 }, { "epoch": 0.6384683506937123, "grad_norm": 0.834312379360199, "learning_rate": 8.14573901505365e-05, "loss": 0.9364454269409179, "memory(GiB)": 91.52, "step": 49205, "token_acc": 0.7310192709542606, "train_speed(iter/s)": 0.149907 }, { "epoch": 0.638533229095368, "grad_norm": 0.8087106347084045, "learning_rate": 8.145322080161031e-05, "loss": 0.9201543807983399, "memory(GiB)": 91.52, "step": 49210, "token_acc": 0.7470965158189827, "train_speed(iter/s)": 0.149904 }, { "epoch": 0.6385981074970237, "grad_norm": 0.7720547914505005, "learning_rate": 8.144905109072247e-05, "loss": 0.8806000709533691, "memory(GiB)": 91.52, "step": 49215, "token_acc": 0.7523608349900597, "train_speed(iter/s)": 0.149901 }, { "epoch": 0.6386629858986794, "grad_norm": 0.9461379647254944, "learning_rate": 8.144488101792099e-05, "loss": 0.9215453147888184, "memory(GiB)": 91.52, "step": 49220, "token_acc": 0.757549478118128, "train_speed(iter/s)": 0.149897 }, { "epoch": 0.6387278643003351, "grad_norm": 0.740497887134552, "learning_rate": 8.144071058325384e-05, "loss": 0.8757728576660156, "memory(GiB)": 91.52, "step": 49225, "token_acc": 0.7480539206379343, "train_speed(iter/s)": 0.149894 }, { "epoch": 0.6387927427019908, "grad_norm": 0.8272427320480347, "learning_rate": 8.143653978676899e-05, "loss": 0.9199735641479492, "memory(GiB)": 91.52, "step": 49230, "token_acc": 0.761049339819319, "train_speed(iter/s)": 0.149891 }, { "epoch": 0.6388576211036465, "grad_norm": 0.8170849084854126, "learning_rate": 8.143236862851449e-05, "loss": 0.9550283432006836, "memory(GiB)": 91.52, "step": 49235, "token_acc": 0.7344131351231418, "train_speed(iter/s)": 0.149888 }, { "epoch": 0.6389224995053022, "grad_norm": 0.7767136096954346, "learning_rate": 8.14281971085383e-05, "loss": 0.8825643539428711, "memory(GiB)": 91.52, "step": 49240, "token_acc": 0.7442774566473989, "train_speed(iter/s)": 0.149885 }, { "epoch": 0.6389873779069579, "grad_norm": 0.7032446265220642, "learning_rate": 8.142402522688846e-05, "loss": 0.9163648605346679, "memory(GiB)": 91.52, "step": 49245, "token_acc": 0.7580911152339723, "train_speed(iter/s)": 0.149882 }, { "epoch": 0.6390522563086136, "grad_norm": 0.7925724387168884, "learning_rate": 8.141985298361294e-05, "loss": 0.8805914878845215, "memory(GiB)": 91.52, "step": 49250, "token_acc": 0.7360756170138281, "train_speed(iter/s)": 0.149878 }, { "epoch": 0.6391171347102693, "grad_norm": 0.7828801870346069, "learning_rate": 8.141568037875978e-05, "loss": 0.9034282684326171, "memory(GiB)": 91.52, "step": 49255, "token_acc": 0.7471232238527836, "train_speed(iter/s)": 0.149875 }, { "epoch": 0.639182013111925, "grad_norm": 0.7602846622467041, "learning_rate": 8.141150741237699e-05, "loss": 0.8644323348999023, "memory(GiB)": 91.52, "step": 49260, "token_acc": 0.7669872355306872, "train_speed(iter/s)": 0.149871 }, { "epoch": 0.6392468915135807, "grad_norm": 0.7332777380943298, "learning_rate": 8.14073340845126e-05, "loss": 0.8716106414794922, "memory(GiB)": 91.52, "step": 49265, "token_acc": 0.7697707953855495, "train_speed(iter/s)": 0.149868 }, { "epoch": 0.6393117699152364, "grad_norm": 0.8129457235336304, "learning_rate": 8.140316039521464e-05, "loss": 0.9175817489624023, "memory(GiB)": 91.52, "step": 49270, "token_acc": 0.7591668556740001, "train_speed(iter/s)": 0.149864 }, { "epoch": 0.6393766483168921, "grad_norm": 0.7909567952156067, "learning_rate": 8.139898634453111e-05, "loss": 0.8495919227600097, "memory(GiB)": 91.52, "step": 49275, "token_acc": 0.7789302177714195, "train_speed(iter/s)": 0.149861 }, { "epoch": 0.6394415267185478, "grad_norm": 0.7807796597480774, "learning_rate": 8.139481193251009e-05, "loss": 0.8685233116149902, "memory(GiB)": 91.52, "step": 49280, "token_acc": 0.7974392442041146, "train_speed(iter/s)": 0.149857 }, { "epoch": 0.6395064051202035, "grad_norm": 0.7307697534561157, "learning_rate": 8.139063715919958e-05, "loss": 0.9246739387512207, "memory(GiB)": 91.52, "step": 49285, "token_acc": 0.7380152329749103, "train_speed(iter/s)": 0.149855 }, { "epoch": 0.6395712835218591, "grad_norm": 0.7863356471061707, "learning_rate": 8.138646202464764e-05, "loss": 0.8960694313049317, "memory(GiB)": 91.52, "step": 49290, "token_acc": 0.7557859583721652, "train_speed(iter/s)": 0.149851 }, { "epoch": 0.6396361619235148, "grad_norm": 0.7061884999275208, "learning_rate": 8.138228652890232e-05, "loss": 0.8795259475708008, "memory(GiB)": 91.52, "step": 49295, "token_acc": 0.7536431739405911, "train_speed(iter/s)": 0.149848 }, { "epoch": 0.6397010403251705, "grad_norm": 0.7805972695350647, "learning_rate": 8.137811067201165e-05, "loss": 0.8813577651977539, "memory(GiB)": 91.52, "step": 49300, "token_acc": 0.7527099820442907, "train_speed(iter/s)": 0.149844 }, { "epoch": 0.6397659187268262, "grad_norm": 0.7568279504776001, "learning_rate": 8.137393445402372e-05, "loss": 0.9140899658203125, "memory(GiB)": 91.52, "step": 49305, "token_acc": 0.7600831909678378, "train_speed(iter/s)": 0.14984 }, { "epoch": 0.6398307971284819, "grad_norm": 0.7594281435012817, "learning_rate": 8.136975787498655e-05, "loss": 0.9102043151855469, "memory(GiB)": 91.52, "step": 49310, "token_acc": 0.7457467554300317, "train_speed(iter/s)": 0.149837 }, { "epoch": 0.6398956755301376, "grad_norm": 0.7521346807479858, "learning_rate": 8.136558093494826e-05, "loss": 0.9045258522033691, "memory(GiB)": 91.52, "step": 49315, "token_acc": 0.7589504659146641, "train_speed(iter/s)": 0.149833 }, { "epoch": 0.6399605539317933, "grad_norm": 0.8290374875068665, "learning_rate": 8.136140363395686e-05, "loss": 0.9667557716369629, "memory(GiB)": 91.52, "step": 49320, "token_acc": 0.7447340753362578, "train_speed(iter/s)": 0.149829 }, { "epoch": 0.640025432333449, "grad_norm": 0.8006407618522644, "learning_rate": 8.135722597206044e-05, "loss": 0.8913063049316406, "memory(GiB)": 91.52, "step": 49325, "token_acc": 0.7692809983312776, "train_speed(iter/s)": 0.149826 }, { "epoch": 0.6400903107351047, "grad_norm": 0.8365021347999573, "learning_rate": 8.135304794930709e-05, "loss": 0.9480034828186035, "memory(GiB)": 91.52, "step": 49330, "token_acc": 0.7421778290007257, "train_speed(iter/s)": 0.149823 }, { "epoch": 0.6401551891367604, "grad_norm": 0.811591386795044, "learning_rate": 8.134886956574488e-05, "loss": 0.9398590087890625, "memory(GiB)": 91.52, "step": 49335, "token_acc": 0.7486336738155366, "train_speed(iter/s)": 0.149818 }, { "epoch": 0.6402200675384161, "grad_norm": 0.7928442358970642, "learning_rate": 8.13446908214219e-05, "loss": 0.8852805137634278, "memory(GiB)": 91.52, "step": 49340, "token_acc": 0.7575394770903443, "train_speed(iter/s)": 0.149814 }, { "epoch": 0.6402849459400718, "grad_norm": 0.7161261439323425, "learning_rate": 8.134051171638624e-05, "loss": 0.8431802749633789, "memory(GiB)": 91.52, "step": 49345, "token_acc": 0.7787243337702053, "train_speed(iter/s)": 0.149811 }, { "epoch": 0.6403498243417275, "grad_norm": 0.7987274527549744, "learning_rate": 8.133633225068597e-05, "loss": 0.9274738311767579, "memory(GiB)": 91.52, "step": 49350, "token_acc": 0.7475516046406508, "train_speed(iter/s)": 0.149808 }, { "epoch": 0.6404147027433832, "grad_norm": 0.7514729499816895, "learning_rate": 8.133215242436923e-05, "loss": 0.8428007125854492, "memory(GiB)": 91.52, "step": 49355, "token_acc": 0.7710268745852688, "train_speed(iter/s)": 0.149804 }, { "epoch": 0.6404795811450389, "grad_norm": 0.7626616358757019, "learning_rate": 8.132797223748407e-05, "loss": 0.9221768379211426, "memory(GiB)": 91.52, "step": 49360, "token_acc": 0.7567818654775177, "train_speed(iter/s)": 0.149801 }, { "epoch": 0.6405444595466946, "grad_norm": 0.7030034065246582, "learning_rate": 8.132379169007864e-05, "loss": 0.8991737365722656, "memory(GiB)": 91.52, "step": 49365, "token_acc": 0.7613969608104505, "train_speed(iter/s)": 0.149797 }, { "epoch": 0.6406093379483503, "grad_norm": 0.8156353831291199, "learning_rate": 8.131961078220103e-05, "loss": 0.9360214233398437, "memory(GiB)": 91.52, "step": 49370, "token_acc": 0.74823662850459, "train_speed(iter/s)": 0.149794 }, { "epoch": 0.640674216350006, "grad_norm": 0.7390492558479309, "learning_rate": 8.131542951389934e-05, "loss": 0.8673875808715821, "memory(GiB)": 91.52, "step": 49375, "token_acc": 0.7713650322990575, "train_speed(iter/s)": 0.14979 }, { "epoch": 0.6407390947516617, "grad_norm": 0.8674196600914001, "learning_rate": 8.131124788522173e-05, "loss": 0.8885915756225586, "memory(GiB)": 91.52, "step": 49380, "token_acc": 0.7450746776732905, "train_speed(iter/s)": 0.149787 }, { "epoch": 0.6408039731533174, "grad_norm": 0.7608280777931213, "learning_rate": 8.130706589621629e-05, "loss": 0.8915373802185058, "memory(GiB)": 91.52, "step": 49385, "token_acc": 0.7524341208919018, "train_speed(iter/s)": 0.149783 }, { "epoch": 0.6408688515549731, "grad_norm": 0.8490849137306213, "learning_rate": 8.130288354693112e-05, "loss": 0.9156222343444824, "memory(GiB)": 91.52, "step": 49390, "token_acc": 0.7513554868110301, "train_speed(iter/s)": 0.14978 }, { "epoch": 0.6409337299566288, "grad_norm": 0.739051342010498, "learning_rate": 8.12987008374144e-05, "loss": 0.9609882354736328, "memory(GiB)": 91.52, "step": 49395, "token_acc": 0.7176293751342065, "train_speed(iter/s)": 0.149776 }, { "epoch": 0.6409986083582845, "grad_norm": 0.763847827911377, "learning_rate": 8.129451776771424e-05, "loss": 0.928800392150879, "memory(GiB)": 91.52, "step": 49400, "token_acc": 0.7368217179379579, "train_speed(iter/s)": 0.149772 }, { "epoch": 0.6410634867599402, "grad_norm": 0.8567410707473755, "learning_rate": 8.129033433787881e-05, "loss": 0.8695401191711426, "memory(GiB)": 91.52, "step": 49405, "token_acc": 0.7703539178561025, "train_speed(iter/s)": 0.149769 }, { "epoch": 0.6411283651615959, "grad_norm": 0.7679409384727478, "learning_rate": 8.128615054795619e-05, "loss": 0.8816486358642578, "memory(GiB)": 91.52, "step": 49410, "token_acc": 0.7500176317088653, "train_speed(iter/s)": 0.149766 }, { "epoch": 0.6411932435632516, "grad_norm": 0.7298591136932373, "learning_rate": 8.128196639799458e-05, "loss": 0.936060905456543, "memory(GiB)": 91.52, "step": 49415, "token_acc": 0.7415058439793422, "train_speed(iter/s)": 0.149763 }, { "epoch": 0.6412581219649073, "grad_norm": 0.8142129182815552, "learning_rate": 8.127778188804212e-05, "loss": 0.8652677536010742, "memory(GiB)": 91.52, "step": 49420, "token_acc": 0.7533501147553121, "train_speed(iter/s)": 0.149759 }, { "epoch": 0.641323000366563, "grad_norm": 0.7314475774765015, "learning_rate": 8.127359701814693e-05, "loss": 0.8961074829101563, "memory(GiB)": 91.52, "step": 49425, "token_acc": 0.752857717272675, "train_speed(iter/s)": 0.149755 }, { "epoch": 0.6413878787682187, "grad_norm": 0.7935310006141663, "learning_rate": 8.126941178835724e-05, "loss": 0.9308669090270996, "memory(GiB)": 91.52, "step": 49430, "token_acc": 0.7547856108011467, "train_speed(iter/s)": 0.149751 }, { "epoch": 0.6414527571698744, "grad_norm": 0.7516577839851379, "learning_rate": 8.126522619872115e-05, "loss": 0.923919677734375, "memory(GiB)": 91.52, "step": 49435, "token_acc": 0.750364190173487, "train_speed(iter/s)": 0.149748 }, { "epoch": 0.6415176355715301, "grad_norm": 0.8481065630912781, "learning_rate": 8.126104024928685e-05, "loss": 0.8966983795166016, "memory(GiB)": 91.52, "step": 49440, "token_acc": 0.764020676957938, "train_speed(iter/s)": 0.149745 }, { "epoch": 0.6415825139731858, "grad_norm": 0.779757559299469, "learning_rate": 8.12568539401025e-05, "loss": 0.9102634429931641, "memory(GiB)": 91.52, "step": 49445, "token_acc": 0.7521058502250086, "train_speed(iter/s)": 0.149742 }, { "epoch": 0.6416473923748415, "grad_norm": 0.7452539205551147, "learning_rate": 8.12526672712163e-05, "loss": 0.8805130004882813, "memory(GiB)": 91.52, "step": 49450, "token_acc": 0.7786472326706072, "train_speed(iter/s)": 0.149739 }, { "epoch": 0.6417122707764972, "grad_norm": 0.7784113883972168, "learning_rate": 8.124848024267641e-05, "loss": 0.8901300430297852, "memory(GiB)": 91.52, "step": 49455, "token_acc": 0.7671496669507845, "train_speed(iter/s)": 0.149736 }, { "epoch": 0.6417771491781529, "grad_norm": 0.718606173992157, "learning_rate": 8.124429285453104e-05, "loss": 0.9267077445983887, "memory(GiB)": 91.52, "step": 49460, "token_acc": 0.7621597190835449, "train_speed(iter/s)": 0.149732 }, { "epoch": 0.6418420275798086, "grad_norm": 0.6953202486038208, "learning_rate": 8.124010510682834e-05, "loss": 0.9276451110839844, "memory(GiB)": 91.52, "step": 49465, "token_acc": 0.747795702436157, "train_speed(iter/s)": 0.149729 }, { "epoch": 0.6419069059814643, "grad_norm": 0.8006352782249451, "learning_rate": 8.123591699961653e-05, "loss": 0.9050453186035157, "memory(GiB)": 91.52, "step": 49470, "token_acc": 0.7565370678679855, "train_speed(iter/s)": 0.149726 }, { "epoch": 0.64197178438312, "grad_norm": 0.7534513473510742, "learning_rate": 8.123172853294378e-05, "loss": 0.9156845092773438, "memory(GiB)": 91.52, "step": 49475, "token_acc": 0.7469414674361088, "train_speed(iter/s)": 0.149723 }, { "epoch": 0.6420366627847757, "grad_norm": 0.7413085699081421, "learning_rate": 8.122753970685832e-05, "loss": 0.9174403190612793, "memory(GiB)": 91.52, "step": 49480, "token_acc": 0.7471193016488846, "train_speed(iter/s)": 0.14972 }, { "epoch": 0.6421015411864314, "grad_norm": 0.8011002540588379, "learning_rate": 8.122335052140834e-05, "loss": 0.9305213928222656, "memory(GiB)": 91.52, "step": 49485, "token_acc": 0.7478432074458564, "train_speed(iter/s)": 0.149716 }, { "epoch": 0.6421664195880871, "grad_norm": 0.7401541471481323, "learning_rate": 8.121916097664206e-05, "loss": 0.9008993148803711, "memory(GiB)": 91.52, "step": 49490, "token_acc": 0.7607056480295917, "train_speed(iter/s)": 0.149712 }, { "epoch": 0.6422312979897428, "grad_norm": 0.7609550952911377, "learning_rate": 8.121497107260769e-05, "loss": 0.9005534172058105, "memory(GiB)": 91.52, "step": 49495, "token_acc": 0.7441179510145027, "train_speed(iter/s)": 0.149709 }, { "epoch": 0.6422961763913985, "grad_norm": 0.6770529747009277, "learning_rate": 8.121078080935342e-05, "loss": 0.8878915786743165, "memory(GiB)": 91.52, "step": 49500, "token_acc": 0.742347455909183, "train_speed(iter/s)": 0.149706 }, { "epoch": 0.6423610547930542, "grad_norm": 0.7459476590156555, "learning_rate": 8.12065901869275e-05, "loss": 0.8828920364379883, "memory(GiB)": 91.52, "step": 49505, "token_acc": 0.7463905281520247, "train_speed(iter/s)": 0.149702 }, { "epoch": 0.6424259331947099, "grad_norm": 0.7412424087524414, "learning_rate": 8.120239920537816e-05, "loss": 0.902066707611084, "memory(GiB)": 91.52, "step": 49510, "token_acc": 0.7551346315929557, "train_speed(iter/s)": 0.149699 }, { "epoch": 0.6424908115963656, "grad_norm": 0.8403762578964233, "learning_rate": 8.119820786475361e-05, "loss": 0.8954699516296387, "memory(GiB)": 91.52, "step": 49515, "token_acc": 0.7759892445768686, "train_speed(iter/s)": 0.149696 }, { "epoch": 0.6425556899980213, "grad_norm": 0.8010275959968567, "learning_rate": 8.119401616510209e-05, "loss": 0.8961166381835938, "memory(GiB)": 91.52, "step": 49520, "token_acc": 0.7667532046081453, "train_speed(iter/s)": 0.149693 }, { "epoch": 0.6426205683996769, "grad_norm": 0.8348206281661987, "learning_rate": 8.118982410647185e-05, "loss": 0.8906021118164062, "memory(GiB)": 91.52, "step": 49525, "token_acc": 0.7636350475914734, "train_speed(iter/s)": 0.14969 }, { "epoch": 0.6426854468013325, "grad_norm": 0.6829365491867065, "learning_rate": 8.118563168891112e-05, "loss": 0.8782393455505371, "memory(GiB)": 91.52, "step": 49530, "token_acc": 0.7737431650475128, "train_speed(iter/s)": 0.149687 }, { "epoch": 0.6427503252029882, "grad_norm": 0.7752141952514648, "learning_rate": 8.118143891246812e-05, "loss": 0.87767333984375, "memory(GiB)": 91.52, "step": 49535, "token_acc": 0.7840619939856581, "train_speed(iter/s)": 0.149684 }, { "epoch": 0.642815203604644, "grad_norm": 0.7703221440315247, "learning_rate": 8.117724577719115e-05, "loss": 0.8661949157714843, "memory(GiB)": 91.52, "step": 49540, "token_acc": 0.764570311135943, "train_speed(iter/s)": 0.14968 }, { "epoch": 0.6428800820062996, "grad_norm": 0.7471147775650024, "learning_rate": 8.117305228312843e-05, "loss": 0.8930585861206055, "memory(GiB)": 91.52, "step": 49545, "token_acc": 0.7764919941775837, "train_speed(iter/s)": 0.149677 }, { "epoch": 0.6429449604079553, "grad_norm": 0.6845870018005371, "learning_rate": 8.116885843032826e-05, "loss": 0.829263973236084, "memory(GiB)": 91.52, "step": 49550, "token_acc": 0.7705596488477818, "train_speed(iter/s)": 0.149673 }, { "epoch": 0.643009838809611, "grad_norm": 0.8042904734611511, "learning_rate": 8.116466421883885e-05, "loss": 0.8925285339355469, "memory(GiB)": 91.52, "step": 49555, "token_acc": 0.7559523809523809, "train_speed(iter/s)": 0.14967 }, { "epoch": 0.6430747172112667, "grad_norm": 0.8192145228385925, "learning_rate": 8.116046964870849e-05, "loss": 0.8679651260375977, "memory(GiB)": 91.52, "step": 49560, "token_acc": 0.7840884389653138, "train_speed(iter/s)": 0.149667 }, { "epoch": 0.6431395956129224, "grad_norm": 0.8307878971099854, "learning_rate": 8.115627471998546e-05, "loss": 0.9444107055664063, "memory(GiB)": 91.52, "step": 49565, "token_acc": 0.7379088963619688, "train_speed(iter/s)": 0.149663 }, { "epoch": 0.6432044740145781, "grad_norm": 0.7462762594223022, "learning_rate": 8.115207943271804e-05, "loss": 0.8691464424133301, "memory(GiB)": 91.52, "step": 49570, "token_acc": 0.760077451152966, "train_speed(iter/s)": 0.14966 }, { "epoch": 0.6432693524162338, "grad_norm": 0.7684543132781982, "learning_rate": 8.114788378695446e-05, "loss": 0.9179780960083008, "memory(GiB)": 91.52, "step": 49575, "token_acc": 0.7595955742083175, "train_speed(iter/s)": 0.149657 }, { "epoch": 0.6433342308178895, "grad_norm": 0.715913712978363, "learning_rate": 8.114368778274305e-05, "loss": 0.9169155120849609, "memory(GiB)": 91.52, "step": 49580, "token_acc": 0.7469323954669793, "train_speed(iter/s)": 0.149654 }, { "epoch": 0.6433991092195452, "grad_norm": 0.8659994006156921, "learning_rate": 8.11394914201321e-05, "loss": 0.8931743621826171, "memory(GiB)": 91.52, "step": 49585, "token_acc": 0.7582130229817324, "train_speed(iter/s)": 0.14965 }, { "epoch": 0.6434639876212009, "grad_norm": 0.7547791600227356, "learning_rate": 8.113529469916987e-05, "loss": 0.925896167755127, "memory(GiB)": 91.52, "step": 49590, "token_acc": 0.7548415052831572, "train_speed(iter/s)": 0.149646 }, { "epoch": 0.6435288660228566, "grad_norm": 0.777090311050415, "learning_rate": 8.113109761990467e-05, "loss": 0.9109709739685059, "memory(GiB)": 91.52, "step": 49595, "token_acc": 0.7470586052035779, "train_speed(iter/s)": 0.149643 }, { "epoch": 0.6435937444245123, "grad_norm": 0.791471540927887, "learning_rate": 8.112690018238482e-05, "loss": 0.9082378387451172, "memory(GiB)": 91.52, "step": 49600, "token_acc": 0.7531229140841041, "train_speed(iter/s)": 0.14964 }, { "epoch": 0.643658622826168, "grad_norm": 0.6956110000610352, "learning_rate": 8.11227023866586e-05, "loss": 0.9125131607055664, "memory(GiB)": 91.52, "step": 49605, "token_acc": 0.7612552460892789, "train_speed(iter/s)": 0.149637 }, { "epoch": 0.6437235012278237, "grad_norm": 0.7669951915740967, "learning_rate": 8.111850423277431e-05, "loss": 0.9102482795715332, "memory(GiB)": 91.52, "step": 49610, "token_acc": 0.7541873669268985, "train_speed(iter/s)": 0.149634 }, { "epoch": 0.6437883796294794, "grad_norm": 0.8195140957832336, "learning_rate": 8.11143057207803e-05, "loss": 0.8527503967285156, "memory(GiB)": 91.52, "step": 49615, "token_acc": 0.7528836824894893, "train_speed(iter/s)": 0.14963 }, { "epoch": 0.6438532580311351, "grad_norm": 0.7864144444465637, "learning_rate": 8.111010685072484e-05, "loss": 0.9499554634094238, "memory(GiB)": 91.52, "step": 49620, "token_acc": 0.7468145567879733, "train_speed(iter/s)": 0.149627 }, { "epoch": 0.6439181364327908, "grad_norm": 0.8229067325592041, "learning_rate": 8.110590762265629e-05, "loss": 0.9335556983947754, "memory(GiB)": 91.52, "step": 49625, "token_acc": 0.7526595744680851, "train_speed(iter/s)": 0.149624 }, { "epoch": 0.6439830148344465, "grad_norm": 0.6919890642166138, "learning_rate": 8.110170803662295e-05, "loss": 0.9019654273986817, "memory(GiB)": 91.52, "step": 49630, "token_acc": 0.7669929972476744, "train_speed(iter/s)": 0.149621 }, { "epoch": 0.6440478932361022, "grad_norm": 0.7480737566947937, "learning_rate": 8.109750809267315e-05, "loss": 0.890414047241211, "memory(GiB)": 91.52, "step": 49635, "token_acc": 0.771016916446903, "train_speed(iter/s)": 0.149617 }, { "epoch": 0.6441127716377579, "grad_norm": 0.6954650282859802, "learning_rate": 8.109330779085524e-05, "loss": 0.9231174468994141, "memory(GiB)": 91.52, "step": 49640, "token_acc": 0.7654716759842058, "train_speed(iter/s)": 0.149614 }, { "epoch": 0.6441776500394136, "grad_norm": 0.7322177290916443, "learning_rate": 8.108910713121753e-05, "loss": 0.8609831809997559, "memory(GiB)": 91.52, "step": 49645, "token_acc": 0.7655250860796852, "train_speed(iter/s)": 0.14961 }, { "epoch": 0.6442425284410693, "grad_norm": 0.781663715839386, "learning_rate": 8.108490611380839e-05, "loss": 0.9427590370178223, "memory(GiB)": 91.52, "step": 49650, "token_acc": 0.7709828187742742, "train_speed(iter/s)": 0.149607 }, { "epoch": 0.644307406842725, "grad_norm": 0.7204242944717407, "learning_rate": 8.108070473867615e-05, "loss": 0.9020479202270508, "memory(GiB)": 91.52, "step": 49655, "token_acc": 0.7714170665413912, "train_speed(iter/s)": 0.149604 }, { "epoch": 0.6443722852443807, "grad_norm": 0.7511578798294067, "learning_rate": 8.107650300586916e-05, "loss": 0.8784638404846191, "memory(GiB)": 91.52, "step": 49660, "token_acc": 0.7495442820292347, "train_speed(iter/s)": 0.149601 }, { "epoch": 0.6444371636460364, "grad_norm": 0.8493988513946533, "learning_rate": 8.107230091543576e-05, "loss": 0.931217098236084, "memory(GiB)": 91.52, "step": 49665, "token_acc": 0.7573357844870254, "train_speed(iter/s)": 0.149597 }, { "epoch": 0.6445020420476921, "grad_norm": 1.0437849760055542, "learning_rate": 8.106809846742432e-05, "loss": 0.8893369674682617, "memory(GiB)": 91.52, "step": 49670, "token_acc": 0.767685083546774, "train_speed(iter/s)": 0.149592 }, { "epoch": 0.6445669204493478, "grad_norm": 0.8303335905075073, "learning_rate": 8.106389566188322e-05, "loss": 0.9082963943481446, "memory(GiB)": 91.52, "step": 49675, "token_acc": 0.7523133626097867, "train_speed(iter/s)": 0.149588 }, { "epoch": 0.6446317988510035, "grad_norm": 0.8619617819786072, "learning_rate": 8.105969249886081e-05, "loss": 0.8885624885559082, "memory(GiB)": 91.52, "step": 49680, "token_acc": 0.773571829312564, "train_speed(iter/s)": 0.149583 }, { "epoch": 0.6446966772526592, "grad_norm": 0.7896336913108826, "learning_rate": 8.105548897840543e-05, "loss": 0.8695259094238281, "memory(GiB)": 91.52, "step": 49685, "token_acc": 0.7557399937946013, "train_speed(iter/s)": 0.149581 }, { "epoch": 0.6447615556543149, "grad_norm": 0.7612184882164001, "learning_rate": 8.10512851005655e-05, "loss": 0.9080355644226075, "memory(GiB)": 91.52, "step": 49690, "token_acc": 0.757543835802637, "train_speed(iter/s)": 0.149577 }, { "epoch": 0.6448264340559706, "grad_norm": 0.7769936919212341, "learning_rate": 8.104708086538939e-05, "loss": 0.9133729934692383, "memory(GiB)": 91.52, "step": 49695, "token_acc": 0.7495701796100523, "train_speed(iter/s)": 0.149573 }, { "epoch": 0.6448913124576263, "grad_norm": 0.8015763759613037, "learning_rate": 8.104287627292546e-05, "loss": 0.9091745376586914, "memory(GiB)": 91.52, "step": 49700, "token_acc": 0.7692332800208898, "train_speed(iter/s)": 0.14957 }, { "epoch": 0.644956190859282, "grad_norm": 0.7398888468742371, "learning_rate": 8.10386713232221e-05, "loss": 0.8774441719055176, "memory(GiB)": 91.52, "step": 49705, "token_acc": 0.7483443708609272, "train_speed(iter/s)": 0.149567 }, { "epoch": 0.6450210692609377, "grad_norm": 0.7622784376144409, "learning_rate": 8.103446601632772e-05, "loss": 0.9193469047546386, "memory(GiB)": 91.52, "step": 49710, "token_acc": 0.7715140104108982, "train_speed(iter/s)": 0.149564 }, { "epoch": 0.6450859476625934, "grad_norm": 0.843925416469574, "learning_rate": 8.10302603522907e-05, "loss": 0.9532188415527344, "memory(GiB)": 91.52, "step": 49715, "token_acc": 0.7430970572660565, "train_speed(iter/s)": 0.149561 }, { "epoch": 0.6451508260642491, "grad_norm": 0.7860636115074158, "learning_rate": 8.102605433115944e-05, "loss": 0.9298406600952148, "memory(GiB)": 91.52, "step": 49720, "token_acc": 0.7482152974504249, "train_speed(iter/s)": 0.149558 }, { "epoch": 0.6452157044659048, "grad_norm": 0.8038303852081299, "learning_rate": 8.102184795298233e-05, "loss": 0.9524885177612304, "memory(GiB)": 91.52, "step": 49725, "token_acc": 0.752457127363926, "train_speed(iter/s)": 0.149555 }, { "epoch": 0.6452805828675605, "grad_norm": 0.7793116569519043, "learning_rate": 8.101764121780781e-05, "loss": 0.869835090637207, "memory(GiB)": 91.52, "step": 49730, "token_acc": 0.7638860541738345, "train_speed(iter/s)": 0.149551 }, { "epoch": 0.6453454612692162, "grad_norm": 0.819462239742279, "learning_rate": 8.101343412568427e-05, "loss": 0.9136237144470215, "memory(GiB)": 91.52, "step": 49735, "token_acc": 0.7609682881773399, "train_speed(iter/s)": 0.149548 }, { "epoch": 0.6454103396708719, "grad_norm": 0.8123688697814941, "learning_rate": 8.10092266766601e-05, "loss": 0.9029814720153808, "memory(GiB)": 91.52, "step": 49740, "token_acc": 0.7616117003945987, "train_speed(iter/s)": 0.149544 }, { "epoch": 0.6454752180725276, "grad_norm": 0.8042370676994324, "learning_rate": 8.100501887078379e-05, "loss": 0.882938003540039, "memory(GiB)": 91.52, "step": 49745, "token_acc": 0.7637772571634529, "train_speed(iter/s)": 0.14954 }, { "epoch": 0.6455400964741833, "grad_norm": 0.7980333566665649, "learning_rate": 8.10008107081037e-05, "loss": 0.9437199592590332, "memory(GiB)": 91.52, "step": 49750, "token_acc": 0.7448697291216976, "train_speed(iter/s)": 0.149536 }, { "epoch": 0.645604974875839, "grad_norm": 0.7344643473625183, "learning_rate": 8.099660218866826e-05, "loss": 0.9031504631042481, "memory(GiB)": 91.52, "step": 49755, "token_acc": 0.7650081214572214, "train_speed(iter/s)": 0.149534 }, { "epoch": 0.6456698532774947, "grad_norm": 0.8118243217468262, "learning_rate": 8.099239331252593e-05, "loss": 0.8931957244873047, "memory(GiB)": 91.52, "step": 49760, "token_acc": 0.7607803588816247, "train_speed(iter/s)": 0.14953 }, { "epoch": 0.6457347316791503, "grad_norm": 0.7809919714927673, "learning_rate": 8.098818407972513e-05, "loss": 0.8428853988647461, "memory(GiB)": 91.52, "step": 49765, "token_acc": 0.7655705996131528, "train_speed(iter/s)": 0.149526 }, { "epoch": 0.645799610080806, "grad_norm": 0.7749234437942505, "learning_rate": 8.09839744903143e-05, "loss": 0.9226024627685547, "memory(GiB)": 91.52, "step": 49770, "token_acc": 0.7446697867914717, "train_speed(iter/s)": 0.149522 }, { "epoch": 0.6458644884824617, "grad_norm": 0.7439854145050049, "learning_rate": 8.097976454434188e-05, "loss": 0.9684085845947266, "memory(GiB)": 91.52, "step": 49775, "token_acc": 0.7425029660843595, "train_speed(iter/s)": 0.149518 }, { "epoch": 0.6459293668841174, "grad_norm": 0.8020468950271606, "learning_rate": 8.097555424185632e-05, "loss": 0.8873924255371094, "memory(GiB)": 91.52, "step": 49780, "token_acc": 0.7516547159404302, "train_speed(iter/s)": 0.149515 }, { "epoch": 0.6459942452857731, "grad_norm": 0.7793342471122742, "learning_rate": 8.09713435829061e-05, "loss": 0.9176784515380859, "memory(GiB)": 91.52, "step": 49785, "token_acc": 0.7600083330439915, "train_speed(iter/s)": 0.149512 }, { "epoch": 0.6460591236874288, "grad_norm": 0.764305830001831, "learning_rate": 8.096713256753963e-05, "loss": 0.9257417678833008, "memory(GiB)": 91.52, "step": 49790, "token_acc": 0.76764620096252, "train_speed(iter/s)": 0.149509 }, { "epoch": 0.6461240020890845, "grad_norm": 0.7119724750518799, "learning_rate": 8.09629211958054e-05, "loss": 0.9519252777099609, "memory(GiB)": 91.52, "step": 49795, "token_acc": 0.7417989417989418, "train_speed(iter/s)": 0.149506 }, { "epoch": 0.6461888804907402, "grad_norm": 0.7105868458747864, "learning_rate": 8.095870946775187e-05, "loss": 0.9191635131835938, "memory(GiB)": 91.52, "step": 49800, "token_acc": 0.7524732309124768, "train_speed(iter/s)": 0.149502 }, { "epoch": 0.6462537588923959, "grad_norm": 0.872441291809082, "learning_rate": 8.095449738342749e-05, "loss": 0.8813852310180664, "memory(GiB)": 91.52, "step": 49805, "token_acc": 0.7549481945544043, "train_speed(iter/s)": 0.149498 }, { "epoch": 0.6463186372940516, "grad_norm": 0.7580994367599487, "learning_rate": 8.095028494288076e-05, "loss": 0.8757678985595703, "memory(GiB)": 91.52, "step": 49810, "token_acc": 0.7607051878209534, "train_speed(iter/s)": 0.149496 }, { "epoch": 0.6463835156957073, "grad_norm": 0.6574327349662781, "learning_rate": 8.094607214616013e-05, "loss": 0.8765929222106934, "memory(GiB)": 91.52, "step": 49815, "token_acc": 0.768918918918919, "train_speed(iter/s)": 0.149492 }, { "epoch": 0.646448394097363, "grad_norm": 0.8176712989807129, "learning_rate": 8.094185899331411e-05, "loss": 0.9295467376708985, "memory(GiB)": 91.52, "step": 49820, "token_acc": 0.7433087901734305, "train_speed(iter/s)": 0.14949 }, { "epoch": 0.6465132724990187, "grad_norm": 0.8267173171043396, "learning_rate": 8.093764548439116e-05, "loss": 0.9382197380065918, "memory(GiB)": 91.52, "step": 49825, "token_acc": 0.737993853445403, "train_speed(iter/s)": 0.149488 }, { "epoch": 0.6465781509006744, "grad_norm": 0.7990130186080933, "learning_rate": 8.093343161943977e-05, "loss": 0.9418999671936035, "memory(GiB)": 91.52, "step": 49830, "token_acc": 0.7531972984624228, "train_speed(iter/s)": 0.149484 }, { "epoch": 0.6466430293023301, "grad_norm": 0.7034149169921875, "learning_rate": 8.092921739850845e-05, "loss": 0.8771387100219726, "memory(GiB)": 91.52, "step": 49835, "token_acc": 0.763580358760645, "train_speed(iter/s)": 0.149481 }, { "epoch": 0.6467079077039858, "grad_norm": 0.8309453129768372, "learning_rate": 8.092500282164568e-05, "loss": 0.878571605682373, "memory(GiB)": 91.52, "step": 49840, "token_acc": 0.7504637206764866, "train_speed(iter/s)": 0.149478 }, { "epoch": 0.6467727861056415, "grad_norm": 0.8026105761528015, "learning_rate": 8.092078788889996e-05, "loss": 0.9287920951843261, "memory(GiB)": 91.52, "step": 49845, "token_acc": 0.7545421773612112, "train_speed(iter/s)": 0.149475 }, { "epoch": 0.6468376645072972, "grad_norm": 0.78240567445755, "learning_rate": 8.091657260031982e-05, "loss": 0.8649457931518555, "memory(GiB)": 91.52, "step": 49850, "token_acc": 0.7688927943760984, "train_speed(iter/s)": 0.149471 }, { "epoch": 0.6469025429089529, "grad_norm": 0.8009052872657776, "learning_rate": 8.091235695595376e-05, "loss": 0.9558885574340821, "memory(GiB)": 91.52, "step": 49855, "token_acc": 0.7479809748779008, "train_speed(iter/s)": 0.149468 }, { "epoch": 0.6469674213106086, "grad_norm": 0.8247900605201721, "learning_rate": 8.090814095585027e-05, "loss": 0.8944664001464844, "memory(GiB)": 91.52, "step": 49860, "token_acc": 0.7643295896485298, "train_speed(iter/s)": 0.149464 }, { "epoch": 0.6470322997122643, "grad_norm": 0.7782905101776123, "learning_rate": 8.090392460005787e-05, "loss": 0.9022778511047364, "memory(GiB)": 91.52, "step": 49865, "token_acc": 0.7585406859448555, "train_speed(iter/s)": 0.14946 }, { "epoch": 0.64709717811392, "grad_norm": 0.7343138456344604, "learning_rate": 8.08997078886251e-05, "loss": 0.9367918014526367, "memory(GiB)": 91.52, "step": 49870, "token_acc": 0.7572097609070741, "train_speed(iter/s)": 0.149457 }, { "epoch": 0.6471620565155757, "grad_norm": 1.1025704145431519, "learning_rate": 8.08954908216005e-05, "loss": 0.9190423965454102, "memory(GiB)": 91.52, "step": 49875, "token_acc": 0.7573792300656855, "train_speed(iter/s)": 0.149453 }, { "epoch": 0.6472269349172314, "grad_norm": 0.6906318068504333, "learning_rate": 8.089127339903257e-05, "loss": 0.8873078346252441, "memory(GiB)": 91.52, "step": 49880, "token_acc": 0.7625788033781372, "train_speed(iter/s)": 0.149449 }, { "epoch": 0.6472918133188871, "grad_norm": 0.7307195663452148, "learning_rate": 8.088705562096985e-05, "loss": 0.9154319763183594, "memory(GiB)": 91.52, "step": 49885, "token_acc": 0.7509757226963022, "train_speed(iter/s)": 0.149445 }, { "epoch": 0.6473566917205428, "grad_norm": 0.719063401222229, "learning_rate": 8.08828374874609e-05, "loss": 0.8450551986694336, "memory(GiB)": 91.52, "step": 49890, "token_acc": 0.7703200513704417, "train_speed(iter/s)": 0.149442 }, { "epoch": 0.6474215701221985, "grad_norm": 0.8773766756057739, "learning_rate": 8.087861899855422e-05, "loss": 0.9266036987304688, "memory(GiB)": 91.52, "step": 49895, "token_acc": 0.7477652249990491, "train_speed(iter/s)": 0.149439 }, { "epoch": 0.6474864485238542, "grad_norm": 0.8257165551185608, "learning_rate": 8.087440015429838e-05, "loss": 0.9041548728942871, "memory(GiB)": 91.52, "step": 49900, "token_acc": 0.73745989809398, "train_speed(iter/s)": 0.149436 }, { "epoch": 0.6475513269255099, "grad_norm": 0.7525587677955627, "learning_rate": 8.087018095474193e-05, "loss": 0.9184247970581054, "memory(GiB)": 91.52, "step": 49905, "token_acc": 0.7618821010052942, "train_speed(iter/s)": 0.149432 }, { "epoch": 0.6476162053271656, "grad_norm": 0.7251690626144409, "learning_rate": 8.086596139993345e-05, "loss": 0.8722124099731445, "memory(GiB)": 91.52, "step": 49910, "token_acc": 0.7745385544247024, "train_speed(iter/s)": 0.149429 }, { "epoch": 0.6476810837288213, "grad_norm": 0.7712453603744507, "learning_rate": 8.086174148992145e-05, "loss": 0.9285568237304688, "memory(GiB)": 91.52, "step": 49915, "token_acc": 0.7503263504693231, "train_speed(iter/s)": 0.149426 }, { "epoch": 0.647745962130477, "grad_norm": 0.7280023097991943, "learning_rate": 8.085752122475452e-05, "loss": 0.9208690643310546, "memory(GiB)": 91.52, "step": 49920, "token_acc": 0.7444260688472375, "train_speed(iter/s)": 0.149422 }, { "epoch": 0.6478108405321327, "grad_norm": 0.8236974477767944, "learning_rate": 8.085330060448123e-05, "loss": 0.9236909866333007, "memory(GiB)": 91.52, "step": 49925, "token_acc": 0.7491144203322213, "train_speed(iter/s)": 0.149419 }, { "epoch": 0.6478757189337884, "grad_norm": 0.8174692392349243, "learning_rate": 8.084907962915014e-05, "loss": 0.8933017730712891, "memory(GiB)": 91.52, "step": 49930, "token_acc": 0.7671602546360365, "train_speed(iter/s)": 0.149415 }, { "epoch": 0.6479405973354441, "grad_norm": 0.7878639101982117, "learning_rate": 8.08448582988098e-05, "loss": 0.8730754852294922, "memory(GiB)": 91.52, "step": 49935, "token_acc": 0.7778846484146886, "train_speed(iter/s)": 0.149411 }, { "epoch": 0.6480054757370998, "grad_norm": 0.6847003102302551, "learning_rate": 8.084063661350885e-05, "loss": 0.9154720306396484, "memory(GiB)": 91.52, "step": 49940, "token_acc": 0.770963153616906, "train_speed(iter/s)": 0.149407 }, { "epoch": 0.6480703541387555, "grad_norm": 0.7220338582992554, "learning_rate": 8.083641457329583e-05, "loss": 0.8488328933715821, "memory(GiB)": 91.52, "step": 49945, "token_acc": 0.7749425287356322, "train_speed(iter/s)": 0.149404 }, { "epoch": 0.6481352325404112, "grad_norm": 0.8450242877006531, "learning_rate": 8.083219217821933e-05, "loss": 0.9229957580566406, "memory(GiB)": 91.52, "step": 49950, "token_acc": 0.7287074363152929, "train_speed(iter/s)": 0.149401 }, { "epoch": 0.6482001109420669, "grad_norm": 0.7669990658760071, "learning_rate": 8.082796942832794e-05, "loss": 0.863624382019043, "memory(GiB)": 91.52, "step": 49955, "token_acc": 0.7635025654874427, "train_speed(iter/s)": 0.149397 }, { "epoch": 0.6482649893437226, "grad_norm": 0.7436892986297607, "learning_rate": 8.082374632367027e-05, "loss": 0.9195348739624023, "memory(GiB)": 91.52, "step": 49960, "token_acc": 0.751958319670638, "train_speed(iter/s)": 0.149395 }, { "epoch": 0.6483298677453783, "grad_norm": 0.7104483246803284, "learning_rate": 8.081952286429492e-05, "loss": 0.8831453323364258, "memory(GiB)": 91.52, "step": 49965, "token_acc": 0.7814105009831832, "train_speed(iter/s)": 0.149391 }, { "epoch": 0.648394746147034, "grad_norm": 0.9152425527572632, "learning_rate": 8.081529905025047e-05, "loss": 0.8651420593261718, "memory(GiB)": 91.52, "step": 49970, "token_acc": 0.7724816739511994, "train_speed(iter/s)": 0.149388 }, { "epoch": 0.6484596245486897, "grad_norm": 0.8281003832817078, "learning_rate": 8.081107488158555e-05, "loss": 0.9295171737670899, "memory(GiB)": 91.52, "step": 49975, "token_acc": 0.7543650028907304, "train_speed(iter/s)": 0.149384 }, { "epoch": 0.6485245029503454, "grad_norm": 0.669120192527771, "learning_rate": 8.080685035834876e-05, "loss": 0.8772963523864746, "memory(GiB)": 91.52, "step": 49980, "token_acc": 0.7541004572049749, "train_speed(iter/s)": 0.149381 }, { "epoch": 0.6485893813520011, "grad_norm": 0.7748560905456543, "learning_rate": 8.080262548058873e-05, "loss": 0.8436472892761231, "memory(GiB)": 91.52, "step": 49985, "token_acc": 0.7626232399911803, "train_speed(iter/s)": 0.149378 }, { "epoch": 0.6486542597536568, "grad_norm": 0.7690054774284363, "learning_rate": 8.079840024835405e-05, "loss": 0.9035470008850097, "memory(GiB)": 91.52, "step": 49990, "token_acc": 0.7458051828591367, "train_speed(iter/s)": 0.149374 }, { "epoch": 0.6487191381553125, "grad_norm": 0.8035492300987244, "learning_rate": 8.079417466169337e-05, "loss": 0.8970311164855957, "memory(GiB)": 91.52, "step": 49995, "token_acc": 0.7571863375042273, "train_speed(iter/s)": 0.149371 }, { "epoch": 0.6487840165569682, "grad_norm": 0.8012113571166992, "learning_rate": 8.078994872065531e-05, "loss": 0.891107177734375, "memory(GiB)": 91.52, "step": 50000, "token_acc": 0.7675172620834584, "train_speed(iter/s)": 0.149368 }, { "epoch": 0.6487840165569682, "eval_loss": 0.8969811201095581, "eval_runtime": 1730.4986, "eval_samples_per_second": 28.79, "eval_steps_per_second": 1.799, "eval_token_acc": 0.7567947643572003, "step": 50000 }, { "epoch": 0.6488488949586237, "grad_norm": 0.7809414267539978, "learning_rate": 8.07857224252885e-05, "loss": 0.8743377685546875, "memory(GiB)": 91.52, "step": 50005, "token_acc": 0.7582202334892171, "train_speed(iter/s)": 0.148542 }, { "epoch": 0.6489137733602794, "grad_norm": 0.8822034597396851, "learning_rate": 8.078149577564158e-05, "loss": 0.9219618797302246, "memory(GiB)": 91.52, "step": 50010, "token_acc": 0.7553322553322553, "train_speed(iter/s)": 0.148538 }, { "epoch": 0.6489786517619351, "grad_norm": 0.7114306688308716, "learning_rate": 8.077726877176319e-05, "loss": 0.88906888961792, "memory(GiB)": 91.52, "step": 50015, "token_acc": 0.7537795659595221, "train_speed(iter/s)": 0.148534 }, { "epoch": 0.6490435301635908, "grad_norm": 0.7355203628540039, "learning_rate": 8.077304141370197e-05, "loss": 0.9082328796386718, "memory(GiB)": 91.52, "step": 50020, "token_acc": 0.7618772563176895, "train_speed(iter/s)": 0.14853 }, { "epoch": 0.6491084085652465, "grad_norm": 0.8333708047866821, "learning_rate": 8.076881370150658e-05, "loss": 0.9021080017089844, "memory(GiB)": 91.52, "step": 50025, "token_acc": 0.7498265361190347, "train_speed(iter/s)": 0.148526 }, { "epoch": 0.6491732869669022, "grad_norm": 0.8165570497512817, "learning_rate": 8.076458563522565e-05, "loss": 0.9493127822875976, "memory(GiB)": 91.52, "step": 50030, "token_acc": 0.7403465015810805, "train_speed(iter/s)": 0.148524 }, { "epoch": 0.6492381653685579, "grad_norm": 0.8394230008125305, "learning_rate": 8.076035721490785e-05, "loss": 0.8882699966430664, "memory(GiB)": 91.52, "step": 50035, "token_acc": 0.7451316404424365, "train_speed(iter/s)": 0.148521 }, { "epoch": 0.6493030437702136, "grad_norm": 0.6190715432167053, "learning_rate": 8.075612844060183e-05, "loss": 0.8697001457214355, "memory(GiB)": 91.52, "step": 50040, "token_acc": 0.7487583423870867, "train_speed(iter/s)": 0.148519 }, { "epoch": 0.6493679221718693, "grad_norm": 0.8296608924865723, "learning_rate": 8.075189931235627e-05, "loss": 0.9036872863769532, "memory(GiB)": 91.52, "step": 50045, "token_acc": 0.7705478387868618, "train_speed(iter/s)": 0.148515 }, { "epoch": 0.649432800573525, "grad_norm": 0.8372585773468018, "learning_rate": 8.074766983021985e-05, "loss": 0.9083732604980469, "memory(GiB)": 91.52, "step": 50050, "token_acc": 0.7604371272454555, "train_speed(iter/s)": 0.148512 }, { "epoch": 0.6494976789751807, "grad_norm": 0.8183494806289673, "learning_rate": 8.07434399942412e-05, "loss": 0.9075660705566406, "memory(GiB)": 91.52, "step": 50055, "token_acc": 0.7493893502686859, "train_speed(iter/s)": 0.148509 }, { "epoch": 0.6495625573768364, "grad_norm": 0.8560284972190857, "learning_rate": 8.073920980446903e-05, "loss": 0.9098323822021485, "memory(GiB)": 91.52, "step": 50060, "token_acc": 0.7532983914693656, "train_speed(iter/s)": 0.148505 }, { "epoch": 0.6496274357784921, "grad_norm": 0.7871177196502686, "learning_rate": 8.073497926095202e-05, "loss": 0.926630973815918, "memory(GiB)": 91.52, "step": 50065, "token_acc": 0.7438265127667016, "train_speed(iter/s)": 0.148502 }, { "epoch": 0.6496923141801478, "grad_norm": 0.7221119403839111, "learning_rate": 8.073074836373884e-05, "loss": 0.8975900650024414, "memory(GiB)": 91.52, "step": 50070, "token_acc": 0.7766093052899936, "train_speed(iter/s)": 0.148499 }, { "epoch": 0.6497571925818035, "grad_norm": 0.8547495603561401, "learning_rate": 8.072651711287818e-05, "loss": 0.9554193496704102, "memory(GiB)": 91.52, "step": 50075, "token_acc": 0.7294220665499125, "train_speed(iter/s)": 0.148496 }, { "epoch": 0.6498220709834592, "grad_norm": 0.762579619884491, "learning_rate": 8.072228550841874e-05, "loss": 0.915799331665039, "memory(GiB)": 91.52, "step": 50080, "token_acc": 0.7479854288552821, "train_speed(iter/s)": 0.148493 }, { "epoch": 0.6498869493851149, "grad_norm": 0.7034903168678284, "learning_rate": 8.071805355040922e-05, "loss": 0.8907659530639649, "memory(GiB)": 91.52, "step": 50085, "token_acc": 0.7693761814744802, "train_speed(iter/s)": 0.148489 }, { "epoch": 0.6499518277867706, "grad_norm": 0.6925817728042603, "learning_rate": 8.071382123889831e-05, "loss": 0.9145734786987305, "memory(GiB)": 91.52, "step": 50090, "token_acc": 0.7537886298614849, "train_speed(iter/s)": 0.148485 }, { "epoch": 0.6500167061884263, "grad_norm": 0.7679932713508606, "learning_rate": 8.070958857393474e-05, "loss": 0.8923372268676758, "memory(GiB)": 91.52, "step": 50095, "token_acc": 0.7421642213253875, "train_speed(iter/s)": 0.148481 }, { "epoch": 0.650081584590082, "grad_norm": 0.7855610847473145, "learning_rate": 8.070535555556717e-05, "loss": 0.9052095413208008, "memory(GiB)": 91.52, "step": 50100, "token_acc": 0.769878719338611, "train_speed(iter/s)": 0.148478 }, { "epoch": 0.6501464629917377, "grad_norm": 0.7409092783927917, "learning_rate": 8.070112218384438e-05, "loss": 0.8779951095581054, "memory(GiB)": 91.52, "step": 50105, "token_acc": 0.7729055586198443, "train_speed(iter/s)": 0.148475 }, { "epoch": 0.6502113413933934, "grad_norm": 0.8580477833747864, "learning_rate": 8.069688845881502e-05, "loss": 0.9126468658447265, "memory(GiB)": 91.52, "step": 50110, "token_acc": 0.7718220338983051, "train_speed(iter/s)": 0.148473 }, { "epoch": 0.6502762197950491, "grad_norm": 0.86109459400177, "learning_rate": 8.069265438052787e-05, "loss": 0.9013313293457031, "memory(GiB)": 91.52, "step": 50115, "token_acc": 0.7506954846993367, "train_speed(iter/s)": 0.148471 }, { "epoch": 0.6503410981967048, "grad_norm": 0.828089714050293, "learning_rate": 8.06884199490316e-05, "loss": 0.8593656539916992, "memory(GiB)": 91.52, "step": 50120, "token_acc": 0.7662309681126113, "train_speed(iter/s)": 0.148468 }, { "epoch": 0.6504059765983605, "grad_norm": 0.7371439337730408, "learning_rate": 8.068418516437499e-05, "loss": 0.8768285751342774, "memory(GiB)": 91.52, "step": 50125, "token_acc": 0.7443233245444886, "train_speed(iter/s)": 0.148464 }, { "epoch": 0.6504708550000162, "grad_norm": 0.8709443807601929, "learning_rate": 8.067995002660676e-05, "loss": 0.8707813262939453, "memory(GiB)": 91.52, "step": 50130, "token_acc": 0.7684984855041108, "train_speed(iter/s)": 0.148461 }, { "epoch": 0.6505357334016719, "grad_norm": 0.696226179599762, "learning_rate": 8.067571453577561e-05, "loss": 0.8780205726623536, "memory(GiB)": 91.52, "step": 50135, "token_acc": 0.7641651850112382, "train_speed(iter/s)": 0.148459 }, { "epoch": 0.6506006118033276, "grad_norm": 0.7895345687866211, "learning_rate": 8.067147869193032e-05, "loss": 0.8632966995239257, "memory(GiB)": 91.52, "step": 50140, "token_acc": 0.7418397626112759, "train_speed(iter/s)": 0.148455 }, { "epoch": 0.6506654902049833, "grad_norm": 0.7383675575256348, "learning_rate": 8.066724249511965e-05, "loss": 0.9342191696166993, "memory(GiB)": 91.52, "step": 50145, "token_acc": 0.7424490573224147, "train_speed(iter/s)": 0.148452 }, { "epoch": 0.650730368606639, "grad_norm": 0.7908023595809937, "learning_rate": 8.066300594539231e-05, "loss": 0.8891054153442383, "memory(GiB)": 91.52, "step": 50150, "token_acc": 0.7660723933009185, "train_speed(iter/s)": 0.148449 }, { "epoch": 0.6507952470082947, "grad_norm": 0.8011146187782288, "learning_rate": 8.065876904279708e-05, "loss": 0.8848503112792969, "memory(GiB)": 91.52, "step": 50155, "token_acc": 0.7594158034686669, "train_speed(iter/s)": 0.148445 }, { "epoch": 0.6508601254099504, "grad_norm": 0.7112859487533569, "learning_rate": 8.065453178738269e-05, "loss": 0.9417307853698731, "memory(GiB)": 91.52, "step": 50160, "token_acc": 0.7417947119439656, "train_speed(iter/s)": 0.148442 }, { "epoch": 0.6509250038116061, "grad_norm": 0.7026748657226562, "learning_rate": 8.065029417919794e-05, "loss": 0.9182506561279297, "memory(GiB)": 91.52, "step": 50165, "token_acc": 0.760431500996867, "train_speed(iter/s)": 0.148439 }, { "epoch": 0.6509898822132618, "grad_norm": 0.8927445411682129, "learning_rate": 8.064605621829158e-05, "loss": 0.9441389083862305, "memory(GiB)": 91.52, "step": 50170, "token_acc": 0.7508680555555556, "train_speed(iter/s)": 0.148436 }, { "epoch": 0.6510547606149175, "grad_norm": 0.7876848578453064, "learning_rate": 8.064181790471236e-05, "loss": 0.9091596603393555, "memory(GiB)": 91.52, "step": 50175, "token_acc": 0.7695987654320988, "train_speed(iter/s)": 0.148434 }, { "epoch": 0.6511196390165732, "grad_norm": 0.7779720425605774, "learning_rate": 8.063757923850908e-05, "loss": 0.9034275054931641, "memory(GiB)": 91.52, "step": 50180, "token_acc": 0.7639614682364793, "train_speed(iter/s)": 0.14843 }, { "epoch": 0.6511845174182289, "grad_norm": 0.8298590779304504, "learning_rate": 8.06333402197305e-05, "loss": 0.9065010070800781, "memory(GiB)": 91.52, "step": 50185, "token_acc": 0.7630462294349736, "train_speed(iter/s)": 0.148427 }, { "epoch": 0.6512493958198846, "grad_norm": 0.7828567624092102, "learning_rate": 8.062910084842543e-05, "loss": 0.8466794967651368, "memory(GiB)": 91.52, "step": 50190, "token_acc": 0.78483149454392, "train_speed(iter/s)": 0.148422 }, { "epoch": 0.6513142742215403, "grad_norm": 0.7183248400688171, "learning_rate": 8.062486112464264e-05, "loss": 0.9552367210388184, "memory(GiB)": 91.52, "step": 50195, "token_acc": 0.7387912952281069, "train_speed(iter/s)": 0.14842 }, { "epoch": 0.651379152623196, "grad_norm": 0.7692142128944397, "learning_rate": 8.06206210484309e-05, "loss": 0.9468685150146484, "memory(GiB)": 91.52, "step": 50200, "token_acc": 0.7520271041912521, "train_speed(iter/s)": 0.148418 }, { "epoch": 0.6514440310248517, "grad_norm": 0.7898207306861877, "learning_rate": 8.061638061983903e-05, "loss": 0.93313627243042, "memory(GiB)": 91.52, "step": 50205, "token_acc": 0.7646866520644676, "train_speed(iter/s)": 0.148414 }, { "epoch": 0.6515089094265074, "grad_norm": 0.7134591937065125, "learning_rate": 8.061213983891582e-05, "loss": 0.8630653381347656, "memory(GiB)": 91.52, "step": 50210, "token_acc": 0.7585258127579466, "train_speed(iter/s)": 0.148411 }, { "epoch": 0.6515737878281631, "grad_norm": 0.7920807600021362, "learning_rate": 8.06078987057101e-05, "loss": 0.8773507118225098, "memory(GiB)": 91.52, "step": 50215, "token_acc": 0.7508614605388799, "train_speed(iter/s)": 0.148409 }, { "epoch": 0.6516386662298188, "grad_norm": 0.8546293377876282, "learning_rate": 8.060365722027062e-05, "loss": 0.9089637756347656, "memory(GiB)": 91.52, "step": 50220, "token_acc": 0.7561794324076899, "train_speed(iter/s)": 0.148406 }, { "epoch": 0.6517035446314745, "grad_norm": 0.6959245800971985, "learning_rate": 8.059941538264624e-05, "loss": 0.871709156036377, "memory(GiB)": 91.52, "step": 50225, "token_acc": 0.7590343818580834, "train_speed(iter/s)": 0.148403 }, { "epoch": 0.6517684230331302, "grad_norm": 0.7992310523986816, "learning_rate": 8.059517319288574e-05, "loss": 0.9047691345214843, "memory(GiB)": 91.52, "step": 50230, "token_acc": 0.7690642564180464, "train_speed(iter/s)": 0.1484 }, { "epoch": 0.6518333014347859, "grad_norm": 0.7468308210372925, "learning_rate": 8.059093065103798e-05, "loss": 0.8893978118896484, "memory(GiB)": 91.52, "step": 50235, "token_acc": 0.7661021089487827, "train_speed(iter/s)": 0.148397 }, { "epoch": 0.6518981798364416, "grad_norm": 0.8853887319564819, "learning_rate": 8.058668775715174e-05, "loss": 0.9505510330200195, "memory(GiB)": 91.52, "step": 50240, "token_acc": 0.7635352331419077, "train_speed(iter/s)": 0.148394 }, { "epoch": 0.6519630582380972, "grad_norm": 0.737646222114563, "learning_rate": 8.058244451127587e-05, "loss": 0.9219614028930664, "memory(GiB)": 91.52, "step": 50245, "token_acc": 0.7519415912100498, "train_speed(iter/s)": 0.14839 }, { "epoch": 0.6520279366397529, "grad_norm": 0.8618151545524597, "learning_rate": 8.057820091345921e-05, "loss": 0.8812939643859863, "memory(GiB)": 91.52, "step": 50250, "token_acc": 0.7566637375512595, "train_speed(iter/s)": 0.148386 }, { "epoch": 0.6520928150414086, "grad_norm": 0.8120272755622864, "learning_rate": 8.057395696375057e-05, "loss": 0.9089238166809082, "memory(GiB)": 91.52, "step": 50255, "token_acc": 0.7525548726953468, "train_speed(iter/s)": 0.148384 }, { "epoch": 0.6521576934430643, "grad_norm": 0.7430034279823303, "learning_rate": 8.056971266219879e-05, "loss": 0.8744579315185547, "memory(GiB)": 91.52, "step": 50260, "token_acc": 0.7713444914474894, "train_speed(iter/s)": 0.14838 }, { "epoch": 0.65222257184472, "grad_norm": 0.6758373975753784, "learning_rate": 8.056546800885274e-05, "loss": 0.9152981758117675, "memory(GiB)": 91.52, "step": 50265, "token_acc": 0.7528986016442977, "train_speed(iter/s)": 0.148377 }, { "epoch": 0.6522874502463757, "grad_norm": 0.7271171808242798, "learning_rate": 8.056122300376125e-05, "loss": 0.8867252349853516, "memory(GiB)": 91.52, "step": 50270, "token_acc": 0.7703526082000078, "train_speed(iter/s)": 0.148373 }, { "epoch": 0.6523523286480314, "grad_norm": 0.7285391688346863, "learning_rate": 8.055697764697318e-05, "loss": 0.8829450607299805, "memory(GiB)": 91.52, "step": 50275, "token_acc": 0.7555985915492958, "train_speed(iter/s)": 0.14837 }, { "epoch": 0.6524172070496871, "grad_norm": 0.6947794556617737, "learning_rate": 8.055273193853736e-05, "loss": 0.8594798088073731, "memory(GiB)": 91.52, "step": 50280, "token_acc": 0.754775655264327, "train_speed(iter/s)": 0.148366 }, { "epoch": 0.6524820854513428, "grad_norm": 0.8839151859283447, "learning_rate": 8.054848587850268e-05, "loss": 0.9117395401000976, "memory(GiB)": 91.52, "step": 50285, "token_acc": 0.7607343905220942, "train_speed(iter/s)": 0.148363 }, { "epoch": 0.6525469638529985, "grad_norm": 0.8686055541038513, "learning_rate": 8.054423946691798e-05, "loss": 0.9078523635864257, "memory(GiB)": 91.52, "step": 50290, "token_acc": 0.7586613592805543, "train_speed(iter/s)": 0.14836 }, { "epoch": 0.6526118422546542, "grad_norm": 0.7189289927482605, "learning_rate": 8.053999270383215e-05, "loss": 0.917024803161621, "memory(GiB)": 91.52, "step": 50295, "token_acc": 0.7437507499100108, "train_speed(iter/s)": 0.148358 }, { "epoch": 0.6526767206563099, "grad_norm": 0.8885534405708313, "learning_rate": 8.053574558929404e-05, "loss": 0.8968761444091797, "memory(GiB)": 91.52, "step": 50300, "token_acc": 0.7425689615649117, "train_speed(iter/s)": 0.148354 }, { "epoch": 0.6527415990579656, "grad_norm": 0.7023662328720093, "learning_rate": 8.053149812335255e-05, "loss": 0.8642881393432618, "memory(GiB)": 91.52, "step": 50305, "token_acc": 0.7804093950925075, "train_speed(iter/s)": 0.148351 }, { "epoch": 0.6528064774596213, "grad_norm": 0.6856695413589478, "learning_rate": 8.052725030605652e-05, "loss": 0.8789201736450195, "memory(GiB)": 91.52, "step": 50310, "token_acc": 0.7498186538388119, "train_speed(iter/s)": 0.148347 }, { "epoch": 0.652871355861277, "grad_norm": 0.7585020065307617, "learning_rate": 8.052300213745487e-05, "loss": 0.8986637115478515, "memory(GiB)": 91.52, "step": 50315, "token_acc": 0.753036718261956, "train_speed(iter/s)": 0.148344 }, { "epoch": 0.6529362342629327, "grad_norm": 0.8005415201187134, "learning_rate": 8.051875361759647e-05, "loss": 0.9013477325439453, "memory(GiB)": 91.52, "step": 50320, "token_acc": 0.7478874826304127, "train_speed(iter/s)": 0.148341 }, { "epoch": 0.6530011126645884, "grad_norm": 0.7918955683708191, "learning_rate": 8.051450474653024e-05, "loss": 0.9065361022949219, "memory(GiB)": 91.52, "step": 50325, "token_acc": 0.7610365976949518, "train_speed(iter/s)": 0.148338 }, { "epoch": 0.6530659910662441, "grad_norm": 0.7740020751953125, "learning_rate": 8.051025552430503e-05, "loss": 0.8580790519714355, "memory(GiB)": 91.52, "step": 50330, "token_acc": 0.7791479820627802, "train_speed(iter/s)": 0.148335 }, { "epoch": 0.6531308694678998, "grad_norm": 0.7484338879585266, "learning_rate": 8.050600595096977e-05, "loss": 0.930565357208252, "memory(GiB)": 91.52, "step": 50335, "token_acc": 0.7455627666822044, "train_speed(iter/s)": 0.148332 }, { "epoch": 0.6531957478695555, "grad_norm": 0.8309153914451599, "learning_rate": 8.050175602657336e-05, "loss": 0.9183873176574707, "memory(GiB)": 91.52, "step": 50340, "token_acc": 0.7470182976516779, "train_speed(iter/s)": 0.148329 }, { "epoch": 0.6532606262712112, "grad_norm": 0.6339640617370605, "learning_rate": 8.04975057511647e-05, "loss": 0.8947417259216308, "memory(GiB)": 91.52, "step": 50345, "token_acc": 0.7418492154009517, "train_speed(iter/s)": 0.148325 }, { "epoch": 0.6533255046728669, "grad_norm": 0.8073968291282654, "learning_rate": 8.049325512479271e-05, "loss": 0.8578238487243652, "memory(GiB)": 91.52, "step": 50350, "token_acc": 0.7644063630298136, "train_speed(iter/s)": 0.148322 }, { "epoch": 0.6533903830745226, "grad_norm": 0.6522947549819946, "learning_rate": 8.048900414750631e-05, "loss": 0.8807729721069336, "memory(GiB)": 91.52, "step": 50355, "token_acc": 0.7640949554896143, "train_speed(iter/s)": 0.148318 }, { "epoch": 0.6534552614761783, "grad_norm": 0.8676776885986328, "learning_rate": 8.04847528193544e-05, "loss": 0.8727153778076172, "memory(GiB)": 91.52, "step": 50360, "token_acc": 0.7925799573560768, "train_speed(iter/s)": 0.148315 }, { "epoch": 0.653520139877834, "grad_norm": 0.7921098470687866, "learning_rate": 8.048050114038592e-05, "loss": 0.8590356826782226, "memory(GiB)": 91.52, "step": 50365, "token_acc": 0.7442631310555838, "train_speed(iter/s)": 0.148312 }, { "epoch": 0.6535850182794897, "grad_norm": 0.7808356285095215, "learning_rate": 8.04762491106498e-05, "loss": 0.8813166618347168, "memory(GiB)": 91.52, "step": 50370, "token_acc": 0.7769761273209549, "train_speed(iter/s)": 0.148308 }, { "epoch": 0.6536498966811454, "grad_norm": 0.8517569303512573, "learning_rate": 8.047199673019497e-05, "loss": 0.9457327842712402, "memory(GiB)": 91.52, "step": 50375, "token_acc": 0.7430680951807712, "train_speed(iter/s)": 0.148305 }, { "epoch": 0.6537147750828011, "grad_norm": 0.6481422185897827, "learning_rate": 8.046774399907035e-05, "loss": 0.8445764541625976, "memory(GiB)": 91.52, "step": 50380, "token_acc": 0.798769799534691, "train_speed(iter/s)": 0.148301 }, { "epoch": 0.6537796534844568, "grad_norm": 0.7650035619735718, "learning_rate": 8.046349091732489e-05, "loss": 0.9142040252685547, "memory(GiB)": 91.52, "step": 50385, "token_acc": 0.7433139428529616, "train_speed(iter/s)": 0.148297 }, { "epoch": 0.6538445318861125, "grad_norm": 0.7806304097175598, "learning_rate": 8.045923748500754e-05, "loss": 0.8509232521057128, "memory(GiB)": 91.52, "step": 50390, "token_acc": 0.7572129734031969, "train_speed(iter/s)": 0.148294 }, { "epoch": 0.6539094102877682, "grad_norm": 0.7818281054496765, "learning_rate": 8.045498370216726e-05, "loss": 0.8950337409973145, "memory(GiB)": 91.52, "step": 50395, "token_acc": 0.7771910190949151, "train_speed(iter/s)": 0.14829 }, { "epoch": 0.6539742886894239, "grad_norm": 0.7051566243171692, "learning_rate": 8.045072956885298e-05, "loss": 0.8345758438110351, "memory(GiB)": 91.52, "step": 50400, "token_acc": 0.7763559021623537, "train_speed(iter/s)": 0.148287 }, { "epoch": 0.6540391670910796, "grad_norm": 0.7143688797950745, "learning_rate": 8.044647508511365e-05, "loss": 0.8637938499450684, "memory(GiB)": 91.52, "step": 50405, "token_acc": 0.7611367993684448, "train_speed(iter/s)": 0.148284 }, { "epoch": 0.6541040454927353, "grad_norm": 0.7000609636306763, "learning_rate": 8.044222025099825e-05, "loss": 0.8862998962402344, "memory(GiB)": 91.52, "step": 50410, "token_acc": 0.7617337247452944, "train_speed(iter/s)": 0.148281 }, { "epoch": 0.654168923894391, "grad_norm": 0.743195116519928, "learning_rate": 8.043796506655575e-05, "loss": 0.9359047889709473, "memory(GiB)": 91.52, "step": 50415, "token_acc": 0.7705196258982414, "train_speed(iter/s)": 0.148277 }, { "epoch": 0.6542338022960467, "grad_norm": 0.8042847514152527, "learning_rate": 8.043370953183511e-05, "loss": 0.9030946731567383, "memory(GiB)": 91.52, "step": 50420, "token_acc": 0.7576478585995922, "train_speed(iter/s)": 0.148273 }, { "epoch": 0.6542986806977024, "grad_norm": 0.7485685348510742, "learning_rate": 8.042945364688529e-05, "loss": 0.900410270690918, "memory(GiB)": 91.52, "step": 50425, "token_acc": 0.754060500511112, "train_speed(iter/s)": 0.148269 }, { "epoch": 0.6543635590993581, "grad_norm": 0.7169348001480103, "learning_rate": 8.042519741175528e-05, "loss": 0.8861944198608398, "memory(GiB)": 91.52, "step": 50430, "token_acc": 0.7597922848664689, "train_speed(iter/s)": 0.148266 }, { "epoch": 0.6544284375010138, "grad_norm": 0.9155146479606628, "learning_rate": 8.042094082649406e-05, "loss": 0.8918484687805176, "memory(GiB)": 91.52, "step": 50435, "token_acc": 0.7610144068092232, "train_speed(iter/s)": 0.148263 }, { "epoch": 0.6544933159026695, "grad_norm": 0.7336634397506714, "learning_rate": 8.041668389115059e-05, "loss": 0.917431640625, "memory(GiB)": 91.52, "step": 50440, "token_acc": 0.7388845964739763, "train_speed(iter/s)": 0.14826 }, { "epoch": 0.6545581943043252, "grad_norm": 0.8626350164413452, "learning_rate": 8.041242660577389e-05, "loss": 0.9221380233764649, "memory(GiB)": 91.52, "step": 50445, "token_acc": 0.7617066316288915, "train_speed(iter/s)": 0.148257 }, { "epoch": 0.6546230727059809, "grad_norm": 0.6639999747276306, "learning_rate": 8.040816897041294e-05, "loss": 0.8732640266418457, "memory(GiB)": 91.52, "step": 50450, "token_acc": 0.745950457114677, "train_speed(iter/s)": 0.148253 }, { "epoch": 0.6546879511076366, "grad_norm": 0.7156017422676086, "learning_rate": 8.040391098511676e-05, "loss": 0.9112949371337891, "memory(GiB)": 91.52, "step": 50455, "token_acc": 0.7396545768566494, "train_speed(iter/s)": 0.14825 }, { "epoch": 0.6547528295092923, "grad_norm": 0.7848214507102966, "learning_rate": 8.039965264993431e-05, "loss": 0.9174216270446778, "memory(GiB)": 91.52, "step": 50460, "token_acc": 0.7320503573536161, "train_speed(iter/s)": 0.148247 }, { "epoch": 0.654817707910948, "grad_norm": 0.7547958493232727, "learning_rate": 8.039539396491462e-05, "loss": 0.8608461380004883, "memory(GiB)": 91.52, "step": 50465, "token_acc": 0.739627450245493, "train_speed(iter/s)": 0.148244 }, { "epoch": 0.6548825863126037, "grad_norm": 0.8024955987930298, "learning_rate": 8.039113493010668e-05, "loss": 0.9229740142822266, "memory(GiB)": 91.52, "step": 50470, "token_acc": 0.7534246575342466, "train_speed(iter/s)": 0.14824 }, { "epoch": 0.6549474647142594, "grad_norm": 0.7402758598327637, "learning_rate": 8.038687554555953e-05, "loss": 0.9304183006286622, "memory(GiB)": 91.52, "step": 50475, "token_acc": 0.7457156257892268, "train_speed(iter/s)": 0.148238 }, { "epoch": 0.6550123431159149, "grad_norm": 0.7633282542228699, "learning_rate": 8.038261581132216e-05, "loss": 0.8629024505615235, "memory(GiB)": 91.52, "step": 50480, "token_acc": 0.767576957029359, "train_speed(iter/s)": 0.148234 }, { "epoch": 0.6550772215175706, "grad_norm": 0.7256039977073669, "learning_rate": 8.03783557274436e-05, "loss": 0.8971832275390625, "memory(GiB)": 91.52, "step": 50485, "token_acc": 0.7518859725360681, "train_speed(iter/s)": 0.148231 }, { "epoch": 0.6551420999192263, "grad_norm": 0.7672391533851624, "learning_rate": 8.037409529397291e-05, "loss": 0.8838998794555664, "memory(GiB)": 91.52, "step": 50490, "token_acc": 0.7526344789586059, "train_speed(iter/s)": 0.148227 }, { "epoch": 0.655206978320882, "grad_norm": 0.8072904944419861, "learning_rate": 8.036983451095905e-05, "loss": 0.8609855651855469, "memory(GiB)": 91.52, "step": 50495, "token_acc": 0.7815659445077917, "train_speed(iter/s)": 0.148224 }, { "epoch": 0.6552718567225377, "grad_norm": 0.7423928380012512, "learning_rate": 8.03655733784511e-05, "loss": 0.9495590209960938, "memory(GiB)": 91.52, "step": 50500, "token_acc": 0.7394429759525498, "train_speed(iter/s)": 0.148221 }, { "epoch": 0.6553367351241934, "grad_norm": 0.7120556235313416, "learning_rate": 8.036131189649809e-05, "loss": 0.8987237930297851, "memory(GiB)": 91.52, "step": 50505, "token_acc": 0.7903347691377617, "train_speed(iter/s)": 0.148219 }, { "epoch": 0.6554016135258491, "grad_norm": 0.85496586561203, "learning_rate": 8.035705006514906e-05, "loss": 0.9337382316589355, "memory(GiB)": 91.52, "step": 50510, "token_acc": 0.7453780492738379, "train_speed(iter/s)": 0.148215 }, { "epoch": 0.6554664919275048, "grad_norm": 0.7734994888305664, "learning_rate": 8.035278788445304e-05, "loss": 0.8768400192260742, "memory(GiB)": 91.52, "step": 50515, "token_acc": 0.7634165053154568, "train_speed(iter/s)": 0.148211 }, { "epoch": 0.6555313703291605, "grad_norm": 0.7100785374641418, "learning_rate": 8.03485253544591e-05, "loss": 0.9169897079467774, "memory(GiB)": 91.52, "step": 50520, "token_acc": 0.7442358242583552, "train_speed(iter/s)": 0.148208 }, { "epoch": 0.6555962487308162, "grad_norm": 0.745781660079956, "learning_rate": 8.034426247521629e-05, "loss": 0.9076272964477539, "memory(GiB)": 91.52, "step": 50525, "token_acc": 0.7494001877673077, "train_speed(iter/s)": 0.148205 }, { "epoch": 0.6556611271324719, "grad_norm": 0.7160028219223022, "learning_rate": 8.033999924677364e-05, "loss": 0.8977216720581055, "memory(GiB)": 91.52, "step": 50530, "token_acc": 0.7508058966279112, "train_speed(iter/s)": 0.148201 }, { "epoch": 0.6557260055341276, "grad_norm": 0.7903403043746948, "learning_rate": 8.033573566918025e-05, "loss": 0.8893209457397461, "memory(GiB)": 91.52, "step": 50535, "token_acc": 0.7660533485821924, "train_speed(iter/s)": 0.148198 }, { "epoch": 0.6557908839357833, "grad_norm": 0.7686652541160583, "learning_rate": 8.033147174248515e-05, "loss": 0.9056264877319335, "memory(GiB)": 91.52, "step": 50540, "token_acc": 0.7371951801100054, "train_speed(iter/s)": 0.148196 }, { "epoch": 0.655855762337439, "grad_norm": 0.7838018536567688, "learning_rate": 8.032720746673743e-05, "loss": 0.8890921592712402, "memory(GiB)": 91.52, "step": 50545, "token_acc": 0.7612297241092472, "train_speed(iter/s)": 0.148193 }, { "epoch": 0.6559206407390947, "grad_norm": 0.7729532122612, "learning_rate": 8.032294284198616e-05, "loss": 0.8739499092102051, "memory(GiB)": 91.52, "step": 50550, "token_acc": 0.7482520169035728, "train_speed(iter/s)": 0.14819 }, { "epoch": 0.6559855191407504, "grad_norm": 0.7855203747749329, "learning_rate": 8.031867786828043e-05, "loss": 0.897034740447998, "memory(GiB)": 91.52, "step": 50555, "token_acc": 0.7715615070358602, "train_speed(iter/s)": 0.148186 }, { "epoch": 0.6560503975424061, "grad_norm": 0.7445172071456909, "learning_rate": 8.031441254566929e-05, "loss": 0.9177944183349609, "memory(GiB)": 91.52, "step": 50560, "token_acc": 0.7355675675675676, "train_speed(iter/s)": 0.148183 }, { "epoch": 0.6561152759440618, "grad_norm": 0.72953200340271, "learning_rate": 8.031014687420184e-05, "loss": 0.9104145050048829, "memory(GiB)": 91.52, "step": 50565, "token_acc": 0.7610125420617926, "train_speed(iter/s)": 0.14818 }, { "epoch": 0.6561801543457175, "grad_norm": 0.7218270897865295, "learning_rate": 8.030588085392718e-05, "loss": 0.8638893127441406, "memory(GiB)": 91.52, "step": 50570, "token_acc": 0.7773545822057398, "train_speed(iter/s)": 0.148176 }, { "epoch": 0.6562450327473732, "grad_norm": 0.9384924173355103, "learning_rate": 8.03016144848944e-05, "loss": 0.911567497253418, "memory(GiB)": 91.52, "step": 50575, "token_acc": 0.756547428936442, "train_speed(iter/s)": 0.148173 }, { "epoch": 0.6563099111490289, "grad_norm": 0.7221001982688904, "learning_rate": 8.029734776715257e-05, "loss": 0.8943002700805665, "memory(GiB)": 91.52, "step": 50580, "token_acc": 0.7657967236425037, "train_speed(iter/s)": 0.148169 }, { "epoch": 0.6563747895506846, "grad_norm": 0.8325796127319336, "learning_rate": 8.029308070075081e-05, "loss": 0.9134997367858887, "memory(GiB)": 91.52, "step": 50585, "token_acc": 0.7794537301263759, "train_speed(iter/s)": 0.148166 }, { "epoch": 0.6564396679523403, "grad_norm": 0.813718855381012, "learning_rate": 8.028881328573825e-05, "loss": 0.861055564880371, "memory(GiB)": 91.52, "step": 50590, "token_acc": 0.7885511277943493, "train_speed(iter/s)": 0.148162 }, { "epoch": 0.656504546353996, "grad_norm": 0.7069897055625916, "learning_rate": 8.028454552216396e-05, "loss": 0.8958562850952149, "memory(GiB)": 91.52, "step": 50595, "token_acc": 0.7484580316438724, "train_speed(iter/s)": 0.148159 }, { "epoch": 0.6565694247556517, "grad_norm": 0.7129019498825073, "learning_rate": 8.028027741007707e-05, "loss": 0.9131908416748047, "memory(GiB)": 91.52, "step": 50600, "token_acc": 0.7565030602636534, "train_speed(iter/s)": 0.148156 }, { "epoch": 0.6566343031573074, "grad_norm": 0.761988639831543, "learning_rate": 8.027600894952671e-05, "loss": 0.9440458297729493, "memory(GiB)": 91.52, "step": 50605, "token_acc": 0.7460894870862131, "train_speed(iter/s)": 0.148154 }, { "epoch": 0.6566991815589631, "grad_norm": 0.7560537457466125, "learning_rate": 8.027174014056198e-05, "loss": 0.8851730346679687, "memory(GiB)": 91.52, "step": 50610, "token_acc": 0.7480903882877148, "train_speed(iter/s)": 0.148151 }, { "epoch": 0.6567640599606188, "grad_norm": 0.783940851688385, "learning_rate": 8.0267470983232e-05, "loss": 0.9561257362365723, "memory(GiB)": 91.52, "step": 50615, "token_acc": 0.7498846331333641, "train_speed(iter/s)": 0.148148 }, { "epoch": 0.6568289383622745, "grad_norm": 0.7945412397384644, "learning_rate": 8.026320147758593e-05, "loss": 0.9140327453613282, "memory(GiB)": 91.52, "step": 50620, "token_acc": 0.7432984332070989, "train_speed(iter/s)": 0.148145 }, { "epoch": 0.6568938167639302, "grad_norm": 0.8242651224136353, "learning_rate": 8.025893162367288e-05, "loss": 0.9083069801330567, "memory(GiB)": 91.52, "step": 50625, "token_acc": 0.7494495382694141, "train_speed(iter/s)": 0.148142 }, { "epoch": 0.6569586951655859, "grad_norm": 0.840344250202179, "learning_rate": 8.025466142154199e-05, "loss": 0.8686830520629882, "memory(GiB)": 91.52, "step": 50630, "token_acc": 0.7540038436899423, "train_speed(iter/s)": 0.148139 }, { "epoch": 0.6570235735672416, "grad_norm": 0.73629230260849, "learning_rate": 8.02503908712424e-05, "loss": 0.8298900604248047, "memory(GiB)": 91.52, "step": 50635, "token_acc": 0.7815679672059146, "train_speed(iter/s)": 0.148136 }, { "epoch": 0.6570884519688973, "grad_norm": 0.7759674787521362, "learning_rate": 8.024611997282325e-05, "loss": 0.8642833709716797, "memory(GiB)": 91.52, "step": 50640, "token_acc": 0.7673626036229659, "train_speed(iter/s)": 0.148133 }, { "epoch": 0.657153330370553, "grad_norm": 0.6139113306999207, "learning_rate": 8.024184872633373e-05, "loss": 0.858802604675293, "memory(GiB)": 91.52, "step": 50645, "token_acc": 0.7664767823234939, "train_speed(iter/s)": 0.148129 }, { "epoch": 0.6572182087722087, "grad_norm": 0.723126232624054, "learning_rate": 8.023757713182293e-05, "loss": 0.9038064002990722, "memory(GiB)": 91.52, "step": 50650, "token_acc": 0.7500694080266527, "train_speed(iter/s)": 0.148125 }, { "epoch": 0.6572830871738644, "grad_norm": 0.762412965297699, "learning_rate": 8.023330518934004e-05, "loss": 0.9208209037780761, "memory(GiB)": 91.52, "step": 50655, "token_acc": 0.7239329876346231, "train_speed(iter/s)": 0.148122 }, { "epoch": 0.6573479655755201, "grad_norm": 0.7190480828285217, "learning_rate": 8.022903289893425e-05, "loss": 0.9287843704223633, "memory(GiB)": 91.52, "step": 50660, "token_acc": 0.7362864119407092, "train_speed(iter/s)": 0.148119 }, { "epoch": 0.6574128439771758, "grad_norm": 0.8247026205062866, "learning_rate": 8.022476026065467e-05, "loss": 0.8655404090881348, "memory(GiB)": 91.52, "step": 50665, "token_acc": 0.764669347193619, "train_speed(iter/s)": 0.148116 }, { "epoch": 0.6574777223788315, "grad_norm": 0.7931404709815979, "learning_rate": 8.022048727455049e-05, "loss": 0.8592764854431152, "memory(GiB)": 91.52, "step": 50670, "token_acc": 0.7726297577854672, "train_speed(iter/s)": 0.148112 }, { "epoch": 0.6575426007804872, "grad_norm": 0.697435736656189, "learning_rate": 8.021621394067089e-05, "loss": 0.9158878326416016, "memory(GiB)": 91.52, "step": 50675, "token_acc": 0.7512161020651889, "train_speed(iter/s)": 0.148109 }, { "epoch": 0.6576074791821429, "grad_norm": 0.7765949964523315, "learning_rate": 8.021194025906506e-05, "loss": 0.9007854461669922, "memory(GiB)": 91.52, "step": 50680, "token_acc": 0.7571051000097889, "train_speed(iter/s)": 0.148106 }, { "epoch": 0.6576723575837986, "grad_norm": 0.7627663016319275, "learning_rate": 8.020766622978216e-05, "loss": 0.9278185844421387, "memory(GiB)": 91.52, "step": 50685, "token_acc": 0.7345593108018555, "train_speed(iter/s)": 0.148103 }, { "epoch": 0.6577372359854543, "grad_norm": 0.7949556708335876, "learning_rate": 8.020339185287136e-05, "loss": 0.9170883178710938, "memory(GiB)": 91.52, "step": 50690, "token_acc": 0.7419270149645576, "train_speed(iter/s)": 0.148099 }, { "epoch": 0.65780211438711, "grad_norm": 0.799872875213623, "learning_rate": 8.019911712838191e-05, "loss": 0.886808967590332, "memory(GiB)": 91.52, "step": 50695, "token_acc": 0.7612609329446064, "train_speed(iter/s)": 0.148096 }, { "epoch": 0.6578669927887657, "grad_norm": 0.7757912278175354, "learning_rate": 8.019484205636294e-05, "loss": 0.9278881072998046, "memory(GiB)": 91.52, "step": 50700, "token_acc": 0.733910367378342, "train_speed(iter/s)": 0.148094 }, { "epoch": 0.6579318711904214, "grad_norm": 0.7243258357048035, "learning_rate": 8.019056663686367e-05, "loss": 0.9015087127685547, "memory(GiB)": 91.52, "step": 50705, "token_acc": 0.7716947590202792, "train_speed(iter/s)": 0.14809 }, { "epoch": 0.6579967495920771, "grad_norm": 0.7851359248161316, "learning_rate": 8.01862908699333e-05, "loss": 0.8796022415161133, "memory(GiB)": 91.52, "step": 50710, "token_acc": 0.7668983700862896, "train_speed(iter/s)": 0.148087 }, { "epoch": 0.6580616279937328, "grad_norm": 0.7188015580177307, "learning_rate": 8.018201475562104e-05, "loss": 0.9099761009216308, "memory(GiB)": 91.52, "step": 50715, "token_acc": 0.7573899787910923, "train_speed(iter/s)": 0.148083 }, { "epoch": 0.6581265063953884, "grad_norm": 0.7896707057952881, "learning_rate": 8.01777382939761e-05, "loss": 0.857166862487793, "memory(GiB)": 91.52, "step": 50720, "token_acc": 0.7751399776035834, "train_speed(iter/s)": 0.14808 }, { "epoch": 0.6581913847970441, "grad_norm": 0.8142056465148926, "learning_rate": 8.017346148504769e-05, "loss": 0.9080168724060058, "memory(GiB)": 91.52, "step": 50725, "token_acc": 0.752949533633949, "train_speed(iter/s)": 0.148077 }, { "epoch": 0.6582562631986998, "grad_norm": 0.706994891166687, "learning_rate": 8.016918432888503e-05, "loss": 0.8654029846191407, "memory(GiB)": 91.52, "step": 50730, "token_acc": 0.7659416799436808, "train_speed(iter/s)": 0.148074 }, { "epoch": 0.6583211416003555, "grad_norm": 0.7740163207054138, "learning_rate": 8.016490682553734e-05, "loss": 0.8799840927124023, "memory(GiB)": 91.52, "step": 50735, "token_acc": 0.7462464835477447, "train_speed(iter/s)": 0.14807 }, { "epoch": 0.6583860200020112, "grad_norm": 0.7819677591323853, "learning_rate": 8.016062897505384e-05, "loss": 0.8671241760253906, "memory(GiB)": 91.52, "step": 50740, "token_acc": 0.7477978485894053, "train_speed(iter/s)": 0.148067 }, { "epoch": 0.6584508984036669, "grad_norm": 0.7151374220848083, "learning_rate": 8.015635077748376e-05, "loss": 0.8773082733154297, "memory(GiB)": 91.52, "step": 50745, "token_acc": 0.7604935933850913, "train_speed(iter/s)": 0.148063 }, { "epoch": 0.6585157768053226, "grad_norm": 0.7449100017547607, "learning_rate": 8.015207223287632e-05, "loss": 0.8927478790283203, "memory(GiB)": 91.52, "step": 50750, "token_acc": 0.7573858875815088, "train_speed(iter/s)": 0.14806 }, { "epoch": 0.6585806552069783, "grad_norm": 0.7610750794410706, "learning_rate": 8.01477933412808e-05, "loss": 0.9051106452941895, "memory(GiB)": 91.52, "step": 50755, "token_acc": 0.7438409132756398, "train_speed(iter/s)": 0.148058 }, { "epoch": 0.658645533608634, "grad_norm": 0.8503799438476562, "learning_rate": 8.014351410274641e-05, "loss": 0.9279773712158204, "memory(GiB)": 91.52, "step": 50760, "token_acc": 0.7563122204588083, "train_speed(iter/s)": 0.148055 }, { "epoch": 0.6587104120102897, "grad_norm": 0.804980993270874, "learning_rate": 8.013923451732239e-05, "loss": 0.8850383758544922, "memory(GiB)": 91.52, "step": 50765, "token_acc": 0.775545549012816, "train_speed(iter/s)": 0.148053 }, { "epoch": 0.6587752904119454, "grad_norm": 0.799437940120697, "learning_rate": 8.0134954585058e-05, "loss": 0.9208406448364258, "memory(GiB)": 91.52, "step": 50770, "token_acc": 0.7632540160998617, "train_speed(iter/s)": 0.148049 }, { "epoch": 0.6588401688136011, "grad_norm": 0.7434966564178467, "learning_rate": 8.01306743060025e-05, "loss": 0.8752664566040039, "memory(GiB)": 91.52, "step": 50775, "token_acc": 0.7811373290972049, "train_speed(iter/s)": 0.148046 }, { "epoch": 0.6589050472152568, "grad_norm": 0.7787259221076965, "learning_rate": 8.012639368020513e-05, "loss": 0.9356895446777344, "memory(GiB)": 91.52, "step": 50780, "token_acc": 0.7485380116959064, "train_speed(iter/s)": 0.148043 }, { "epoch": 0.6589699256169125, "grad_norm": 0.7640370726585388, "learning_rate": 8.012211270771517e-05, "loss": 0.8659345626831054, "memory(GiB)": 91.52, "step": 50785, "token_acc": 0.759575062901873, "train_speed(iter/s)": 0.14804 }, { "epoch": 0.6590348040185682, "grad_norm": 0.733830451965332, "learning_rate": 8.011783138858186e-05, "loss": 0.951468563079834, "memory(GiB)": 91.52, "step": 50790, "token_acc": 0.7509348170749948, "train_speed(iter/s)": 0.148037 }, { "epoch": 0.6590996824202239, "grad_norm": 0.7392653822898865, "learning_rate": 8.01135497228545e-05, "loss": 0.8493153572082519, "memory(GiB)": 91.52, "step": 50795, "token_acc": 0.7451372889677833, "train_speed(iter/s)": 0.148034 }, { "epoch": 0.6591645608218796, "grad_norm": 0.7588629722595215, "learning_rate": 8.010926771058234e-05, "loss": 0.8931184768676758, "memory(GiB)": 91.52, "step": 50800, "token_acc": 0.7649755805840726, "train_speed(iter/s)": 0.14803 }, { "epoch": 0.6592294392235353, "grad_norm": 0.821148693561554, "learning_rate": 8.010498535181467e-05, "loss": 0.9139820098876953, "memory(GiB)": 91.52, "step": 50805, "token_acc": 0.7679424977538185, "train_speed(iter/s)": 0.148027 }, { "epoch": 0.659294317625191, "grad_norm": 0.7393717169761658, "learning_rate": 8.010070264660077e-05, "loss": 0.9339117050170899, "memory(GiB)": 91.52, "step": 50810, "token_acc": 0.7221770917952883, "train_speed(iter/s)": 0.148025 }, { "epoch": 0.6593591960268467, "grad_norm": 0.7807075381278992, "learning_rate": 8.009641959498992e-05, "loss": 0.8351432800292968, "memory(GiB)": 91.52, "step": 50815, "token_acc": 0.7794456865041343, "train_speed(iter/s)": 0.148021 }, { "epoch": 0.6594240744285024, "grad_norm": 0.7053194046020508, "learning_rate": 8.00921361970314e-05, "loss": 0.9115943908691406, "memory(GiB)": 91.52, "step": 50820, "token_acc": 0.7600483262611976, "train_speed(iter/s)": 0.148018 }, { "epoch": 0.6594889528301581, "grad_norm": 0.6969436407089233, "learning_rate": 8.008785245277453e-05, "loss": 0.8617433547973633, "memory(GiB)": 91.52, "step": 50825, "token_acc": 0.7716953231465105, "train_speed(iter/s)": 0.148013 }, { "epoch": 0.6595538312318138, "grad_norm": 0.7106108665466309, "learning_rate": 8.008356836226858e-05, "loss": 0.8554893493652344, "memory(GiB)": 91.52, "step": 50830, "token_acc": 0.7492637778712663, "train_speed(iter/s)": 0.148011 }, { "epoch": 0.6596187096334695, "grad_norm": 0.7998678684234619, "learning_rate": 8.007928392556286e-05, "loss": 0.9271121978759765, "memory(GiB)": 91.52, "step": 50835, "token_acc": 0.7485364453797589, "train_speed(iter/s)": 0.148008 }, { "epoch": 0.6596835880351252, "grad_norm": 0.6811630129814148, "learning_rate": 8.007499914270667e-05, "loss": 0.873759651184082, "memory(GiB)": 91.52, "step": 50840, "token_acc": 0.7663043478260869, "train_speed(iter/s)": 0.148004 }, { "epoch": 0.6597484664367809, "grad_norm": 0.8391227126121521, "learning_rate": 8.007071401374934e-05, "loss": 0.9214502334594726, "memory(GiB)": 91.52, "step": 50845, "token_acc": 0.7640579337807893, "train_speed(iter/s)": 0.148001 }, { "epoch": 0.6598133448384366, "grad_norm": 0.7322748303413391, "learning_rate": 8.006642853874015e-05, "loss": 0.8324623107910156, "memory(GiB)": 91.52, "step": 50850, "token_acc": 0.7744705415449002, "train_speed(iter/s)": 0.147998 }, { "epoch": 0.6598782232400923, "grad_norm": 0.6444587707519531, "learning_rate": 8.006214271772844e-05, "loss": 0.8900568008422851, "memory(GiB)": 91.52, "step": 50855, "token_acc": 0.7550702961411906, "train_speed(iter/s)": 0.147995 }, { "epoch": 0.659943101641748, "grad_norm": 0.7840049862861633, "learning_rate": 8.005785655076354e-05, "loss": 0.9155744552612305, "memory(GiB)": 91.52, "step": 50860, "token_acc": 0.7496848251609051, "train_speed(iter/s)": 0.147993 }, { "epoch": 0.6600079800434037, "grad_norm": 0.706825852394104, "learning_rate": 8.005357003789478e-05, "loss": 0.9014110565185547, "memory(GiB)": 91.52, "step": 50865, "token_acc": 0.7403688085197627, "train_speed(iter/s)": 0.14799 }, { "epoch": 0.6600728584450594, "grad_norm": 0.7458767294883728, "learning_rate": 8.004928317917143e-05, "loss": 0.8987216949462891, "memory(GiB)": 91.52, "step": 50870, "token_acc": 0.7696952773220705, "train_speed(iter/s)": 0.147986 }, { "epoch": 0.6601377368467151, "grad_norm": 0.7085191607475281, "learning_rate": 8.004499597464288e-05, "loss": 0.8719350814819335, "memory(GiB)": 91.52, "step": 50875, "token_acc": 0.775019991388325, "train_speed(iter/s)": 0.147982 }, { "epoch": 0.6602026152483708, "grad_norm": 0.8226125240325928, "learning_rate": 8.004070842435844e-05, "loss": 0.887995719909668, "memory(GiB)": 91.52, "step": 50880, "token_acc": 0.7645676998368679, "train_speed(iter/s)": 0.14798 }, { "epoch": 0.6602674936500265, "grad_norm": 0.7705634832382202, "learning_rate": 8.003642052836749e-05, "loss": 0.8936735153198242, "memory(GiB)": 91.52, "step": 50885, "token_acc": 0.7606967645780289, "train_speed(iter/s)": 0.147977 }, { "epoch": 0.6603323720516822, "grad_norm": 0.7842330932617188, "learning_rate": 8.003213228671931e-05, "loss": 0.8916032791137696, "memory(GiB)": 91.52, "step": 50890, "token_acc": 0.7598418657187469, "train_speed(iter/s)": 0.147974 }, { "epoch": 0.6603972504533379, "grad_norm": 0.8005832433700562, "learning_rate": 8.00278436994633e-05, "loss": 0.9122075080871582, "memory(GiB)": 91.52, "step": 50895, "token_acc": 0.7499514154304593, "train_speed(iter/s)": 0.14797 }, { "epoch": 0.6604621288549936, "grad_norm": 0.7089784145355225, "learning_rate": 8.00235547666488e-05, "loss": 0.8526170730590821, "memory(GiB)": 91.52, "step": 50900, "token_acc": 0.7764669141885562, "train_speed(iter/s)": 0.147966 }, { "epoch": 0.6605270072566493, "grad_norm": 0.841748833656311, "learning_rate": 8.001926548832517e-05, "loss": 0.906214714050293, "memory(GiB)": 91.52, "step": 50905, "token_acc": 0.7673590995613672, "train_speed(iter/s)": 0.147963 }, { "epoch": 0.660591885658305, "grad_norm": 0.7957319021224976, "learning_rate": 8.001497586454175e-05, "loss": 0.9213194847106934, "memory(GiB)": 91.52, "step": 50910, "token_acc": 0.7635259356533158, "train_speed(iter/s)": 0.14796 }, { "epoch": 0.6606567640599607, "grad_norm": 0.7191647887229919, "learning_rate": 8.001068589534793e-05, "loss": 0.8762949943542481, "memory(GiB)": 91.52, "step": 50915, "token_acc": 0.7692603857852385, "train_speed(iter/s)": 0.147956 }, { "epoch": 0.6607216424616164, "grad_norm": 0.7728008031845093, "learning_rate": 8.000639558079306e-05, "loss": 0.924686050415039, "memory(GiB)": 91.52, "step": 50920, "token_acc": 0.7441379087292603, "train_speed(iter/s)": 0.147954 }, { "epoch": 0.660786520863272, "grad_norm": 0.7700210213661194, "learning_rate": 8.000210492092652e-05, "loss": 0.9270416259765625, "memory(GiB)": 91.52, "step": 50925, "token_acc": 0.7568053596614951, "train_speed(iter/s)": 0.14795 }, { "epoch": 0.6608513992649278, "grad_norm": 0.838199257850647, "learning_rate": 7.999781391579769e-05, "loss": 0.8886819839477539, "memory(GiB)": 91.52, "step": 50930, "token_acc": 0.7489111400475955, "train_speed(iter/s)": 0.147947 }, { "epoch": 0.6609162776665835, "grad_norm": 0.6831446290016174, "learning_rate": 7.999352256545596e-05, "loss": 0.9012411117553711, "memory(GiB)": 91.52, "step": 50935, "token_acc": 0.7465556978233034, "train_speed(iter/s)": 0.147943 }, { "epoch": 0.6609811560682392, "grad_norm": 0.7779280543327332, "learning_rate": 7.998923086995068e-05, "loss": 0.8609136581420899, "memory(GiB)": 91.52, "step": 50940, "token_acc": 0.7590000696330339, "train_speed(iter/s)": 0.147941 }, { "epoch": 0.6610460344698948, "grad_norm": 0.7704397439956665, "learning_rate": 7.998493882933126e-05, "loss": 0.8985439300537109, "memory(GiB)": 91.52, "step": 50945, "token_acc": 0.7513825438807405, "train_speed(iter/s)": 0.147937 }, { "epoch": 0.6611109128715505, "grad_norm": 0.8238296508789062, "learning_rate": 7.998064644364712e-05, "loss": 0.8797980308532715, "memory(GiB)": 91.52, "step": 50950, "token_acc": 0.7560421952196555, "train_speed(iter/s)": 0.147934 }, { "epoch": 0.6611757912732062, "grad_norm": 0.7167235016822815, "learning_rate": 7.997635371294763e-05, "loss": 0.9058063507080079, "memory(GiB)": 91.52, "step": 50955, "token_acc": 0.7495348323819736, "train_speed(iter/s)": 0.147931 }, { "epoch": 0.6612406696748618, "grad_norm": 0.7198654413223267, "learning_rate": 7.997206063728218e-05, "loss": 0.909029197692871, "memory(GiB)": 91.52, "step": 50960, "token_acc": 0.7582964411956484, "train_speed(iter/s)": 0.147928 }, { "epoch": 0.6613055480765175, "grad_norm": 0.7138741612434387, "learning_rate": 7.996776721670018e-05, "loss": 0.8508724212646485, "memory(GiB)": 91.52, "step": 50965, "token_acc": 0.7821527777777778, "train_speed(iter/s)": 0.147925 }, { "epoch": 0.6613704264781732, "grad_norm": 0.8045297861099243, "learning_rate": 7.996347345125104e-05, "loss": 0.8821613311767578, "memory(GiB)": 91.52, "step": 50970, "token_acc": 0.7546554934823091, "train_speed(iter/s)": 0.147922 }, { "epoch": 0.6614353048798289, "grad_norm": 0.8681796193122864, "learning_rate": 7.995917934098419e-05, "loss": 0.9029597282409668, "memory(GiB)": 91.52, "step": 50975, "token_acc": 0.7659627241159379, "train_speed(iter/s)": 0.147919 }, { "epoch": 0.6615001832814846, "grad_norm": 0.8679269552230835, "learning_rate": 7.995488488594902e-05, "loss": 0.9121462821960449, "memory(GiB)": 91.52, "step": 50980, "token_acc": 0.7588116410670979, "train_speed(iter/s)": 0.147916 }, { "epoch": 0.6615650616831403, "grad_norm": 0.8103169202804565, "learning_rate": 7.995059008619496e-05, "loss": 0.929680347442627, "memory(GiB)": 91.52, "step": 50985, "token_acc": 0.7328337776496866, "train_speed(iter/s)": 0.147914 }, { "epoch": 0.661629940084796, "grad_norm": 0.6913601756095886, "learning_rate": 7.994629494177146e-05, "loss": 0.9166494369506836, "memory(GiB)": 91.52, "step": 50990, "token_acc": 0.7663010896780337, "train_speed(iter/s)": 0.147912 }, { "epoch": 0.6616948184864517, "grad_norm": 0.7734089493751526, "learning_rate": 7.994199945272791e-05, "loss": 0.8858105659484863, "memory(GiB)": 91.52, "step": 50995, "token_acc": 0.7771060203226864, "train_speed(iter/s)": 0.147908 }, { "epoch": 0.6617596968881074, "grad_norm": 0.6700993180274963, "learning_rate": 7.993770361911374e-05, "loss": 0.8889297485351563, "memory(GiB)": 91.52, "step": 51000, "token_acc": 0.7537291510265366, "train_speed(iter/s)": 0.147904 }, { "epoch": 0.6618245752897631, "grad_norm": 0.7700000405311584, "learning_rate": 7.993340744097844e-05, "loss": 0.8640925407409668, "memory(GiB)": 91.52, "step": 51005, "token_acc": 0.7505617223917637, "train_speed(iter/s)": 0.1479 }, { "epoch": 0.6618894536914188, "grad_norm": 0.7830845713615417, "learning_rate": 7.992911091837138e-05, "loss": 0.8670249938964844, "memory(GiB)": 91.52, "step": 51010, "token_acc": 0.7580260110457866, "train_speed(iter/s)": 0.147897 }, { "epoch": 0.6619543320930745, "grad_norm": 0.7212663292884827, "learning_rate": 7.992481405134207e-05, "loss": 0.9211673736572266, "memory(GiB)": 91.52, "step": 51015, "token_acc": 0.7730833703944443, "train_speed(iter/s)": 0.147894 }, { "epoch": 0.6620192104947302, "grad_norm": 0.7297347784042358, "learning_rate": 7.992051683993991e-05, "loss": 0.8748966217041015, "memory(GiB)": 91.52, "step": 51020, "token_acc": 0.7658230107450112, "train_speed(iter/s)": 0.147891 }, { "epoch": 0.6620840888963859, "grad_norm": 0.671404242515564, "learning_rate": 7.991621928421436e-05, "loss": 0.8851778030395507, "memory(GiB)": 91.52, "step": 51025, "token_acc": 0.7562719355258026, "train_speed(iter/s)": 0.147888 }, { "epoch": 0.6621489672980416, "grad_norm": 0.7616829872131348, "learning_rate": 7.991192138421488e-05, "loss": 0.9538524627685547, "memory(GiB)": 91.52, "step": 51030, "token_acc": 0.7187638164293925, "train_speed(iter/s)": 0.147885 }, { "epoch": 0.6622138456996973, "grad_norm": 0.8351172804832458, "learning_rate": 7.990762313999095e-05, "loss": 0.8867448806762696, "memory(GiB)": 91.52, "step": 51035, "token_acc": 0.7667157791342174, "train_speed(iter/s)": 0.147882 }, { "epoch": 0.662278724101353, "grad_norm": 0.8080635070800781, "learning_rate": 7.9903324551592e-05, "loss": 0.9351293563842773, "memory(GiB)": 91.52, "step": 51040, "token_acc": 0.7503297117377379, "train_speed(iter/s)": 0.147879 }, { "epoch": 0.6623436025030087, "grad_norm": 0.6952025294303894, "learning_rate": 7.989902561906755e-05, "loss": 0.8653353691101074, "memory(GiB)": 91.52, "step": 51045, "token_acc": 0.76634375, "train_speed(iter/s)": 0.147877 }, { "epoch": 0.6624084809046644, "grad_norm": 0.7215629816055298, "learning_rate": 7.9894726342467e-05, "loss": 0.9385337829589844, "memory(GiB)": 91.52, "step": 51050, "token_acc": 0.765129090610097, "train_speed(iter/s)": 0.147874 }, { "epoch": 0.6624733593063201, "grad_norm": 0.7708696722984314, "learning_rate": 7.989042672183988e-05, "loss": 0.9146385192871094, "memory(GiB)": 91.52, "step": 51055, "token_acc": 0.7701765897355434, "train_speed(iter/s)": 0.147871 }, { "epoch": 0.6625382377079758, "grad_norm": 0.734661340713501, "learning_rate": 7.988612675723566e-05, "loss": 0.8809655189514161, "memory(GiB)": 91.52, "step": 51060, "token_acc": 0.7563399839216546, "train_speed(iter/s)": 0.147868 }, { "epoch": 0.6626031161096315, "grad_norm": 0.7733895778656006, "learning_rate": 7.988182644870381e-05, "loss": 0.8758681297302247, "memory(GiB)": 91.52, "step": 51065, "token_acc": 0.7676203768318214, "train_speed(iter/s)": 0.147864 }, { "epoch": 0.6626679945112872, "grad_norm": 0.7409138083457947, "learning_rate": 7.987752579629381e-05, "loss": 0.8520444869995117, "memory(GiB)": 91.52, "step": 51070, "token_acc": 0.7442151834305626, "train_speed(iter/s)": 0.147861 }, { "epoch": 0.6627328729129429, "grad_norm": 0.808195948600769, "learning_rate": 7.987322480005518e-05, "loss": 0.9288660049438476, "memory(GiB)": 91.52, "step": 51075, "token_acc": 0.7523278981824096, "train_speed(iter/s)": 0.147857 }, { "epoch": 0.6627977513145986, "grad_norm": 0.7153565287590027, "learning_rate": 7.98689234600374e-05, "loss": 0.9163577079772949, "memory(GiB)": 91.52, "step": 51080, "token_acc": 0.7258537886872999, "train_speed(iter/s)": 0.147853 }, { "epoch": 0.6628626297162543, "grad_norm": 0.7210107445716858, "learning_rate": 7.986462177628997e-05, "loss": 0.9157697677612304, "memory(GiB)": 91.52, "step": 51085, "token_acc": 0.7696274630541872, "train_speed(iter/s)": 0.14785 }, { "epoch": 0.66292750811791, "grad_norm": 0.8210057020187378, "learning_rate": 7.98603197488624e-05, "loss": 0.8944253921508789, "memory(GiB)": 91.52, "step": 51090, "token_acc": 0.7312382149591452, "train_speed(iter/s)": 0.147847 }, { "epoch": 0.6629923865195657, "grad_norm": 0.7177830934524536, "learning_rate": 7.985601737780418e-05, "loss": 0.8819689750671387, "memory(GiB)": 91.52, "step": 51095, "token_acc": 0.7608438273598106, "train_speed(iter/s)": 0.147844 }, { "epoch": 0.6630572649212214, "grad_norm": 0.7829980850219727, "learning_rate": 7.985171466316486e-05, "loss": 0.8755941390991211, "memory(GiB)": 91.52, "step": 51100, "token_acc": 0.7588755399568035, "train_speed(iter/s)": 0.147841 }, { "epoch": 0.6631221433228771, "grad_norm": 0.8502321839332581, "learning_rate": 7.98474116049939e-05, "loss": 0.8553784370422364, "memory(GiB)": 91.52, "step": 51105, "token_acc": 0.7579565067311012, "train_speed(iter/s)": 0.147837 }, { "epoch": 0.6631870217245328, "grad_norm": 0.7508909702301025, "learning_rate": 7.984310820334086e-05, "loss": 0.9328758239746093, "memory(GiB)": 91.52, "step": 51110, "token_acc": 0.7488542329726289, "train_speed(iter/s)": 0.147834 }, { "epoch": 0.6632519001261885, "grad_norm": 0.6748330593109131, "learning_rate": 7.983880445825525e-05, "loss": 0.8351292610168457, "memory(GiB)": 91.52, "step": 51115, "token_acc": 0.7670895522388059, "train_speed(iter/s)": 0.147831 }, { "epoch": 0.6633167785278442, "grad_norm": 0.7223950028419495, "learning_rate": 7.983450036978659e-05, "loss": 0.9001043319702149, "memory(GiB)": 91.52, "step": 51120, "token_acc": 0.7583596214511041, "train_speed(iter/s)": 0.147827 }, { "epoch": 0.6633816569294999, "grad_norm": 0.8367787599563599, "learning_rate": 7.983019593798444e-05, "loss": 0.9226831436157227, "memory(GiB)": 91.52, "step": 51125, "token_acc": 0.7432262129804663, "train_speed(iter/s)": 0.147823 }, { "epoch": 0.6634465353311556, "grad_norm": 0.788733720779419, "learning_rate": 7.982589116289829e-05, "loss": 0.9199312210083008, "memory(GiB)": 91.52, "step": 51130, "token_acc": 0.7533341841618276, "train_speed(iter/s)": 0.14782 }, { "epoch": 0.6635114137328113, "grad_norm": 0.7091754674911499, "learning_rate": 7.982158604457773e-05, "loss": 0.9248420715332031, "memory(GiB)": 91.52, "step": 51135, "token_acc": 0.753314894655495, "train_speed(iter/s)": 0.147817 }, { "epoch": 0.663576292134467, "grad_norm": 0.6791051626205444, "learning_rate": 7.981728058307226e-05, "loss": 0.874294662475586, "memory(GiB)": 91.52, "step": 51140, "token_acc": 0.7646625992580556, "train_speed(iter/s)": 0.147814 }, { "epoch": 0.6636411705361227, "grad_norm": 0.6676329970359802, "learning_rate": 7.981297477843145e-05, "loss": 0.9403031349182129, "memory(GiB)": 91.52, "step": 51145, "token_acc": 0.7533193322179903, "train_speed(iter/s)": 0.147811 }, { "epoch": 0.6637060489377784, "grad_norm": 0.787092387676239, "learning_rate": 7.980866863070485e-05, "loss": 0.9383237838745118, "memory(GiB)": 91.52, "step": 51150, "token_acc": 0.7433317889999338, "train_speed(iter/s)": 0.147808 }, { "epoch": 0.6637709273394341, "grad_norm": 0.8185890316963196, "learning_rate": 7.9804362139942e-05, "loss": 0.915530776977539, "memory(GiB)": 91.52, "step": 51155, "token_acc": 0.7384255878486928, "train_speed(iter/s)": 0.147805 }, { "epoch": 0.6638358057410898, "grad_norm": 0.715718150138855, "learning_rate": 7.980005530619246e-05, "loss": 0.9343931198120117, "memory(GiB)": 91.52, "step": 51160, "token_acc": 0.7430862482636696, "train_speed(iter/s)": 0.147802 }, { "epoch": 0.6639006841427455, "grad_norm": 0.6761821508407593, "learning_rate": 7.979574812950582e-05, "loss": 0.8632917404174805, "memory(GiB)": 91.52, "step": 51165, "token_acc": 0.7683258242151183, "train_speed(iter/s)": 0.147799 }, { "epoch": 0.6639655625444012, "grad_norm": 0.7746480107307434, "learning_rate": 7.979144060993162e-05, "loss": 0.8838428497314453, "memory(GiB)": 91.52, "step": 51170, "token_acc": 0.749523843748877, "train_speed(iter/s)": 0.147796 }, { "epoch": 0.6640304409460569, "grad_norm": 0.7256913781166077, "learning_rate": 7.978713274751944e-05, "loss": 0.9170181274414062, "memory(GiB)": 91.52, "step": 51175, "token_acc": 0.7655436284408546, "train_speed(iter/s)": 0.147792 }, { "epoch": 0.6640953193477126, "grad_norm": 0.7678770422935486, "learning_rate": 7.978282454231885e-05, "loss": 0.9232831001281738, "memory(GiB)": 91.52, "step": 51180, "token_acc": 0.7619585687382298, "train_speed(iter/s)": 0.147788 }, { "epoch": 0.6641601977493683, "grad_norm": 0.7149384021759033, "learning_rate": 7.977851599437945e-05, "loss": 0.8401632308959961, "memory(GiB)": 91.52, "step": 51185, "token_acc": 0.7721468208865414, "train_speed(iter/s)": 0.147786 }, { "epoch": 0.664225076151024, "grad_norm": 0.8048232793807983, "learning_rate": 7.977420710375079e-05, "loss": 0.9004226684570312, "memory(GiB)": 91.52, "step": 51190, "token_acc": 0.7431786001301136, "train_speed(iter/s)": 0.147783 }, { "epoch": 0.6642899545526797, "grad_norm": 0.8192530274391174, "learning_rate": 7.976989787048246e-05, "loss": 0.9277506828308105, "memory(GiB)": 91.52, "step": 51195, "token_acc": 0.7263779527559056, "train_speed(iter/s)": 0.14778 }, { "epoch": 0.6643548329543353, "grad_norm": 0.7329156994819641, "learning_rate": 7.976558829462408e-05, "loss": 0.9056656837463379, "memory(GiB)": 91.52, "step": 51200, "token_acc": 0.7450980392156863, "train_speed(iter/s)": 0.147778 }, { "epoch": 0.664419711355991, "grad_norm": 0.7294777631759644, "learning_rate": 7.976127837622523e-05, "loss": 0.907099723815918, "memory(GiB)": 91.52, "step": 51205, "token_acc": 0.7439741667627725, "train_speed(iter/s)": 0.147775 }, { "epoch": 0.6644845897576467, "grad_norm": 0.8341232538223267, "learning_rate": 7.975696811533548e-05, "loss": 0.8886520385742187, "memory(GiB)": 91.52, "step": 51210, "token_acc": 0.764339736296098, "train_speed(iter/s)": 0.147771 }, { "epoch": 0.6645494681593024, "grad_norm": 0.7275635004043579, "learning_rate": 7.975265751200447e-05, "loss": 0.8686254501342774, "memory(GiB)": 91.52, "step": 51215, "token_acc": 0.7784197549037648, "train_speed(iter/s)": 0.147768 }, { "epoch": 0.6646143465609581, "grad_norm": 0.8136693239212036, "learning_rate": 7.974834656628179e-05, "loss": 0.8981273651123047, "memory(GiB)": 91.52, "step": 51220, "token_acc": 0.7694723827981421, "train_speed(iter/s)": 0.147765 }, { "epoch": 0.6646792249626138, "grad_norm": 0.8371150493621826, "learning_rate": 7.974403527821706e-05, "loss": 0.9402200698852539, "memory(GiB)": 91.52, "step": 51225, "token_acc": 0.7573596571348276, "train_speed(iter/s)": 0.147763 }, { "epoch": 0.6647441033642695, "grad_norm": 0.7715064883232117, "learning_rate": 7.973972364785986e-05, "loss": 0.8986841201782226, "memory(GiB)": 91.52, "step": 51230, "token_acc": 0.7488343368843601, "train_speed(iter/s)": 0.14776 }, { "epoch": 0.6648089817659252, "grad_norm": 0.7035665512084961, "learning_rate": 7.973541167525986e-05, "loss": 0.8670446395874023, "memory(GiB)": 91.52, "step": 51235, "token_acc": 0.7706215979597275, "train_speed(iter/s)": 0.147756 }, { "epoch": 0.6648738601675809, "grad_norm": 0.8776551485061646, "learning_rate": 7.973109936046666e-05, "loss": 0.9151103973388672, "memory(GiB)": 91.52, "step": 51240, "token_acc": 0.7568818131911222, "train_speed(iter/s)": 0.147754 }, { "epoch": 0.6649387385692366, "grad_norm": 0.7798179984092712, "learning_rate": 7.972678670352986e-05, "loss": 0.882728385925293, "memory(GiB)": 91.52, "step": 51245, "token_acc": 0.7715459245095957, "train_speed(iter/s)": 0.147751 }, { "epoch": 0.6650036169708923, "grad_norm": 0.7687311768531799, "learning_rate": 7.972247370449912e-05, "loss": 0.9018451690673828, "memory(GiB)": 91.52, "step": 51250, "token_acc": 0.7565641410352588, "train_speed(iter/s)": 0.147748 }, { "epoch": 0.665068495372548, "grad_norm": 0.7425766587257385, "learning_rate": 7.971816036342408e-05, "loss": 0.9037395477294922, "memory(GiB)": 91.52, "step": 51255, "token_acc": 0.7637586054188977, "train_speed(iter/s)": 0.147744 }, { "epoch": 0.6651333737742037, "grad_norm": 0.7335182428359985, "learning_rate": 7.971384668035435e-05, "loss": 0.8606956481933594, "memory(GiB)": 91.52, "step": 51260, "token_acc": 0.757745152843567, "train_speed(iter/s)": 0.147741 }, { "epoch": 0.6651982521758594, "grad_norm": 0.7053656578063965, "learning_rate": 7.970953265533959e-05, "loss": 0.8802329063415527, "memory(GiB)": 91.52, "step": 51265, "token_acc": 0.7657614628820961, "train_speed(iter/s)": 0.147738 }, { "epoch": 0.6652631305775151, "grad_norm": 0.7870234847068787, "learning_rate": 7.970521828842944e-05, "loss": 0.9055032730102539, "memory(GiB)": 91.52, "step": 51270, "token_acc": 0.7649514532401405, "train_speed(iter/s)": 0.147735 }, { "epoch": 0.6653280089791708, "grad_norm": 0.7685430645942688, "learning_rate": 7.970090357967355e-05, "loss": 0.929997444152832, "memory(GiB)": 91.52, "step": 51275, "token_acc": 0.7675348535610073, "train_speed(iter/s)": 0.147732 }, { "epoch": 0.6653928873808265, "grad_norm": 0.7874779105186462, "learning_rate": 7.969658852912156e-05, "loss": 0.8709808349609375, "memory(GiB)": 91.52, "step": 51280, "token_acc": 0.7639405917795602, "train_speed(iter/s)": 0.147729 }, { "epoch": 0.6654577657824822, "grad_norm": 0.6810978055000305, "learning_rate": 7.969227313682315e-05, "loss": 0.9369590759277344, "memory(GiB)": 91.52, "step": 51285, "token_acc": 0.7495791095031701, "train_speed(iter/s)": 0.147726 }, { "epoch": 0.6655226441841379, "grad_norm": 0.8094559907913208, "learning_rate": 7.968795740282797e-05, "loss": 0.8719577789306641, "memory(GiB)": 91.52, "step": 51290, "token_acc": 0.7792400179143556, "train_speed(iter/s)": 0.147723 }, { "epoch": 0.6655875225857936, "grad_norm": 0.6953410506248474, "learning_rate": 7.96836413271857e-05, "loss": 0.8676185607910156, "memory(GiB)": 91.52, "step": 51295, "token_acc": 0.7841504129327007, "train_speed(iter/s)": 0.14772 }, { "epoch": 0.6656524009874493, "grad_norm": 0.7939466834068298, "learning_rate": 7.967932490994599e-05, "loss": 0.930429744720459, "memory(GiB)": 91.52, "step": 51300, "token_acc": 0.7607286297081054, "train_speed(iter/s)": 0.147716 }, { "epoch": 0.665717279389105, "grad_norm": 0.8446753621101379, "learning_rate": 7.967500815115851e-05, "loss": 0.8846834182739258, "memory(GiB)": 91.52, "step": 51305, "token_acc": 0.7811868070566906, "train_speed(iter/s)": 0.147713 }, { "epoch": 0.6657821577907607, "grad_norm": 0.7113127708435059, "learning_rate": 7.967069105087295e-05, "loss": 0.8891157150268555, "memory(GiB)": 91.52, "step": 51310, "token_acc": 0.7676922076232601, "train_speed(iter/s)": 0.147709 }, { "epoch": 0.6658470361924164, "grad_norm": 0.7977184057235718, "learning_rate": 7.9666373609139e-05, "loss": 0.8873104095458985, "memory(GiB)": 91.52, "step": 51315, "token_acc": 0.7459410703547805, "train_speed(iter/s)": 0.147706 }, { "epoch": 0.6659119145940721, "grad_norm": 0.8662819266319275, "learning_rate": 7.96620558260063e-05, "loss": 0.9234498977661133, "memory(GiB)": 91.52, "step": 51320, "token_acc": 0.7744193677762788, "train_speed(iter/s)": 0.147704 }, { "epoch": 0.6659767929957278, "grad_norm": 0.8889005184173584, "learning_rate": 7.96577377015246e-05, "loss": 0.8703556060791016, "memory(GiB)": 91.52, "step": 51325, "token_acc": 0.7735299472055344, "train_speed(iter/s)": 0.147701 }, { "epoch": 0.6660416713973835, "grad_norm": 0.7289795875549316, "learning_rate": 7.965341923574355e-05, "loss": 0.8555441856384277, "memory(GiB)": 91.52, "step": 51330, "token_acc": 0.7529614837879917, "train_speed(iter/s)": 0.147697 }, { "epoch": 0.6661065497990392, "grad_norm": 0.7316770553588867, "learning_rate": 7.964910042871285e-05, "loss": 0.8627181053161621, "memory(GiB)": 91.52, "step": 51335, "token_acc": 0.7371187102730727, "train_speed(iter/s)": 0.147693 }, { "epoch": 0.6661714282006949, "grad_norm": 0.7943626046180725, "learning_rate": 7.964478128048223e-05, "loss": 0.9448450088500977, "memory(GiB)": 91.52, "step": 51340, "token_acc": 0.746134772462077, "train_speed(iter/s)": 0.147691 }, { "epoch": 0.6662363066023506, "grad_norm": 0.6965889930725098, "learning_rate": 7.964046179110137e-05, "loss": 0.902951717376709, "memory(GiB)": 91.52, "step": 51345, "token_acc": 0.7465533998056225, "train_speed(iter/s)": 0.147688 }, { "epoch": 0.6663011850040063, "grad_norm": 0.7411513328552246, "learning_rate": 7.963614196062e-05, "loss": 0.8680357933044434, "memory(GiB)": 91.52, "step": 51350, "token_acc": 0.7522553782095767, "train_speed(iter/s)": 0.147685 }, { "epoch": 0.666366063405662, "grad_norm": 0.7672621607780457, "learning_rate": 7.963182178908779e-05, "loss": 0.873628044128418, "memory(GiB)": 91.52, "step": 51355, "token_acc": 0.7839366515837104, "train_speed(iter/s)": 0.147681 }, { "epoch": 0.6664309418073177, "grad_norm": 0.7922499775886536, "learning_rate": 7.962750127655447e-05, "loss": 0.8663886070251465, "memory(GiB)": 91.52, "step": 51360, "token_acc": 0.7592763901737075, "train_speed(iter/s)": 0.147678 }, { "epoch": 0.6664958202089734, "grad_norm": 0.8109846115112305, "learning_rate": 7.96231804230698e-05, "loss": 0.8932502746582032, "memory(GiB)": 91.52, "step": 51365, "token_acc": 0.7636947791164659, "train_speed(iter/s)": 0.147676 }, { "epoch": 0.666560698610629, "grad_norm": 0.6861647963523865, "learning_rate": 7.961885922868348e-05, "loss": 0.8886936187744141, "memory(GiB)": 91.52, "step": 51370, "token_acc": 0.7512975949918776, "train_speed(iter/s)": 0.147673 }, { "epoch": 0.6666255770122848, "grad_norm": 0.7777910828590393, "learning_rate": 7.961453769344522e-05, "loss": 0.8944879531860351, "memory(GiB)": 91.52, "step": 51375, "token_acc": 0.7630704029812575, "train_speed(iter/s)": 0.14767 }, { "epoch": 0.6666904554139405, "grad_norm": 0.8657256960868835, "learning_rate": 7.961021581740475e-05, "loss": 0.9287569046020507, "memory(GiB)": 91.52, "step": 51380, "token_acc": 0.7480827960637937, "train_speed(iter/s)": 0.147666 }, { "epoch": 0.6667553338155962, "grad_norm": 0.7623460292816162, "learning_rate": 7.960589360061186e-05, "loss": 0.9012062072753906, "memory(GiB)": 91.52, "step": 51385, "token_acc": 0.7649552703228316, "train_speed(iter/s)": 0.147664 }, { "epoch": 0.6668202122172519, "grad_norm": 0.8504171371459961, "learning_rate": 7.960157104311623e-05, "loss": 0.8709527969360351, "memory(GiB)": 91.52, "step": 51390, "token_acc": 0.7633975852573607, "train_speed(iter/s)": 0.147662 }, { "epoch": 0.6668850906189076, "grad_norm": 0.7239764332771301, "learning_rate": 7.959724814496763e-05, "loss": 0.8795412063598633, "memory(GiB)": 91.52, "step": 51395, "token_acc": 0.7645263312739041, "train_speed(iter/s)": 0.147659 }, { "epoch": 0.6669499690205632, "grad_norm": 0.7500227093696594, "learning_rate": 7.959292490621579e-05, "loss": 0.8790473937988281, "memory(GiB)": 91.52, "step": 51400, "token_acc": 0.7514334977344721, "train_speed(iter/s)": 0.147656 }, { "epoch": 0.667014847422219, "grad_norm": 0.820641040802002, "learning_rate": 7.958860132691048e-05, "loss": 0.9188254356384278, "memory(GiB)": 91.52, "step": 51405, "token_acc": 0.7370352640816977, "train_speed(iter/s)": 0.147654 }, { "epoch": 0.6670797258238746, "grad_norm": 0.8361276984214783, "learning_rate": 7.958427740710146e-05, "loss": 0.9389589309692383, "memory(GiB)": 91.52, "step": 51410, "token_acc": 0.7402869451800325, "train_speed(iter/s)": 0.14765 }, { "epoch": 0.6671446042255303, "grad_norm": 0.7064334750175476, "learning_rate": 7.957995314683848e-05, "loss": 0.9048888206481933, "memory(GiB)": 91.52, "step": 51415, "token_acc": 0.7678734100340767, "train_speed(iter/s)": 0.147647 }, { "epoch": 0.667209482627186, "grad_norm": 0.768838107585907, "learning_rate": 7.957562854617131e-05, "loss": 0.9243875503540039, "memory(GiB)": 91.52, "step": 51420, "token_acc": 0.7406960041786367, "train_speed(iter/s)": 0.147644 }, { "epoch": 0.6672743610288417, "grad_norm": 0.8109314441680908, "learning_rate": 7.957130360514969e-05, "loss": 0.8931087493896485, "memory(GiB)": 91.52, "step": 51425, "token_acc": 0.7707208847975099, "train_speed(iter/s)": 0.147641 }, { "epoch": 0.6673392394304974, "grad_norm": 0.7002323269844055, "learning_rate": 7.956697832382342e-05, "loss": 0.8783302307128906, "memory(GiB)": 91.52, "step": 51430, "token_acc": 0.7475870751154008, "train_speed(iter/s)": 0.147638 }, { "epoch": 0.667404117832153, "grad_norm": 0.7591006755828857, "learning_rate": 7.956265270224225e-05, "loss": 0.9176057815551758, "memory(GiB)": 91.52, "step": 51435, "token_acc": 0.7525812988516839, "train_speed(iter/s)": 0.147635 }, { "epoch": 0.6674689962338087, "grad_norm": 0.7630364298820496, "learning_rate": 7.955832674045601e-05, "loss": 0.9086130142211915, "memory(GiB)": 91.52, "step": 51440, "token_acc": 0.7754802812135795, "train_speed(iter/s)": 0.147632 }, { "epoch": 0.6675338746354644, "grad_norm": 0.7458800673484802, "learning_rate": 7.955400043851442e-05, "loss": 0.8350180625915528, "memory(GiB)": 91.52, "step": 51445, "token_acc": 0.7817991349013247, "train_speed(iter/s)": 0.147628 }, { "epoch": 0.6675987530371201, "grad_norm": 0.7694719433784485, "learning_rate": 7.954967379646732e-05, "loss": 0.9416975021362305, "memory(GiB)": 91.52, "step": 51450, "token_acc": 0.7330210772833724, "train_speed(iter/s)": 0.147626 }, { "epoch": 0.6676636314387758, "grad_norm": 0.673618733882904, "learning_rate": 7.954534681436446e-05, "loss": 0.8966796875, "memory(GiB)": 91.52, "step": 51455, "token_acc": 0.7503503791281848, "train_speed(iter/s)": 0.147622 }, { "epoch": 0.6677285098404315, "grad_norm": 0.8142496943473816, "learning_rate": 7.954101949225566e-05, "loss": 0.8765321731567383, "memory(GiB)": 91.52, "step": 51460, "token_acc": 0.7701190093277581, "train_speed(iter/s)": 0.147619 }, { "epoch": 0.6677933882420872, "grad_norm": 0.6779868006706238, "learning_rate": 7.953669183019068e-05, "loss": 0.8707345008850098, "memory(GiB)": 91.52, "step": 51465, "token_acc": 0.7412401422607082, "train_speed(iter/s)": 0.147616 }, { "epoch": 0.6678582666437429, "grad_norm": 0.8099383115768433, "learning_rate": 7.953236382821938e-05, "loss": 0.8661914825439453, "memory(GiB)": 91.52, "step": 51470, "token_acc": 0.776657458563536, "train_speed(iter/s)": 0.147613 }, { "epoch": 0.6679231450453986, "grad_norm": 0.7530177235603333, "learning_rate": 7.952803548639152e-05, "loss": 0.878116226196289, "memory(GiB)": 91.52, "step": 51475, "token_acc": 0.7483266948630559, "train_speed(iter/s)": 0.147611 }, { "epoch": 0.6679880234470543, "grad_norm": 0.7663429975509644, "learning_rate": 7.952370680475694e-05, "loss": 0.8738166809082031, "memory(GiB)": 91.52, "step": 51480, "token_acc": 0.7594165377450826, "train_speed(iter/s)": 0.147608 }, { "epoch": 0.66805290184871, "grad_norm": 0.7398840188980103, "learning_rate": 7.951937778336544e-05, "loss": 0.8817197799682617, "memory(GiB)": 91.52, "step": 51485, "token_acc": 0.7411289696405, "train_speed(iter/s)": 0.147605 }, { "epoch": 0.6681177802503657, "grad_norm": 0.8023437261581421, "learning_rate": 7.951504842226685e-05, "loss": 0.9368828773498535, "memory(GiB)": 91.52, "step": 51490, "token_acc": 0.7562143588243895, "train_speed(iter/s)": 0.147602 }, { "epoch": 0.6681826586520214, "grad_norm": 0.7306193709373474, "learning_rate": 7.951071872151097e-05, "loss": 0.8984333038330078, "memory(GiB)": 91.52, "step": 51495, "token_acc": 0.7503419122218078, "train_speed(iter/s)": 0.147599 }, { "epoch": 0.6682475370536771, "grad_norm": 0.6879367828369141, "learning_rate": 7.950638868114763e-05, "loss": 0.8600591659545899, "memory(GiB)": 91.52, "step": 51500, "token_acc": 0.7567290039518312, "train_speed(iter/s)": 0.147595 }, { "epoch": 0.6683124154553328, "grad_norm": 0.7307636737823486, "learning_rate": 7.950205830122668e-05, "loss": 0.9298833847045899, "memory(GiB)": 91.52, "step": 51505, "token_acc": 0.7753916422933611, "train_speed(iter/s)": 0.147592 }, { "epoch": 0.6683772938569885, "grad_norm": 0.7607160806655884, "learning_rate": 7.949772758179794e-05, "loss": 0.909552001953125, "memory(GiB)": 91.52, "step": 51510, "token_acc": 0.7452256481382695, "train_speed(iter/s)": 0.147589 }, { "epoch": 0.6684421722586442, "grad_norm": 0.7046086192131042, "learning_rate": 7.949339652291126e-05, "loss": 0.9249383926391601, "memory(GiB)": 91.52, "step": 51515, "token_acc": 0.7633007600434311, "train_speed(iter/s)": 0.147584 }, { "epoch": 0.6685070506602999, "grad_norm": 0.6858674883842468, "learning_rate": 7.948906512461646e-05, "loss": 0.9023493766784668, "memory(GiB)": 91.52, "step": 51520, "token_acc": 0.764090559964888, "train_speed(iter/s)": 0.147581 }, { "epoch": 0.6685719290619556, "grad_norm": 0.7368748188018799, "learning_rate": 7.948473338696338e-05, "loss": 0.8738628387451172, "memory(GiB)": 91.52, "step": 51525, "token_acc": 0.7539534562530212, "train_speed(iter/s)": 0.147578 }, { "epoch": 0.6686368074636113, "grad_norm": 0.7588543891906738, "learning_rate": 7.948040131000192e-05, "loss": 0.9474279403686523, "memory(GiB)": 91.52, "step": 51530, "token_acc": 0.7447900586600803, "train_speed(iter/s)": 0.147576 }, { "epoch": 0.668701685865267, "grad_norm": 0.8085831999778748, "learning_rate": 7.947606889378187e-05, "loss": 0.9309341430664062, "memory(GiB)": 91.52, "step": 51535, "token_acc": 0.754162186213801, "train_speed(iter/s)": 0.147573 }, { "epoch": 0.6687665642669227, "grad_norm": 0.7752970457077026, "learning_rate": 7.947173613835313e-05, "loss": 0.9487566947937012, "memory(GiB)": 91.52, "step": 51540, "token_acc": 0.7556606621716816, "train_speed(iter/s)": 0.14757 }, { "epoch": 0.6688314426685784, "grad_norm": 0.7526517510414124, "learning_rate": 7.946740304376555e-05, "loss": 0.9231204986572266, "memory(GiB)": 91.52, "step": 51545, "token_acc": 0.7492619810313422, "train_speed(iter/s)": 0.147567 }, { "epoch": 0.6688963210702341, "grad_norm": 0.7253316640853882, "learning_rate": 7.946306961006899e-05, "loss": 0.8962081909179688, "memory(GiB)": 91.52, "step": 51550, "token_acc": 0.7521957128916109, "train_speed(iter/s)": 0.147564 }, { "epoch": 0.6689611994718898, "grad_norm": 0.8282967805862427, "learning_rate": 7.945873583731332e-05, "loss": 0.8939901351928711, "memory(GiB)": 91.52, "step": 51555, "token_acc": 0.7603730424071793, "train_speed(iter/s)": 0.147561 }, { "epoch": 0.6690260778735455, "grad_norm": 0.7120015025138855, "learning_rate": 7.94544017255484e-05, "loss": 0.8535076141357422, "memory(GiB)": 91.52, "step": 51560, "token_acc": 0.7729917321629071, "train_speed(iter/s)": 0.147558 }, { "epoch": 0.6690909562752012, "grad_norm": 0.8590087890625, "learning_rate": 7.945006727482415e-05, "loss": 0.9417558670043945, "memory(GiB)": 91.52, "step": 51565, "token_acc": 0.7492154896315227, "train_speed(iter/s)": 0.147554 }, { "epoch": 0.6691558346768569, "grad_norm": 0.7951195240020752, "learning_rate": 7.94457324851904e-05, "loss": 0.8789226531982421, "memory(GiB)": 91.52, "step": 51570, "token_acc": 0.7725880551301685, "train_speed(iter/s)": 0.147551 }, { "epoch": 0.6692207130785126, "grad_norm": 0.7224968671798706, "learning_rate": 7.944139735669705e-05, "loss": 0.9140131950378418, "memory(GiB)": 91.52, "step": 51575, "token_acc": 0.7435590825089684, "train_speed(iter/s)": 0.147549 }, { "epoch": 0.6692855914801683, "grad_norm": 0.741193950176239, "learning_rate": 7.943706188939402e-05, "loss": 0.8961685180664063, "memory(GiB)": 91.52, "step": 51580, "token_acc": 0.762824990298797, "train_speed(iter/s)": 0.147546 }, { "epoch": 0.669350469881824, "grad_norm": 0.7543414235115051, "learning_rate": 7.943272608333115e-05, "loss": 0.881431770324707, "memory(GiB)": 91.52, "step": 51585, "token_acc": 0.7684308330057535, "train_speed(iter/s)": 0.147544 }, { "epoch": 0.6694153482834797, "grad_norm": 0.8703237175941467, "learning_rate": 7.942838993855837e-05, "loss": 0.8947540283203125, "memory(GiB)": 91.52, "step": 51590, "token_acc": 0.7462791819767658, "train_speed(iter/s)": 0.14754 }, { "epoch": 0.6694802266851354, "grad_norm": 0.7400676608085632, "learning_rate": 7.942405345512558e-05, "loss": 0.8281105041503907, "memory(GiB)": 91.52, "step": 51595, "token_acc": 0.7595967546621458, "train_speed(iter/s)": 0.147537 }, { "epoch": 0.6695451050867911, "grad_norm": 0.7680293321609497, "learning_rate": 7.941971663308268e-05, "loss": 0.9081510543823242, "memory(GiB)": 91.52, "step": 51600, "token_acc": 0.7432913524480343, "train_speed(iter/s)": 0.147535 }, { "epoch": 0.6696099834884468, "grad_norm": 0.706049382686615, "learning_rate": 7.941537947247957e-05, "loss": 0.8781854629516601, "memory(GiB)": 91.52, "step": 51605, "token_acc": 0.7667705311414348, "train_speed(iter/s)": 0.147531 }, { "epoch": 0.6696748618901025, "grad_norm": 0.9387079477310181, "learning_rate": 7.941104197336615e-05, "loss": 0.9364274978637696, "memory(GiB)": 91.52, "step": 51610, "token_acc": 0.7504438888284959, "train_speed(iter/s)": 0.147529 }, { "epoch": 0.6697397402917582, "grad_norm": 0.764300525188446, "learning_rate": 7.940670413579236e-05, "loss": 0.8550230026245117, "memory(GiB)": 91.52, "step": 51615, "token_acc": 0.7707651086632926, "train_speed(iter/s)": 0.147526 }, { "epoch": 0.6698046186934139, "grad_norm": 0.7405839562416077, "learning_rate": 7.940236595980812e-05, "loss": 0.9300508499145508, "memory(GiB)": 91.52, "step": 51620, "token_acc": 0.7571443031141402, "train_speed(iter/s)": 0.147523 }, { "epoch": 0.6698694970950696, "grad_norm": 0.8764616250991821, "learning_rate": 7.939802744546334e-05, "loss": 0.9252866744995117, "memory(GiB)": 91.52, "step": 51625, "token_acc": 0.7600983233664746, "train_speed(iter/s)": 0.14752 }, { "epoch": 0.6699343754967253, "grad_norm": 0.7091623544692993, "learning_rate": 7.939368859280795e-05, "loss": 0.8985880851745606, "memory(GiB)": 91.52, "step": 51630, "token_acc": 0.7524271844660194, "train_speed(iter/s)": 0.147518 }, { "epoch": 0.669999253898381, "grad_norm": 0.7400396466255188, "learning_rate": 7.938934940189188e-05, "loss": 0.9357268333435058, "memory(GiB)": 91.52, "step": 51635, "token_acc": 0.7661996119853417, "train_speed(iter/s)": 0.147515 }, { "epoch": 0.6700641323000367, "grad_norm": 0.8315743803977966, "learning_rate": 7.93850098727651e-05, "loss": 0.9298471450805664, "memory(GiB)": 91.52, "step": 51640, "token_acc": 0.7469679400796013, "train_speed(iter/s)": 0.147512 }, { "epoch": 0.6701290107016924, "grad_norm": 0.707496166229248, "learning_rate": 7.938067000547749e-05, "loss": 0.8756002426147461, "memory(GiB)": 91.52, "step": 51645, "token_acc": 0.7703360552763819, "train_speed(iter/s)": 0.147509 }, { "epoch": 0.6701938891033481, "grad_norm": 0.7059122920036316, "learning_rate": 7.937632980007902e-05, "loss": 0.9109806060791016, "memory(GiB)": 91.52, "step": 51650, "token_acc": 0.756501857673621, "train_speed(iter/s)": 0.147506 }, { "epoch": 0.6702587675050038, "grad_norm": 0.7990753054618835, "learning_rate": 7.937198925661964e-05, "loss": 0.8975894927978516, "memory(GiB)": 91.52, "step": 51655, "token_acc": 0.7429203108235524, "train_speed(iter/s)": 0.147503 }, { "epoch": 0.6703236459066595, "grad_norm": 0.7945113778114319, "learning_rate": 7.93676483751493e-05, "loss": 0.919953441619873, "memory(GiB)": 91.52, "step": 51660, "token_acc": 0.7592481785224033, "train_speed(iter/s)": 0.1475 }, { "epoch": 0.6703885243083152, "grad_norm": 0.7897790670394897, "learning_rate": 7.936330715571795e-05, "loss": 0.9007355690002441, "memory(GiB)": 91.52, "step": 51665, "token_acc": 0.7535199711181857, "train_speed(iter/s)": 0.147498 }, { "epoch": 0.6704534027099709, "grad_norm": 0.7188977003097534, "learning_rate": 7.935896559837554e-05, "loss": 0.8501707077026367, "memory(GiB)": 91.52, "step": 51670, "token_acc": 0.7854914762422923, "train_speed(iter/s)": 0.147495 }, { "epoch": 0.6705182811116265, "grad_norm": 0.7618928551673889, "learning_rate": 7.935462370317208e-05, "loss": 0.8591882705688476, "memory(GiB)": 91.52, "step": 51675, "token_acc": 0.7660196040745724, "train_speed(iter/s)": 0.147492 }, { "epoch": 0.6705831595132822, "grad_norm": 0.8030906319618225, "learning_rate": 7.935028147015746e-05, "loss": 0.8785125732421875, "memory(GiB)": 91.52, "step": 51680, "token_acc": 0.7538415084211233, "train_speed(iter/s)": 0.147488 }, { "epoch": 0.6706480379149379, "grad_norm": 0.7290438413619995, "learning_rate": 7.934593889938171e-05, "loss": 0.855274772644043, "memory(GiB)": 91.52, "step": 51685, "token_acc": 0.7767173893922069, "train_speed(iter/s)": 0.147484 }, { "epoch": 0.6707129163165936, "grad_norm": 0.7672454714775085, "learning_rate": 7.934159599089479e-05, "loss": 0.881141471862793, "memory(GiB)": 91.52, "step": 51690, "token_acc": 0.7773299218394027, "train_speed(iter/s)": 0.147482 }, { "epoch": 0.6707777947182493, "grad_norm": 0.7603110671043396, "learning_rate": 7.933725274474666e-05, "loss": 0.9277385711669922, "memory(GiB)": 91.52, "step": 51695, "token_acc": 0.7753030036887406, "train_speed(iter/s)": 0.147479 }, { "epoch": 0.670842673119905, "grad_norm": 0.6635980606079102, "learning_rate": 7.93329091609873e-05, "loss": 0.8653841972351074, "memory(GiB)": 91.52, "step": 51700, "token_acc": 0.7580276608538785, "train_speed(iter/s)": 0.147476 }, { "epoch": 0.6709075515215607, "grad_norm": 0.7354798913002014, "learning_rate": 7.932856523966673e-05, "loss": 0.9036012649536133, "memory(GiB)": 91.52, "step": 51705, "token_acc": 0.7409273828082514, "train_speed(iter/s)": 0.147474 }, { "epoch": 0.6709724299232164, "grad_norm": 0.7050058841705322, "learning_rate": 7.932422098083492e-05, "loss": 0.9252302169799804, "memory(GiB)": 91.52, "step": 51710, "token_acc": 0.7367488034036518, "train_speed(iter/s)": 0.14747 }, { "epoch": 0.6710373083248721, "grad_norm": 0.7570156455039978, "learning_rate": 7.931987638454185e-05, "loss": 0.934564208984375, "memory(GiB)": 91.52, "step": 51715, "token_acc": 0.7500092988655384, "train_speed(iter/s)": 0.147467 }, { "epoch": 0.6711021867265278, "grad_norm": 0.8157822489738464, "learning_rate": 7.931553145083752e-05, "loss": 0.9224704742431641, "memory(GiB)": 91.52, "step": 51720, "token_acc": 0.7546928746928747, "train_speed(iter/s)": 0.147464 }, { "epoch": 0.6711670651281835, "grad_norm": 0.7663443684577942, "learning_rate": 7.931118617977196e-05, "loss": 0.863949966430664, "memory(GiB)": 91.52, "step": 51725, "token_acc": 0.7734243661440285, "train_speed(iter/s)": 0.147461 }, { "epoch": 0.6712319435298392, "grad_norm": 0.7010363340377808, "learning_rate": 7.930684057139515e-05, "loss": 0.9203327178955079, "memory(GiB)": 91.52, "step": 51730, "token_acc": 0.7591743119266054, "train_speed(iter/s)": 0.147459 }, { "epoch": 0.6712968219314949, "grad_norm": 0.790912926197052, "learning_rate": 7.930249462575712e-05, "loss": 0.8568927764892578, "memory(GiB)": 91.52, "step": 51735, "token_acc": 0.7652994892431512, "train_speed(iter/s)": 0.147454 }, { "epoch": 0.6713617003331506, "grad_norm": 0.6429159641265869, "learning_rate": 7.929814834290785e-05, "loss": 0.8672140121459961, "memory(GiB)": 91.52, "step": 51740, "token_acc": 0.7769694765239005, "train_speed(iter/s)": 0.147451 }, { "epoch": 0.6714265787348063, "grad_norm": 0.6985365748405457, "learning_rate": 7.929380172289738e-05, "loss": 0.9003240585327148, "memory(GiB)": 91.52, "step": 51745, "token_acc": 0.7528809621000409, "train_speed(iter/s)": 0.147448 }, { "epoch": 0.671491457136462, "grad_norm": 0.8336625099182129, "learning_rate": 7.928945476577572e-05, "loss": 0.9014280319213868, "memory(GiB)": 91.52, "step": 51750, "token_acc": 0.7524965595116271, "train_speed(iter/s)": 0.147445 }, { "epoch": 0.6715563355381177, "grad_norm": 0.8345074653625488, "learning_rate": 7.92851074715929e-05, "loss": 0.8494073867797851, "memory(GiB)": 91.52, "step": 51755, "token_acc": 0.7492597373016475, "train_speed(iter/s)": 0.147442 }, { "epoch": 0.6716212139397734, "grad_norm": 0.7110345959663391, "learning_rate": 7.928075984039896e-05, "loss": 0.9066704750061035, "memory(GiB)": 91.52, "step": 51760, "token_acc": 0.7634701208229019, "train_speed(iter/s)": 0.147439 }, { "epoch": 0.6716860923414291, "grad_norm": 0.6231620907783508, "learning_rate": 7.92764118722439e-05, "loss": 0.8885280609130859, "memory(GiB)": 91.52, "step": 51765, "token_acc": 0.7738722373599758, "train_speed(iter/s)": 0.147436 }, { "epoch": 0.6717509707430848, "grad_norm": 0.740996778011322, "learning_rate": 7.92720635671778e-05, "loss": 0.886495304107666, "memory(GiB)": 91.52, "step": 51770, "token_acc": 0.7700477079444099, "train_speed(iter/s)": 0.147433 }, { "epoch": 0.6718158491447405, "grad_norm": 0.8137699365615845, "learning_rate": 7.926771492525064e-05, "loss": 0.8821773529052734, "memory(GiB)": 91.52, "step": 51775, "token_acc": 0.7456779915814793, "train_speed(iter/s)": 0.14743 }, { "epoch": 0.6718807275463962, "grad_norm": 0.6816971302032471, "learning_rate": 7.926336594651254e-05, "loss": 0.8773023605346679, "memory(GiB)": 91.52, "step": 51780, "token_acc": 0.7802999720522933, "train_speed(iter/s)": 0.147427 }, { "epoch": 0.6719456059480519, "grad_norm": 0.874369204044342, "learning_rate": 7.92590166310135e-05, "loss": 0.9477087020874023, "memory(GiB)": 91.52, "step": 51785, "token_acc": 0.7465413923207812, "train_speed(iter/s)": 0.147423 }, { "epoch": 0.6720104843497076, "grad_norm": 0.8169035315513611, "learning_rate": 7.925466697880356e-05, "loss": 0.8664106369018555, "memory(GiB)": 91.52, "step": 51790, "token_acc": 0.7533667093569294, "train_speed(iter/s)": 0.147421 }, { "epoch": 0.6720753627513633, "grad_norm": 0.7446280121803284, "learning_rate": 7.92503169899328e-05, "loss": 0.8715091705322265, "memory(GiB)": 91.52, "step": 51795, "token_acc": 0.7690994005297644, "train_speed(iter/s)": 0.147417 }, { "epoch": 0.672140241153019, "grad_norm": 0.8022758364677429, "learning_rate": 7.92459666644513e-05, "loss": 0.9044246673583984, "memory(GiB)": 91.52, "step": 51800, "token_acc": 0.7786893687137414, "train_speed(iter/s)": 0.147414 }, { "epoch": 0.6722051195546747, "grad_norm": 0.9745907187461853, "learning_rate": 7.924161600240907e-05, "loss": 0.9662385940551758, "memory(GiB)": 91.52, "step": 51805, "token_acc": 0.7547052133396925, "train_speed(iter/s)": 0.147412 }, { "epoch": 0.6722699979563304, "grad_norm": 0.6724207401275635, "learning_rate": 7.923726500385622e-05, "loss": 0.8839971542358398, "memory(GiB)": 91.52, "step": 51810, "token_acc": 0.7537188574209838, "train_speed(iter/s)": 0.147407 }, { "epoch": 0.672334876357986, "grad_norm": 0.7255055904388428, "learning_rate": 7.92329136688428e-05, "loss": 0.9709698677062988, "memory(GiB)": 91.52, "step": 51815, "token_acc": 0.7383208980305169, "train_speed(iter/s)": 0.147404 }, { "epoch": 0.6723997547596418, "grad_norm": 0.8033302426338196, "learning_rate": 7.922856199741889e-05, "loss": 0.8632207870483398, "memory(GiB)": 91.52, "step": 51820, "token_acc": 0.7519913756962329, "train_speed(iter/s)": 0.1474 }, { "epoch": 0.6724646331612975, "grad_norm": 0.7910556197166443, "learning_rate": 7.922420998963457e-05, "loss": 0.8604720115661622, "memory(GiB)": 91.52, "step": 51825, "token_acc": 0.7620122417750573, "train_speed(iter/s)": 0.147398 }, { "epoch": 0.6725295115629532, "grad_norm": 0.7426855564117432, "learning_rate": 7.921985764553992e-05, "loss": 0.8790438652038575, "memory(GiB)": 91.52, "step": 51830, "token_acc": 0.7653311246850951, "train_speed(iter/s)": 0.147395 }, { "epoch": 0.6725943899646089, "grad_norm": 0.7409521341323853, "learning_rate": 7.921550496518504e-05, "loss": 0.8714414596557617, "memory(GiB)": 91.52, "step": 51835, "token_acc": 0.7681626873303702, "train_speed(iter/s)": 0.147392 }, { "epoch": 0.6726592683662646, "grad_norm": 0.7606309652328491, "learning_rate": 7.921115194862e-05, "loss": 0.8735294342041016, "memory(GiB)": 91.52, "step": 51840, "token_acc": 0.7720931842569836, "train_speed(iter/s)": 0.147388 }, { "epoch": 0.6727241467679203, "grad_norm": 0.7133328914642334, "learning_rate": 7.92067985958949e-05, "loss": 0.9127800941467286, "memory(GiB)": 91.52, "step": 51845, "token_acc": 0.7398338972623808, "train_speed(iter/s)": 0.147385 }, { "epoch": 0.672789025169576, "grad_norm": 0.666662335395813, "learning_rate": 7.920244490705985e-05, "loss": 0.8718963623046875, "memory(GiB)": 91.52, "step": 51850, "token_acc": 0.7700706044429138, "train_speed(iter/s)": 0.147382 }, { "epoch": 0.6728539035712316, "grad_norm": 0.7520981431007385, "learning_rate": 7.919809088216495e-05, "loss": 0.9037657737731933, "memory(GiB)": 91.52, "step": 51855, "token_acc": 0.755039361739683, "train_speed(iter/s)": 0.147379 }, { "epoch": 0.6729187819728873, "grad_norm": 0.8097866177558899, "learning_rate": 7.919373652126028e-05, "loss": 0.8789122581481934, "memory(GiB)": 91.52, "step": 51860, "token_acc": 0.7559462608814861, "train_speed(iter/s)": 0.147376 }, { "epoch": 0.672983660374543, "grad_norm": 0.7758339643478394, "learning_rate": 7.918938182439597e-05, "loss": 0.8912571907043457, "memory(GiB)": 91.52, "step": 51865, "token_acc": 0.7892068965517242, "train_speed(iter/s)": 0.147374 }, { "epoch": 0.6730485387761987, "grad_norm": 0.7878441214561462, "learning_rate": 7.918502679162215e-05, "loss": 0.8957514762878418, "memory(GiB)": 91.52, "step": 51870, "token_acc": 0.7513765353663702, "train_speed(iter/s)": 0.147371 }, { "epoch": 0.6731134171778544, "grad_norm": 0.7717692255973816, "learning_rate": 7.91806714229889e-05, "loss": 0.8719328880310059, "memory(GiB)": 91.52, "step": 51875, "token_acc": 0.7779609329926647, "train_speed(iter/s)": 0.147368 }, { "epoch": 0.6731782955795101, "grad_norm": 0.7975730299949646, "learning_rate": 7.917631571854637e-05, "loss": 0.9048432350158692, "memory(GiB)": 91.52, "step": 51880, "token_acc": 0.7664142316210617, "train_speed(iter/s)": 0.147365 }, { "epoch": 0.6732431739811658, "grad_norm": 0.8848854899406433, "learning_rate": 7.917195967834469e-05, "loss": 0.8912508964538575, "memory(GiB)": 91.52, "step": 51885, "token_acc": 0.7468145351580935, "train_speed(iter/s)": 0.147363 }, { "epoch": 0.6733080523828215, "grad_norm": 0.8097860217094421, "learning_rate": 7.916760330243397e-05, "loss": 0.9462116241455079, "memory(GiB)": 91.52, "step": 51890, "token_acc": 0.7399381822964619, "train_speed(iter/s)": 0.147361 }, { "epoch": 0.6733729307844772, "grad_norm": 0.7110071778297424, "learning_rate": 7.916324659086433e-05, "loss": 0.8871657371520996, "memory(GiB)": 91.52, "step": 51895, "token_acc": 0.7577872955834909, "train_speed(iter/s)": 0.147357 }, { "epoch": 0.6734378091861329, "grad_norm": 0.7291156053543091, "learning_rate": 7.915888954368595e-05, "loss": 0.8933048248291016, "memory(GiB)": 91.52, "step": 51900, "token_acc": 0.7709131388534197, "train_speed(iter/s)": 0.147355 }, { "epoch": 0.6735026875877886, "grad_norm": 0.7870292663574219, "learning_rate": 7.915453216094893e-05, "loss": 0.8786022186279296, "memory(GiB)": 91.52, "step": 51905, "token_acc": 0.7624725822532403, "train_speed(iter/s)": 0.147352 }, { "epoch": 0.6735675659894443, "grad_norm": 0.706119179725647, "learning_rate": 7.915017444270343e-05, "loss": 0.9204560279846191, "memory(GiB)": 91.52, "step": 51910, "token_acc": 0.7339498018494055, "train_speed(iter/s)": 0.147349 }, { "epoch": 0.6736324443910999, "grad_norm": 0.7248126864433289, "learning_rate": 7.91458163889996e-05, "loss": 0.9024005889892578, "memory(GiB)": 91.52, "step": 51915, "token_acc": 0.7601423212282941, "train_speed(iter/s)": 0.147347 }, { "epoch": 0.6736973227927556, "grad_norm": 0.7111963629722595, "learning_rate": 7.914145799988759e-05, "loss": 0.9328771591186523, "memory(GiB)": 91.52, "step": 51920, "token_acc": 0.7538167938931297, "train_speed(iter/s)": 0.147344 }, { "epoch": 0.6737622011944113, "grad_norm": 0.7095168828964233, "learning_rate": 7.913709927541757e-05, "loss": 0.8556161880493164, "memory(GiB)": 91.52, "step": 51925, "token_acc": 0.7760855263157894, "train_speed(iter/s)": 0.147341 }, { "epoch": 0.673827079596067, "grad_norm": 0.752150297164917, "learning_rate": 7.913274021563968e-05, "loss": 0.8975658416748047, "memory(GiB)": 91.52, "step": 51930, "token_acc": 0.7666982395401247, "train_speed(iter/s)": 0.147338 }, { "epoch": 0.6738919579977227, "grad_norm": 0.6858881711959839, "learning_rate": 7.912838082060407e-05, "loss": 0.903532886505127, "memory(GiB)": 91.52, "step": 51935, "token_acc": 0.7688236561124958, "train_speed(iter/s)": 0.147335 }, { "epoch": 0.6739568363993784, "grad_norm": 0.7612240314483643, "learning_rate": 7.912402109036094e-05, "loss": 0.877630615234375, "memory(GiB)": 91.52, "step": 51940, "token_acc": 0.7573663889453364, "train_speed(iter/s)": 0.147331 }, { "epoch": 0.6740217148010341, "grad_norm": 0.8009293079376221, "learning_rate": 7.911966102496046e-05, "loss": 0.907040786743164, "memory(GiB)": 91.52, "step": 51945, "token_acc": 0.7467294610151753, "train_speed(iter/s)": 0.147329 }, { "epoch": 0.6740865932026898, "grad_norm": 0.7461677193641663, "learning_rate": 7.91153006244528e-05, "loss": 0.8475946426391602, "memory(GiB)": 91.52, "step": 51950, "token_acc": 0.749435580975316, "train_speed(iter/s)": 0.147325 }, { "epoch": 0.6741514716043455, "grad_norm": 0.7694382071495056, "learning_rate": 7.911093988888812e-05, "loss": 0.880006980895996, "memory(GiB)": 91.52, "step": 51955, "token_acc": 0.7389693499494779, "train_speed(iter/s)": 0.147322 }, { "epoch": 0.6742163500060012, "grad_norm": 0.7610581517219543, "learning_rate": 7.910657881831664e-05, "loss": 0.8887812614440918, "memory(GiB)": 91.52, "step": 51960, "token_acc": 0.7765119549929677, "train_speed(iter/s)": 0.147318 }, { "epoch": 0.6742812284076569, "grad_norm": 0.7040845155715942, "learning_rate": 7.91022174127885e-05, "loss": 0.8784204483032226, "memory(GiB)": 91.52, "step": 51965, "token_acc": 0.7696375454445301, "train_speed(iter/s)": 0.147315 }, { "epoch": 0.6743461068093126, "grad_norm": 0.7641312479972839, "learning_rate": 7.909785567235391e-05, "loss": 0.900124454498291, "memory(GiB)": 91.52, "step": 51970, "token_acc": 0.7381136907369343, "train_speed(iter/s)": 0.147311 }, { "epoch": 0.6744109852109683, "grad_norm": 0.7388913631439209, "learning_rate": 7.909349359706308e-05, "loss": 0.8971887588500976, "memory(GiB)": 91.52, "step": 51975, "token_acc": 0.7642071410298579, "train_speed(iter/s)": 0.147309 }, { "epoch": 0.674475863612624, "grad_norm": 0.7133058905601501, "learning_rate": 7.908913118696621e-05, "loss": 0.9142415046691894, "memory(GiB)": 91.52, "step": 51980, "token_acc": 0.7546231210122202, "train_speed(iter/s)": 0.147306 }, { "epoch": 0.6745407420142797, "grad_norm": 0.7099557518959045, "learning_rate": 7.908476844211348e-05, "loss": 0.8965106964111328, "memory(GiB)": 91.52, "step": 51985, "token_acc": 0.7771490446749799, "train_speed(iter/s)": 0.147304 }, { "epoch": 0.6746056204159354, "grad_norm": 0.7491530776023865, "learning_rate": 7.90804053625551e-05, "loss": 0.8888007164001465, "memory(GiB)": 91.52, "step": 51990, "token_acc": 0.7609204653545035, "train_speed(iter/s)": 0.147301 }, { "epoch": 0.6746704988175911, "grad_norm": 0.6880276799201965, "learning_rate": 7.907604194834128e-05, "loss": 0.8795598983764649, "memory(GiB)": 91.52, "step": 51995, "token_acc": 0.759545923632611, "train_speed(iter/s)": 0.147298 }, { "epoch": 0.6747353772192468, "grad_norm": 0.7124196887016296, "learning_rate": 7.907167819952227e-05, "loss": 0.8840483665466309, "memory(GiB)": 91.52, "step": 52000, "token_acc": 0.7543986363505042, "train_speed(iter/s)": 0.147294 }, { "epoch": 0.6748002556209025, "grad_norm": 0.7507632970809937, "learning_rate": 7.906731411614823e-05, "loss": 0.9366642951965332, "memory(GiB)": 91.52, "step": 52005, "token_acc": 0.7507927559720949, "train_speed(iter/s)": 0.147292 }, { "epoch": 0.6748651340225582, "grad_norm": 0.7691929340362549, "learning_rate": 7.906294969826943e-05, "loss": 0.8878318786621093, "memory(GiB)": 91.52, "step": 52010, "token_acc": 0.7594403395692502, "train_speed(iter/s)": 0.147289 }, { "epoch": 0.6749300124242139, "grad_norm": 0.6336284279823303, "learning_rate": 7.905858494593608e-05, "loss": 0.8960746765136719, "memory(GiB)": 91.52, "step": 52015, "token_acc": 0.7676301837460963, "train_speed(iter/s)": 0.147285 }, { "epoch": 0.6749948908258696, "grad_norm": 0.7396957278251648, "learning_rate": 7.905421985919838e-05, "loss": 0.9171060562133789, "memory(GiB)": 91.52, "step": 52020, "token_acc": 0.7528906736827529, "train_speed(iter/s)": 0.147282 }, { "epoch": 0.6750597692275253, "grad_norm": 0.7824103236198425, "learning_rate": 7.90498544381066e-05, "loss": 0.90074462890625, "memory(GiB)": 91.52, "step": 52025, "token_acc": 0.7578162348216847, "train_speed(iter/s)": 0.14728 }, { "epoch": 0.675124647629181, "grad_norm": 0.7610507607460022, "learning_rate": 7.904548868271096e-05, "loss": 0.9044670104980469, "memory(GiB)": 91.52, "step": 52030, "token_acc": 0.7571498840559342, "train_speed(iter/s)": 0.147277 }, { "epoch": 0.6751895260308367, "grad_norm": 0.827656626701355, "learning_rate": 7.904112259306171e-05, "loss": 0.915042781829834, "memory(GiB)": 91.52, "step": 52035, "token_acc": 0.7446926930110519, "train_speed(iter/s)": 0.147275 }, { "epoch": 0.6752544044324924, "grad_norm": 0.795409619808197, "learning_rate": 7.90367561692091e-05, "loss": 0.9097906112670898, "memory(GiB)": 91.52, "step": 52040, "token_acc": 0.7735751295336788, "train_speed(iter/s)": 0.147272 }, { "epoch": 0.6753192828341481, "grad_norm": 0.7762647867202759, "learning_rate": 7.903238941120334e-05, "loss": 0.8665876388549805, "memory(GiB)": 91.52, "step": 52045, "token_acc": 0.7458327397065109, "train_speed(iter/s)": 0.147268 }, { "epoch": 0.6753841612358038, "grad_norm": 0.7100045680999756, "learning_rate": 7.902802231909474e-05, "loss": 0.9035148620605469, "memory(GiB)": 91.52, "step": 52050, "token_acc": 0.7705137848709814, "train_speed(iter/s)": 0.147265 }, { "epoch": 0.6754490396374595, "grad_norm": 0.8444089889526367, "learning_rate": 7.902365489293352e-05, "loss": 0.8589815139770508, "memory(GiB)": 91.52, "step": 52055, "token_acc": 0.7656456253173126, "train_speed(iter/s)": 0.147262 }, { "epoch": 0.6755139180391152, "grad_norm": 0.769036054611206, "learning_rate": 7.901928713276994e-05, "loss": 0.9124046325683594, "memory(GiB)": 91.52, "step": 52060, "token_acc": 0.742314313821938, "train_speed(iter/s)": 0.147259 }, { "epoch": 0.6755787964407709, "grad_norm": 0.7125909924507141, "learning_rate": 7.901491903865427e-05, "loss": 0.8706523895263671, "memory(GiB)": 91.52, "step": 52065, "token_acc": 0.7649941726683203, "train_speed(iter/s)": 0.147256 }, { "epoch": 0.6756436748424266, "grad_norm": 0.753011167049408, "learning_rate": 7.90105506106368e-05, "loss": 0.9011751174926758, "memory(GiB)": 91.52, "step": 52070, "token_acc": 0.7539139374357468, "train_speed(iter/s)": 0.147253 }, { "epoch": 0.6757085532440823, "grad_norm": 0.7740077376365662, "learning_rate": 7.900618184876776e-05, "loss": 0.8793741226196289, "memory(GiB)": 91.52, "step": 52075, "token_acc": 0.756468483347096, "train_speed(iter/s)": 0.14725 }, { "epoch": 0.675773431645738, "grad_norm": 0.7434563040733337, "learning_rate": 7.900181275309745e-05, "loss": 0.9181370735168457, "memory(GiB)": 91.52, "step": 52080, "token_acc": 0.7608412687538535, "train_speed(iter/s)": 0.147247 }, { "epoch": 0.6758383100473937, "grad_norm": 0.7626206874847412, "learning_rate": 7.899744332367615e-05, "loss": 0.8925235748291016, "memory(GiB)": 91.52, "step": 52085, "token_acc": 0.76487199494668, "train_speed(iter/s)": 0.147244 }, { "epoch": 0.6759031884490494, "grad_norm": 0.7904987931251526, "learning_rate": 7.899307356055414e-05, "loss": 0.9326404571533203, "memory(GiB)": 91.52, "step": 52090, "token_acc": 0.7619945602901178, "train_speed(iter/s)": 0.147241 }, { "epoch": 0.6759680668507051, "grad_norm": 0.7862908840179443, "learning_rate": 7.89887034637817e-05, "loss": 0.8441532135009766, "memory(GiB)": 91.52, "step": 52095, "token_acc": 0.7390253364767674, "train_speed(iter/s)": 0.147238 }, { "epoch": 0.6760329452523608, "grad_norm": 0.8234080672264099, "learning_rate": 7.898433303340913e-05, "loss": 0.8946393966674805, "memory(GiB)": 91.52, "step": 52100, "token_acc": 0.7675285775414145, "train_speed(iter/s)": 0.147235 }, { "epoch": 0.6760978236540165, "grad_norm": 0.6908161044120789, "learning_rate": 7.897996226948671e-05, "loss": 0.8620190620422363, "memory(GiB)": 91.52, "step": 52105, "token_acc": 0.781590752807885, "train_speed(iter/s)": 0.147231 }, { "epoch": 0.6761627020556722, "grad_norm": 0.8245747685432434, "learning_rate": 7.897559117206476e-05, "loss": 0.9012348175048828, "memory(GiB)": 91.52, "step": 52110, "token_acc": 0.7696060345180995, "train_speed(iter/s)": 0.147229 }, { "epoch": 0.6762275804573279, "grad_norm": 0.7212178707122803, "learning_rate": 7.897121974119357e-05, "loss": 0.8791119575500488, "memory(GiB)": 91.52, "step": 52115, "token_acc": 0.7840263656125096, "train_speed(iter/s)": 0.147226 }, { "epoch": 0.6762924588589836, "grad_norm": 0.7002944350242615, "learning_rate": 7.896684797692344e-05, "loss": 0.9207769393920898, "memory(GiB)": 91.52, "step": 52120, "token_acc": 0.7652522183475469, "train_speed(iter/s)": 0.147223 }, { "epoch": 0.6763573372606393, "grad_norm": 0.7721481323242188, "learning_rate": 7.896247587930469e-05, "loss": 0.9081598281860351, "memory(GiB)": 91.52, "step": 52125, "token_acc": 0.745131828364639, "train_speed(iter/s)": 0.14722 }, { "epoch": 0.676422215662295, "grad_norm": 0.9165604114532471, "learning_rate": 7.895810344838766e-05, "loss": 0.9373016357421875, "memory(GiB)": 91.52, "step": 52130, "token_acc": 0.7421967895362663, "train_speed(iter/s)": 0.147217 }, { "epoch": 0.6764870940639507, "grad_norm": 0.7496211528778076, "learning_rate": 7.89537306842226e-05, "loss": 0.8869186401367187, "memory(GiB)": 91.52, "step": 52135, "token_acc": 0.7555378433130749, "train_speed(iter/s)": 0.147214 }, { "epoch": 0.6765519724656064, "grad_norm": 0.724766194820404, "learning_rate": 7.89493575868599e-05, "loss": 0.8487050056457519, "memory(GiB)": 91.52, "step": 52140, "token_acc": 0.7731345397882211, "train_speed(iter/s)": 0.147211 }, { "epoch": 0.6766168508672621, "grad_norm": 0.7186470031738281, "learning_rate": 7.894498415634983e-05, "loss": 0.9057001113891602, "memory(GiB)": 91.52, "step": 52145, "token_acc": 0.7691210433365032, "train_speed(iter/s)": 0.147208 }, { "epoch": 0.6766817292689177, "grad_norm": 0.7575815916061401, "learning_rate": 7.894061039274277e-05, "loss": 0.8943489074707032, "memory(GiB)": 91.52, "step": 52150, "token_acc": 0.771587646193711, "train_speed(iter/s)": 0.147205 }, { "epoch": 0.6767466076705734, "grad_norm": 0.7030508518218994, "learning_rate": 7.8936236296089e-05, "loss": 0.8661604881286621, "memory(GiB)": 91.52, "step": 52155, "token_acc": 0.740713970204393, "train_speed(iter/s)": 0.147202 }, { "epoch": 0.6768114860722291, "grad_norm": 0.6223227977752686, "learning_rate": 7.89318618664389e-05, "loss": 0.8770134925842286, "memory(GiB)": 91.52, "step": 52160, "token_acc": 0.7551329219083788, "train_speed(iter/s)": 0.147198 }, { "epoch": 0.6768763644738848, "grad_norm": 0.7498036623001099, "learning_rate": 7.892748710384281e-05, "loss": 0.8286113739013672, "memory(GiB)": 91.52, "step": 52165, "token_acc": 0.7720006394372951, "train_speed(iter/s)": 0.147196 }, { "epoch": 0.6769412428755405, "grad_norm": 0.7434698343276978, "learning_rate": 7.892311200835104e-05, "loss": 0.8880685806274414, "memory(GiB)": 91.52, "step": 52170, "token_acc": 0.7698275862068965, "train_speed(iter/s)": 0.147192 }, { "epoch": 0.6770061212771962, "grad_norm": 0.8483487367630005, "learning_rate": 7.891873658001396e-05, "loss": 0.905793571472168, "memory(GiB)": 91.52, "step": 52175, "token_acc": 0.767033641228399, "train_speed(iter/s)": 0.147189 }, { "epoch": 0.6770709996788519, "grad_norm": 0.7386360168457031, "learning_rate": 7.891436081888193e-05, "loss": 0.8706013679504394, "memory(GiB)": 91.52, "step": 52180, "token_acc": 0.7474284977267598, "train_speed(iter/s)": 0.147186 }, { "epoch": 0.6771358780805076, "grad_norm": 0.7586742639541626, "learning_rate": 7.890998472500529e-05, "loss": 0.8762999534606933, "memory(GiB)": 91.52, "step": 52185, "token_acc": 0.7544073411262994, "train_speed(iter/s)": 0.147182 }, { "epoch": 0.6772007564821633, "grad_norm": 0.712735116481781, "learning_rate": 7.890560829843441e-05, "loss": 0.9148056030273437, "memory(GiB)": 91.52, "step": 52190, "token_acc": 0.7654725138974676, "train_speed(iter/s)": 0.147179 }, { "epoch": 0.677265634883819, "grad_norm": 0.7571211457252502, "learning_rate": 7.890123153921965e-05, "loss": 0.8829893112182617, "memory(GiB)": 91.52, "step": 52195, "token_acc": 0.7603610108303249, "train_speed(iter/s)": 0.147176 }, { "epoch": 0.6773305132854747, "grad_norm": 0.797225832939148, "learning_rate": 7.889685444741137e-05, "loss": 0.9216110229492187, "memory(GiB)": 91.52, "step": 52200, "token_acc": 0.7410699084005857, "train_speed(iter/s)": 0.147174 }, { "epoch": 0.6773953916871304, "grad_norm": 0.7953428030014038, "learning_rate": 7.889247702305996e-05, "loss": 0.9309114456176758, "memory(GiB)": 91.52, "step": 52205, "token_acc": 0.7408774669884992, "train_speed(iter/s)": 0.14717 }, { "epoch": 0.6774602700887861, "grad_norm": 0.7672902345657349, "learning_rate": 7.888809926621579e-05, "loss": 0.9071792602539063, "memory(GiB)": 91.52, "step": 52210, "token_acc": 0.7452677503342481, "train_speed(iter/s)": 0.147167 }, { "epoch": 0.6775251484904418, "grad_norm": 0.8052794933319092, "learning_rate": 7.888372117692922e-05, "loss": 0.8768827438354492, "memory(GiB)": 91.52, "step": 52215, "token_acc": 0.7614022998406478, "train_speed(iter/s)": 0.147164 }, { "epoch": 0.6775900268920975, "grad_norm": 0.7402389049530029, "learning_rate": 7.887934275525064e-05, "loss": 0.8862964630126953, "memory(GiB)": 91.52, "step": 52220, "token_acc": 0.764897200604019, "train_speed(iter/s)": 0.147162 }, { "epoch": 0.6776549052937532, "grad_norm": 0.7028520703315735, "learning_rate": 7.887496400123045e-05, "loss": 0.903862190246582, "memory(GiB)": 91.52, "step": 52225, "token_acc": 0.7413284705517188, "train_speed(iter/s)": 0.147159 }, { "epoch": 0.6777197836954089, "grad_norm": 0.7434000968933105, "learning_rate": 7.887058491491903e-05, "loss": 0.8926937103271484, "memory(GiB)": 91.52, "step": 52230, "token_acc": 0.7491979407595315, "train_speed(iter/s)": 0.147158 }, { "epoch": 0.6777846620970646, "grad_norm": 0.6889826059341431, "learning_rate": 7.886620549636679e-05, "loss": 0.852366828918457, "memory(GiB)": 91.52, "step": 52235, "token_acc": 0.7645822947083541, "train_speed(iter/s)": 0.147155 }, { "epoch": 0.6778495404987203, "grad_norm": 0.7248494625091553, "learning_rate": 7.88618257456241e-05, "loss": 0.921774959564209, "memory(GiB)": 91.52, "step": 52240, "token_acc": 0.7479032063886993, "train_speed(iter/s)": 0.147153 }, { "epoch": 0.677914418900376, "grad_norm": 0.7177848219871521, "learning_rate": 7.885744566274138e-05, "loss": 0.8824767112731934, "memory(GiB)": 91.52, "step": 52245, "token_acc": 0.77304258355737, "train_speed(iter/s)": 0.14715 }, { "epoch": 0.6779792973020317, "grad_norm": 0.8398004174232483, "learning_rate": 7.885306524776905e-05, "loss": 0.9216236114501953, "memory(GiB)": 91.52, "step": 52250, "token_acc": 0.7379601864229199, "train_speed(iter/s)": 0.147148 }, { "epoch": 0.6780441757036874, "grad_norm": 0.7545298337936401, "learning_rate": 7.884868450075747e-05, "loss": 0.8653646469116211, "memory(GiB)": 91.52, "step": 52255, "token_acc": 0.7653895178825822, "train_speed(iter/s)": 0.147145 }, { "epoch": 0.6781090541053431, "grad_norm": 0.7860035300254822, "learning_rate": 7.88443034217571e-05, "loss": 0.8938383102416992, "memory(GiB)": 91.52, "step": 52260, "token_acc": 0.7729637688014379, "train_speed(iter/s)": 0.147142 }, { "epoch": 0.6781739325069988, "grad_norm": 0.730043888092041, "learning_rate": 7.883992201081834e-05, "loss": 0.890138816833496, "memory(GiB)": 91.52, "step": 52265, "token_acc": 0.7502051779452361, "train_speed(iter/s)": 0.147139 }, { "epoch": 0.6782388109086545, "grad_norm": 0.7725951671600342, "learning_rate": 7.883554026799163e-05, "loss": 0.9352428436279296, "memory(GiB)": 91.52, "step": 52270, "token_acc": 0.7658218125960061, "train_speed(iter/s)": 0.147136 }, { "epoch": 0.6783036893103102, "grad_norm": 0.8373186588287354, "learning_rate": 7.883115819332735e-05, "loss": 0.901953125, "memory(GiB)": 91.52, "step": 52275, "token_acc": 0.7615181570948648, "train_speed(iter/s)": 0.147133 }, { "epoch": 0.6783685677119659, "grad_norm": 0.7523190379142761, "learning_rate": 7.882677578687599e-05, "loss": 0.9291017532348633, "memory(GiB)": 91.52, "step": 52280, "token_acc": 0.751602787456446, "train_speed(iter/s)": 0.14713 }, { "epoch": 0.6784334461136216, "grad_norm": 0.7089213728904724, "learning_rate": 7.882239304868793e-05, "loss": 0.9324926376342774, "memory(GiB)": 91.52, "step": 52285, "token_acc": 0.7285679365855362, "train_speed(iter/s)": 0.147127 }, { "epoch": 0.6784983245152773, "grad_norm": 0.750968337059021, "learning_rate": 7.881800997881364e-05, "loss": 0.8710432052612305, "memory(GiB)": 91.52, "step": 52290, "token_acc": 0.7751212171877613, "train_speed(iter/s)": 0.147125 }, { "epoch": 0.678563202916933, "grad_norm": 0.7741295099258423, "learning_rate": 7.881362657730352e-05, "loss": 0.8701254844665527, "memory(GiB)": 91.52, "step": 52295, "token_acc": 0.7570910908409346, "train_speed(iter/s)": 0.147123 }, { "epoch": 0.6786280813185886, "grad_norm": 0.7390232086181641, "learning_rate": 7.880924284420805e-05, "loss": 0.8550238609313965, "memory(GiB)": 91.52, "step": 52300, "token_acc": 0.7711278834910174, "train_speed(iter/s)": 0.147119 }, { "epoch": 0.6786929597202443, "grad_norm": 0.8087198138237, "learning_rate": 7.880485877957768e-05, "loss": 0.9047069549560547, "memory(GiB)": 91.52, "step": 52305, "token_acc": 0.7520438787126151, "train_speed(iter/s)": 0.147117 }, { "epoch": 0.6787578381219, "grad_norm": 0.6697655320167542, "learning_rate": 7.880047438346284e-05, "loss": 0.8956016540527344, "memory(GiB)": 91.52, "step": 52310, "token_acc": 0.7792385359697097, "train_speed(iter/s)": 0.147114 }, { "epoch": 0.6788227165235557, "grad_norm": 0.6756124496459961, "learning_rate": 7.879608965591398e-05, "loss": 0.8746307373046875, "memory(GiB)": 91.52, "step": 52315, "token_acc": 0.7575893716706245, "train_speed(iter/s)": 0.14711 }, { "epoch": 0.6788875949252114, "grad_norm": 0.7037419676780701, "learning_rate": 7.87917045969816e-05, "loss": 0.871366310119629, "memory(GiB)": 91.52, "step": 52320, "token_acc": 0.7607563303525728, "train_speed(iter/s)": 0.147107 }, { "epoch": 0.6789524733268671, "grad_norm": 0.7274475693702698, "learning_rate": 7.878731920671613e-05, "loss": 0.8674191474914551, "memory(GiB)": 91.52, "step": 52325, "token_acc": 0.7546971396522715, "train_speed(iter/s)": 0.147104 }, { "epoch": 0.6790173517285228, "grad_norm": 0.7044307589530945, "learning_rate": 7.878293348516804e-05, "loss": 0.8909321784973144, "memory(GiB)": 91.52, "step": 52330, "token_acc": 0.7705670855434125, "train_speed(iter/s)": 0.147101 }, { "epoch": 0.6790822301301785, "grad_norm": 0.771282434463501, "learning_rate": 7.877854743238779e-05, "loss": 0.9191123962402343, "memory(GiB)": 91.52, "step": 52335, "token_acc": 0.7844044221369287, "train_speed(iter/s)": 0.147098 }, { "epoch": 0.6791471085318342, "grad_norm": 0.7331180572509766, "learning_rate": 7.877416104842589e-05, "loss": 0.8825441360473633, "memory(GiB)": 91.52, "step": 52340, "token_acc": 0.7600759721864537, "train_speed(iter/s)": 0.147096 }, { "epoch": 0.6792119869334899, "grad_norm": 0.8348448276519775, "learning_rate": 7.87697743333328e-05, "loss": 0.9381783485412598, "memory(GiB)": 91.52, "step": 52345, "token_acc": 0.7521940036536877, "train_speed(iter/s)": 0.147093 }, { "epoch": 0.6792768653351456, "grad_norm": 0.7421362400054932, "learning_rate": 7.876538728715896e-05, "loss": 0.9146846771240235, "memory(GiB)": 91.52, "step": 52350, "token_acc": 0.7373766500064078, "train_speed(iter/s)": 0.14709 }, { "epoch": 0.6793417437368013, "grad_norm": 0.8085320591926575, "learning_rate": 7.876099990995494e-05, "loss": 0.9079141616821289, "memory(GiB)": 91.52, "step": 52355, "token_acc": 0.7381072874493927, "train_speed(iter/s)": 0.147087 }, { "epoch": 0.679406622138457, "grad_norm": 0.7145738005638123, "learning_rate": 7.875661220177115e-05, "loss": 0.8854429244995117, "memory(GiB)": 91.52, "step": 52360, "token_acc": 0.7378831400210886, "train_speed(iter/s)": 0.147084 }, { "epoch": 0.6794715005401127, "grad_norm": 0.7364093661308289, "learning_rate": 7.875222416265814e-05, "loss": 0.9039079666137695, "memory(GiB)": 91.52, "step": 52365, "token_acc": 0.7465987849747583, "train_speed(iter/s)": 0.147081 }, { "epoch": 0.6795363789417684, "grad_norm": 0.7674380540847778, "learning_rate": 7.874783579266637e-05, "loss": 0.8764608383178711, "memory(GiB)": 91.52, "step": 52370, "token_acc": 0.7837008411002501, "train_speed(iter/s)": 0.147079 }, { "epoch": 0.6796012573434241, "grad_norm": 0.8400480151176453, "learning_rate": 7.874344709184636e-05, "loss": 0.9112069129943847, "memory(GiB)": 91.52, "step": 52375, "token_acc": 0.745381984036488, "train_speed(iter/s)": 0.147077 }, { "epoch": 0.6796661357450798, "grad_norm": 0.6281062364578247, "learning_rate": 7.873905806024862e-05, "loss": 0.8714043617248535, "memory(GiB)": 91.52, "step": 52380, "token_acc": 0.7721199727334697, "train_speed(iter/s)": 0.147074 }, { "epoch": 0.6797310141467355, "grad_norm": 0.7427282333374023, "learning_rate": 7.873466869792363e-05, "loss": 0.8984037399291992, "memory(GiB)": 91.52, "step": 52385, "token_acc": 0.7434188192451316, "train_speed(iter/s)": 0.147071 }, { "epoch": 0.6797958925483911, "grad_norm": 0.7806950211524963, "learning_rate": 7.873027900492195e-05, "loss": 0.9119287490844726, "memory(GiB)": 91.52, "step": 52390, "token_acc": 0.7612378284647192, "train_speed(iter/s)": 0.147067 }, { "epoch": 0.6798607709500468, "grad_norm": 0.7589160799980164, "learning_rate": 7.872588898129404e-05, "loss": 0.9055453300476074, "memory(GiB)": 91.52, "step": 52395, "token_acc": 0.7706169931841701, "train_speed(iter/s)": 0.147064 }, { "epoch": 0.6799256493517025, "grad_norm": 0.7549651861190796, "learning_rate": 7.872149862709047e-05, "loss": 0.8607938766479493, "memory(GiB)": 91.52, "step": 52400, "token_acc": 0.7648332109311584, "train_speed(iter/s)": 0.147062 }, { "epoch": 0.6799905277533582, "grad_norm": 0.6930170059204102, "learning_rate": 7.871710794236174e-05, "loss": 0.8976028442382813, "memory(GiB)": 91.52, "step": 52405, "token_acc": 0.768876454331434, "train_speed(iter/s)": 0.147058 }, { "epoch": 0.6800554061550139, "grad_norm": 0.7772046327590942, "learning_rate": 7.871271692715837e-05, "loss": 0.9265779495239258, "memory(GiB)": 91.52, "step": 52410, "token_acc": 0.7581814965505042, "train_speed(iter/s)": 0.147055 }, { "epoch": 0.6801202845566696, "grad_norm": 0.8350245952606201, "learning_rate": 7.870832558153091e-05, "loss": 0.9331499099731445, "memory(GiB)": 91.52, "step": 52415, "token_acc": 0.7424003094677799, "train_speed(iter/s)": 0.147052 }, { "epoch": 0.6801851629583253, "grad_norm": 0.6780168414115906, "learning_rate": 7.870393390552988e-05, "loss": 0.8820781707763672, "memory(GiB)": 91.52, "step": 52420, "token_acc": 0.7573740306646365, "train_speed(iter/s)": 0.14705 }, { "epoch": 0.680250041359981, "grad_norm": 0.7629152536392212, "learning_rate": 7.869954189920583e-05, "loss": 0.9010278701782226, "memory(GiB)": 91.52, "step": 52425, "token_acc": 0.7285563555114201, "train_speed(iter/s)": 0.147047 }, { "epoch": 0.6803149197616367, "grad_norm": 0.7003121972084045, "learning_rate": 7.86951495626093e-05, "loss": 0.8629267692565918, "memory(GiB)": 91.52, "step": 52430, "token_acc": 0.7593918016506284, "train_speed(iter/s)": 0.147044 }, { "epoch": 0.6803797981632924, "grad_norm": 0.728834867477417, "learning_rate": 7.869075689579083e-05, "loss": 0.8659072875976562, "memory(GiB)": 91.52, "step": 52435, "token_acc": 0.7422347099803702, "train_speed(iter/s)": 0.147041 }, { "epoch": 0.6804446765649481, "grad_norm": 0.7059594988822937, "learning_rate": 7.868636389880097e-05, "loss": 0.9116175651550293, "memory(GiB)": 91.52, "step": 52440, "token_acc": 0.7626339440027653, "train_speed(iter/s)": 0.147038 }, { "epoch": 0.6805095549666038, "grad_norm": 0.7215242981910706, "learning_rate": 7.868197057169029e-05, "loss": 0.8878414154052734, "memory(GiB)": 91.52, "step": 52445, "token_acc": 0.783229028178605, "train_speed(iter/s)": 0.147035 }, { "epoch": 0.6805744333682595, "grad_norm": 0.7154508233070374, "learning_rate": 7.867757691450935e-05, "loss": 0.8495329856872559, "memory(GiB)": 91.52, "step": 52450, "token_acc": 0.784058174664512, "train_speed(iter/s)": 0.147033 }, { "epoch": 0.6806393117699152, "grad_norm": 0.8007758259773254, "learning_rate": 7.867318292730867e-05, "loss": 0.8843306541442871, "memory(GiB)": 91.52, "step": 52455, "token_acc": 0.7509893455098935, "train_speed(iter/s)": 0.14703 }, { "epoch": 0.6807041901715709, "grad_norm": 0.7056261301040649, "learning_rate": 7.866878861013887e-05, "loss": 0.8928422927856445, "memory(GiB)": 91.52, "step": 52460, "token_acc": 0.7531333657824288, "train_speed(iter/s)": 0.147027 }, { "epoch": 0.6807690685732266, "grad_norm": 0.7866972088813782, "learning_rate": 7.86643939630505e-05, "loss": 0.8855436325073243, "memory(GiB)": 91.52, "step": 52465, "token_acc": 0.7689747234839291, "train_speed(iter/s)": 0.147025 }, { "epoch": 0.6808339469748823, "grad_norm": 0.7596246004104614, "learning_rate": 7.865999898609412e-05, "loss": 0.9290483474731446, "memory(GiB)": 91.52, "step": 52470, "token_acc": 0.7726183615904886, "train_speed(iter/s)": 0.147022 }, { "epoch": 0.680898825376538, "grad_norm": 0.7259009480476379, "learning_rate": 7.865560367932032e-05, "loss": 0.8963394165039062, "memory(GiB)": 91.52, "step": 52475, "token_acc": 0.7449048311062276, "train_speed(iter/s)": 0.147019 }, { "epoch": 0.6809637037781937, "grad_norm": 0.7119123339653015, "learning_rate": 7.865120804277966e-05, "loss": 0.8677487373352051, "memory(GiB)": 91.52, "step": 52480, "token_acc": 0.7611607142857143, "train_speed(iter/s)": 0.147015 }, { "epoch": 0.6810285821798494, "grad_norm": 0.7470839619636536, "learning_rate": 7.864681207652276e-05, "loss": 0.900844955444336, "memory(GiB)": 91.52, "step": 52485, "token_acc": 0.7592300813354341, "train_speed(iter/s)": 0.147013 }, { "epoch": 0.6810934605815051, "grad_norm": 0.8182344436645508, "learning_rate": 7.864241578060019e-05, "loss": 0.878694725036621, "memory(GiB)": 91.52, "step": 52490, "token_acc": 0.7672145362815813, "train_speed(iter/s)": 0.14701 }, { "epoch": 0.6811583389831608, "grad_norm": 0.7525925040245056, "learning_rate": 7.863801915506252e-05, "loss": 0.9106642723083496, "memory(GiB)": 91.52, "step": 52495, "token_acc": 0.7965383772730592, "train_speed(iter/s)": 0.147008 }, { "epoch": 0.6812232173848165, "grad_norm": 0.7028717994689941, "learning_rate": 7.86336221999604e-05, "loss": 0.8988543510437011, "memory(GiB)": 91.52, "step": 52500, "token_acc": 0.7516992905694017, "train_speed(iter/s)": 0.147004 }, { "epoch": 0.6812880957864722, "grad_norm": 0.6958602666854858, "learning_rate": 7.862922491534439e-05, "loss": 0.8632348060607911, "memory(GiB)": 91.52, "step": 52505, "token_acc": 0.7570552574733468, "train_speed(iter/s)": 0.147001 }, { "epoch": 0.6813529741881279, "grad_norm": 0.7355443835258484, "learning_rate": 7.862482730126509e-05, "loss": 0.8323688507080078, "memory(GiB)": 91.52, "step": 52510, "token_acc": 0.7701348626165868, "train_speed(iter/s)": 0.146998 }, { "epoch": 0.6814178525897836, "grad_norm": 0.7032412886619568, "learning_rate": 7.862042935777313e-05, "loss": 0.8813589096069336, "memory(GiB)": 91.52, "step": 52515, "token_acc": 0.7709606020362992, "train_speed(iter/s)": 0.146995 }, { "epoch": 0.6814827309914393, "grad_norm": 0.7327859401702881, "learning_rate": 7.861603108491911e-05, "loss": 0.9281144142150879, "memory(GiB)": 91.52, "step": 52520, "token_acc": 0.7461796267257018, "train_speed(iter/s)": 0.146991 }, { "epoch": 0.681547609393095, "grad_norm": 0.772486686706543, "learning_rate": 7.861163248275365e-05, "loss": 0.9385141372680664, "memory(GiB)": 91.52, "step": 52525, "token_acc": 0.7390734706218296, "train_speed(iter/s)": 0.14699 }, { "epoch": 0.6816124877947507, "grad_norm": 0.8280343413352966, "learning_rate": 7.860723355132735e-05, "loss": 0.8844381332397461, "memory(GiB)": 91.52, "step": 52530, "token_acc": 0.7742459198496112, "train_speed(iter/s)": 0.146987 }, { "epoch": 0.6816773661964064, "grad_norm": 0.799575924873352, "learning_rate": 7.860283429069086e-05, "loss": 0.8775568008422852, "memory(GiB)": 91.52, "step": 52535, "token_acc": 0.7661706650470695, "train_speed(iter/s)": 0.146984 }, { "epoch": 0.6817422445980621, "grad_norm": 0.7593026161193848, "learning_rate": 7.859843470089478e-05, "loss": 0.9155901908874512, "memory(GiB)": 91.52, "step": 52540, "token_acc": 0.7342089824652891, "train_speed(iter/s)": 0.146981 }, { "epoch": 0.6818071229997178, "grad_norm": 0.8845236897468567, "learning_rate": 7.859403478198976e-05, "loss": 0.9068210601806641, "memory(GiB)": 91.52, "step": 52545, "token_acc": 0.7795155044577192, "train_speed(iter/s)": 0.146978 }, { "epoch": 0.6818720014013735, "grad_norm": 0.8478663563728333, "learning_rate": 7.858963453402644e-05, "loss": 0.8909255981445312, "memory(GiB)": 91.52, "step": 52550, "token_acc": 0.7594032805429864, "train_speed(iter/s)": 0.146975 }, { "epoch": 0.6819368798030292, "grad_norm": 0.7840487360954285, "learning_rate": 7.858523395705544e-05, "loss": 0.8918308258056641, "memory(GiB)": 91.52, "step": 52555, "token_acc": 0.7680881821467294, "train_speed(iter/s)": 0.146972 }, { "epoch": 0.6820017582046849, "grad_norm": 0.7361009120941162, "learning_rate": 7.858083305112741e-05, "loss": 0.8948295593261719, "memory(GiB)": 91.52, "step": 52560, "token_acc": 0.7755232578799616, "train_speed(iter/s)": 0.146969 }, { "epoch": 0.6820666366063406, "grad_norm": 0.8174592852592468, "learning_rate": 7.857643181629298e-05, "loss": 0.8837123870849609, "memory(GiB)": 91.52, "step": 52565, "token_acc": 0.7463830922061471, "train_speed(iter/s)": 0.146966 }, { "epoch": 0.6821315150079963, "grad_norm": 0.8347399830818176, "learning_rate": 7.857203025260282e-05, "loss": 0.9200532913208008, "memory(GiB)": 91.52, "step": 52570, "token_acc": 0.7759152215799615, "train_speed(iter/s)": 0.146963 }, { "epoch": 0.682196393409652, "grad_norm": 0.8194341659545898, "learning_rate": 7.856762836010758e-05, "loss": 0.9143094062805176, "memory(GiB)": 91.52, "step": 52575, "token_acc": 0.7785920796073421, "train_speed(iter/s)": 0.146961 }, { "epoch": 0.6822612718113077, "grad_norm": 0.798202633857727, "learning_rate": 7.856322613885791e-05, "loss": 0.8919048309326172, "memory(GiB)": 91.52, "step": 52580, "token_acc": 0.7468311644414399, "train_speed(iter/s)": 0.146958 }, { "epoch": 0.6823261502129634, "grad_norm": 0.8584627509117126, "learning_rate": 7.855882358890445e-05, "loss": 0.8908964157104492, "memory(GiB)": 91.52, "step": 52585, "token_acc": 0.7929700961059846, "train_speed(iter/s)": 0.146955 }, { "epoch": 0.6823910286146191, "grad_norm": 0.7818546891212463, "learning_rate": 7.855442071029792e-05, "loss": 0.8646865844726562, "memory(GiB)": 91.52, "step": 52590, "token_acc": 0.766354556803995, "train_speed(iter/s)": 0.146953 }, { "epoch": 0.6824559070162748, "grad_norm": 0.7620815634727478, "learning_rate": 7.855001750308894e-05, "loss": 0.9371365547180176, "memory(GiB)": 91.52, "step": 52595, "token_acc": 0.756160394265233, "train_speed(iter/s)": 0.14695 }, { "epoch": 0.6825207854179305, "grad_norm": 0.6506043672561646, "learning_rate": 7.854561396732819e-05, "loss": 0.8574673652648925, "memory(GiB)": 91.52, "step": 52600, "token_acc": 0.7602490498908385, "train_speed(iter/s)": 0.146948 }, { "epoch": 0.6825856638195862, "grad_norm": 0.7349735498428345, "learning_rate": 7.854121010306635e-05, "loss": 0.8868972778320312, "memory(GiB)": 91.52, "step": 52605, "token_acc": 0.7553603811826619, "train_speed(iter/s)": 0.146945 }, { "epoch": 0.6826505422212419, "grad_norm": 0.7806969285011292, "learning_rate": 7.853680591035411e-05, "loss": 0.9642436027526855, "memory(GiB)": 91.52, "step": 52610, "token_acc": 0.7144225312145289, "train_speed(iter/s)": 0.146943 }, { "epoch": 0.6827154206228976, "grad_norm": 0.7920486927032471, "learning_rate": 7.853240138924214e-05, "loss": 0.891960334777832, "memory(GiB)": 91.52, "step": 52615, "token_acc": 0.76389644468623, "train_speed(iter/s)": 0.146941 }, { "epoch": 0.6827802990245533, "grad_norm": 0.827782928943634, "learning_rate": 7.852799653978113e-05, "loss": 0.8692796707153321, "memory(GiB)": 91.52, "step": 52620, "token_acc": 0.7586994655896871, "train_speed(iter/s)": 0.146937 }, { "epoch": 0.682845177426209, "grad_norm": 0.8254592418670654, "learning_rate": 7.852359136202178e-05, "loss": 0.8584278106689454, "memory(GiB)": 91.52, "step": 52625, "token_acc": 0.767070254677952, "train_speed(iter/s)": 0.146934 }, { "epoch": 0.6829100558278646, "grad_norm": 0.7778552770614624, "learning_rate": 7.851918585601477e-05, "loss": 0.8704526901245118, "memory(GiB)": 91.52, "step": 52630, "token_acc": 0.7663258452123293, "train_speed(iter/s)": 0.146931 }, { "epoch": 0.6829749342295203, "grad_norm": 0.6824995875358582, "learning_rate": 7.85147800218108e-05, "loss": 0.8962245941162109, "memory(GiB)": 91.52, "step": 52635, "token_acc": 0.7698605566163281, "train_speed(iter/s)": 0.146928 }, { "epoch": 0.683039812631176, "grad_norm": 0.7817044854164124, "learning_rate": 7.851037385946058e-05, "loss": 0.9453998565673828, "memory(GiB)": 91.52, "step": 52640, "token_acc": 0.757518652411353, "train_speed(iter/s)": 0.146926 }, { "epoch": 0.6831046910328317, "grad_norm": 0.8178317546844482, "learning_rate": 7.850596736901483e-05, "loss": 0.9214133262634278, "memory(GiB)": 91.52, "step": 52645, "token_acc": 0.7258791097962751, "train_speed(iter/s)": 0.146923 }, { "epoch": 0.6831695694344874, "grad_norm": 0.7096356153488159, "learning_rate": 7.850156055052422e-05, "loss": 0.8524030685424805, "memory(GiB)": 91.52, "step": 52650, "token_acc": 0.7635585467851542, "train_speed(iter/s)": 0.14692 }, { "epoch": 0.6832344478361431, "grad_norm": 0.8161994814872742, "learning_rate": 7.849715340403949e-05, "loss": 0.890574836730957, "memory(GiB)": 91.52, "step": 52655, "token_acc": 0.7551721643518519, "train_speed(iter/s)": 0.146916 }, { "epoch": 0.6832993262377988, "grad_norm": 0.8019680380821228, "learning_rate": 7.849274592961135e-05, "loss": 0.8685657501220703, "memory(GiB)": 91.52, "step": 52660, "token_acc": 0.7550046710262912, "train_speed(iter/s)": 0.146913 }, { "epoch": 0.6833642046394545, "grad_norm": 0.8266753554344177, "learning_rate": 7.848833812729053e-05, "loss": 0.9682591438293457, "memory(GiB)": 91.52, "step": 52665, "token_acc": 0.7363304981773997, "train_speed(iter/s)": 0.14691 }, { "epoch": 0.6834290830411102, "grad_norm": 0.736306369304657, "learning_rate": 7.848392999712774e-05, "loss": 0.84891357421875, "memory(GiB)": 91.52, "step": 52670, "token_acc": 0.7769825077761798, "train_speed(iter/s)": 0.146907 }, { "epoch": 0.6834939614427659, "grad_norm": 0.7201395034790039, "learning_rate": 7.847952153917372e-05, "loss": 0.8379700660705567, "memory(GiB)": 91.52, "step": 52675, "token_acc": 0.7625764573711147, "train_speed(iter/s)": 0.146904 }, { "epoch": 0.6835588398444216, "grad_norm": 0.6513803005218506, "learning_rate": 7.847511275347921e-05, "loss": 0.9531154632568359, "memory(GiB)": 91.52, "step": 52680, "token_acc": 0.733836267113719, "train_speed(iter/s)": 0.1469 }, { "epoch": 0.6836237182460773, "grad_norm": 0.7512143850326538, "learning_rate": 7.847070364009493e-05, "loss": 0.8388360977172852, "memory(GiB)": 91.52, "step": 52685, "token_acc": 0.7707088921447086, "train_speed(iter/s)": 0.146897 }, { "epoch": 0.683688596647733, "grad_norm": 0.8808886408805847, "learning_rate": 7.846629419907163e-05, "loss": 0.8933328628540039, "memory(GiB)": 91.52, "step": 52690, "token_acc": 0.7552439666190512, "train_speed(iter/s)": 0.146894 }, { "epoch": 0.6837534750493887, "grad_norm": 0.8488366007804871, "learning_rate": 7.846188443046003e-05, "loss": 0.8736274719238282, "memory(GiB)": 91.52, "step": 52695, "token_acc": 0.7728231186698469, "train_speed(iter/s)": 0.146892 }, { "epoch": 0.6838183534510444, "grad_norm": 0.7086341381072998, "learning_rate": 7.845747433431092e-05, "loss": 0.8811317443847656, "memory(GiB)": 91.52, "step": 52700, "token_acc": 0.7582717266305241, "train_speed(iter/s)": 0.146888 }, { "epoch": 0.6838832318527001, "grad_norm": 0.6579958200454712, "learning_rate": 7.845306391067501e-05, "loss": 0.9067527770996093, "memory(GiB)": 91.52, "step": 52705, "token_acc": 0.756044957472661, "train_speed(iter/s)": 0.146885 }, { "epoch": 0.6839481102543558, "grad_norm": 0.7162171602249146, "learning_rate": 7.844865315960308e-05, "loss": 0.8722820281982422, "memory(GiB)": 91.52, "step": 52710, "token_acc": 0.7598246674727932, "train_speed(iter/s)": 0.146883 }, { "epoch": 0.6840129886560115, "grad_norm": 0.7924575805664062, "learning_rate": 7.844424208114587e-05, "loss": 0.9186459541320801, "memory(GiB)": 91.52, "step": 52715, "token_acc": 0.7496367831809247, "train_speed(iter/s)": 0.14688 }, { "epoch": 0.6840778670576672, "grad_norm": 0.6589076519012451, "learning_rate": 7.843983067535417e-05, "loss": 0.8623578071594238, "memory(GiB)": 91.52, "step": 52720, "token_acc": 0.7665995975855131, "train_speed(iter/s)": 0.146876 }, { "epoch": 0.6841427454593229, "grad_norm": 0.7270414233207703, "learning_rate": 7.84354189422787e-05, "loss": 0.8628097534179687, "memory(GiB)": 91.52, "step": 52725, "token_acc": 0.7648560901761938, "train_speed(iter/s)": 0.146873 }, { "epoch": 0.6842076238609786, "grad_norm": 0.7596540451049805, "learning_rate": 7.843100688197028e-05, "loss": 0.9445301055908203, "memory(GiB)": 91.52, "step": 52730, "token_acc": 0.763399477438034, "train_speed(iter/s)": 0.146871 }, { "epoch": 0.6842725022626343, "grad_norm": 0.7434234023094177, "learning_rate": 7.842659449447967e-05, "loss": 0.9030994415283203, "memory(GiB)": 91.52, "step": 52735, "token_acc": 0.7684804082976622, "train_speed(iter/s)": 0.146868 }, { "epoch": 0.68433738066429, "grad_norm": 0.7609714269638062, "learning_rate": 7.842218177985763e-05, "loss": 0.8968521118164062, "memory(GiB)": 91.52, "step": 52740, "token_acc": 0.7594994073764205, "train_speed(iter/s)": 0.146865 }, { "epoch": 0.6844022590659457, "grad_norm": 0.8218116760253906, "learning_rate": 7.841776873815494e-05, "loss": 0.918303394317627, "memory(GiB)": 91.52, "step": 52745, "token_acc": 0.7413137986512854, "train_speed(iter/s)": 0.146862 }, { "epoch": 0.6844671374676013, "grad_norm": 0.760473370552063, "learning_rate": 7.84133553694224e-05, "loss": 0.9224198341369629, "memory(GiB)": 91.52, "step": 52750, "token_acc": 0.7400581217497706, "train_speed(iter/s)": 0.14686 }, { "epoch": 0.684532015869257, "grad_norm": 0.8139374256134033, "learning_rate": 7.840894167371081e-05, "loss": 0.9540048599243164, "memory(GiB)": 91.52, "step": 52755, "token_acc": 0.7475707496985602, "train_speed(iter/s)": 0.146858 }, { "epoch": 0.6845968942709127, "grad_norm": 0.753373920917511, "learning_rate": 7.840452765107094e-05, "loss": 0.8657997131347657, "memory(GiB)": 91.52, "step": 52760, "token_acc": 0.7642894018732945, "train_speed(iter/s)": 0.146854 }, { "epoch": 0.6846617726725684, "grad_norm": 0.7902403473854065, "learning_rate": 7.840011330155359e-05, "loss": 0.9194788932800293, "memory(GiB)": 91.52, "step": 52765, "token_acc": 0.7442832992259097, "train_speed(iter/s)": 0.146852 }, { "epoch": 0.6847266510742241, "grad_norm": 0.804527759552002, "learning_rate": 7.839569862520957e-05, "loss": 0.9076534271240234, "memory(GiB)": 91.52, "step": 52770, "token_acc": 0.7494151686776656, "train_speed(iter/s)": 0.146849 }, { "epoch": 0.6847915294758798, "grad_norm": 0.7864199280738831, "learning_rate": 7.839128362208967e-05, "loss": 0.944795799255371, "memory(GiB)": 91.52, "step": 52775, "token_acc": 0.7346093778375041, "train_speed(iter/s)": 0.146847 }, { "epoch": 0.6848564078775355, "grad_norm": 0.6992807388305664, "learning_rate": 7.838686829224471e-05, "loss": 0.9096799850463867, "memory(GiB)": 91.52, "step": 52780, "token_acc": 0.7559525753666264, "train_speed(iter/s)": 0.146845 }, { "epoch": 0.6849212862791912, "grad_norm": 0.7311533093452454, "learning_rate": 7.83824526357255e-05, "loss": 0.8874481201171875, "memory(GiB)": 91.52, "step": 52785, "token_acc": 0.7442298167409609, "train_speed(iter/s)": 0.146842 }, { "epoch": 0.6849861646808469, "grad_norm": 0.7305067181587219, "learning_rate": 7.837803665258285e-05, "loss": 0.9149899482727051, "memory(GiB)": 91.52, "step": 52790, "token_acc": 0.7430644629310661, "train_speed(iter/s)": 0.146839 }, { "epoch": 0.6850510430825026, "grad_norm": 0.7987533807754517, "learning_rate": 7.837362034286759e-05, "loss": 0.8785176277160645, "memory(GiB)": 91.52, "step": 52795, "token_acc": 0.7686188724271652, "train_speed(iter/s)": 0.146836 }, { "epoch": 0.6851159214841583, "grad_norm": 0.6639001369476318, "learning_rate": 7.83692037066305e-05, "loss": 0.8905490875244141, "memory(GiB)": 91.52, "step": 52800, "token_acc": 0.7408515614252305, "train_speed(iter/s)": 0.146832 }, { "epoch": 0.685180799885814, "grad_norm": 0.7175312638282776, "learning_rate": 7.836478674392248e-05, "loss": 0.9055941581726075, "memory(GiB)": 91.52, "step": 52805, "token_acc": 0.7742640874684609, "train_speed(iter/s)": 0.14683 }, { "epoch": 0.6852456782874697, "grad_norm": 0.8379146456718445, "learning_rate": 7.83603694547943e-05, "loss": 0.9266815185546875, "memory(GiB)": 91.52, "step": 52810, "token_acc": 0.7454357834576917, "train_speed(iter/s)": 0.146827 }, { "epoch": 0.6853105566891254, "grad_norm": 0.7854076623916626, "learning_rate": 7.83559518392968e-05, "loss": 0.9117303848266601, "memory(GiB)": 91.52, "step": 52815, "token_acc": 0.7534395120298204, "train_speed(iter/s)": 0.146824 }, { "epoch": 0.6853754350907811, "grad_norm": 0.7561267018318176, "learning_rate": 7.835153389748085e-05, "loss": 0.9191631317138672, "memory(GiB)": 91.52, "step": 52820, "token_acc": 0.7509745188954084, "train_speed(iter/s)": 0.146821 }, { "epoch": 0.6854403134924368, "grad_norm": 0.779404878616333, "learning_rate": 7.834711562939728e-05, "loss": 0.8984826087951661, "memory(GiB)": 91.52, "step": 52825, "token_acc": 0.7572995898909881, "train_speed(iter/s)": 0.146818 }, { "epoch": 0.6855051918940925, "grad_norm": 0.777435302734375, "learning_rate": 7.834269703509691e-05, "loss": 0.9106966018676758, "memory(GiB)": 91.52, "step": 52830, "token_acc": 0.7537206732956392, "train_speed(iter/s)": 0.146815 }, { "epoch": 0.6855700702957482, "grad_norm": 0.7266409397125244, "learning_rate": 7.833827811463062e-05, "loss": 0.8822023391723632, "memory(GiB)": 91.52, "step": 52835, "token_acc": 0.7444191343963553, "train_speed(iter/s)": 0.146813 }, { "epoch": 0.6856349486974039, "grad_norm": 0.7400366067886353, "learning_rate": 7.833385886804924e-05, "loss": 0.8657123565673828, "memory(GiB)": 91.52, "step": 52840, "token_acc": 0.7776932826362484, "train_speed(iter/s)": 0.14681 }, { "epoch": 0.6856998270990596, "grad_norm": 0.65495765209198, "learning_rate": 7.832943929540364e-05, "loss": 0.9028184890747071, "memory(GiB)": 91.52, "step": 52845, "token_acc": 0.7400841346153846, "train_speed(iter/s)": 0.146807 }, { "epoch": 0.6857647055007153, "grad_norm": 0.719551682472229, "learning_rate": 7.832501939674467e-05, "loss": 0.909945297241211, "memory(GiB)": 91.52, "step": 52850, "token_acc": 0.7476891365261361, "train_speed(iter/s)": 0.146804 }, { "epoch": 0.685829583902371, "grad_norm": 0.731586754322052, "learning_rate": 7.83205991721232e-05, "loss": 0.9281535148620605, "memory(GiB)": 91.52, "step": 52855, "token_acc": 0.746239837398374, "train_speed(iter/s)": 0.146801 }, { "epoch": 0.6858944623040267, "grad_norm": 0.760845959186554, "learning_rate": 7.831617862159011e-05, "loss": 0.9039603233337402, "memory(GiB)": 91.52, "step": 52860, "token_acc": 0.7670669811995139, "train_speed(iter/s)": 0.146798 }, { "epoch": 0.6859593407056824, "grad_norm": 0.7508227825164795, "learning_rate": 7.831175774519626e-05, "loss": 0.9162050247192383, "memory(GiB)": 91.52, "step": 52865, "token_acc": 0.7523286556843604, "train_speed(iter/s)": 0.146796 }, { "epoch": 0.686024219107338, "grad_norm": 0.752794086933136, "learning_rate": 7.830733654299251e-05, "loss": 0.9290068626403809, "memory(GiB)": 91.52, "step": 52870, "token_acc": 0.7451247015123375, "train_speed(iter/s)": 0.146793 }, { "epoch": 0.6860890975089937, "grad_norm": 0.8734341859817505, "learning_rate": 7.830291501502975e-05, "loss": 0.908657455444336, "memory(GiB)": 91.52, "step": 52875, "token_acc": 0.7601397244323695, "train_speed(iter/s)": 0.14679 }, { "epoch": 0.6861539759106494, "grad_norm": 0.8149822354316711, "learning_rate": 7.829849316135888e-05, "loss": 0.9329640388488769, "memory(GiB)": 91.52, "step": 52880, "token_acc": 0.7582232893157262, "train_speed(iter/s)": 0.146787 }, { "epoch": 0.6862188543123051, "grad_norm": 0.8233280777931213, "learning_rate": 7.829407098203077e-05, "loss": 0.9588438034057617, "memory(GiB)": 91.52, "step": 52885, "token_acc": 0.7479687812687612, "train_speed(iter/s)": 0.146785 }, { "epoch": 0.6862837327139608, "grad_norm": 0.781002938747406, "learning_rate": 7.82896484770963e-05, "loss": 0.8882855415344239, "memory(GiB)": 91.52, "step": 52890, "token_acc": 0.7530129730559607, "train_speed(iter/s)": 0.146783 }, { "epoch": 0.6863486111156165, "grad_norm": 0.7826159596443176, "learning_rate": 7.828522564660638e-05, "loss": 0.9409215927124024, "memory(GiB)": 91.52, "step": 52895, "token_acc": 0.7547277936962751, "train_speed(iter/s)": 0.146781 }, { "epoch": 0.6864134895172722, "grad_norm": 0.799563467502594, "learning_rate": 7.828080249061191e-05, "loss": 0.8571962356567383, "memory(GiB)": 91.52, "step": 52900, "token_acc": 0.7563531569295258, "train_speed(iter/s)": 0.146779 }, { "epoch": 0.6864783679189279, "grad_norm": 0.7231101393699646, "learning_rate": 7.827637900916378e-05, "loss": 0.8779491424560547, "memory(GiB)": 91.52, "step": 52905, "token_acc": 0.7455349606671393, "train_speed(iter/s)": 0.146775 }, { "epoch": 0.6865432463205836, "grad_norm": 0.7392633557319641, "learning_rate": 7.82719552023129e-05, "loss": 0.9102073669433594, "memory(GiB)": 91.52, "step": 52910, "token_acc": 0.7406635778475699, "train_speed(iter/s)": 0.146772 }, { "epoch": 0.6866081247222393, "grad_norm": 0.6717745065689087, "learning_rate": 7.826753107011018e-05, "loss": 0.9011273384094238, "memory(GiB)": 91.52, "step": 52915, "token_acc": 0.7561209964412812, "train_speed(iter/s)": 0.146769 }, { "epoch": 0.686673003123895, "grad_norm": 0.7623025178909302, "learning_rate": 7.826310661260654e-05, "loss": 0.9033254623413086, "memory(GiB)": 91.52, "step": 52920, "token_acc": 0.755073071587573, "train_speed(iter/s)": 0.146766 }, { "epoch": 0.6867378815255507, "grad_norm": 0.736443817615509, "learning_rate": 7.825868182985288e-05, "loss": 0.8518270492553711, "memory(GiB)": 91.52, "step": 52925, "token_acc": 0.7560357223946114, "train_speed(iter/s)": 0.146763 }, { "epoch": 0.6868027599272064, "grad_norm": 0.6233806014060974, "learning_rate": 7.825425672190013e-05, "loss": 0.8426219940185546, "memory(GiB)": 91.52, "step": 52930, "token_acc": 0.7947844634809234, "train_speed(iter/s)": 0.14676 }, { "epoch": 0.6868676383288621, "grad_norm": 0.7381251454353333, "learning_rate": 7.824983128879922e-05, "loss": 0.8937581062316895, "memory(GiB)": 91.52, "step": 52935, "token_acc": 0.7638722619471318, "train_speed(iter/s)": 0.146757 }, { "epoch": 0.6869325167305178, "grad_norm": 0.7581550478935242, "learning_rate": 7.824540553060105e-05, "loss": 0.8798245429992676, "memory(GiB)": 91.52, "step": 52940, "token_acc": 0.7767014764133957, "train_speed(iter/s)": 0.146753 }, { "epoch": 0.6869973951321735, "grad_norm": 0.7967044711112976, "learning_rate": 7.824097944735658e-05, "loss": 0.9202600479125976, "memory(GiB)": 91.52, "step": 52945, "token_acc": 0.7526026290437863, "train_speed(iter/s)": 0.14675 }, { "epoch": 0.6870622735338292, "grad_norm": 0.7675424218177795, "learning_rate": 7.823655303911673e-05, "loss": 0.8741280555725097, "memory(GiB)": 91.52, "step": 52950, "token_acc": 0.7566164710142741, "train_speed(iter/s)": 0.146747 }, { "epoch": 0.6871271519354849, "grad_norm": 0.7837675213813782, "learning_rate": 7.823212630593246e-05, "loss": 0.9278427124023437, "memory(GiB)": 91.52, "step": 52955, "token_acc": 0.7588341985114766, "train_speed(iter/s)": 0.146745 }, { "epoch": 0.6871920303371406, "grad_norm": 0.8055779337882996, "learning_rate": 7.82276992478547e-05, "loss": 0.9276312828063965, "memory(GiB)": 91.52, "step": 52960, "token_acc": 0.7566097406704617, "train_speed(iter/s)": 0.146741 }, { "epoch": 0.6872569087387963, "grad_norm": 0.7750841379165649, "learning_rate": 7.822327186493437e-05, "loss": 0.8895898818969726, "memory(GiB)": 91.52, "step": 52965, "token_acc": 0.7661360736113909, "train_speed(iter/s)": 0.146738 }, { "epoch": 0.687321787140452, "grad_norm": 0.7390322089195251, "learning_rate": 7.821884415722247e-05, "loss": 0.8956563949584961, "memory(GiB)": 91.52, "step": 52970, "token_acc": 0.7521213936308276, "train_speed(iter/s)": 0.146735 }, { "epoch": 0.6873866655421077, "grad_norm": 0.7136190533638, "learning_rate": 7.82144161247699e-05, "loss": 0.9320444107055664, "memory(GiB)": 91.52, "step": 52975, "token_acc": 0.7576508752601298, "train_speed(iter/s)": 0.146732 }, { "epoch": 0.6874515439437634, "grad_norm": 0.860022246837616, "learning_rate": 7.820998776762767e-05, "loss": 0.8687767028808594, "memory(GiB)": 91.52, "step": 52980, "token_acc": 0.7640757427770251, "train_speed(iter/s)": 0.146729 }, { "epoch": 0.6875164223454191, "grad_norm": 0.7829903960227966, "learning_rate": 7.82055590858467e-05, "loss": 0.9063728332519532, "memory(GiB)": 91.52, "step": 52985, "token_acc": 0.7512052477673279, "train_speed(iter/s)": 0.146726 }, { "epoch": 0.6875813007470748, "grad_norm": 0.8638331294059753, "learning_rate": 7.820113007947798e-05, "loss": 0.8938888549804688, "memory(GiB)": 91.52, "step": 52990, "token_acc": 0.7726092505391013, "train_speed(iter/s)": 0.146724 }, { "epoch": 0.6876461791487305, "grad_norm": 0.7195773720741272, "learning_rate": 7.819670074857247e-05, "loss": 0.9091549873352051, "memory(GiB)": 91.52, "step": 52995, "token_acc": 0.762052611825719, "train_speed(iter/s)": 0.14672 }, { "epoch": 0.6877110575503862, "grad_norm": 0.8055015206336975, "learning_rate": 7.819227109318115e-05, "loss": 0.9089571952819824, "memory(GiB)": 91.52, "step": 53000, "token_acc": 0.7560409556313993, "train_speed(iter/s)": 0.146717 }, { "epoch": 0.6877759359520419, "grad_norm": 0.7346208691596985, "learning_rate": 7.818784111335498e-05, "loss": 0.8357851028442382, "memory(GiB)": 91.52, "step": 53005, "token_acc": 0.7674241923762964, "train_speed(iter/s)": 0.146714 }, { "epoch": 0.6878408143536976, "grad_norm": 0.7391648292541504, "learning_rate": 7.818341080914494e-05, "loss": 0.9210269927978516, "memory(GiB)": 91.52, "step": 53010, "token_acc": 0.7375170141127588, "train_speed(iter/s)": 0.146712 }, { "epoch": 0.6879056927553533, "grad_norm": 0.8432331085205078, "learning_rate": 7.817898018060203e-05, "loss": 0.9147100448608398, "memory(GiB)": 91.52, "step": 53015, "token_acc": 0.7512622512622512, "train_speed(iter/s)": 0.146709 }, { "epoch": 0.687970571157009, "grad_norm": 0.7325670123100281, "learning_rate": 7.817454922777723e-05, "loss": 0.8979608535766601, "memory(GiB)": 91.52, "step": 53020, "token_acc": 0.7462386466061153, "train_speed(iter/s)": 0.146706 }, { "epoch": 0.6880354495586647, "grad_norm": 0.7267540097236633, "learning_rate": 7.817011795072153e-05, "loss": 0.8902153015136719, "memory(GiB)": 91.52, "step": 53025, "token_acc": 0.7509352462493732, "train_speed(iter/s)": 0.146704 }, { "epoch": 0.6881003279603204, "grad_norm": 0.8200505971908569, "learning_rate": 7.816568634948593e-05, "loss": 0.8835861206054687, "memory(GiB)": 91.52, "step": 53030, "token_acc": 0.7471841081302478, "train_speed(iter/s)": 0.146702 }, { "epoch": 0.6881652063619761, "grad_norm": 0.7310588359832764, "learning_rate": 7.81612544241214e-05, "loss": 0.923075008392334, "memory(GiB)": 91.52, "step": 53035, "token_acc": 0.7515314177455005, "train_speed(iter/s)": 0.146699 }, { "epoch": 0.6882300847636318, "grad_norm": 0.7489011287689209, "learning_rate": 7.815682217467901e-05, "loss": 0.8726493835449218, "memory(GiB)": 91.52, "step": 53040, "token_acc": 0.763532101205006, "train_speed(iter/s)": 0.146696 }, { "epoch": 0.6882949631652875, "grad_norm": 0.6950169205665588, "learning_rate": 7.81523896012097e-05, "loss": 0.8871644973754883, "memory(GiB)": 91.52, "step": 53045, "token_acc": 0.7498329323710238, "train_speed(iter/s)": 0.146694 }, { "epoch": 0.6883598415669432, "grad_norm": 0.7174971699714661, "learning_rate": 7.814795670376449e-05, "loss": 0.9275074958801269, "memory(GiB)": 91.52, "step": 53050, "token_acc": 0.7498369636102779, "train_speed(iter/s)": 0.14669 }, { "epoch": 0.6884247199685989, "grad_norm": 0.7651169896125793, "learning_rate": 7.814352348239442e-05, "loss": 0.8979488372802734, "memory(GiB)": 91.52, "step": 53055, "token_acc": 0.7629491898455584, "train_speed(iter/s)": 0.146687 }, { "epoch": 0.6884895983702546, "grad_norm": 0.7390053272247314, "learning_rate": 7.81390899371505e-05, "loss": 0.9309833526611329, "memory(GiB)": 91.52, "step": 53060, "token_acc": 0.7516661377340527, "train_speed(iter/s)": 0.146684 }, { "epoch": 0.6885544767719103, "grad_norm": 0.747600257396698, "learning_rate": 7.813465606808374e-05, "loss": 0.8849846839904785, "memory(GiB)": 91.52, "step": 53065, "token_acc": 0.7387198693599347, "train_speed(iter/s)": 0.146681 }, { "epoch": 0.688619355173566, "grad_norm": 0.8403436541557312, "learning_rate": 7.813022187524516e-05, "loss": 0.8931191444396973, "memory(GiB)": 91.52, "step": 53070, "token_acc": 0.7565168774896918, "train_speed(iter/s)": 0.146679 }, { "epoch": 0.6886842335752217, "grad_norm": 0.7146639227867126, "learning_rate": 7.812578735868579e-05, "loss": 0.8403548240661621, "memory(GiB)": 91.52, "step": 53075, "token_acc": 0.7567017305734646, "train_speed(iter/s)": 0.146676 }, { "epoch": 0.6887491119768774, "grad_norm": 0.8551980257034302, "learning_rate": 7.812135251845669e-05, "loss": 0.8931134223937989, "memory(GiB)": 91.52, "step": 53080, "token_acc": 0.7778853860539674, "train_speed(iter/s)": 0.146672 }, { "epoch": 0.6888139903785331, "grad_norm": 0.6757500171661377, "learning_rate": 7.811691735460887e-05, "loss": 0.8581117630004883, "memory(GiB)": 91.52, "step": 53085, "token_acc": 0.7677186712193254, "train_speed(iter/s)": 0.146669 }, { "epoch": 0.6888788687801888, "grad_norm": 0.7643688321113586, "learning_rate": 7.811248186719336e-05, "loss": 0.9105674743652343, "memory(GiB)": 91.52, "step": 53090, "token_acc": 0.7579642443187805, "train_speed(iter/s)": 0.146666 }, { "epoch": 0.6889437471818445, "grad_norm": 0.8039963245391846, "learning_rate": 7.810804605626123e-05, "loss": 0.944854736328125, "memory(GiB)": 91.52, "step": 53095, "token_acc": 0.7646252421397706, "train_speed(iter/s)": 0.146664 }, { "epoch": 0.6890086255835002, "grad_norm": 0.719937264919281, "learning_rate": 7.810360992186352e-05, "loss": 0.9285686492919922, "memory(GiB)": 91.52, "step": 53100, "token_acc": 0.7602296941023511, "train_speed(iter/s)": 0.146662 }, { "epoch": 0.6890735039851558, "grad_norm": 0.7484157085418701, "learning_rate": 7.809917346405127e-05, "loss": 0.8580427169799805, "memory(GiB)": 91.52, "step": 53105, "token_acc": 0.7591192257963405, "train_speed(iter/s)": 0.146658 }, { "epoch": 0.6891383823868115, "grad_norm": 0.7283434271812439, "learning_rate": 7.809473668287552e-05, "loss": 0.8940988540649414, "memory(GiB)": 91.52, "step": 53110, "token_acc": 0.7723769783090861, "train_speed(iter/s)": 0.146656 }, { "epoch": 0.6892032607884672, "grad_norm": 0.7882735729217529, "learning_rate": 7.809029957838739e-05, "loss": 0.9001537322998047, "memory(GiB)": 91.52, "step": 53115, "token_acc": 0.7307756631100111, "train_speed(iter/s)": 0.146653 }, { "epoch": 0.6892681391901229, "grad_norm": 0.8904600739479065, "learning_rate": 7.808586215063786e-05, "loss": 0.898440170288086, "memory(GiB)": 91.52, "step": 53120, "token_acc": 0.7575484614830295, "train_speed(iter/s)": 0.146651 }, { "epoch": 0.6893330175917786, "grad_norm": 0.664584755897522, "learning_rate": 7.808142439967807e-05, "loss": 0.8554093360900878, "memory(GiB)": 91.52, "step": 53125, "token_acc": 0.7471052724761871, "train_speed(iter/s)": 0.146648 }, { "epoch": 0.6893978959934343, "grad_norm": 0.8433821201324463, "learning_rate": 7.807698632555903e-05, "loss": 0.8888032913208008, "memory(GiB)": 91.52, "step": 53130, "token_acc": 0.7705462154556606, "train_speed(iter/s)": 0.146644 }, { "epoch": 0.68946277439509, "grad_norm": 0.7236877083778381, "learning_rate": 7.807254792833185e-05, "loss": 0.9193913459777832, "memory(GiB)": 91.52, "step": 53135, "token_acc": 0.7559199696087122, "train_speed(iter/s)": 0.146641 }, { "epoch": 0.6895276527967457, "grad_norm": 0.7424578666687012, "learning_rate": 7.806810920804759e-05, "loss": 0.8849760055541992, "memory(GiB)": 91.52, "step": 53140, "token_acc": 0.7516310066643284, "train_speed(iter/s)": 0.146639 }, { "epoch": 0.6895925311984014, "grad_norm": 0.8448354601860046, "learning_rate": 7.806367016475733e-05, "loss": 0.877581787109375, "memory(GiB)": 91.52, "step": 53145, "token_acc": 0.7748539143222012, "train_speed(iter/s)": 0.146635 }, { "epoch": 0.6896574096000571, "grad_norm": 0.72123122215271, "learning_rate": 7.805923079851218e-05, "loss": 0.9042821884155273, "memory(GiB)": 91.52, "step": 53150, "token_acc": 0.7374501155219492, "train_speed(iter/s)": 0.146633 }, { "epoch": 0.6897222880017128, "grad_norm": 0.7711699604988098, "learning_rate": 7.805479110936321e-05, "loss": 0.8962089538574218, "memory(GiB)": 91.52, "step": 53155, "token_acc": 0.7645448560241173, "train_speed(iter/s)": 0.14663 }, { "epoch": 0.6897871664033685, "grad_norm": 0.6985358595848083, "learning_rate": 7.805035109736147e-05, "loss": 0.8536133766174316, "memory(GiB)": 91.52, "step": 53160, "token_acc": 0.7511122456421852, "train_speed(iter/s)": 0.146628 }, { "epoch": 0.6898520448050242, "grad_norm": 0.7541661262512207, "learning_rate": 7.804591076255813e-05, "loss": 0.9214390754699707, "memory(GiB)": 91.52, "step": 53165, "token_acc": 0.7711488791811486, "train_speed(iter/s)": 0.146625 }, { "epoch": 0.6899169232066799, "grad_norm": 0.7162015438079834, "learning_rate": 7.804147010500425e-05, "loss": 0.8719827651977539, "memory(GiB)": 91.52, "step": 53170, "token_acc": 0.7714014619061006, "train_speed(iter/s)": 0.146622 }, { "epoch": 0.6899818016083356, "grad_norm": 0.7842923998832703, "learning_rate": 7.803702912475092e-05, "loss": 0.9135128021240234, "memory(GiB)": 91.52, "step": 53175, "token_acc": 0.7595353947574949, "train_speed(iter/s)": 0.14662 }, { "epoch": 0.6900466800099913, "grad_norm": 0.6643728613853455, "learning_rate": 7.803258782184928e-05, "loss": 0.9061439514160157, "memory(GiB)": 91.52, "step": 53180, "token_acc": 0.7607983738919316, "train_speed(iter/s)": 0.146617 }, { "epoch": 0.690111558411647, "grad_norm": 0.7115989327430725, "learning_rate": 7.802814619635041e-05, "loss": 0.9180892944335938, "memory(GiB)": 91.52, "step": 53185, "token_acc": 0.7413392586607414, "train_speed(iter/s)": 0.146614 }, { "epoch": 0.6901764368133027, "grad_norm": 0.7112090587615967, "learning_rate": 7.802370424830545e-05, "loss": 0.9070938110351563, "memory(GiB)": 91.52, "step": 53190, "token_acc": 0.7599908130454754, "train_speed(iter/s)": 0.146612 }, { "epoch": 0.6902413152149584, "grad_norm": 0.7996149659156799, "learning_rate": 7.80192619777655e-05, "loss": 0.9225704193115234, "memory(GiB)": 91.52, "step": 53195, "token_acc": 0.7375366568914956, "train_speed(iter/s)": 0.14661 }, { "epoch": 0.690306193616614, "grad_norm": 0.7184674739837646, "learning_rate": 7.80148193847817e-05, "loss": 0.9095334053039551, "memory(GiB)": 91.52, "step": 53200, "token_acc": 0.7689239912417892, "train_speed(iter/s)": 0.146608 }, { "epoch": 0.6903710720182697, "grad_norm": 0.7048489451408386, "learning_rate": 7.801037646940515e-05, "loss": 0.8740556716918946, "memory(GiB)": 91.52, "step": 53205, "token_acc": 0.7389043271139341, "train_speed(iter/s)": 0.146605 }, { "epoch": 0.6904359504199254, "grad_norm": 0.8357012271881104, "learning_rate": 7.800593323168699e-05, "loss": 0.8548256874084472, "memory(GiB)": 91.52, "step": 53210, "token_acc": 0.7628266158891357, "train_speed(iter/s)": 0.146603 }, { "epoch": 0.6905008288215811, "grad_norm": 0.7835836410522461, "learning_rate": 7.800148967167834e-05, "loss": 0.934748363494873, "memory(GiB)": 91.52, "step": 53215, "token_acc": 0.7668207853150231, "train_speed(iter/s)": 0.1466 }, { "epoch": 0.6905657072232368, "grad_norm": 0.7273356318473816, "learning_rate": 7.799704578943036e-05, "loss": 0.8950252532958984, "memory(GiB)": 91.52, "step": 53220, "token_acc": 0.7527765805788549, "train_speed(iter/s)": 0.146597 }, { "epoch": 0.6906305856248925, "grad_norm": 0.8128920197486877, "learning_rate": 7.799260158499419e-05, "loss": 0.9453206062316895, "memory(GiB)": 91.52, "step": 53225, "token_acc": 0.7466297259857418, "train_speed(iter/s)": 0.146595 }, { "epoch": 0.6906954640265482, "grad_norm": 0.7833540439605713, "learning_rate": 7.798815705842094e-05, "loss": 0.9439162254333496, "memory(GiB)": 91.52, "step": 53230, "token_acc": 0.7583974232564255, "train_speed(iter/s)": 0.146592 }, { "epoch": 0.6907603424282039, "grad_norm": 0.8358412981033325, "learning_rate": 7.798371220976179e-05, "loss": 0.8938985824584961, "memory(GiB)": 91.52, "step": 53235, "token_acc": 0.7592466087635406, "train_speed(iter/s)": 0.146589 }, { "epoch": 0.6908252208298596, "grad_norm": 0.7265239357948303, "learning_rate": 7.797926703906788e-05, "loss": 0.9082565307617188, "memory(GiB)": 91.52, "step": 53240, "token_acc": 0.7517757009345795, "train_speed(iter/s)": 0.146586 }, { "epoch": 0.6908900992315153, "grad_norm": 0.7666524648666382, "learning_rate": 7.797482154639037e-05, "loss": 0.8776180267333984, "memory(GiB)": 91.52, "step": 53245, "token_acc": 0.7421489835888431, "train_speed(iter/s)": 0.146584 }, { "epoch": 0.690954977633171, "grad_norm": 0.7112113237380981, "learning_rate": 7.797037573178042e-05, "loss": 0.8875692367553711, "memory(GiB)": 91.52, "step": 53250, "token_acc": 0.7598603044452037, "train_speed(iter/s)": 0.146582 }, { "epoch": 0.6910198560348267, "grad_norm": 0.7956194281578064, "learning_rate": 7.796592959528918e-05, "loss": 0.9302855491638183, "memory(GiB)": 91.52, "step": 53255, "token_acc": 0.7610260685056077, "train_speed(iter/s)": 0.14658 }, { "epoch": 0.6910847344364824, "grad_norm": 0.7539960741996765, "learning_rate": 7.796148313696783e-05, "loss": 0.8757846832275391, "memory(GiB)": 91.52, "step": 53260, "token_acc": 0.7831249314467479, "train_speed(iter/s)": 0.146576 }, { "epoch": 0.6911496128381381, "grad_norm": 0.6503876447677612, "learning_rate": 7.795703635686751e-05, "loss": 0.9021034240722656, "memory(GiB)": 91.52, "step": 53265, "token_acc": 0.7551653674411565, "train_speed(iter/s)": 0.146574 }, { "epoch": 0.6912144912397938, "grad_norm": 0.7835174202919006, "learning_rate": 7.795258925503943e-05, "loss": 0.8770236968994141, "memory(GiB)": 91.52, "step": 53270, "token_acc": 0.7698335725646894, "train_speed(iter/s)": 0.146571 }, { "epoch": 0.6912793696414495, "grad_norm": 0.7671531438827515, "learning_rate": 7.794814183153475e-05, "loss": 0.8436361312866211, "memory(GiB)": 91.52, "step": 53275, "token_acc": 0.7808088818398097, "train_speed(iter/s)": 0.146568 }, { "epoch": 0.6913442480431052, "grad_norm": 0.7342175841331482, "learning_rate": 7.794369408640465e-05, "loss": 0.9085982322692872, "memory(GiB)": 91.52, "step": 53280, "token_acc": 0.7711858356536018, "train_speed(iter/s)": 0.146566 }, { "epoch": 0.6914091264447609, "grad_norm": 0.8096250891685486, "learning_rate": 7.793924601970031e-05, "loss": 0.8580753326416015, "memory(GiB)": 91.52, "step": 53285, "token_acc": 0.7693938729253491, "train_speed(iter/s)": 0.146564 }, { "epoch": 0.6914740048464166, "grad_norm": 0.7715618014335632, "learning_rate": 7.793479763147293e-05, "loss": 0.9408533096313476, "memory(GiB)": 91.52, "step": 53290, "token_acc": 0.755536680508368, "train_speed(iter/s)": 0.146562 }, { "epoch": 0.6915388832480723, "grad_norm": 0.7645044326782227, "learning_rate": 7.793034892177371e-05, "loss": 0.8811956405639648, "memory(GiB)": 91.52, "step": 53295, "token_acc": 0.7817760572467594, "train_speed(iter/s)": 0.146558 }, { "epoch": 0.691603761649728, "grad_norm": 0.7340522408485413, "learning_rate": 7.792589989065383e-05, "loss": 0.9727206230163574, "memory(GiB)": 91.52, "step": 53300, "token_acc": 0.7409164545244157, "train_speed(iter/s)": 0.146555 }, { "epoch": 0.6916686400513837, "grad_norm": 0.7829154133796692, "learning_rate": 7.792145053816449e-05, "loss": 0.8872323036193848, "memory(GiB)": 91.52, "step": 53305, "token_acc": 0.7600679053098179, "train_speed(iter/s)": 0.146552 }, { "epoch": 0.6917335184530394, "grad_norm": 0.7983848452568054, "learning_rate": 7.791700086435686e-05, "loss": 0.8324012756347656, "memory(GiB)": 91.52, "step": 53310, "token_acc": 0.77672687540547, "train_speed(iter/s)": 0.14655 }, { "epoch": 0.6917983968546951, "grad_norm": 0.7576152086257935, "learning_rate": 7.791255086928223e-05, "loss": 0.8906431198120117, "memory(GiB)": 91.52, "step": 53315, "token_acc": 0.7740315325151766, "train_speed(iter/s)": 0.146547 }, { "epoch": 0.6918632752563508, "grad_norm": 0.733917236328125, "learning_rate": 7.790810055299174e-05, "loss": 0.8670305252075196, "memory(GiB)": 91.52, "step": 53320, "token_acc": 0.7528727627272094, "train_speed(iter/s)": 0.146543 }, { "epoch": 0.6919281536580065, "grad_norm": 0.7111856937408447, "learning_rate": 7.790364991553662e-05, "loss": 0.9359514236450195, "memory(GiB)": 91.52, "step": 53325, "token_acc": 0.742544982511809, "train_speed(iter/s)": 0.146541 }, { "epoch": 0.6919930320596622, "grad_norm": 0.8350604176521301, "learning_rate": 7.789919895696811e-05, "loss": 0.9204187393188477, "memory(GiB)": 91.52, "step": 53330, "token_acc": 0.7346160281077464, "train_speed(iter/s)": 0.14654 }, { "epoch": 0.6920579104613179, "grad_norm": 0.7508881688117981, "learning_rate": 7.78947476773374e-05, "loss": 0.9163004875183105, "memory(GiB)": 91.52, "step": 53335, "token_acc": 0.7471174516318155, "train_speed(iter/s)": 0.146536 }, { "epoch": 0.6921227888629736, "grad_norm": 0.7277004718780518, "learning_rate": 7.789029607669573e-05, "loss": 0.9389284133911133, "memory(GiB)": 91.52, "step": 53340, "token_acc": 0.7335327183626406, "train_speed(iter/s)": 0.146534 }, { "epoch": 0.6921876672646292, "grad_norm": 0.7757799029350281, "learning_rate": 7.788584415509433e-05, "loss": 0.8859577178955078, "memory(GiB)": 91.52, "step": 53345, "token_acc": 0.764534679851322, "train_speed(iter/s)": 0.14653 }, { "epoch": 0.6922525456662849, "grad_norm": 0.6674938201904297, "learning_rate": 7.788139191258444e-05, "loss": 0.8452274322509765, "memory(GiB)": 91.52, "step": 53350, "token_acc": 0.7871313144583854, "train_speed(iter/s)": 0.146527 }, { "epoch": 0.6923174240679406, "grad_norm": 0.7432336807250977, "learning_rate": 7.787693934921726e-05, "loss": 0.9117427825927734, "memory(GiB)": 91.52, "step": 53355, "token_acc": 0.7493151600187078, "train_speed(iter/s)": 0.146525 }, { "epoch": 0.6923823024695963, "grad_norm": 0.719257652759552, "learning_rate": 7.787248646504406e-05, "loss": 0.90501708984375, "memory(GiB)": 91.52, "step": 53360, "token_acc": 0.7585370714855169, "train_speed(iter/s)": 0.146522 }, { "epoch": 0.692447180871252, "grad_norm": 0.6901386976242065, "learning_rate": 7.78680332601161e-05, "loss": 0.9156641960144043, "memory(GiB)": 91.52, "step": 53365, "token_acc": 0.7519670002291651, "train_speed(iter/s)": 0.14652 }, { "epoch": 0.6925120592729077, "grad_norm": 0.7644883990287781, "learning_rate": 7.786357973448459e-05, "loss": 0.8989822387695312, "memory(GiB)": 91.52, "step": 53370, "token_acc": 0.7539783454706585, "train_speed(iter/s)": 0.146517 }, { "epoch": 0.6925769376745634, "grad_norm": 0.8531234264373779, "learning_rate": 7.785912588820081e-05, "loss": 0.9209033012390136, "memory(GiB)": 91.52, "step": 53375, "token_acc": 0.7709244802836636, "train_speed(iter/s)": 0.146513 }, { "epoch": 0.6926418160762191, "grad_norm": 0.7465393543243408, "learning_rate": 7.7854671721316e-05, "loss": 0.9087778091430664, "memory(GiB)": 91.52, "step": 53380, "token_acc": 0.7563333833008544, "train_speed(iter/s)": 0.146511 }, { "epoch": 0.6927066944778748, "grad_norm": 0.6883150935173035, "learning_rate": 7.785021723388142e-05, "loss": 0.9626394271850586, "memory(GiB)": 91.52, "step": 53385, "token_acc": 0.7501722610493159, "train_speed(iter/s)": 0.146509 }, { "epoch": 0.6927715728795305, "grad_norm": 0.7860466241836548, "learning_rate": 7.784576242594833e-05, "loss": 0.9237466812133789, "memory(GiB)": 91.52, "step": 53390, "token_acc": 0.7558112415455182, "train_speed(iter/s)": 0.146506 }, { "epoch": 0.6928364512811862, "grad_norm": 0.7084850072860718, "learning_rate": 7.7841307297568e-05, "loss": 0.8665014266967773, "memory(GiB)": 91.52, "step": 53395, "token_acc": 0.7876966363985164, "train_speed(iter/s)": 0.146504 }, { "epoch": 0.6929013296828419, "grad_norm": 0.7729653716087341, "learning_rate": 7.783685184879168e-05, "loss": 0.9150044441223144, "memory(GiB)": 91.52, "step": 53400, "token_acc": 0.7479788439743106, "train_speed(iter/s)": 0.146501 }, { "epoch": 0.6929662080844976, "grad_norm": 0.7231447100639343, "learning_rate": 7.783239607967069e-05, "loss": 0.8954082489013672, "memory(GiB)": 91.52, "step": 53405, "token_acc": 0.7689852047393947, "train_speed(iter/s)": 0.146498 }, { "epoch": 0.6930310864861533, "grad_norm": 0.7512059211730957, "learning_rate": 7.782793999025626e-05, "loss": 0.8502394676208496, "memory(GiB)": 91.52, "step": 53410, "token_acc": 0.7689559223423765, "train_speed(iter/s)": 0.146495 }, { "epoch": 0.693095964887809, "grad_norm": 0.753679096698761, "learning_rate": 7.78234835805997e-05, "loss": 0.8515054702758789, "memory(GiB)": 91.52, "step": 53415, "token_acc": 0.7639562157935887, "train_speed(iter/s)": 0.146493 }, { "epoch": 0.6931608432894647, "grad_norm": 0.8661779761314392, "learning_rate": 7.781902685075227e-05, "loss": 0.9415061950683594, "memory(GiB)": 91.52, "step": 53420, "token_acc": 0.7359833983326773, "train_speed(iter/s)": 0.14649 }, { "epoch": 0.6932257216911204, "grad_norm": 0.7144066691398621, "learning_rate": 7.781456980076527e-05, "loss": 0.8883797645568847, "memory(GiB)": 91.52, "step": 53425, "token_acc": 0.7646279926994238, "train_speed(iter/s)": 0.146488 }, { "epoch": 0.6932906000927761, "grad_norm": 0.8148586750030518, "learning_rate": 7.781011243068999e-05, "loss": 0.8764608383178711, "memory(GiB)": 91.52, "step": 53430, "token_acc": 0.7672181156869646, "train_speed(iter/s)": 0.146485 }, { "epoch": 0.6933554784944318, "grad_norm": 0.7746326923370361, "learning_rate": 7.780565474057773e-05, "loss": 0.9055703163146973, "memory(GiB)": 91.52, "step": 53435, "token_acc": 0.7581319158894126, "train_speed(iter/s)": 0.146482 }, { "epoch": 0.6934203568960875, "grad_norm": 0.761220395565033, "learning_rate": 7.780119673047979e-05, "loss": 0.8604103088378906, "memory(GiB)": 91.52, "step": 53440, "token_acc": 0.7663426209430496, "train_speed(iter/s)": 0.14648 }, { "epoch": 0.6934852352977432, "grad_norm": 0.8072898983955383, "learning_rate": 7.779673840044746e-05, "loss": 0.9114573478698731, "memory(GiB)": 91.52, "step": 53445, "token_acc": 0.7430404158980379, "train_speed(iter/s)": 0.146477 }, { "epoch": 0.6935501136993989, "grad_norm": 0.8000059723854065, "learning_rate": 7.779227975053206e-05, "loss": 0.9451595306396484, "memory(GiB)": 91.52, "step": 53450, "token_acc": 0.7501112366088236, "train_speed(iter/s)": 0.146475 }, { "epoch": 0.6936149921010546, "grad_norm": 0.7834551930427551, "learning_rate": 7.778782078078488e-05, "loss": 0.9176169395446777, "memory(GiB)": 91.52, "step": 53455, "token_acc": 0.7569998494656028, "train_speed(iter/s)": 0.146473 }, { "epoch": 0.6936798705027103, "grad_norm": 0.6930696368217468, "learning_rate": 7.778336149125727e-05, "loss": 0.8791957855224609, "memory(GiB)": 91.52, "step": 53460, "token_acc": 0.7674536585365853, "train_speed(iter/s)": 0.146471 }, { "epoch": 0.693744748904366, "grad_norm": 0.7803640365600586, "learning_rate": 7.77789018820005e-05, "loss": 0.9346985816955566, "memory(GiB)": 91.52, "step": 53465, "token_acc": 0.7278620737989278, "train_speed(iter/s)": 0.146469 }, { "epoch": 0.6938096273060217, "grad_norm": 0.6941721439361572, "learning_rate": 7.777444195306592e-05, "loss": 0.9151090621948242, "memory(GiB)": 91.52, "step": 53470, "token_acc": 0.7537045261840352, "train_speed(iter/s)": 0.146467 }, { "epoch": 0.6938745057076774, "grad_norm": 0.7373433709144592, "learning_rate": 7.776998170450486e-05, "loss": 0.8682818412780762, "memory(GiB)": 91.52, "step": 53475, "token_acc": 0.7852045147294621, "train_speed(iter/s)": 0.146464 }, { "epoch": 0.6939393841093331, "grad_norm": 0.8015912175178528, "learning_rate": 7.776552113636862e-05, "loss": 0.8816440582275391, "memory(GiB)": 91.52, "step": 53480, "token_acc": 0.7628413647023224, "train_speed(iter/s)": 0.146461 }, { "epoch": 0.6940042625109888, "grad_norm": 0.8060932755470276, "learning_rate": 7.776106024870857e-05, "loss": 0.9070097923278808, "memory(GiB)": 91.52, "step": 53485, "token_acc": 0.771860166582082, "train_speed(iter/s)": 0.146458 }, { "epoch": 0.6940691409126445, "grad_norm": 0.8135383129119873, "learning_rate": 7.775659904157602e-05, "loss": 0.9126496315002441, "memory(GiB)": 91.52, "step": 53490, "token_acc": 0.7596062133805696, "train_speed(iter/s)": 0.146456 }, { "epoch": 0.6941340193143002, "grad_norm": 0.6765661239624023, "learning_rate": 7.77521375150223e-05, "loss": 0.8964282989501953, "memory(GiB)": 91.52, "step": 53495, "token_acc": 0.7579319790002282, "train_speed(iter/s)": 0.146453 }, { "epoch": 0.6941988977159559, "grad_norm": 0.7593896389007568, "learning_rate": 7.774767566909878e-05, "loss": 0.9083954811096191, "memory(GiB)": 91.52, "step": 53500, "token_acc": 0.7558067406437383, "train_speed(iter/s)": 0.146451 }, { "epoch": 0.6942637761176116, "grad_norm": 0.7206456661224365, "learning_rate": 7.77432135038568e-05, "loss": 0.8755525588989258, "memory(GiB)": 91.52, "step": 53505, "token_acc": 0.7780125132038678, "train_speed(iter/s)": 0.146448 }, { "epoch": 0.6943286545192673, "grad_norm": 0.8402963280677795, "learning_rate": 7.77387510193477e-05, "loss": 0.9273963928222656, "memory(GiB)": 91.52, "step": 53510, "token_acc": 0.7481192532738925, "train_speed(iter/s)": 0.146446 }, { "epoch": 0.694393532920923, "grad_norm": 0.7454958558082581, "learning_rate": 7.773428821562283e-05, "loss": 0.8863550186157226, "memory(GiB)": 91.52, "step": 53515, "token_acc": 0.755964275518943, "train_speed(iter/s)": 0.146443 }, { "epoch": 0.6944584113225787, "grad_norm": 0.7079200148582458, "learning_rate": 7.772982509273357e-05, "loss": 0.8847857475280761, "memory(GiB)": 91.52, "step": 53520, "token_acc": 0.7696469248291572, "train_speed(iter/s)": 0.14644 }, { "epoch": 0.6945232897242344, "grad_norm": 0.6682504415512085, "learning_rate": 7.772536165073126e-05, "loss": 0.8450469017028809, "memory(GiB)": 91.52, "step": 53525, "token_acc": 0.766248064682608, "train_speed(iter/s)": 0.146437 }, { "epoch": 0.6945881681258901, "grad_norm": 0.7176511883735657, "learning_rate": 7.772089788966727e-05, "loss": 0.9102169036865234, "memory(GiB)": 91.52, "step": 53530, "token_acc": 0.7444829415987764, "train_speed(iter/s)": 0.146433 }, { "epoch": 0.6946530465275458, "grad_norm": 0.8985588550567627, "learning_rate": 7.771643380959297e-05, "loss": 0.9061719894409179, "memory(GiB)": 91.52, "step": 53535, "token_acc": 0.7519898926089703, "train_speed(iter/s)": 0.146431 }, { "epoch": 0.6947179249292015, "grad_norm": 0.6989828944206238, "learning_rate": 7.771196941055974e-05, "loss": 0.9056573867797851, "memory(GiB)": 91.52, "step": 53540, "token_acc": 0.7582873868354775, "train_speed(iter/s)": 0.146428 }, { "epoch": 0.6947828033308572, "grad_norm": 0.680871307849884, "learning_rate": 7.770750469261895e-05, "loss": 0.9106094360351562, "memory(GiB)": 91.52, "step": 53545, "token_acc": 0.7799646174259177, "train_speed(iter/s)": 0.146425 }, { "epoch": 0.6948476817325129, "grad_norm": 0.7364054322242737, "learning_rate": 7.770303965582197e-05, "loss": 0.9017575263977051, "memory(GiB)": 91.52, "step": 53550, "token_acc": 0.7477450188476037, "train_speed(iter/s)": 0.146422 }, { "epoch": 0.6949125601341686, "grad_norm": 0.7632703185081482, "learning_rate": 7.76985743002202e-05, "loss": 0.8378776550292969, "memory(GiB)": 91.52, "step": 53555, "token_acc": 0.7692740330535606, "train_speed(iter/s)": 0.146418 }, { "epoch": 0.6949774385358243, "grad_norm": 0.7031559944152832, "learning_rate": 7.769410862586501e-05, "loss": 0.8992437362670899, "memory(GiB)": 91.52, "step": 53560, "token_acc": 0.7476956506501244, "train_speed(iter/s)": 0.146416 }, { "epoch": 0.69504231693748, "grad_norm": 0.8915713429450989, "learning_rate": 7.768964263280783e-05, "loss": 0.9699882507324219, "memory(GiB)": 91.52, "step": 53565, "token_acc": 0.7365401445658163, "train_speed(iter/s)": 0.146413 }, { "epoch": 0.6951071953391357, "grad_norm": 0.7016175389289856, "learning_rate": 7.76851763211e-05, "loss": 0.8152925491333007, "memory(GiB)": 91.52, "step": 53570, "token_acc": 0.7729781440871293, "train_speed(iter/s)": 0.14641 }, { "epoch": 0.6951720737407914, "grad_norm": 0.7789980173110962, "learning_rate": 7.768070969079292e-05, "loss": 0.8863493919372558, "memory(GiB)": 91.52, "step": 53575, "token_acc": 0.7518754820164061, "train_speed(iter/s)": 0.146408 }, { "epoch": 0.6952369521424471, "grad_norm": 0.7450786232948303, "learning_rate": 7.767624274193806e-05, "loss": 0.9037319183349609, "memory(GiB)": 91.52, "step": 53580, "token_acc": 0.7445861865202127, "train_speed(iter/s)": 0.146405 }, { "epoch": 0.6953018305441027, "grad_norm": 0.7917845845222473, "learning_rate": 7.767177547458675e-05, "loss": 0.8851578712463379, "memory(GiB)": 91.52, "step": 53585, "token_acc": 0.7575638366354338, "train_speed(iter/s)": 0.146403 }, { "epoch": 0.6953667089457584, "grad_norm": 0.7755045294761658, "learning_rate": 7.766730788879045e-05, "loss": 0.900912094116211, "memory(GiB)": 91.52, "step": 53590, "token_acc": 0.764360243581069, "train_speed(iter/s)": 0.146401 }, { "epoch": 0.6954315873474141, "grad_norm": 0.7321529984474182, "learning_rate": 7.766283998460053e-05, "loss": 0.9230715751647949, "memory(GiB)": 91.52, "step": 53595, "token_acc": 0.7637000746244443, "train_speed(iter/s)": 0.146398 }, { "epoch": 0.6954964657490698, "grad_norm": 0.7897908687591553, "learning_rate": 7.765837176206844e-05, "loss": 0.9162971496582031, "memory(GiB)": 91.52, "step": 53600, "token_acc": 0.761522462562396, "train_speed(iter/s)": 0.146396 }, { "epoch": 0.6955613441507255, "grad_norm": 0.8465884923934937, "learning_rate": 7.76539032212456e-05, "loss": 0.9053167343139649, "memory(GiB)": 91.52, "step": 53605, "token_acc": 0.7569240057354162, "train_speed(iter/s)": 0.146394 }, { "epoch": 0.6956262225523812, "grad_norm": 0.7536446452140808, "learning_rate": 7.764943436218339e-05, "loss": 0.9229844093322754, "memory(GiB)": 91.52, "step": 53610, "token_acc": 0.7532411005751594, "train_speed(iter/s)": 0.146391 }, { "epoch": 0.6956911009540369, "grad_norm": 0.7471176385879517, "learning_rate": 7.764496518493328e-05, "loss": 0.9016476631164551, "memory(GiB)": 91.52, "step": 53615, "token_acc": 0.765746717115513, "train_speed(iter/s)": 0.146388 }, { "epoch": 0.6957559793556926, "grad_norm": 0.6899093389511108, "learning_rate": 7.76404956895467e-05, "loss": 0.8768751144409179, "memory(GiB)": 91.52, "step": 53620, "token_acc": 0.7447552447552448, "train_speed(iter/s)": 0.146386 }, { "epoch": 0.6958208577573483, "grad_norm": 0.683629035949707, "learning_rate": 7.763602587607509e-05, "loss": 0.8990234375, "memory(GiB)": 91.52, "step": 53625, "token_acc": 0.7312323203017335, "train_speed(iter/s)": 0.146383 }, { "epoch": 0.695885736159004, "grad_norm": 0.7496470808982849, "learning_rate": 7.763155574456985e-05, "loss": 0.8369812965393066, "memory(GiB)": 91.52, "step": 53630, "token_acc": 0.7733149521219121, "train_speed(iter/s)": 0.146381 }, { "epoch": 0.6959506145606597, "grad_norm": 0.790137767791748, "learning_rate": 7.762708529508245e-05, "loss": 0.8414175033569335, "memory(GiB)": 91.52, "step": 53635, "token_acc": 0.7767535219120544, "train_speed(iter/s)": 0.146377 }, { "epoch": 0.6960154929623154, "grad_norm": 0.7687087655067444, "learning_rate": 7.762261452766433e-05, "loss": 0.867215347290039, "memory(GiB)": 91.52, "step": 53640, "token_acc": 0.7583156404812456, "train_speed(iter/s)": 0.146375 }, { "epoch": 0.696080371363971, "grad_norm": 0.7823753952980042, "learning_rate": 7.761814344236693e-05, "loss": 0.9028949737548828, "memory(GiB)": 91.52, "step": 53645, "token_acc": 0.7413638804367281, "train_speed(iter/s)": 0.146371 }, { "epoch": 0.6961452497656268, "grad_norm": 0.7389623522758484, "learning_rate": 7.761367203924171e-05, "loss": 0.9324735641479492, "memory(GiB)": 91.52, "step": 53650, "token_acc": 0.7552252289245678, "train_speed(iter/s)": 0.146369 }, { "epoch": 0.6962101281672824, "grad_norm": 0.8211053609848022, "learning_rate": 7.760920031834016e-05, "loss": 0.8706626892089844, "memory(GiB)": 91.52, "step": 53655, "token_acc": 0.7527155532566745, "train_speed(iter/s)": 0.146366 }, { "epoch": 0.6962750065689381, "grad_norm": 0.7384882569313049, "learning_rate": 7.760472827971367e-05, "loss": 0.9087047576904297, "memory(GiB)": 91.52, "step": 53660, "token_acc": 0.7689991202544495, "train_speed(iter/s)": 0.146364 }, { "epoch": 0.6963398849705938, "grad_norm": 0.717290997505188, "learning_rate": 7.760025592341378e-05, "loss": 0.9130319595336914, "memory(GiB)": 91.52, "step": 53665, "token_acc": 0.7648510270803776, "train_speed(iter/s)": 0.146361 }, { "epoch": 0.6964047633722495, "grad_norm": 0.7254205346107483, "learning_rate": 7.75957832494919e-05, "loss": 0.8933332443237305, "memory(GiB)": 91.52, "step": 53670, "token_acc": 0.7610130674929274, "train_speed(iter/s)": 0.146358 }, { "epoch": 0.6964696417739052, "grad_norm": 0.7340741157531738, "learning_rate": 7.759131025799953e-05, "loss": 0.8418723106384277, "memory(GiB)": 91.52, "step": 53675, "token_acc": 0.7589446808510638, "train_speed(iter/s)": 0.146356 }, { "epoch": 0.696534520175561, "grad_norm": 0.7120253443717957, "learning_rate": 7.758683694898813e-05, "loss": 0.9132732391357422, "memory(GiB)": 91.52, "step": 53680, "token_acc": 0.7425390885160635, "train_speed(iter/s)": 0.146353 }, { "epoch": 0.6965993985772166, "grad_norm": 0.7159050703048706, "learning_rate": 7.75823633225092e-05, "loss": 0.8939361572265625, "memory(GiB)": 91.52, "step": 53685, "token_acc": 0.7422567753215936, "train_speed(iter/s)": 0.14635 }, { "epoch": 0.6966642769788723, "grad_norm": 0.7903721928596497, "learning_rate": 7.757788937861419e-05, "loss": 0.8769167900085449, "memory(GiB)": 91.52, "step": 53690, "token_acc": 0.734422539907473, "train_speed(iter/s)": 0.146347 }, { "epoch": 0.696729155380528, "grad_norm": 0.8281651139259338, "learning_rate": 7.75734151173546e-05, "loss": 0.9204058647155762, "memory(GiB)": 91.52, "step": 53695, "token_acc": 0.7711768851303735, "train_speed(iter/s)": 0.146345 }, { "epoch": 0.6967940337821837, "grad_norm": 0.7640950083732605, "learning_rate": 7.756894053878195e-05, "loss": 0.9166794776916504, "memory(GiB)": 91.52, "step": 53700, "token_acc": 0.7532495824558856, "train_speed(iter/s)": 0.146343 }, { "epoch": 0.6968589121838394, "grad_norm": 0.68284672498703, "learning_rate": 7.756446564294768e-05, "loss": 0.8705703735351562, "memory(GiB)": 91.52, "step": 53705, "token_acc": 0.7567686924618677, "train_speed(iter/s)": 0.14634 }, { "epoch": 0.6969237905854951, "grad_norm": 0.6825991272926331, "learning_rate": 7.755999042990334e-05, "loss": 0.8935880661010742, "memory(GiB)": 91.52, "step": 53710, "token_acc": 0.7587581885502706, "train_speed(iter/s)": 0.146337 }, { "epoch": 0.6969886689871508, "grad_norm": 0.7411912083625793, "learning_rate": 7.755551489970039e-05, "loss": 0.8731321334838867, "memory(GiB)": 91.52, "step": 53715, "token_acc": 0.7632160521977058, "train_speed(iter/s)": 0.146335 }, { "epoch": 0.6970535473888065, "grad_norm": 0.7085652351379395, "learning_rate": 7.755103905239034e-05, "loss": 0.9337291717529297, "memory(GiB)": 91.52, "step": 53720, "token_acc": 0.7355926312130692, "train_speed(iter/s)": 0.146332 }, { "epoch": 0.6971184257904622, "grad_norm": 0.8427890539169312, "learning_rate": 7.754656288802472e-05, "loss": 0.8752987861633301, "memory(GiB)": 91.52, "step": 53725, "token_acc": 0.7628178087970242, "train_speed(iter/s)": 0.146329 }, { "epoch": 0.6971833041921179, "grad_norm": 0.8124474883079529, "learning_rate": 7.754208640665502e-05, "loss": 0.8795448303222656, "memory(GiB)": 91.52, "step": 53730, "token_acc": 0.7749340186136964, "train_speed(iter/s)": 0.146326 }, { "epoch": 0.6972481825937736, "grad_norm": 0.800024688243866, "learning_rate": 7.753760960833277e-05, "loss": 0.8829583168029785, "memory(GiB)": 91.52, "step": 53735, "token_acc": 0.7732949409046774, "train_speed(iter/s)": 0.146323 }, { "epoch": 0.6973130609954293, "grad_norm": 0.7824346423149109, "learning_rate": 7.753313249310948e-05, "loss": 0.8782392501831054, "memory(GiB)": 91.52, "step": 53740, "token_acc": 0.7739180834621329, "train_speed(iter/s)": 0.14632 }, { "epoch": 0.697377939397085, "grad_norm": 0.7943106293678284, "learning_rate": 7.752865506103666e-05, "loss": 0.8952309608459472, "memory(GiB)": 91.52, "step": 53745, "token_acc": 0.7674182256182482, "train_speed(iter/s)": 0.146319 }, { "epoch": 0.6974428177987407, "grad_norm": 0.7891039848327637, "learning_rate": 7.752417731216586e-05, "loss": 0.8766963958740235, "memory(GiB)": 91.52, "step": 53750, "token_acc": 0.7609449668567905, "train_speed(iter/s)": 0.146316 }, { "epoch": 0.6975076962003964, "grad_norm": 0.7934595346450806, "learning_rate": 7.751969924654858e-05, "loss": 0.9271906852722168, "memory(GiB)": 91.52, "step": 53755, "token_acc": 0.7605462710741367, "train_speed(iter/s)": 0.146314 }, { "epoch": 0.6975725746020521, "grad_norm": 0.7314755916595459, "learning_rate": 7.75152208642364e-05, "loss": 0.8867061614990235, "memory(GiB)": 91.52, "step": 53760, "token_acc": 0.7421995410487379, "train_speed(iter/s)": 0.146311 }, { "epoch": 0.6976374530037078, "grad_norm": 0.7107586860656738, "learning_rate": 7.751074216528082e-05, "loss": 0.927365779876709, "memory(GiB)": 91.52, "step": 53765, "token_acc": 0.7567324604260309, "train_speed(iter/s)": 0.146308 }, { "epoch": 0.6977023314053635, "grad_norm": 0.736884593963623, "learning_rate": 7.750626314973338e-05, "loss": 0.9191997528076172, "memory(GiB)": 91.52, "step": 53770, "token_acc": 0.7566465256797583, "train_speed(iter/s)": 0.146306 }, { "epoch": 0.6977672098070192, "grad_norm": 0.7426024079322815, "learning_rate": 7.750178381764564e-05, "loss": 0.9206480979919434, "memory(GiB)": 91.52, "step": 53775, "token_acc": 0.7486004761598353, "train_speed(iter/s)": 0.146303 }, { "epoch": 0.6978320882086749, "grad_norm": 0.6964646577835083, "learning_rate": 7.749730416906916e-05, "loss": 0.8917004585266113, "memory(GiB)": 91.52, "step": 53780, "token_acc": 0.7596978061891617, "train_speed(iter/s)": 0.1463 }, { "epoch": 0.6978969666103306, "grad_norm": 0.7782198786735535, "learning_rate": 7.749282420405544e-05, "loss": 0.9091310501098633, "memory(GiB)": 91.52, "step": 53785, "token_acc": 0.7462869553033488, "train_speed(iter/s)": 0.146297 }, { "epoch": 0.6979618450119863, "grad_norm": 0.7659790515899658, "learning_rate": 7.748834392265607e-05, "loss": 0.9023472785949707, "memory(GiB)": 91.52, "step": 53790, "token_acc": 0.7587026099580955, "train_speed(iter/s)": 0.146294 }, { "epoch": 0.698026723413642, "grad_norm": 0.8103017807006836, "learning_rate": 7.748386332492264e-05, "loss": 0.8591837882995605, "memory(GiB)": 91.52, "step": 53795, "token_acc": 0.7384634153780462, "train_speed(iter/s)": 0.146292 }, { "epoch": 0.6980916018152977, "grad_norm": 0.7751795649528503, "learning_rate": 7.747938241090666e-05, "loss": 0.9586940765380859, "memory(GiB)": 91.52, "step": 53800, "token_acc": 0.753710055966781, "train_speed(iter/s)": 0.146289 }, { "epoch": 0.6981564802169534, "grad_norm": 0.7646636366844177, "learning_rate": 7.747490118065972e-05, "loss": 0.9111344337463378, "memory(GiB)": 91.52, "step": 53805, "token_acc": 0.7494529868417109, "train_speed(iter/s)": 0.146287 }, { "epoch": 0.6982213586186091, "grad_norm": 0.725765585899353, "learning_rate": 7.747041963423338e-05, "loss": 0.8281795501708984, "memory(GiB)": 91.52, "step": 53810, "token_acc": 0.7712709662935762, "train_speed(iter/s)": 0.146283 }, { "epoch": 0.6982862370202648, "grad_norm": 0.7607229351997375, "learning_rate": 7.746593777167923e-05, "loss": 0.8905176162719727, "memory(GiB)": 91.52, "step": 53815, "token_acc": 0.7691795313846878, "train_speed(iter/s)": 0.146281 }, { "epoch": 0.6983511154219205, "grad_norm": 0.8247074484825134, "learning_rate": 7.746145559304882e-05, "loss": 0.9152181625366211, "memory(GiB)": 91.52, "step": 53820, "token_acc": 0.7547487464773268, "train_speed(iter/s)": 0.146279 }, { "epoch": 0.6984159938235761, "grad_norm": 0.8820284008979797, "learning_rate": 7.745697309839375e-05, "loss": 0.9367575645446777, "memory(GiB)": 91.52, "step": 53825, "token_acc": 0.757069134644091, "train_speed(iter/s)": 0.146276 }, { "epoch": 0.6984808722252318, "grad_norm": 0.7774538397789001, "learning_rate": 7.74524902877656e-05, "loss": 0.8540488243103027, "memory(GiB)": 91.52, "step": 53830, "token_acc": 0.7447330817425714, "train_speed(iter/s)": 0.146273 }, { "epoch": 0.6985457506268875, "grad_norm": 0.8264269232749939, "learning_rate": 7.744800716121596e-05, "loss": 0.9034941673278809, "memory(GiB)": 91.52, "step": 53835, "token_acc": 0.7673415093115308, "train_speed(iter/s)": 0.146271 }, { "epoch": 0.6986106290285432, "grad_norm": 0.822274923324585, "learning_rate": 7.744352371879642e-05, "loss": 0.8776729583740235, "memory(GiB)": 91.52, "step": 53840, "token_acc": 0.7656136598036595, "train_speed(iter/s)": 0.146268 }, { "epoch": 0.6986755074301989, "grad_norm": 0.8325669765472412, "learning_rate": 7.743903996055859e-05, "loss": 0.8753570556640625, "memory(GiB)": 91.52, "step": 53845, "token_acc": 0.7600572798721194, "train_speed(iter/s)": 0.146266 }, { "epoch": 0.6987403858318546, "grad_norm": 0.6948492527008057, "learning_rate": 7.743455588655404e-05, "loss": 0.9135536193847656, "memory(GiB)": 91.52, "step": 53850, "token_acc": 0.7347918183623485, "train_speed(iter/s)": 0.146263 }, { "epoch": 0.6988052642335103, "grad_norm": 0.8054111003875732, "learning_rate": 7.74300714968344e-05, "loss": 0.9237194061279297, "memory(GiB)": 91.52, "step": 53855, "token_acc": 0.7580426619178506, "train_speed(iter/s)": 0.14626 }, { "epoch": 0.698870142635166, "grad_norm": 0.8031889200210571, "learning_rate": 7.742558679145124e-05, "loss": 0.860353660583496, "memory(GiB)": 91.52, "step": 53860, "token_acc": 0.7781323055538004, "train_speed(iter/s)": 0.146257 }, { "epoch": 0.6989350210368217, "grad_norm": 0.7296024560928345, "learning_rate": 7.74211017704562e-05, "loss": 0.8982122421264649, "memory(GiB)": 91.52, "step": 53865, "token_acc": 0.7472, "train_speed(iter/s)": 0.146255 }, { "epoch": 0.6989998994384774, "grad_norm": 0.7247301340103149, "learning_rate": 7.74166164339009e-05, "loss": 0.8829669952392578, "memory(GiB)": 91.52, "step": 53870, "token_acc": 0.7623968995329424, "train_speed(iter/s)": 0.146252 }, { "epoch": 0.6990647778401331, "grad_norm": 0.7404378056526184, "learning_rate": 7.741213078183694e-05, "loss": 0.8657157897949219, "memory(GiB)": 91.52, "step": 53875, "token_acc": 0.7723500208435973, "train_speed(iter/s)": 0.14625 }, { "epoch": 0.6991296562417888, "grad_norm": 0.7263686060905457, "learning_rate": 7.740764481431593e-05, "loss": 0.8624448776245117, "memory(GiB)": 91.52, "step": 53880, "token_acc": 0.7822855791642739, "train_speed(iter/s)": 0.146247 }, { "epoch": 0.6991945346434445, "grad_norm": 0.7950077652931213, "learning_rate": 7.74031585313895e-05, "loss": 0.8809633255004883, "memory(GiB)": 91.52, "step": 53885, "token_acc": 0.7710268731090941, "train_speed(iter/s)": 0.146244 }, { "epoch": 0.6992594130451002, "grad_norm": 0.7948315143585205, "learning_rate": 7.73986719331093e-05, "loss": 0.9226963996887207, "memory(GiB)": 91.52, "step": 53890, "token_acc": 0.7430519322411214, "train_speed(iter/s)": 0.146241 }, { "epoch": 0.6993242914467559, "grad_norm": 0.6137893199920654, "learning_rate": 7.739418501952696e-05, "loss": 0.8120065689086914, "memory(GiB)": 91.52, "step": 53895, "token_acc": 0.7649252521321789, "train_speed(iter/s)": 0.146237 }, { "epoch": 0.6993891698484116, "grad_norm": 0.755157470703125, "learning_rate": 7.738969779069407e-05, "loss": 0.8692893028259278, "memory(GiB)": 91.52, "step": 53900, "token_acc": 0.743389398363285, "train_speed(iter/s)": 0.146234 }, { "epoch": 0.6994540482500673, "grad_norm": 0.6954901814460754, "learning_rate": 7.738521024666232e-05, "loss": 0.8626710891723632, "memory(GiB)": 91.52, "step": 53905, "token_acc": 0.7571117644812707, "train_speed(iter/s)": 0.146231 }, { "epoch": 0.699518926651723, "grad_norm": 0.765807569026947, "learning_rate": 7.738072238748332e-05, "loss": 0.9045963287353516, "memory(GiB)": 91.52, "step": 53910, "token_acc": 0.7369171030600976, "train_speed(iter/s)": 0.146229 }, { "epoch": 0.6995838050533787, "grad_norm": 0.8271274566650391, "learning_rate": 7.737623421320875e-05, "loss": 0.852077865600586, "memory(GiB)": 91.52, "step": 53915, "token_acc": 0.7792586591047195, "train_speed(iter/s)": 0.146226 }, { "epoch": 0.6996486834550344, "grad_norm": 0.7517957091331482, "learning_rate": 7.737174572389023e-05, "loss": 0.9459522247314454, "memory(GiB)": 91.52, "step": 53920, "token_acc": 0.7456440098792164, "train_speed(iter/s)": 0.146224 }, { "epoch": 0.6997135618566901, "grad_norm": 0.6395292282104492, "learning_rate": 7.736725691957942e-05, "loss": 0.8616965293884278, "memory(GiB)": 91.52, "step": 53925, "token_acc": 0.7754389276949216, "train_speed(iter/s)": 0.146221 }, { "epoch": 0.6997784402583458, "grad_norm": 0.7145859003067017, "learning_rate": 7.736276780032798e-05, "loss": 0.9078187942504883, "memory(GiB)": 91.52, "step": 53930, "token_acc": 0.7620286576168929, "train_speed(iter/s)": 0.146219 }, { "epoch": 0.6998433186600015, "grad_norm": 0.6578958034515381, "learning_rate": 7.735827836618756e-05, "loss": 0.8459921836853027, "memory(GiB)": 91.52, "step": 53935, "token_acc": 0.7685255920550038, "train_speed(iter/s)": 0.146216 }, { "epoch": 0.6999081970616572, "grad_norm": 0.705193281173706, "learning_rate": 7.735378861720984e-05, "loss": 0.8553781509399414, "memory(GiB)": 91.52, "step": 53940, "token_acc": 0.7621545226130654, "train_speed(iter/s)": 0.146213 }, { "epoch": 0.6999730754633129, "grad_norm": 0.7602111101150513, "learning_rate": 7.734929855344649e-05, "loss": 0.8931209564208984, "memory(GiB)": 91.52, "step": 53945, "token_acc": 0.7597682652075958, "train_speed(iter/s)": 0.146209 }, { "epoch": 0.7000379538649686, "grad_norm": 0.7798507213592529, "learning_rate": 7.734480817494917e-05, "loss": 0.9216478347778321, "memory(GiB)": 91.52, "step": 53950, "token_acc": 0.7471766579035065, "train_speed(iter/s)": 0.146207 }, { "epoch": 0.7001028322666243, "grad_norm": 0.7559962868690491, "learning_rate": 7.734031748176955e-05, "loss": 0.9161865234375, "memory(GiB)": 91.52, "step": 53955, "token_acc": 0.7476669450597335, "train_speed(iter/s)": 0.146205 }, { "epoch": 0.70016771066828, "grad_norm": 0.7600463032722473, "learning_rate": 7.733582647395932e-05, "loss": 0.8604895591735839, "memory(GiB)": 91.52, "step": 53960, "token_acc": 0.7645717602188873, "train_speed(iter/s)": 0.146203 }, { "epoch": 0.7002325890699357, "grad_norm": 0.7286834120750427, "learning_rate": 7.733133515157015e-05, "loss": 0.9209321975708008, "memory(GiB)": 91.52, "step": 53965, "token_acc": 0.7603280114307014, "train_speed(iter/s)": 0.146199 }, { "epoch": 0.7002974674715914, "grad_norm": 0.7516180276870728, "learning_rate": 7.732684351465374e-05, "loss": 0.8955400466918946, "memory(GiB)": 91.52, "step": 53970, "token_acc": 0.7615780445969125, "train_speed(iter/s)": 0.146197 }, { "epoch": 0.7003623458732471, "grad_norm": 0.746374249458313, "learning_rate": 7.732235156326177e-05, "loss": 0.8901376724243164, "memory(GiB)": 91.52, "step": 53975, "token_acc": 0.7525591086149108, "train_speed(iter/s)": 0.146193 }, { "epoch": 0.7004272242749028, "grad_norm": 0.6959747672080994, "learning_rate": 7.731785929744595e-05, "loss": 0.9059144973754882, "memory(GiB)": 91.52, "step": 53980, "token_acc": 0.753977102765616, "train_speed(iter/s)": 0.14619 }, { "epoch": 0.7004921026765585, "grad_norm": 0.8044013381004333, "learning_rate": 7.731336671725796e-05, "loss": 0.8598932266235352, "memory(GiB)": 91.52, "step": 53985, "token_acc": 0.7554147870876924, "train_speed(iter/s)": 0.146187 }, { "epoch": 0.7005569810782142, "grad_norm": 0.7892688512802124, "learning_rate": 7.73088738227495e-05, "loss": 0.8676968574523926, "memory(GiB)": 91.52, "step": 53990, "token_acc": 0.7627118644067796, "train_speed(iter/s)": 0.146185 }, { "epoch": 0.7006218594798699, "grad_norm": 0.750129222869873, "learning_rate": 7.730438061397227e-05, "loss": 0.8686948776245117, "memory(GiB)": 91.52, "step": 53995, "token_acc": 0.7529746428899152, "train_speed(iter/s)": 0.146182 }, { "epoch": 0.7006867378815256, "grad_norm": 0.7125178575515747, "learning_rate": 7.729988709097798e-05, "loss": 0.9027766227722168, "memory(GiB)": 91.52, "step": 54000, "token_acc": 0.7414818086082141, "train_speed(iter/s)": 0.146179 }, { "epoch": 0.7007516162831813, "grad_norm": 0.7358165383338928, "learning_rate": 7.729539325381836e-05, "loss": 0.8613929748535156, "memory(GiB)": 91.52, "step": 54005, "token_acc": 0.7731410724965114, "train_speed(iter/s)": 0.146175 }, { "epoch": 0.700816494684837, "grad_norm": 0.7781529426574707, "learning_rate": 7.72908991025451e-05, "loss": 0.8884073257446289, "memory(GiB)": 91.52, "step": 54010, "token_acc": 0.7588800668616799, "train_speed(iter/s)": 0.146173 }, { "epoch": 0.7008813730864927, "grad_norm": 0.7893421053886414, "learning_rate": 7.728640463720995e-05, "loss": 0.8611909866333007, "memory(GiB)": 91.52, "step": 54015, "token_acc": 0.7694307800421645, "train_speed(iter/s)": 0.14617 }, { "epoch": 0.7009462514881484, "grad_norm": 0.7270882725715637, "learning_rate": 7.728190985786458e-05, "loss": 0.9002159118652344, "memory(GiB)": 91.52, "step": 54020, "token_acc": 0.7325015119202979, "train_speed(iter/s)": 0.146167 }, { "epoch": 0.7010111298898041, "grad_norm": 0.721987783908844, "learning_rate": 7.727741476456079e-05, "loss": 0.8666222572326661, "memory(GiB)": 91.52, "step": 54025, "token_acc": 0.7697983635300993, "train_speed(iter/s)": 0.146164 }, { "epoch": 0.7010760082914598, "grad_norm": 0.719038724899292, "learning_rate": 7.727291935735023e-05, "loss": 0.9337343215942383, "memory(GiB)": 91.52, "step": 54030, "token_acc": 0.7506603081438005, "train_speed(iter/s)": 0.146162 }, { "epoch": 0.7011408866931155, "grad_norm": 0.7476770877838135, "learning_rate": 7.726842363628468e-05, "loss": 0.8877063751220703, "memory(GiB)": 91.52, "step": 54035, "token_acc": 0.752148108778457, "train_speed(iter/s)": 0.146159 }, { "epoch": 0.7012057650947712, "grad_norm": 0.8603448867797852, "learning_rate": 7.726392760141588e-05, "loss": 0.8877543449401856, "memory(GiB)": 91.52, "step": 54040, "token_acc": 0.765175983436853, "train_speed(iter/s)": 0.146157 }, { "epoch": 0.7012706434964269, "grad_norm": 0.7487446069717407, "learning_rate": 7.725943125279553e-05, "loss": 0.8981338500976562, "memory(GiB)": 91.52, "step": 54045, "token_acc": 0.7702427564604541, "train_speed(iter/s)": 0.146154 }, { "epoch": 0.7013355218980826, "grad_norm": 0.7749736309051514, "learning_rate": 7.725493459047542e-05, "loss": 0.857541847229004, "memory(GiB)": 91.52, "step": 54050, "token_acc": 0.7676158068761838, "train_speed(iter/s)": 0.14615 }, { "epoch": 0.7014004002997383, "grad_norm": 0.7157365679740906, "learning_rate": 7.725043761450727e-05, "loss": 0.8810329437255859, "memory(GiB)": 91.52, "step": 54055, "token_acc": 0.7617895559608442, "train_speed(iter/s)": 0.146147 }, { "epoch": 0.7014652787013939, "grad_norm": 0.7200555801391602, "learning_rate": 7.724594032494284e-05, "loss": 0.8822587013244629, "memory(GiB)": 91.52, "step": 54060, "token_acc": 0.7753204515018175, "train_speed(iter/s)": 0.146144 }, { "epoch": 0.7015301571030496, "grad_norm": 0.7659052014350891, "learning_rate": 7.724144272183387e-05, "loss": 0.902064323425293, "memory(GiB)": 91.52, "step": 54065, "token_acc": 0.7460478181343089, "train_speed(iter/s)": 0.146142 }, { "epoch": 0.7015950355047053, "grad_norm": 0.689656674861908, "learning_rate": 7.723694480523216e-05, "loss": 0.8248091697692871, "memory(GiB)": 91.52, "step": 54070, "token_acc": 0.7565206692913385, "train_speed(iter/s)": 0.146139 }, { "epoch": 0.701659913906361, "grad_norm": 0.817532479763031, "learning_rate": 7.723244657518941e-05, "loss": 0.8963088989257812, "memory(GiB)": 91.52, "step": 54075, "token_acc": 0.7525, "train_speed(iter/s)": 0.146137 }, { "epoch": 0.7017247923080167, "grad_norm": 0.7366395592689514, "learning_rate": 7.722794803175744e-05, "loss": 0.9053360939025878, "memory(GiB)": 91.52, "step": 54080, "token_acc": 0.7578236130867709, "train_speed(iter/s)": 0.146134 }, { "epoch": 0.7017896707096724, "grad_norm": 0.764865517616272, "learning_rate": 7.722344917498798e-05, "loss": 0.8840587615966797, "memory(GiB)": 91.52, "step": 54085, "token_acc": 0.7546264564770391, "train_speed(iter/s)": 0.146132 }, { "epoch": 0.701854549111328, "grad_norm": 0.7168295979499817, "learning_rate": 7.721895000493283e-05, "loss": 0.8789714813232422, "memory(GiB)": 91.52, "step": 54090, "token_acc": 0.7595161015505196, "train_speed(iter/s)": 0.146129 }, { "epoch": 0.7019194275129838, "grad_norm": 0.824459969997406, "learning_rate": 7.721445052164375e-05, "loss": 0.8867136001586914, "memory(GiB)": 91.52, "step": 54095, "token_acc": 0.7452531117562652, "train_speed(iter/s)": 0.146126 }, { "epoch": 0.7019843059146395, "grad_norm": 0.7839032411575317, "learning_rate": 7.720995072517252e-05, "loss": 0.8773544311523438, "memory(GiB)": 91.52, "step": 54100, "token_acc": 0.7662860208461066, "train_speed(iter/s)": 0.146124 }, { "epoch": 0.7020491843162951, "grad_norm": 0.7685402035713196, "learning_rate": 7.720545061557094e-05, "loss": 0.8508203506469727, "memory(GiB)": 91.52, "step": 54105, "token_acc": 0.7675826171165718, "train_speed(iter/s)": 0.146121 }, { "epoch": 0.7021140627179508, "grad_norm": 0.8464041352272034, "learning_rate": 7.720095019289078e-05, "loss": 0.8892606735229492, "memory(GiB)": 91.52, "step": 54110, "token_acc": 0.7258324209441618, "train_speed(iter/s)": 0.146119 }, { "epoch": 0.7021789411196065, "grad_norm": 0.6947054862976074, "learning_rate": 7.719644945718383e-05, "loss": 0.8984484672546387, "memory(GiB)": 91.52, "step": 54115, "token_acc": 0.7722197072581862, "train_speed(iter/s)": 0.146116 }, { "epoch": 0.7022438195212622, "grad_norm": 0.7096384167671204, "learning_rate": 7.719194840850189e-05, "loss": 0.8735164642333985, "memory(GiB)": 91.52, "step": 54120, "token_acc": 0.7847298682561981, "train_speed(iter/s)": 0.146113 }, { "epoch": 0.702308697922918, "grad_norm": 0.7937431335449219, "learning_rate": 7.718744704689677e-05, "loss": 0.8708955764770507, "memory(GiB)": 91.52, "step": 54125, "token_acc": 0.775141297992594, "train_speed(iter/s)": 0.146111 }, { "epoch": 0.7023735763245736, "grad_norm": 0.8633432984352112, "learning_rate": 7.718294537242026e-05, "loss": 0.9016188621520996, "memory(GiB)": 91.52, "step": 54130, "token_acc": 0.7698018803211879, "train_speed(iter/s)": 0.146108 }, { "epoch": 0.7024384547262293, "grad_norm": 0.7785668969154358, "learning_rate": 7.717844338512415e-05, "loss": 0.9267971038818359, "memory(GiB)": 91.52, "step": 54135, "token_acc": 0.7639855632895076, "train_speed(iter/s)": 0.146106 }, { "epoch": 0.702503333127885, "grad_norm": 0.7728915214538574, "learning_rate": 7.717394108506026e-05, "loss": 0.8956727981567383, "memory(GiB)": 91.52, "step": 54140, "token_acc": 0.7586585853203677, "train_speed(iter/s)": 0.146103 }, { "epoch": 0.7025682115295407, "grad_norm": 0.665302574634552, "learning_rate": 7.716943847228042e-05, "loss": 0.8165545463562012, "memory(GiB)": 91.52, "step": 54145, "token_acc": 0.7810757981611526, "train_speed(iter/s)": 0.1461 }, { "epoch": 0.7026330899311964, "grad_norm": 0.8220252394676208, "learning_rate": 7.716493554683642e-05, "loss": 0.9234831809997559, "memory(GiB)": 91.52, "step": 54150, "token_acc": 0.7424253225921438, "train_speed(iter/s)": 0.146098 }, { "epoch": 0.7026979683328521, "grad_norm": 0.8468509912490845, "learning_rate": 7.71604323087801e-05, "loss": 0.9334251403808593, "memory(GiB)": 91.52, "step": 54155, "token_acc": 0.7628332939069311, "train_speed(iter/s)": 0.146096 }, { "epoch": 0.7027628467345078, "grad_norm": 0.8264437317848206, "learning_rate": 7.715592875816326e-05, "loss": 0.8661252021789551, "memory(GiB)": 91.52, "step": 54160, "token_acc": 0.7753094255791813, "train_speed(iter/s)": 0.146094 }, { "epoch": 0.7028277251361635, "grad_norm": 0.7449429035186768, "learning_rate": 7.715142489503775e-05, "loss": 0.8994081497192383, "memory(GiB)": 91.52, "step": 54165, "token_acc": 0.753513671375899, "train_speed(iter/s)": 0.146091 }, { "epoch": 0.7028926035378192, "grad_norm": 0.7356656789779663, "learning_rate": 7.714692071945539e-05, "loss": 0.8642400741577149, "memory(GiB)": 91.52, "step": 54170, "token_acc": 0.7724557910762384, "train_speed(iter/s)": 0.146089 }, { "epoch": 0.7029574819394749, "grad_norm": 0.6983250975608826, "learning_rate": 7.714241623146801e-05, "loss": 0.8705534934997559, "memory(GiB)": 91.52, "step": 54175, "token_acc": 0.7488080808080808, "train_speed(iter/s)": 0.146087 }, { "epoch": 0.7030223603411306, "grad_norm": 0.7071316838264465, "learning_rate": 7.713791143112745e-05, "loss": 0.8754317283630371, "memory(GiB)": 91.52, "step": 54180, "token_acc": 0.7829023041816631, "train_speed(iter/s)": 0.146083 }, { "epoch": 0.7030872387427863, "grad_norm": 0.7451266050338745, "learning_rate": 7.713340631848555e-05, "loss": 0.9113042831420899, "memory(GiB)": 91.52, "step": 54185, "token_acc": 0.7653130671506352, "train_speed(iter/s)": 0.146081 }, { "epoch": 0.703152117144442, "grad_norm": 0.7580178380012512, "learning_rate": 7.712890089359415e-05, "loss": 0.8746459007263183, "memory(GiB)": 91.52, "step": 54190, "token_acc": 0.7518134404412206, "train_speed(iter/s)": 0.146077 }, { "epoch": 0.7032169955460977, "grad_norm": 0.6559738516807556, "learning_rate": 7.712439515650512e-05, "loss": 0.9032633781433106, "memory(GiB)": 91.52, "step": 54195, "token_acc": 0.7280767399142799, "train_speed(iter/s)": 0.146075 }, { "epoch": 0.7032818739477534, "grad_norm": 0.7674257159233093, "learning_rate": 7.711988910727029e-05, "loss": 0.8899913787841797, "memory(GiB)": 91.52, "step": 54200, "token_acc": 0.755417722476457, "train_speed(iter/s)": 0.146072 }, { "epoch": 0.7033467523494091, "grad_norm": 0.7238020896911621, "learning_rate": 7.711538274594152e-05, "loss": 0.8872247695922851, "memory(GiB)": 91.52, "step": 54205, "token_acc": 0.7685533200678016, "train_speed(iter/s)": 0.146069 }, { "epoch": 0.7034116307510648, "grad_norm": 0.6996908187866211, "learning_rate": 7.711087607257069e-05, "loss": 0.905352783203125, "memory(GiB)": 91.52, "step": 54210, "token_acc": 0.7458689246148681, "train_speed(iter/s)": 0.146067 }, { "epoch": 0.7034765091527205, "grad_norm": 0.8214237689971924, "learning_rate": 7.710636908720961e-05, "loss": 0.9245771408081055, "memory(GiB)": 91.52, "step": 54215, "token_acc": 0.7478333906005421, "train_speed(iter/s)": 0.146064 }, { "epoch": 0.7035413875543762, "grad_norm": 0.8379713296890259, "learning_rate": 7.710186178991019e-05, "loss": 0.9056541442871093, "memory(GiB)": 91.52, "step": 54220, "token_acc": 0.7612970570408781, "train_speed(iter/s)": 0.146062 }, { "epoch": 0.7036062659560319, "grad_norm": 0.74573814868927, "learning_rate": 7.70973541807243e-05, "loss": 0.9059549331665039, "memory(GiB)": 91.52, "step": 54225, "token_acc": 0.7606305122912367, "train_speed(iter/s)": 0.146059 }, { "epoch": 0.7036711443576876, "grad_norm": 0.756354808807373, "learning_rate": 7.70928462597038e-05, "loss": 0.9019828796386719, "memory(GiB)": 91.52, "step": 54230, "token_acc": 0.7601350187717425, "train_speed(iter/s)": 0.146056 }, { "epoch": 0.7037360227593433, "grad_norm": 0.7121095657348633, "learning_rate": 7.708833802690057e-05, "loss": 0.9081769943237304, "memory(GiB)": 91.52, "step": 54235, "token_acc": 0.7456651718983558, "train_speed(iter/s)": 0.146053 }, { "epoch": 0.703800901160999, "grad_norm": 0.7498867511749268, "learning_rate": 7.708382948236648e-05, "loss": 0.8663358688354492, "memory(GiB)": 91.52, "step": 54240, "token_acc": 0.7755397206152109, "train_speed(iter/s)": 0.14605 }, { "epoch": 0.7038657795626547, "grad_norm": 0.7161219120025635, "learning_rate": 7.707932062615342e-05, "loss": 0.903719139099121, "memory(GiB)": 91.52, "step": 54245, "token_acc": 0.7710625167695198, "train_speed(iter/s)": 0.146047 }, { "epoch": 0.7039306579643104, "grad_norm": 0.7312635183334351, "learning_rate": 7.707481145831329e-05, "loss": 0.8485733032226562, "memory(GiB)": 91.52, "step": 54250, "token_acc": 0.7936883337644581, "train_speed(iter/s)": 0.146044 }, { "epoch": 0.7039955363659661, "grad_norm": 0.7868760228157043, "learning_rate": 7.707030197889798e-05, "loss": 0.877650260925293, "memory(GiB)": 91.52, "step": 54255, "token_acc": 0.7626939932122969, "train_speed(iter/s)": 0.146041 }, { "epoch": 0.7040604147676218, "grad_norm": 0.7945958375930786, "learning_rate": 7.706579218795935e-05, "loss": 0.9045171737670898, "memory(GiB)": 91.52, "step": 54260, "token_acc": 0.7599407783417935, "train_speed(iter/s)": 0.146039 }, { "epoch": 0.7041252931692775, "grad_norm": 0.802069365978241, "learning_rate": 7.706128208554935e-05, "loss": 0.9044326782226563, "memory(GiB)": 91.52, "step": 54265, "token_acc": 0.7490706920515887, "train_speed(iter/s)": 0.146037 }, { "epoch": 0.7041901715709332, "grad_norm": 0.8587349653244019, "learning_rate": 7.705677167171985e-05, "loss": 0.9077414512634278, "memory(GiB)": 91.52, "step": 54270, "token_acc": 0.7465923408804551, "train_speed(iter/s)": 0.146035 }, { "epoch": 0.7042550499725889, "grad_norm": 0.6807632446289062, "learning_rate": 7.705226094652275e-05, "loss": 0.8956746101379395, "memory(GiB)": 91.52, "step": 54275, "token_acc": 0.7704123788777886, "train_speed(iter/s)": 0.146033 }, { "epoch": 0.7043199283742446, "grad_norm": 0.7932922840118408, "learning_rate": 7.704774991001e-05, "loss": 0.9455747604370117, "memory(GiB)": 91.52, "step": 54280, "token_acc": 0.725729695431472, "train_speed(iter/s)": 0.146031 }, { "epoch": 0.7043848067759003, "grad_norm": 0.6634858250617981, "learning_rate": 7.704323856223347e-05, "loss": 0.8850291252136231, "memory(GiB)": 91.52, "step": 54285, "token_acc": 0.7536206287530908, "train_speed(iter/s)": 0.146028 }, { "epoch": 0.704449685177556, "grad_norm": 0.79167240858078, "learning_rate": 7.703872690324508e-05, "loss": 0.9156950950622559, "memory(GiB)": 91.52, "step": 54290, "token_acc": 0.7385595920968769, "train_speed(iter/s)": 0.146026 }, { "epoch": 0.7045145635792117, "grad_norm": 0.7607991099357605, "learning_rate": 7.703421493309677e-05, "loss": 0.9083815574645996, "memory(GiB)": 91.52, "step": 54295, "token_acc": 0.7408939835854303, "train_speed(iter/s)": 0.146023 }, { "epoch": 0.7045794419808673, "grad_norm": 0.8290340304374695, "learning_rate": 7.702970265184046e-05, "loss": 0.9174125671386719, "memory(GiB)": 91.52, "step": 54300, "token_acc": 0.7441410781963029, "train_speed(iter/s)": 0.14602 }, { "epoch": 0.704644320382523, "grad_norm": 0.7382249236106873, "learning_rate": 7.702519005952807e-05, "loss": 0.9000587463378906, "memory(GiB)": 91.52, "step": 54305, "token_acc": 0.7704868051085741, "train_speed(iter/s)": 0.146018 }, { "epoch": 0.7047091987841787, "grad_norm": 0.7737845778465271, "learning_rate": 7.702067715621153e-05, "loss": 0.9337186813354492, "memory(GiB)": 91.52, "step": 54310, "token_acc": 0.7534013605442177, "train_speed(iter/s)": 0.146015 }, { "epoch": 0.7047740771858344, "grad_norm": 0.7495740652084351, "learning_rate": 7.701616394194278e-05, "loss": 0.9460212707519531, "memory(GiB)": 91.52, "step": 54315, "token_acc": 0.7330769504682548, "train_speed(iter/s)": 0.146013 }, { "epoch": 0.7048389555874901, "grad_norm": 0.7536289095878601, "learning_rate": 7.701165041677376e-05, "loss": 0.8638809204101563, "memory(GiB)": 91.52, "step": 54320, "token_acc": 0.7742429516185172, "train_speed(iter/s)": 0.14601 }, { "epoch": 0.7049038339891458, "grad_norm": 0.6913391351699829, "learning_rate": 7.700713658075638e-05, "loss": 0.905216121673584, "memory(GiB)": 91.52, "step": 54325, "token_acc": 0.7488367059842008, "train_speed(iter/s)": 0.146008 }, { "epoch": 0.7049687123908015, "grad_norm": 0.785637378692627, "learning_rate": 7.700262243394262e-05, "loss": 0.9231376647949219, "memory(GiB)": 91.52, "step": 54330, "token_acc": 0.7460978552800591, "train_speed(iter/s)": 0.146005 }, { "epoch": 0.7050335907924572, "grad_norm": 0.7677556276321411, "learning_rate": 7.699810797638442e-05, "loss": 0.91305513381958, "memory(GiB)": 91.52, "step": 54335, "token_acc": 0.7585023976392475, "train_speed(iter/s)": 0.146003 }, { "epoch": 0.7050984691941129, "grad_norm": 0.7649911642074585, "learning_rate": 7.699359320813375e-05, "loss": 0.8429755210876465, "memory(GiB)": 91.52, "step": 54340, "token_acc": 0.748613531928379, "train_speed(iter/s)": 0.146001 }, { "epoch": 0.7051633475957686, "grad_norm": 0.8470963835716248, "learning_rate": 7.698907812924253e-05, "loss": 0.9428890228271485, "memory(GiB)": 91.52, "step": 54345, "token_acc": 0.7560541849553504, "train_speed(iter/s)": 0.145998 }, { "epoch": 0.7052282259974243, "grad_norm": 0.6973371505737305, "learning_rate": 7.698456273976274e-05, "loss": 0.9123800277709961, "memory(GiB)": 91.52, "step": 54350, "token_acc": 0.761618514483225, "train_speed(iter/s)": 0.145995 }, { "epoch": 0.70529310439908, "grad_norm": 0.7459604144096375, "learning_rate": 7.698004703974635e-05, "loss": 0.9202624320983886, "memory(GiB)": 91.52, "step": 54355, "token_acc": 0.7523559988167181, "train_speed(iter/s)": 0.145993 }, { "epoch": 0.7053579828007357, "grad_norm": 0.7501559257507324, "learning_rate": 7.697553102924529e-05, "loss": 0.8241765022277832, "memory(GiB)": 91.52, "step": 54360, "token_acc": 0.7669994087162185, "train_speed(iter/s)": 0.14599 }, { "epoch": 0.7054228612023914, "grad_norm": 0.8844005465507507, "learning_rate": 7.697101470831159e-05, "loss": 0.9242738723754883, "memory(GiB)": 91.52, "step": 54365, "token_acc": 0.7408416708841048, "train_speed(iter/s)": 0.145988 }, { "epoch": 0.7054877396040471, "grad_norm": 0.7364212274551392, "learning_rate": 7.696649807699713e-05, "loss": 0.8500990867614746, "memory(GiB)": 91.52, "step": 54370, "token_acc": 0.7633570430998083, "train_speed(iter/s)": 0.145984 }, { "epoch": 0.7055526180057028, "grad_norm": 0.7329116463661194, "learning_rate": 7.6961981135354e-05, "loss": 0.9172947883605957, "memory(GiB)": 91.52, "step": 54375, "token_acc": 0.7633268125687737, "train_speed(iter/s)": 0.145981 }, { "epoch": 0.7056174964073585, "grad_norm": 0.7755626440048218, "learning_rate": 7.69574638834341e-05, "loss": 0.9157971382141114, "memory(GiB)": 91.52, "step": 54380, "token_acc": 0.7762997776006042, "train_speed(iter/s)": 0.145978 }, { "epoch": 0.7056823748090142, "grad_norm": 0.7420703172683716, "learning_rate": 7.695294632128947e-05, "loss": 0.9225342750549317, "memory(GiB)": 91.52, "step": 54385, "token_acc": 0.7481957559872177, "train_speed(iter/s)": 0.145976 }, { "epoch": 0.7057472532106699, "grad_norm": 0.7879441976547241, "learning_rate": 7.694842844897203e-05, "loss": 0.8802230834960938, "memory(GiB)": 91.52, "step": 54390, "token_acc": 0.77099636799341, "train_speed(iter/s)": 0.145974 }, { "epoch": 0.7058121316123256, "grad_norm": 0.7747098803520203, "learning_rate": 7.694391026653383e-05, "loss": 0.8464677810668946, "memory(GiB)": 91.52, "step": 54395, "token_acc": 0.7483888474323316, "train_speed(iter/s)": 0.145972 }, { "epoch": 0.7058770100139813, "grad_norm": 0.7498361468315125, "learning_rate": 7.693939177402685e-05, "loss": 0.878938102722168, "memory(GiB)": 91.52, "step": 54400, "token_acc": 0.7502392344497608, "train_speed(iter/s)": 0.145969 }, { "epoch": 0.705941888415637, "grad_norm": 0.9054911136627197, "learning_rate": 7.693487297150306e-05, "loss": 0.9017091751098633, "memory(GiB)": 91.52, "step": 54405, "token_acc": 0.757208350699049, "train_speed(iter/s)": 0.145966 }, { "epoch": 0.7060067668172927, "grad_norm": 0.7432294487953186, "learning_rate": 7.693035385901452e-05, "loss": 0.8781959533691406, "memory(GiB)": 91.52, "step": 54410, "token_acc": 0.7437255279542975, "train_speed(iter/s)": 0.145964 }, { "epoch": 0.7060716452189484, "grad_norm": 0.8045774102210999, "learning_rate": 7.692583443661316e-05, "loss": 0.9123592376708984, "memory(GiB)": 91.52, "step": 54415, "token_acc": 0.7510879621697564, "train_speed(iter/s)": 0.145962 }, { "epoch": 0.7061365236206041, "grad_norm": 0.7506025433540344, "learning_rate": 7.692131470435107e-05, "loss": 0.8990158081054688, "memory(GiB)": 91.52, "step": 54420, "token_acc": 0.7600708130135614, "train_speed(iter/s)": 0.145959 }, { "epoch": 0.7062014020222598, "grad_norm": 0.7537420392036438, "learning_rate": 7.69167946622802e-05, "loss": 0.8338788032531739, "memory(GiB)": 91.52, "step": 54425, "token_acc": 0.7713690370633091, "train_speed(iter/s)": 0.145956 }, { "epoch": 0.7062662804239155, "grad_norm": 0.8347910642623901, "learning_rate": 7.69122743104526e-05, "loss": 0.8728078842163086, "memory(GiB)": 91.52, "step": 54430, "token_acc": 0.7548792590142243, "train_speed(iter/s)": 0.145954 }, { "epoch": 0.7063311588255712, "grad_norm": 0.7010890245437622, "learning_rate": 7.690775364892026e-05, "loss": 0.8653932571411133, "memory(GiB)": 91.52, "step": 54435, "token_acc": 0.7369832754812243, "train_speed(iter/s)": 0.145951 }, { "epoch": 0.7063960372272269, "grad_norm": 0.7608519196510315, "learning_rate": 7.690323267773524e-05, "loss": 0.9110269546508789, "memory(GiB)": 91.52, "step": 54440, "token_acc": 0.7259676701764793, "train_speed(iter/s)": 0.145947 }, { "epoch": 0.7064609156288826, "grad_norm": 0.8993145823478699, "learning_rate": 7.689871139694955e-05, "loss": 0.9093482971191407, "memory(GiB)": 91.52, "step": 54445, "token_acc": 0.7606670500287521, "train_speed(iter/s)": 0.145945 }, { "epoch": 0.7065257940305383, "grad_norm": 0.7350745797157288, "learning_rate": 7.689418980661521e-05, "loss": 0.893185806274414, "memory(GiB)": 91.52, "step": 54450, "token_acc": 0.7579626870965454, "train_speed(iter/s)": 0.145942 }, { "epoch": 0.706590672432194, "grad_norm": 0.7314597964286804, "learning_rate": 7.688966790678427e-05, "loss": 0.8813041687011719, "memory(GiB)": 91.52, "step": 54455, "token_acc": 0.7629392971246006, "train_speed(iter/s)": 0.145939 }, { "epoch": 0.7066555508338497, "grad_norm": 0.7931748628616333, "learning_rate": 7.688514569750875e-05, "loss": 0.9450810432434082, "memory(GiB)": 91.52, "step": 54460, "token_acc": 0.7444690265486725, "train_speed(iter/s)": 0.145938 }, { "epoch": 0.7067204292355054, "grad_norm": 0.7630658745765686, "learning_rate": 7.688062317884073e-05, "loss": 0.9015462875366211, "memory(GiB)": 91.52, "step": 54465, "token_acc": 0.7529787234042553, "train_speed(iter/s)": 0.145935 }, { "epoch": 0.7067853076371611, "grad_norm": 0.7080581784248352, "learning_rate": 7.687610035083223e-05, "loss": 0.8704610824584961, "memory(GiB)": 91.52, "step": 54470, "token_acc": 0.7619975389663659, "train_speed(iter/s)": 0.145932 }, { "epoch": 0.7068501860388168, "grad_norm": 0.7163351774215698, "learning_rate": 7.687157721353527e-05, "loss": 0.898958969116211, "memory(GiB)": 91.52, "step": 54475, "token_acc": 0.7543386039336676, "train_speed(iter/s)": 0.145929 }, { "epoch": 0.7069150644404725, "grad_norm": 0.7363319993019104, "learning_rate": 7.686705376700194e-05, "loss": 0.882170581817627, "memory(GiB)": 91.52, "step": 54480, "token_acc": 0.7679386887979986, "train_speed(iter/s)": 0.145927 }, { "epoch": 0.7069799428421282, "grad_norm": 0.7589579820632935, "learning_rate": 7.686253001128429e-05, "loss": 0.8288490295410156, "memory(GiB)": 91.52, "step": 54485, "token_acc": 0.76221928665786, "train_speed(iter/s)": 0.145924 }, { "epoch": 0.7070448212437839, "grad_norm": 0.8033698201179504, "learning_rate": 7.685800594643437e-05, "loss": 0.9290990829467773, "memory(GiB)": 91.52, "step": 54490, "token_acc": 0.7500687884068605, "train_speed(iter/s)": 0.145922 }, { "epoch": 0.7071096996454396, "grad_norm": 0.7556756734848022, "learning_rate": 7.685348157250424e-05, "loss": 0.9230076789855957, "memory(GiB)": 91.52, "step": 54495, "token_acc": 0.760915066279881, "train_speed(iter/s)": 0.14592 }, { "epoch": 0.7071745780470953, "grad_norm": 0.911492109298706, "learning_rate": 7.684895688954599e-05, "loss": 0.9232486724853516, "memory(GiB)": 91.52, "step": 54500, "token_acc": 0.7539319748973425, "train_speed(iter/s)": 0.145917 }, { "epoch": 0.707239456448751, "grad_norm": 0.816974937915802, "learning_rate": 7.684443189761165e-05, "loss": 0.8860230445861816, "memory(GiB)": 91.52, "step": 54505, "token_acc": 0.7809032085023636, "train_speed(iter/s)": 0.145914 }, { "epoch": 0.7073043348504067, "grad_norm": 0.7757140398025513, "learning_rate": 7.683990659675333e-05, "loss": 0.872065258026123, "memory(GiB)": 91.52, "step": 54510, "token_acc": 0.7490887191353133, "train_speed(iter/s)": 0.14591 }, { "epoch": 0.7073692132520624, "grad_norm": 0.7296913862228394, "learning_rate": 7.683538098702309e-05, "loss": 0.8883567810058594, "memory(GiB)": 91.52, "step": 54515, "token_acc": 0.7630024698428607, "train_speed(iter/s)": 0.145907 }, { "epoch": 0.7074340916537181, "grad_norm": 0.7689297795295715, "learning_rate": 7.683085506847301e-05, "loss": 0.9460901260375977, "memory(GiB)": 91.52, "step": 54520, "token_acc": 0.7273185185185185, "train_speed(iter/s)": 0.145905 }, { "epoch": 0.7074989700553738, "grad_norm": 0.7845697402954102, "learning_rate": 7.682632884115517e-05, "loss": 0.901673698425293, "memory(GiB)": 91.52, "step": 54525, "token_acc": 0.7645520650499695, "train_speed(iter/s)": 0.145903 }, { "epoch": 0.7075638484570295, "grad_norm": 0.6995742917060852, "learning_rate": 7.682180230512166e-05, "loss": 0.8952176094055175, "memory(GiB)": 91.52, "step": 54530, "token_acc": 0.7598142414860681, "train_speed(iter/s)": 0.1459 }, { "epoch": 0.7076287268586852, "grad_norm": 0.9070087671279907, "learning_rate": 7.681727546042459e-05, "loss": 0.8912044525146484, "memory(GiB)": 91.52, "step": 54535, "token_acc": 0.7502975565357418, "train_speed(iter/s)": 0.145897 }, { "epoch": 0.7076936052603408, "grad_norm": 0.8499948382377625, "learning_rate": 7.681274830711604e-05, "loss": 0.9008121490478516, "memory(GiB)": 91.52, "step": 54540, "token_acc": 0.7588266363303945, "train_speed(iter/s)": 0.145895 }, { "epoch": 0.7077584836619965, "grad_norm": 0.8184630274772644, "learning_rate": 7.680822084524811e-05, "loss": 0.9272684097290039, "memory(GiB)": 91.52, "step": 54545, "token_acc": 0.7599640499153099, "train_speed(iter/s)": 0.145892 }, { "epoch": 0.7078233620636522, "grad_norm": 0.7532674670219421, "learning_rate": 7.68036930748729e-05, "loss": 0.8654157638549804, "memory(GiB)": 91.52, "step": 54550, "token_acc": 0.7556419172291539, "train_speed(iter/s)": 0.145889 }, { "epoch": 0.7078882404653078, "grad_norm": 0.7693787217140198, "learning_rate": 7.67991649960425e-05, "loss": 0.8915323257446289, "memory(GiB)": 91.52, "step": 54555, "token_acc": 0.7536112822619292, "train_speed(iter/s)": 0.145886 }, { "epoch": 0.7079531188669635, "grad_norm": 0.7242481112480164, "learning_rate": 7.679463660880905e-05, "loss": 0.8877782821655273, "memory(GiB)": 91.52, "step": 54560, "token_acc": 0.7656951024378, "train_speed(iter/s)": 0.145883 }, { "epoch": 0.7080179972686192, "grad_norm": 0.7385399341583252, "learning_rate": 7.679010791322462e-05, "loss": 0.9096328735351562, "memory(GiB)": 91.52, "step": 54565, "token_acc": 0.7685051958433253, "train_speed(iter/s)": 0.145881 }, { "epoch": 0.708082875670275, "grad_norm": 0.6742836833000183, "learning_rate": 7.67855789093414e-05, "loss": 0.8291189193725585, "memory(GiB)": 91.52, "step": 54570, "token_acc": 0.7856814728435634, "train_speed(iter/s)": 0.145878 }, { "epoch": 0.7081477540719306, "grad_norm": 0.7459568977355957, "learning_rate": 7.678104959721142e-05, "loss": 0.9111112594604492, "memory(GiB)": 91.52, "step": 54575, "token_acc": 0.758434047601394, "train_speed(iter/s)": 0.145875 }, { "epoch": 0.7082126324735863, "grad_norm": 0.7385479211807251, "learning_rate": 7.677651997688686e-05, "loss": 0.9075102806091309, "memory(GiB)": 91.52, "step": 54580, "token_acc": 0.7529034467727371, "train_speed(iter/s)": 0.145873 }, { "epoch": 0.708277510875242, "grad_norm": 0.743005096912384, "learning_rate": 7.677199004841983e-05, "loss": 0.867890739440918, "memory(GiB)": 91.52, "step": 54585, "token_acc": 0.7661504424778761, "train_speed(iter/s)": 0.145869 }, { "epoch": 0.7083423892768977, "grad_norm": 0.721270740032196, "learning_rate": 7.676745981186246e-05, "loss": 0.8428176879882813, "memory(GiB)": 91.52, "step": 54590, "token_acc": 0.7471518384454182, "train_speed(iter/s)": 0.145867 }, { "epoch": 0.7084072676785534, "grad_norm": 0.7253745198249817, "learning_rate": 7.676292926726689e-05, "loss": 0.8791732788085938, "memory(GiB)": 91.52, "step": 54595, "token_acc": 0.7670385760560154, "train_speed(iter/s)": 0.145864 }, { "epoch": 0.7084721460802091, "grad_norm": 0.7569942474365234, "learning_rate": 7.675839841468523e-05, "loss": 0.8984375, "memory(GiB)": 91.52, "step": 54600, "token_acc": 0.7648909224609853, "train_speed(iter/s)": 0.145862 }, { "epoch": 0.7085370244818648, "grad_norm": 0.7561184167861938, "learning_rate": 7.675386725416968e-05, "loss": 0.8625127792358398, "memory(GiB)": 91.52, "step": 54605, "token_acc": 0.7735529092553513, "train_speed(iter/s)": 0.145859 }, { "epoch": 0.7086019028835205, "grad_norm": 0.7902596592903137, "learning_rate": 7.674933578577232e-05, "loss": 0.954837417602539, "memory(GiB)": 91.52, "step": 54610, "token_acc": 0.7451251282860978, "train_speed(iter/s)": 0.145858 }, { "epoch": 0.7086667812851762, "grad_norm": 0.77108234167099, "learning_rate": 7.674480400954535e-05, "loss": 0.9171243667602539, "memory(GiB)": 91.52, "step": 54615, "token_acc": 0.7546575215497265, "train_speed(iter/s)": 0.145855 }, { "epoch": 0.7087316596868319, "grad_norm": 0.7191940546035767, "learning_rate": 7.674027192554088e-05, "loss": 0.874118423461914, "memory(GiB)": 91.52, "step": 54620, "token_acc": 0.7762417672886938, "train_speed(iter/s)": 0.145852 }, { "epoch": 0.7087965380884876, "grad_norm": 0.7754733562469482, "learning_rate": 7.673573953381108e-05, "loss": 0.8920413017272949, "memory(GiB)": 91.52, "step": 54625, "token_acc": 0.7465654374548084, "train_speed(iter/s)": 0.145849 }, { "epoch": 0.7088614164901433, "grad_norm": 0.7748826742172241, "learning_rate": 7.673120683440813e-05, "loss": 0.8655426979064942, "memory(GiB)": 91.52, "step": 54630, "token_acc": 0.7522117423411567, "train_speed(iter/s)": 0.145847 }, { "epoch": 0.708926294891799, "grad_norm": 0.8096691370010376, "learning_rate": 7.672667382738415e-05, "loss": 0.9465608596801758, "memory(GiB)": 91.52, "step": 54635, "token_acc": 0.7559193862193018, "train_speed(iter/s)": 0.145845 }, { "epoch": 0.7089911732934547, "grad_norm": 0.7578750252723694, "learning_rate": 7.672214051279133e-05, "loss": 0.8822288513183594, "memory(GiB)": 91.52, "step": 54640, "token_acc": 0.7516352956881591, "train_speed(iter/s)": 0.145843 }, { "epoch": 0.7090560516951104, "grad_norm": 0.7239738702774048, "learning_rate": 7.671760689068184e-05, "loss": 0.867281150817871, "memory(GiB)": 91.52, "step": 54645, "token_acc": 0.7729713714306877, "train_speed(iter/s)": 0.145839 }, { "epoch": 0.7091209300967661, "grad_norm": 0.7536283135414124, "learning_rate": 7.671307296110785e-05, "loss": 0.9034202575683594, "memory(GiB)": 91.52, "step": 54650, "token_acc": 0.7542065564258775, "train_speed(iter/s)": 0.145836 }, { "epoch": 0.7091858084984218, "grad_norm": 0.6801588535308838, "learning_rate": 7.670853872412153e-05, "loss": 0.9163253784179688, "memory(GiB)": 91.52, "step": 54655, "token_acc": 0.728241699256192, "train_speed(iter/s)": 0.145834 }, { "epoch": 0.7092506869000775, "grad_norm": 0.6536757349967957, "learning_rate": 7.670400417977509e-05, "loss": 0.8534992218017579, "memory(GiB)": 91.52, "step": 54660, "token_acc": 0.7492051423831905, "train_speed(iter/s)": 0.14583 }, { "epoch": 0.7093155653017332, "grad_norm": 0.7417237162590027, "learning_rate": 7.669946932812067e-05, "loss": 0.9265830993652344, "memory(GiB)": 91.52, "step": 54665, "token_acc": 0.7442765329164496, "train_speed(iter/s)": 0.145827 }, { "epoch": 0.7093804437033889, "grad_norm": 0.8009272217750549, "learning_rate": 7.669493416921046e-05, "loss": 0.8223031044006348, "memory(GiB)": 91.52, "step": 54670, "token_acc": 0.7737310839186156, "train_speed(iter/s)": 0.145824 }, { "epoch": 0.7094453221050446, "grad_norm": 0.8147794008255005, "learning_rate": 7.669039870309669e-05, "loss": 0.8865597724914551, "memory(GiB)": 91.52, "step": 54675, "token_acc": 0.7625429661631611, "train_speed(iter/s)": 0.145822 }, { "epoch": 0.7095102005067003, "grad_norm": 0.7161038517951965, "learning_rate": 7.668586292983152e-05, "loss": 0.9209802627563477, "memory(GiB)": 91.52, "step": 54680, "token_acc": 0.7595201965459932, "train_speed(iter/s)": 0.14582 }, { "epoch": 0.709575078908356, "grad_norm": 0.7405552864074707, "learning_rate": 7.668132684946716e-05, "loss": 0.9117002487182617, "memory(GiB)": 91.52, "step": 54685, "token_acc": 0.7382346931825922, "train_speed(iter/s)": 0.145818 }, { "epoch": 0.7096399573100117, "grad_norm": 0.8212119340896606, "learning_rate": 7.66767904620558e-05, "loss": 0.8917158126831055, "memory(GiB)": 91.52, "step": 54690, "token_acc": 0.7570593268682259, "train_speed(iter/s)": 0.145815 }, { "epoch": 0.7097048357116674, "grad_norm": 0.7841199636459351, "learning_rate": 7.667225376764965e-05, "loss": 0.9067360877990722, "memory(GiB)": 91.52, "step": 54695, "token_acc": 0.7467805000538078, "train_speed(iter/s)": 0.145813 }, { "epoch": 0.7097697141133231, "grad_norm": 0.7640380859375, "learning_rate": 7.666771676630094e-05, "loss": 0.9078723907470703, "memory(GiB)": 91.52, "step": 54700, "token_acc": 0.7549458434414187, "train_speed(iter/s)": 0.145811 }, { "epoch": 0.7098345925149788, "grad_norm": 0.7144595980644226, "learning_rate": 7.666317945806184e-05, "loss": 0.8994297981262207, "memory(GiB)": 91.52, "step": 54705, "token_acc": 0.7489901558900093, "train_speed(iter/s)": 0.145808 }, { "epoch": 0.7098994709166345, "grad_norm": 0.7871441841125488, "learning_rate": 7.665864184298459e-05, "loss": 0.8898689270019531, "memory(GiB)": 91.52, "step": 54710, "token_acc": 0.7543957189519772, "train_speed(iter/s)": 0.145806 }, { "epoch": 0.7099643493182902, "grad_norm": 0.8191267848014832, "learning_rate": 7.665410392112143e-05, "loss": 0.9235728263854981, "memory(GiB)": 91.52, "step": 54715, "token_acc": 0.7453721867532189, "train_speed(iter/s)": 0.145804 }, { "epoch": 0.7100292277199459, "grad_norm": 0.745606005191803, "learning_rate": 7.664956569252453e-05, "loss": 0.8573055267333984, "memory(GiB)": 91.52, "step": 54720, "token_acc": 0.7621023513139695, "train_speed(iter/s)": 0.145801 }, { "epoch": 0.7100941061216016, "grad_norm": 0.7186140418052673, "learning_rate": 7.664502715724615e-05, "loss": 0.9164140701293946, "memory(GiB)": 91.52, "step": 54725, "token_acc": 0.7532910775231594, "train_speed(iter/s)": 0.145798 }, { "epoch": 0.7101589845232573, "grad_norm": 0.7137096524238586, "learning_rate": 7.664048831533851e-05, "loss": 0.8699546813964844, "memory(GiB)": 91.52, "step": 54730, "token_acc": 0.7605644126809602, "train_speed(iter/s)": 0.145796 }, { "epoch": 0.710223862924913, "grad_norm": 0.7112298011779785, "learning_rate": 7.663594916685387e-05, "loss": 0.9005517959594727, "memory(GiB)": 91.52, "step": 54735, "token_acc": 0.7506027617447213, "train_speed(iter/s)": 0.145793 }, { "epoch": 0.7102887413265687, "grad_norm": 0.7439770698547363, "learning_rate": 7.66314097118444e-05, "loss": 0.893126392364502, "memory(GiB)": 91.52, "step": 54740, "token_acc": 0.7605667876274408, "train_speed(iter/s)": 0.145791 }, { "epoch": 0.7103536197282244, "grad_norm": 0.6362854838371277, "learning_rate": 7.66268699503624e-05, "loss": 0.8694806098937988, "memory(GiB)": 91.52, "step": 54745, "token_acc": 0.7678008821676119, "train_speed(iter/s)": 0.145789 }, { "epoch": 0.7104184981298801, "grad_norm": 0.7497590780258179, "learning_rate": 7.662232988246009e-05, "loss": 0.9194976806640625, "memory(GiB)": 91.52, "step": 54750, "token_acc": 0.7827202181889023, "train_speed(iter/s)": 0.145786 }, { "epoch": 0.7104833765315358, "grad_norm": 0.673490583896637, "learning_rate": 7.661778950818974e-05, "loss": 0.910059928894043, "memory(GiB)": 91.52, "step": 54755, "token_acc": 0.7542663169264002, "train_speed(iter/s)": 0.145784 }, { "epoch": 0.7105482549331915, "grad_norm": 0.7829681634902954, "learning_rate": 7.661324882760356e-05, "loss": 0.8803139686584472, "memory(GiB)": 91.52, "step": 54760, "token_acc": 0.7458188213386668, "train_speed(iter/s)": 0.145781 }, { "epoch": 0.7106131333348472, "grad_norm": 0.717538595199585, "learning_rate": 7.660870784075385e-05, "loss": 0.9276421546936036, "memory(GiB)": 91.52, "step": 54765, "token_acc": 0.7418345084895817, "train_speed(iter/s)": 0.145779 }, { "epoch": 0.7106780117365029, "grad_norm": 0.7498323917388916, "learning_rate": 7.660416654769282e-05, "loss": 0.9267803192138672, "memory(GiB)": 91.52, "step": 54770, "token_acc": 0.7335237258347979, "train_speed(iter/s)": 0.145777 }, { "epoch": 0.7107428901381586, "grad_norm": 0.7469538450241089, "learning_rate": 7.659962494847276e-05, "loss": 0.9243766784667968, "memory(GiB)": 91.52, "step": 54775, "token_acc": 0.7392089453762545, "train_speed(iter/s)": 0.145774 }, { "epoch": 0.7108077685398142, "grad_norm": 0.734585165977478, "learning_rate": 7.659508304314594e-05, "loss": 0.9079862594604492, "memory(GiB)": 91.52, "step": 54780, "token_acc": 0.7833856914231522, "train_speed(iter/s)": 0.145771 }, { "epoch": 0.7108726469414699, "grad_norm": 0.7486295104026794, "learning_rate": 7.65905408317646e-05, "loss": 0.9311337471008301, "memory(GiB)": 91.52, "step": 54785, "token_acc": 0.7338349470907402, "train_speed(iter/s)": 0.145768 }, { "epoch": 0.7109375253431256, "grad_norm": 0.7064527273178101, "learning_rate": 7.658599831438102e-05, "loss": 0.9071207046508789, "memory(GiB)": 91.52, "step": 54790, "token_acc": 0.7572479446127217, "train_speed(iter/s)": 0.145765 }, { "epoch": 0.7110024037447813, "grad_norm": 0.784549355506897, "learning_rate": 7.65814554910475e-05, "loss": 0.87353515625, "memory(GiB)": 91.52, "step": 54795, "token_acc": 0.7691216913028532, "train_speed(iter/s)": 0.145763 }, { "epoch": 0.711067282146437, "grad_norm": 0.7799385786056519, "learning_rate": 7.657691236181628e-05, "loss": 0.8347334861755371, "memory(GiB)": 91.52, "step": 54800, "token_acc": 0.7713747894719462, "train_speed(iter/s)": 0.14576 }, { "epoch": 0.7111321605480927, "grad_norm": 0.7947901487350464, "learning_rate": 7.657236892673969e-05, "loss": 0.8860403060913086, "memory(GiB)": 91.52, "step": 54805, "token_acc": 0.7725007950810983, "train_speed(iter/s)": 0.145757 }, { "epoch": 0.7111970389497484, "grad_norm": 0.6613506078720093, "learning_rate": 7.656782518586998e-05, "loss": 0.8622960090637207, "memory(GiB)": 91.52, "step": 54810, "token_acc": 0.756240612259495, "train_speed(iter/s)": 0.145754 }, { "epoch": 0.7112619173514041, "grad_norm": 0.694787859916687, "learning_rate": 7.656328113925945e-05, "loss": 0.8756053924560547, "memory(GiB)": 91.52, "step": 54815, "token_acc": 0.7665950100970621, "train_speed(iter/s)": 0.145752 }, { "epoch": 0.7113267957530598, "grad_norm": 0.7403460144996643, "learning_rate": 7.655873678696038e-05, "loss": 0.8422178268432617, "memory(GiB)": 91.52, "step": 54820, "token_acc": 0.7442113125551708, "train_speed(iter/s)": 0.145749 }, { "epoch": 0.7113916741547155, "grad_norm": 0.6990185379981995, "learning_rate": 7.655419212902508e-05, "loss": 0.8538496017456054, "memory(GiB)": 91.52, "step": 54825, "token_acc": 0.7819984962944398, "train_speed(iter/s)": 0.145747 }, { "epoch": 0.7114565525563712, "grad_norm": 0.7422199249267578, "learning_rate": 7.654964716550585e-05, "loss": 0.8726350784301757, "memory(GiB)": 91.52, "step": 54830, "token_acc": 0.7629617414248021, "train_speed(iter/s)": 0.145744 }, { "epoch": 0.7115214309580269, "grad_norm": 0.7362030744552612, "learning_rate": 7.654510189645498e-05, "loss": 0.8708715438842773, "memory(GiB)": 91.52, "step": 54835, "token_acc": 0.7411859242642381, "train_speed(iter/s)": 0.145741 }, { "epoch": 0.7115863093596826, "grad_norm": 0.8540611267089844, "learning_rate": 7.654055632192479e-05, "loss": 0.9167288780212403, "memory(GiB)": 91.52, "step": 54840, "token_acc": 0.7581585756768239, "train_speed(iter/s)": 0.145739 }, { "epoch": 0.7116511877613383, "grad_norm": 0.7977991700172424, "learning_rate": 7.653601044196759e-05, "loss": 0.9227802276611328, "memory(GiB)": 91.52, "step": 54845, "token_acc": 0.7617635211834927, "train_speed(iter/s)": 0.145737 }, { "epoch": 0.711716066162994, "grad_norm": 0.7977874875068665, "learning_rate": 7.653146425663569e-05, "loss": 0.9162334442138672, "memory(GiB)": 91.52, "step": 54850, "token_acc": 0.7495315193763586, "train_speed(iter/s)": 0.145734 }, { "epoch": 0.7117809445646497, "grad_norm": 0.7459288835525513, "learning_rate": 7.652691776598141e-05, "loss": 0.8784055709838867, "memory(GiB)": 91.52, "step": 54855, "token_acc": 0.7457868090713642, "train_speed(iter/s)": 0.145732 }, { "epoch": 0.7118458229663054, "grad_norm": 0.7836576104164124, "learning_rate": 7.652237097005705e-05, "loss": 0.8995970726013184, "memory(GiB)": 91.52, "step": 54860, "token_acc": 0.7580825728534112, "train_speed(iter/s)": 0.145729 }, { "epoch": 0.7119107013679611, "grad_norm": 0.8023998141288757, "learning_rate": 7.651782386891497e-05, "loss": 0.8858671188354492, "memory(GiB)": 91.52, "step": 54865, "token_acc": 0.7713288705312633, "train_speed(iter/s)": 0.145727 }, { "epoch": 0.7119755797696168, "grad_norm": 0.6463829278945923, "learning_rate": 7.651327646260746e-05, "loss": 0.8583822250366211, "memory(GiB)": 91.52, "step": 54870, "token_acc": 0.7837584591358667, "train_speed(iter/s)": 0.145724 }, { "epoch": 0.7120404581712725, "grad_norm": 0.727787435054779, "learning_rate": 7.65087287511869e-05, "loss": 0.9135274887084961, "memory(GiB)": 91.52, "step": 54875, "token_acc": 0.7597241277021766, "train_speed(iter/s)": 0.145722 }, { "epoch": 0.7121053365729282, "grad_norm": 0.7938573360443115, "learning_rate": 7.650418073470557e-05, "loss": 0.8703991889953613, "memory(GiB)": 91.52, "step": 54880, "token_acc": 0.7639776845705909, "train_speed(iter/s)": 0.14572 }, { "epoch": 0.7121702149745839, "grad_norm": 0.7167019248008728, "learning_rate": 7.649963241321584e-05, "loss": 0.8334244728088379, "memory(GiB)": 91.52, "step": 54885, "token_acc": 0.7708794788273615, "train_speed(iter/s)": 0.145718 }, { "epoch": 0.7122350933762396, "grad_norm": 0.7579916715621948, "learning_rate": 7.649508378677006e-05, "loss": 0.9023149490356446, "memory(GiB)": 91.52, "step": 54890, "token_acc": 0.7617437305387217, "train_speed(iter/s)": 0.145716 }, { "epoch": 0.7122999717778953, "grad_norm": 0.6961449980735779, "learning_rate": 7.649053485542055e-05, "loss": 0.8585437774658203, "memory(GiB)": 91.52, "step": 54895, "token_acc": 0.763934016495876, "train_speed(iter/s)": 0.145712 }, { "epoch": 0.712364850179551, "grad_norm": 0.7999557256698608, "learning_rate": 7.648598561921967e-05, "loss": 0.8904809951782227, "memory(GiB)": 91.52, "step": 54900, "token_acc": 0.7519901793021353, "train_speed(iter/s)": 0.14571 }, { "epoch": 0.7124297285812067, "grad_norm": 0.6865981817245483, "learning_rate": 7.648143607821979e-05, "loss": 0.9156858444213867, "memory(GiB)": 91.52, "step": 54905, "token_acc": 0.7522516726711271, "train_speed(iter/s)": 0.145707 }, { "epoch": 0.7124946069828624, "grad_norm": 0.755828320980072, "learning_rate": 7.647688623247322e-05, "loss": 0.9029200553894043, "memory(GiB)": 91.52, "step": 54910, "token_acc": 0.7363795530227608, "train_speed(iter/s)": 0.145704 }, { "epoch": 0.7125594853845181, "grad_norm": 0.7547401189804077, "learning_rate": 7.647233608203234e-05, "loss": 0.8925094604492188, "memory(GiB)": 91.52, "step": 54915, "token_acc": 0.7597189046976148, "train_speed(iter/s)": 0.145701 }, { "epoch": 0.7126243637861738, "grad_norm": 0.8156461119651794, "learning_rate": 7.646778562694956e-05, "loss": 0.9076797485351562, "memory(GiB)": 91.52, "step": 54920, "token_acc": 0.7383525499692775, "train_speed(iter/s)": 0.1457 }, { "epoch": 0.7126892421878295, "grad_norm": 0.7103968858718872, "learning_rate": 7.646323486727717e-05, "loss": 0.9059833526611328, "memory(GiB)": 91.52, "step": 54925, "token_acc": 0.7563973760416051, "train_speed(iter/s)": 0.145697 }, { "epoch": 0.7127541205894852, "grad_norm": 0.8633906245231628, "learning_rate": 7.64586838030676e-05, "loss": 0.8859139442443847, "memory(GiB)": 91.52, "step": 54930, "token_acc": 0.7477113731760944, "train_speed(iter/s)": 0.145695 }, { "epoch": 0.7128189989911409, "grad_norm": 0.8080822229385376, "learning_rate": 7.64541324343732e-05, "loss": 0.9110990524291992, "memory(GiB)": 91.52, "step": 54935, "token_acc": 0.7571939586645469, "train_speed(iter/s)": 0.145691 }, { "epoch": 0.7128838773927966, "grad_norm": 0.6847395896911621, "learning_rate": 7.644958076124633e-05, "loss": 0.8805123329162597, "memory(GiB)": 91.52, "step": 54940, "token_acc": 0.7505990347946407, "train_speed(iter/s)": 0.145689 }, { "epoch": 0.7129487557944523, "grad_norm": 0.7913487553596497, "learning_rate": 7.644502878373938e-05, "loss": 0.8650287628173828, "memory(GiB)": 91.52, "step": 54945, "token_acc": 0.7628791065896661, "train_speed(iter/s)": 0.145687 }, { "epoch": 0.713013634196108, "grad_norm": 0.7692379355430603, "learning_rate": 7.644047650190475e-05, "loss": 0.8907999038696289, "memory(GiB)": 91.52, "step": 54950, "token_acc": 0.7467200907737039, "train_speed(iter/s)": 0.145684 }, { "epoch": 0.7130785125977637, "grad_norm": 0.8144122958183289, "learning_rate": 7.643592391579483e-05, "loss": 0.903449821472168, "memory(GiB)": 91.52, "step": 54955, "token_acc": 0.7650429799426934, "train_speed(iter/s)": 0.145681 }, { "epoch": 0.7131433909994194, "grad_norm": 0.7351460456848145, "learning_rate": 7.643137102546198e-05, "loss": 0.8870677947998047, "memory(GiB)": 91.52, "step": 54960, "token_acc": 0.7536930777682823, "train_speed(iter/s)": 0.145678 }, { "epoch": 0.7132082694010751, "grad_norm": 0.741669237613678, "learning_rate": 7.642681783095864e-05, "loss": 0.8956646919250488, "memory(GiB)": 91.52, "step": 54965, "token_acc": 0.7489207523897625, "train_speed(iter/s)": 0.145676 }, { "epoch": 0.7132731478027308, "grad_norm": 0.6676359176635742, "learning_rate": 7.642226433233717e-05, "loss": 0.8769268035888672, "memory(GiB)": 91.52, "step": 54970, "token_acc": 0.7539918809201623, "train_speed(iter/s)": 0.145674 }, { "epoch": 0.7133380262043865, "grad_norm": 0.7364452481269836, "learning_rate": 7.641771052964998e-05, "loss": 0.8515851974487305, "memory(GiB)": 91.52, "step": 54975, "token_acc": 0.7873441619738941, "train_speed(iter/s)": 0.145671 }, { "epoch": 0.7134029046060422, "grad_norm": 0.7371811270713806, "learning_rate": 7.641315642294948e-05, "loss": 0.9256655693054199, "memory(GiB)": 91.52, "step": 54980, "token_acc": 0.7635657494595683, "train_speed(iter/s)": 0.145669 }, { "epoch": 0.7134677830076979, "grad_norm": 0.716718316078186, "learning_rate": 7.640860201228806e-05, "loss": 0.8825353622436524, "memory(GiB)": 91.52, "step": 54985, "token_acc": 0.7501436427809243, "train_speed(iter/s)": 0.145667 }, { "epoch": 0.7135326614093536, "grad_norm": 0.6721152067184448, "learning_rate": 7.640404729771817e-05, "loss": 0.8943659782409668, "memory(GiB)": 91.52, "step": 54990, "token_acc": 0.7619872280549843, "train_speed(iter/s)": 0.145664 }, { "epoch": 0.7135975398110093, "grad_norm": 0.8434205651283264, "learning_rate": 7.639949227929218e-05, "loss": 0.8918634414672851, "memory(GiB)": 91.52, "step": 54995, "token_acc": 0.7878653976886472, "train_speed(iter/s)": 0.14566 }, { "epoch": 0.713662418212665, "grad_norm": 0.7178495526313782, "learning_rate": 7.639493695706254e-05, "loss": 0.8742493629455567, "memory(GiB)": 91.52, "step": 55000, "token_acc": 0.7693423271500843, "train_speed(iter/s)": 0.145657 }, { "epoch": 0.7137272966143207, "grad_norm": 0.7121761441230774, "learning_rate": 7.639038133108167e-05, "loss": 0.9335109710693359, "memory(GiB)": 91.52, "step": 55005, "token_acc": 0.7452946679139383, "train_speed(iter/s)": 0.145655 }, { "epoch": 0.7137921750159764, "grad_norm": 0.8015047907829285, "learning_rate": 7.6385825401402e-05, "loss": 0.9324181556701661, "memory(GiB)": 91.52, "step": 55010, "token_acc": 0.7549383923332681, "train_speed(iter/s)": 0.145652 }, { "epoch": 0.713857053417632, "grad_norm": 0.7210535407066345, "learning_rate": 7.638126916807594e-05, "loss": 0.9136600494384766, "memory(GiB)": 91.52, "step": 55015, "token_acc": 0.7688317462669977, "train_speed(iter/s)": 0.14565 }, { "epoch": 0.7139219318192876, "grad_norm": 0.73549485206604, "learning_rate": 7.637671263115592e-05, "loss": 0.8825016975402832, "memory(GiB)": 91.52, "step": 55020, "token_acc": 0.7698814288950986, "train_speed(iter/s)": 0.145647 }, { "epoch": 0.7139868102209433, "grad_norm": 0.7542923092842102, "learning_rate": 7.637215579069441e-05, "loss": 0.9318606376647949, "memory(GiB)": 91.52, "step": 55025, "token_acc": 0.7640388768898488, "train_speed(iter/s)": 0.145645 }, { "epoch": 0.714051688622599, "grad_norm": 0.6468667387962341, "learning_rate": 7.636759864674381e-05, "loss": 0.8631111145019531, "memory(GiB)": 91.52, "step": 55030, "token_acc": 0.7829117367061064, "train_speed(iter/s)": 0.145642 }, { "epoch": 0.7141165670242547, "grad_norm": 0.6804864406585693, "learning_rate": 7.63630411993566e-05, "loss": 0.8929407119750976, "memory(GiB)": 91.52, "step": 55035, "token_acc": 0.7487110439722834, "train_speed(iter/s)": 0.145639 }, { "epoch": 0.7141814454259104, "grad_norm": 0.7648058533668518, "learning_rate": 7.635848344858519e-05, "loss": 0.8916157722473145, "memory(GiB)": 91.52, "step": 55040, "token_acc": 0.7498553450021698, "train_speed(iter/s)": 0.145637 }, { "epoch": 0.7142463238275661, "grad_norm": 0.6907771229743958, "learning_rate": 7.635392539448206e-05, "loss": 0.905833625793457, "memory(GiB)": 91.52, "step": 55045, "token_acc": 0.7420187304890739, "train_speed(iter/s)": 0.145635 }, { "epoch": 0.7143112022292218, "grad_norm": 0.7829245328903198, "learning_rate": 7.634936703709964e-05, "loss": 0.8703248977661133, "memory(GiB)": 91.52, "step": 55050, "token_acc": 0.7626673499455512, "train_speed(iter/s)": 0.145633 }, { "epoch": 0.7143760806308775, "grad_norm": 0.7323489785194397, "learning_rate": 7.634480837649042e-05, "loss": 0.8545068740844727, "memory(GiB)": 91.52, "step": 55055, "token_acc": 0.7716821931557981, "train_speed(iter/s)": 0.145631 }, { "epoch": 0.7144409590325332, "grad_norm": 0.7225868105888367, "learning_rate": 7.634024941270683e-05, "loss": 0.8575192451477051, "memory(GiB)": 91.52, "step": 55060, "token_acc": 0.7570032573289902, "train_speed(iter/s)": 0.145629 }, { "epoch": 0.7145058374341889, "grad_norm": 0.6922404170036316, "learning_rate": 7.633569014580134e-05, "loss": 0.8901439666748047, "memory(GiB)": 91.52, "step": 55065, "token_acc": 0.7474814568803276, "train_speed(iter/s)": 0.145626 }, { "epoch": 0.7145707158358446, "grad_norm": 0.7086219787597656, "learning_rate": 7.633113057582644e-05, "loss": 0.9061325073242188, "memory(GiB)": 91.52, "step": 55070, "token_acc": 0.7611070082015003, "train_speed(iter/s)": 0.145623 }, { "epoch": 0.7146355942375003, "grad_norm": 0.759113609790802, "learning_rate": 7.632657070283456e-05, "loss": 0.905977725982666, "memory(GiB)": 91.52, "step": 55075, "token_acc": 0.7807946766606754, "train_speed(iter/s)": 0.145621 }, { "epoch": 0.714700472639156, "grad_norm": 0.7954651713371277, "learning_rate": 7.632201052687822e-05, "loss": 0.8814711570739746, "memory(GiB)": 91.52, "step": 55080, "token_acc": 0.7631022823330516, "train_speed(iter/s)": 0.145618 }, { "epoch": 0.7147653510408117, "grad_norm": 0.670133113861084, "learning_rate": 7.631745004800988e-05, "loss": 0.8735807418823243, "memory(GiB)": 91.52, "step": 55085, "token_acc": 0.7744376348135721, "train_speed(iter/s)": 0.145615 }, { "epoch": 0.7148302294424674, "grad_norm": 0.696491539478302, "learning_rate": 7.6312889266282e-05, "loss": 0.8561550140380859, "memory(GiB)": 91.52, "step": 55090, "token_acc": 0.7598784194528876, "train_speed(iter/s)": 0.145612 }, { "epoch": 0.7148951078441231, "grad_norm": 0.8023398518562317, "learning_rate": 7.630832818174709e-05, "loss": 0.925634765625, "memory(GiB)": 91.52, "step": 55095, "token_acc": 0.7457782610172947, "train_speed(iter/s)": 0.14561 }, { "epoch": 0.7149599862457788, "grad_norm": 0.6953173875808716, "learning_rate": 7.630376679445763e-05, "loss": 0.9081310272216797, "memory(GiB)": 91.52, "step": 55100, "token_acc": 0.7524415828691312, "train_speed(iter/s)": 0.145607 }, { "epoch": 0.7150248646474345, "grad_norm": 0.6644576787948608, "learning_rate": 7.629920510446611e-05, "loss": 0.8989233016967774, "memory(GiB)": 91.52, "step": 55105, "token_acc": 0.7419611919611919, "train_speed(iter/s)": 0.145604 }, { "epoch": 0.7150897430490902, "grad_norm": 0.7059266567230225, "learning_rate": 7.629464311182504e-05, "loss": 0.8954841613769531, "memory(GiB)": 91.52, "step": 55110, "token_acc": 0.7547164079351095, "train_speed(iter/s)": 0.145601 }, { "epoch": 0.7151546214507459, "grad_norm": 0.802272617816925, "learning_rate": 7.62900808165869e-05, "loss": 0.8702380180358886, "memory(GiB)": 91.52, "step": 55115, "token_acc": 0.7619388558745614, "train_speed(iter/s)": 0.145599 }, { "epoch": 0.7152194998524016, "grad_norm": 0.7223206162452698, "learning_rate": 7.628551821880422e-05, "loss": 0.9098682403564453, "memory(GiB)": 91.52, "step": 55120, "token_acc": 0.7622103386809269, "train_speed(iter/s)": 0.145596 }, { "epoch": 0.7152843782540573, "grad_norm": 0.7105889320373535, "learning_rate": 7.628095531852946e-05, "loss": 0.8614048004150391, "memory(GiB)": 91.52, "step": 55125, "token_acc": 0.7697420634920635, "train_speed(iter/s)": 0.145594 }, { "epoch": 0.715349256655713, "grad_norm": 0.8351898789405823, "learning_rate": 7.62763921158152e-05, "loss": 0.826060676574707, "memory(GiB)": 91.52, "step": 55130, "token_acc": 0.7809089249863114, "train_speed(iter/s)": 0.145591 }, { "epoch": 0.7154141350573687, "grad_norm": 0.7226097583770752, "learning_rate": 7.627182861071387e-05, "loss": 0.9103075981140136, "memory(GiB)": 91.52, "step": 55135, "token_acc": 0.7273615924642317, "train_speed(iter/s)": 0.145588 }, { "epoch": 0.7154790134590244, "grad_norm": 0.6882258057594299, "learning_rate": 7.626726480327805e-05, "loss": 0.8861429214477539, "memory(GiB)": 91.52, "step": 55140, "token_acc": 0.7708974231748867, "train_speed(iter/s)": 0.145586 }, { "epoch": 0.7155438918606801, "grad_norm": 0.6908299922943115, "learning_rate": 7.626270069356023e-05, "loss": 0.8430538177490234, "memory(GiB)": 91.52, "step": 55145, "token_acc": 0.7704636519748139, "train_speed(iter/s)": 0.145583 }, { "epoch": 0.7156087702623358, "grad_norm": 0.7287586331367493, "learning_rate": 7.625813628161293e-05, "loss": 0.8785133361816406, "memory(GiB)": 91.52, "step": 55150, "token_acc": 0.7415760453361924, "train_speed(iter/s)": 0.145581 }, { "epoch": 0.7156736486639915, "grad_norm": 0.7323039174079895, "learning_rate": 7.625357156748871e-05, "loss": 0.9049185752868653, "memory(GiB)": 91.52, "step": 55155, "token_acc": 0.74987803505085, "train_speed(iter/s)": 0.145579 }, { "epoch": 0.7157385270656472, "grad_norm": 0.7105510830879211, "learning_rate": 7.624900655124005e-05, "loss": 0.891057014465332, "memory(GiB)": 91.52, "step": 55160, "token_acc": 0.7617816593886463, "train_speed(iter/s)": 0.145576 }, { "epoch": 0.7158034054673029, "grad_norm": 0.7134404182434082, "learning_rate": 7.624444123291954e-05, "loss": 0.8712383270263672, "memory(GiB)": 91.52, "step": 55165, "token_acc": 0.7559257563483958, "train_speed(iter/s)": 0.145573 }, { "epoch": 0.7158682838689586, "grad_norm": 0.8267918825149536, "learning_rate": 7.623987561257968e-05, "loss": 0.9359380722045898, "memory(GiB)": 91.52, "step": 55170, "token_acc": 0.7277546230728766, "train_speed(iter/s)": 0.14557 }, { "epoch": 0.7159331622706143, "grad_norm": 0.7297171950340271, "learning_rate": 7.623530969027301e-05, "loss": 0.863780403137207, "memory(GiB)": 91.52, "step": 55175, "token_acc": 0.7674114864626355, "train_speed(iter/s)": 0.145568 }, { "epoch": 0.71599804067227, "grad_norm": 0.7344678640365601, "learning_rate": 7.62307434660521e-05, "loss": 0.8740330696105957, "memory(GiB)": 91.52, "step": 55180, "token_acc": 0.7551012667829781, "train_speed(iter/s)": 0.145566 }, { "epoch": 0.7160629190739257, "grad_norm": 0.705715537071228, "learning_rate": 7.622617693996947e-05, "loss": 0.9386190414428711, "memory(GiB)": 91.52, "step": 55185, "token_acc": 0.7444907000725661, "train_speed(iter/s)": 0.145563 }, { "epoch": 0.7161277974755814, "grad_norm": 0.7107824087142944, "learning_rate": 7.62216101120777e-05, "loss": 0.8768221855163574, "memory(GiB)": 91.52, "step": 55190, "token_acc": 0.747599935784235, "train_speed(iter/s)": 0.145561 }, { "epoch": 0.7161926758772371, "grad_norm": 0.7724515795707703, "learning_rate": 7.621704298242931e-05, "loss": 0.9145780563354492, "memory(GiB)": 91.52, "step": 55195, "token_acc": 0.7541518510771805, "train_speed(iter/s)": 0.145559 }, { "epoch": 0.7162575542788928, "grad_norm": 0.7818777561187744, "learning_rate": 7.621247555107688e-05, "loss": 0.8898393630981445, "memory(GiB)": 91.52, "step": 55200, "token_acc": 0.7578594586718153, "train_speed(iter/s)": 0.145557 }, { "epoch": 0.7163224326805485, "grad_norm": 0.7030651569366455, "learning_rate": 7.620790781807296e-05, "loss": 0.8316084861755371, "memory(GiB)": 91.52, "step": 55205, "token_acc": 0.7661130129749376, "train_speed(iter/s)": 0.145555 }, { "epoch": 0.7163873110822042, "grad_norm": 0.6727153658866882, "learning_rate": 7.620333978347015e-05, "loss": 0.9103885650634765, "memory(GiB)": 91.52, "step": 55210, "token_acc": 0.7627179507640746, "train_speed(iter/s)": 0.145551 }, { "epoch": 0.7164521894838599, "grad_norm": 0.7298685312271118, "learning_rate": 7.619877144732097e-05, "loss": 0.9014907836914062, "memory(GiB)": 91.52, "step": 55215, "token_acc": 0.7638721900389842, "train_speed(iter/s)": 0.145549 }, { "epoch": 0.7165170678855156, "grad_norm": 0.8435607552528381, "learning_rate": 7.619420280967804e-05, "loss": 0.9156038284301757, "memory(GiB)": 91.52, "step": 55220, "token_acc": 0.7670230921946293, "train_speed(iter/s)": 0.145547 }, { "epoch": 0.7165819462871713, "grad_norm": 0.8336969017982483, "learning_rate": 7.618963387059388e-05, "loss": 0.8586444854736328, "memory(GiB)": 91.52, "step": 55225, "token_acc": 0.7592358111127663, "train_speed(iter/s)": 0.145544 }, { "epoch": 0.716646824688827, "grad_norm": 0.8128041625022888, "learning_rate": 7.618506463012112e-05, "loss": 0.8650565147399902, "memory(GiB)": 91.52, "step": 55230, "token_acc": 0.7571915078617492, "train_speed(iter/s)": 0.145541 }, { "epoch": 0.7167117030904827, "grad_norm": 0.7716331481933594, "learning_rate": 7.618049508831231e-05, "loss": 0.8735637664794922, "memory(GiB)": 91.52, "step": 55235, "token_acc": 0.7617403314917127, "train_speed(iter/s)": 0.14554 }, { "epoch": 0.7167765814921384, "grad_norm": 0.7724818587303162, "learning_rate": 7.617592524522003e-05, "loss": 0.8850605964660645, "memory(GiB)": 91.52, "step": 55240, "token_acc": 0.7796219160525473, "train_speed(iter/s)": 0.145537 }, { "epoch": 0.7168414598937941, "grad_norm": 0.7569090127944946, "learning_rate": 7.617135510089693e-05, "loss": 0.8687747955322266, "memory(GiB)": 91.52, "step": 55245, "token_acc": 0.760314932235478, "train_speed(iter/s)": 0.145534 }, { "epoch": 0.7169063382954498, "grad_norm": 0.7338595390319824, "learning_rate": 7.616678465539552e-05, "loss": 0.9353438377380371, "memory(GiB)": 91.52, "step": 55250, "token_acc": 0.7595780877849608, "train_speed(iter/s)": 0.145531 }, { "epoch": 0.7169712166971054, "grad_norm": 0.7528032660484314, "learning_rate": 7.616221390876846e-05, "loss": 0.9371852874755859, "memory(GiB)": 91.52, "step": 55255, "token_acc": 0.7461938791362436, "train_speed(iter/s)": 0.14553 }, { "epoch": 0.7170360950987611, "grad_norm": 0.7203510999679565, "learning_rate": 7.615764286106833e-05, "loss": 0.8554431915283203, "memory(GiB)": 91.52, "step": 55260, "token_acc": 0.763831308077198, "train_speed(iter/s)": 0.145527 }, { "epoch": 0.7171009735004168, "grad_norm": 0.6527000665664673, "learning_rate": 7.615307151234771e-05, "loss": 0.8738271713256835, "memory(GiB)": 91.52, "step": 55265, "token_acc": 0.7709668072502495, "train_speed(iter/s)": 0.145524 }, { "epoch": 0.7171658519020725, "grad_norm": 0.7532554268836975, "learning_rate": 7.614849986265924e-05, "loss": 0.9092377662658692, "memory(GiB)": 91.52, "step": 55270, "token_acc": 0.7649966081984688, "train_speed(iter/s)": 0.145521 }, { "epoch": 0.7172307303037282, "grad_norm": 0.7180954217910767, "learning_rate": 7.614392791205551e-05, "loss": 0.9305354118347168, "memory(GiB)": 91.52, "step": 55275, "token_acc": 0.7366384680962687, "train_speed(iter/s)": 0.145519 }, { "epoch": 0.7172956087053839, "grad_norm": 0.6933351755142212, "learning_rate": 7.613935566058914e-05, "loss": 0.8885404586791992, "memory(GiB)": 91.52, "step": 55280, "token_acc": 0.7584543440807285, "train_speed(iter/s)": 0.145517 }, { "epoch": 0.7173604871070396, "grad_norm": 0.7274402379989624, "learning_rate": 7.613478310831275e-05, "loss": 0.9172370910644532, "memory(GiB)": 91.52, "step": 55285, "token_acc": 0.7590025795356836, "train_speed(iter/s)": 0.145514 }, { "epoch": 0.7174253655086953, "grad_norm": 0.7912365198135376, "learning_rate": 7.613021025527894e-05, "loss": 0.9095859527587891, "memory(GiB)": 91.52, "step": 55290, "token_acc": 0.7383540372670807, "train_speed(iter/s)": 0.145511 }, { "epoch": 0.717490243910351, "grad_norm": 0.8075505495071411, "learning_rate": 7.61256371015404e-05, "loss": 0.8902300834655762, "memory(GiB)": 91.52, "step": 55295, "token_acc": 0.7622202911144905, "train_speed(iter/s)": 0.145509 }, { "epoch": 0.7175551223120067, "grad_norm": 0.7646098732948303, "learning_rate": 7.612106364714966e-05, "loss": 0.902896785736084, "memory(GiB)": 91.52, "step": 55300, "token_acc": 0.7655586017820425, "train_speed(iter/s)": 0.145506 }, { "epoch": 0.7176200007136624, "grad_norm": 0.8229208588600159, "learning_rate": 7.611648989215943e-05, "loss": 0.8726729393005371, "memory(GiB)": 91.52, "step": 55305, "token_acc": 0.7537986186841149, "train_speed(iter/s)": 0.145504 }, { "epoch": 0.7176848791153181, "grad_norm": 0.7638031244277954, "learning_rate": 7.61119158366223e-05, "loss": 0.8865568161010742, "memory(GiB)": 91.52, "step": 55310, "token_acc": 0.7631607085575792, "train_speed(iter/s)": 0.145502 }, { "epoch": 0.7177497575169738, "grad_norm": 0.7155303955078125, "learning_rate": 7.610734148059092e-05, "loss": 0.8819652557373047, "memory(GiB)": 91.52, "step": 55315, "token_acc": 0.7462912130277889, "train_speed(iter/s)": 0.145499 }, { "epoch": 0.7178146359186295, "grad_norm": 0.7153849601745605, "learning_rate": 7.610276682411794e-05, "loss": 0.8908002853393555, "memory(GiB)": 91.52, "step": 55320, "token_acc": 0.7688134160090192, "train_speed(iter/s)": 0.145497 }, { "epoch": 0.7178795143202852, "grad_norm": 0.7080438137054443, "learning_rate": 7.6098191867256e-05, "loss": 0.8557314872741699, "memory(GiB)": 91.52, "step": 55325, "token_acc": 0.7528308109494668, "train_speed(iter/s)": 0.145494 }, { "epoch": 0.7179443927219409, "grad_norm": 0.7627789378166199, "learning_rate": 7.609361661005776e-05, "loss": 0.9037481307983398, "memory(GiB)": 91.52, "step": 55330, "token_acc": 0.761544951032535, "train_speed(iter/s)": 0.145491 }, { "epoch": 0.7180092711235966, "grad_norm": 0.7154427766799927, "learning_rate": 7.608904105257583e-05, "loss": 0.9115837097167969, "memory(GiB)": 91.52, "step": 55335, "token_acc": 0.7676467223429814, "train_speed(iter/s)": 0.145489 }, { "epoch": 0.7180741495252523, "grad_norm": 0.723372220993042, "learning_rate": 7.608446519486292e-05, "loss": 0.9476088523864746, "memory(GiB)": 91.52, "step": 55340, "token_acc": 0.7232321786430003, "train_speed(iter/s)": 0.145487 }, { "epoch": 0.718139027926908, "grad_norm": 0.7668951153755188, "learning_rate": 7.607988903697165e-05, "loss": 0.9234609603881836, "memory(GiB)": 91.52, "step": 55345, "token_acc": 0.7500369658435605, "train_speed(iter/s)": 0.145485 }, { "epoch": 0.7182039063285637, "grad_norm": 0.7394707798957825, "learning_rate": 7.607531257895471e-05, "loss": 0.8420373916625976, "memory(GiB)": 91.52, "step": 55350, "token_acc": 0.742596558599808, "train_speed(iter/s)": 0.145482 }, { "epoch": 0.7182687847302194, "grad_norm": 0.8249367475509644, "learning_rate": 7.607073582086475e-05, "loss": 0.8992502212524414, "memory(GiB)": 91.52, "step": 55355, "token_acc": 0.7475073501214368, "train_speed(iter/s)": 0.145479 }, { "epoch": 0.7183336631318751, "grad_norm": 0.7339231967926025, "learning_rate": 7.606615876275442e-05, "loss": 0.9046069145202636, "memory(GiB)": 91.52, "step": 55360, "token_acc": 0.7547527658613683, "train_speed(iter/s)": 0.145477 }, { "epoch": 0.7183985415335308, "grad_norm": 0.8203449249267578, "learning_rate": 7.606158140467643e-05, "loss": 0.9016044616699219, "memory(GiB)": 91.52, "step": 55365, "token_acc": 0.7630382455201926, "train_speed(iter/s)": 0.145475 }, { "epoch": 0.7184634199351865, "grad_norm": 0.7439699769020081, "learning_rate": 7.605700374668343e-05, "loss": 0.9041875839233399, "memory(GiB)": 91.52, "step": 55370, "token_acc": 0.7489231315777788, "train_speed(iter/s)": 0.145473 }, { "epoch": 0.7185282983368422, "grad_norm": 0.7388259768486023, "learning_rate": 7.605242578882811e-05, "loss": 0.9129735946655273, "memory(GiB)": 91.52, "step": 55375, "token_acc": 0.7467587999863168, "train_speed(iter/s)": 0.145471 }, { "epoch": 0.7185931767384979, "grad_norm": 0.6580604314804077, "learning_rate": 7.604784753116317e-05, "loss": 0.859642219543457, "memory(GiB)": 91.52, "step": 55380, "token_acc": 0.7678641155798516, "train_speed(iter/s)": 0.145468 }, { "epoch": 0.7186580551401536, "grad_norm": 0.8176634311676025, "learning_rate": 7.604326897374126e-05, "loss": 0.9063745498657226, "memory(GiB)": 91.52, "step": 55385, "token_acc": 0.7417665011615725, "train_speed(iter/s)": 0.145466 }, { "epoch": 0.7187229335418093, "grad_norm": 0.82204669713974, "learning_rate": 7.603869011661508e-05, "loss": 0.8911909103393555, "memory(GiB)": 91.52, "step": 55390, "token_acc": 0.7761590851149069, "train_speed(iter/s)": 0.145464 }, { "epoch": 0.718787811943465, "grad_norm": 0.7760581970214844, "learning_rate": 7.603411095983733e-05, "loss": 0.8818025588989258, "memory(GiB)": 91.52, "step": 55395, "token_acc": 0.7686222005842259, "train_speed(iter/s)": 0.145462 }, { "epoch": 0.7188526903451207, "grad_norm": 0.6581293344497681, "learning_rate": 7.602953150346071e-05, "loss": 0.8732808113098145, "memory(GiB)": 91.52, "step": 55400, "token_acc": 0.7623215810662437, "train_speed(iter/s)": 0.14546 }, { "epoch": 0.7189175687467764, "grad_norm": 0.693970263004303, "learning_rate": 7.602495174753793e-05, "loss": 0.8891400337219239, "memory(GiB)": 91.52, "step": 55405, "token_acc": 0.7605549055657697, "train_speed(iter/s)": 0.145457 }, { "epoch": 0.7189824471484321, "grad_norm": 0.6856297254562378, "learning_rate": 7.602037169212167e-05, "loss": 0.8683645248413085, "memory(GiB)": 91.52, "step": 55410, "token_acc": 0.7517784721760521, "train_speed(iter/s)": 0.145454 }, { "epoch": 0.7190473255500878, "grad_norm": 0.7479491233825684, "learning_rate": 7.601579133726465e-05, "loss": 0.8629359245300293, "memory(GiB)": 91.52, "step": 55415, "token_acc": 0.7564738088191773, "train_speed(iter/s)": 0.145451 }, { "epoch": 0.7191122039517435, "grad_norm": 0.7288469672203064, "learning_rate": 7.601121068301957e-05, "loss": 0.8805025100708008, "memory(GiB)": 91.52, "step": 55420, "token_acc": 0.7817881250861, "train_speed(iter/s)": 0.145448 }, { "epoch": 0.7191770823533992, "grad_norm": 0.7376953363418579, "learning_rate": 7.600662972943914e-05, "loss": 0.8447065353393555, "memory(GiB)": 91.52, "step": 55425, "token_acc": 0.7832026309132305, "train_speed(iter/s)": 0.145445 }, { "epoch": 0.7192419607550549, "grad_norm": 0.7792940139770508, "learning_rate": 7.600204847657613e-05, "loss": 0.9675309181213378, "memory(GiB)": 91.52, "step": 55430, "token_acc": 0.7503160351366244, "train_speed(iter/s)": 0.145441 }, { "epoch": 0.7193068391567106, "grad_norm": 0.7789617776870728, "learning_rate": 7.599746692448318e-05, "loss": 0.9046037673950196, "memory(GiB)": 91.52, "step": 55435, "token_acc": 0.7483742066912168, "train_speed(iter/s)": 0.145439 }, { "epoch": 0.7193717175583663, "grad_norm": 0.8292215466499329, "learning_rate": 7.599288507321307e-05, "loss": 0.9183381080627442, "memory(GiB)": 91.52, "step": 55440, "token_acc": 0.7540176257128045, "train_speed(iter/s)": 0.145436 }, { "epoch": 0.719436595960022, "grad_norm": 0.748817503452301, "learning_rate": 7.598830292281852e-05, "loss": 0.8587066650390625, "memory(GiB)": 91.52, "step": 55445, "token_acc": 0.7773119529958994, "train_speed(iter/s)": 0.145433 }, { "epoch": 0.7195014743616777, "grad_norm": 0.746470034122467, "learning_rate": 7.598372047335222e-05, "loss": 0.879489803314209, "memory(GiB)": 91.52, "step": 55450, "token_acc": 0.7471220113194471, "train_speed(iter/s)": 0.145429 }, { "epoch": 0.7195663527633334, "grad_norm": 0.8040305376052856, "learning_rate": 7.597913772486698e-05, "loss": 0.8989408493041993, "memory(GiB)": 91.52, "step": 55455, "token_acc": 0.764058896576193, "train_speed(iter/s)": 0.145427 }, { "epoch": 0.7196312311649891, "grad_norm": 0.6885295510292053, "learning_rate": 7.597455467741548e-05, "loss": 0.8722023010253906, "memory(GiB)": 91.52, "step": 55460, "token_acc": 0.7398207337409672, "train_speed(iter/s)": 0.145426 }, { "epoch": 0.7196961095666448, "grad_norm": 0.7263926863670349, "learning_rate": 7.596997133105045e-05, "loss": 0.8753427505493164, "memory(GiB)": 91.52, "step": 55465, "token_acc": 0.7902721187427241, "train_speed(iter/s)": 0.145422 }, { "epoch": 0.7197609879683005, "grad_norm": 0.7491164803504944, "learning_rate": 7.596538768582471e-05, "loss": 0.9103714942932128, "memory(GiB)": 91.52, "step": 55470, "token_acc": 0.7594147361086918, "train_speed(iter/s)": 0.14542 }, { "epoch": 0.7198258663699562, "grad_norm": 0.7066288590431213, "learning_rate": 7.596080374179092e-05, "loss": 0.8580349922180176, "memory(GiB)": 91.52, "step": 55475, "token_acc": 0.7807060999372503, "train_speed(iter/s)": 0.145417 }, { "epoch": 0.7198907447716119, "grad_norm": 0.7554681301116943, "learning_rate": 7.595621949900189e-05, "loss": 0.9359472274780274, "memory(GiB)": 91.52, "step": 55480, "token_acc": 0.756100175056268, "train_speed(iter/s)": 0.145415 }, { "epoch": 0.7199556231732676, "grad_norm": 0.8244917988777161, "learning_rate": 7.595163495751034e-05, "loss": 0.8707220077514648, "memory(GiB)": 91.52, "step": 55485, "token_acc": 0.7714535568989772, "train_speed(iter/s)": 0.145412 }, { "epoch": 0.7200205015749233, "grad_norm": 0.7579312920570374, "learning_rate": 7.594705011736907e-05, "loss": 0.8951911926269531, "memory(GiB)": 91.52, "step": 55490, "token_acc": 0.751380643379815, "train_speed(iter/s)": 0.14541 }, { "epoch": 0.7200853799765788, "grad_norm": 0.8680738210678101, "learning_rate": 7.594246497863079e-05, "loss": 0.8945375442504883, "memory(GiB)": 91.52, "step": 55495, "token_acc": 0.7632200617844812, "train_speed(iter/s)": 0.145408 }, { "epoch": 0.7201502583782345, "grad_norm": 0.7748192548751831, "learning_rate": 7.59378795413483e-05, "loss": 0.863919448852539, "memory(GiB)": 91.52, "step": 55500, "token_acc": 0.7564918583310555, "train_speed(iter/s)": 0.145406 }, { "epoch": 0.7202151367798902, "grad_norm": 0.8152872920036316, "learning_rate": 7.593329380557437e-05, "loss": 0.910243034362793, "memory(GiB)": 91.52, "step": 55505, "token_acc": 0.7482849822299363, "train_speed(iter/s)": 0.145404 }, { "epoch": 0.7202800151815459, "grad_norm": 0.7777197360992432, "learning_rate": 7.592870777136176e-05, "loss": 0.8675506591796875, "memory(GiB)": 91.52, "step": 55510, "token_acc": 0.7623803578859759, "train_speed(iter/s)": 0.145401 }, { "epoch": 0.7203448935832016, "grad_norm": 0.8483238816261292, "learning_rate": 7.592412143876326e-05, "loss": 0.8975273132324219, "memory(GiB)": 91.52, "step": 55515, "token_acc": 0.7695736146189274, "train_speed(iter/s)": 0.145398 }, { "epoch": 0.7204097719848573, "grad_norm": 0.7830004096031189, "learning_rate": 7.591953480783162e-05, "loss": 0.9295470237731933, "memory(GiB)": 91.52, "step": 55520, "token_acc": 0.7618911685994648, "train_speed(iter/s)": 0.145396 }, { "epoch": 0.720474650386513, "grad_norm": 0.7541098594665527, "learning_rate": 7.591494787861966e-05, "loss": 0.9008047103881835, "memory(GiB)": 91.52, "step": 55525, "token_acc": 0.7524688982942157, "train_speed(iter/s)": 0.145394 }, { "epoch": 0.7205395287881687, "grad_norm": 0.7189998626708984, "learning_rate": 7.591036065118014e-05, "loss": 0.8806554794311523, "memory(GiB)": 91.52, "step": 55530, "token_acc": 0.7590444919721662, "train_speed(iter/s)": 0.145391 }, { "epoch": 0.7206044071898244, "grad_norm": 0.786029577255249, "learning_rate": 7.590577312556586e-05, "loss": 0.913361930847168, "memory(GiB)": 91.52, "step": 55535, "token_acc": 0.7490187023782037, "train_speed(iter/s)": 0.145389 }, { "epoch": 0.7206692855914801, "grad_norm": 0.7632120847702026, "learning_rate": 7.590118530182959e-05, "loss": 0.8561999320983886, "memory(GiB)": 91.52, "step": 55540, "token_acc": 0.7614450327000934, "train_speed(iter/s)": 0.145386 }, { "epoch": 0.7207341639931358, "grad_norm": 0.7221170663833618, "learning_rate": 7.589659718002417e-05, "loss": 0.8703841209411621, "memory(GiB)": 91.52, "step": 55545, "token_acc": 0.7652374874961195, "train_speed(iter/s)": 0.145383 }, { "epoch": 0.7207990423947915, "grad_norm": 0.8054968118667603, "learning_rate": 7.589200876020237e-05, "loss": 0.8558357238769532, "memory(GiB)": 91.52, "step": 55550, "token_acc": 0.7791575091575091, "train_speed(iter/s)": 0.145381 }, { "epoch": 0.7208639207964472, "grad_norm": 0.7368189692497253, "learning_rate": 7.5887420042417e-05, "loss": 0.8524696350097656, "memory(GiB)": 91.52, "step": 55555, "token_acc": 0.7758295948768877, "train_speed(iter/s)": 0.145378 }, { "epoch": 0.7209287991981029, "grad_norm": 0.7276979088783264, "learning_rate": 7.588283102672087e-05, "loss": 0.8861763000488281, "memory(GiB)": 91.52, "step": 55560, "token_acc": 0.7735052600336378, "train_speed(iter/s)": 0.145376 }, { "epoch": 0.7209936775997586, "grad_norm": 0.7735947370529175, "learning_rate": 7.587824171316677e-05, "loss": 0.8710854530334473, "memory(GiB)": 91.52, "step": 55565, "token_acc": 0.7825150884128048, "train_speed(iter/s)": 0.145374 }, { "epoch": 0.7210585560014143, "grad_norm": 0.7219476699829102, "learning_rate": 7.587365210180754e-05, "loss": 0.913088035583496, "memory(GiB)": 91.52, "step": 55570, "token_acc": 0.7480131243164418, "train_speed(iter/s)": 0.145372 }, { "epoch": 0.72112343440307, "grad_norm": 0.7284302115440369, "learning_rate": 7.5869062192696e-05, "loss": 0.8822126388549805, "memory(GiB)": 91.52, "step": 55575, "token_acc": 0.7451431237295478, "train_speed(iter/s)": 0.145369 }, { "epoch": 0.7211883128047257, "grad_norm": 0.6866967678070068, "learning_rate": 7.586447198588495e-05, "loss": 0.900672721862793, "memory(GiB)": 91.52, "step": 55580, "token_acc": 0.7500860262145337, "train_speed(iter/s)": 0.145366 }, { "epoch": 0.7212531912063814, "grad_norm": 0.7319080233573914, "learning_rate": 7.58598814814272e-05, "loss": 0.8663137435913086, "memory(GiB)": 91.52, "step": 55585, "token_acc": 0.7567640876110395, "train_speed(iter/s)": 0.145364 }, { "epoch": 0.7213180696080371, "grad_norm": 0.8091624975204468, "learning_rate": 7.585529067937563e-05, "loss": 0.9058343887329101, "memory(GiB)": 91.52, "step": 55590, "token_acc": 0.748440345642655, "train_speed(iter/s)": 0.145361 }, { "epoch": 0.7213829480096928, "grad_norm": 0.7379459142684937, "learning_rate": 7.585069957978302e-05, "loss": 0.8733406066894531, "memory(GiB)": 91.52, "step": 55595, "token_acc": 0.7534350278208864, "train_speed(iter/s)": 0.145359 }, { "epoch": 0.7214478264113485, "grad_norm": 0.7065644264221191, "learning_rate": 7.584610818270221e-05, "loss": 0.8615419387817382, "memory(GiB)": 91.52, "step": 55600, "token_acc": 0.7576587242859257, "train_speed(iter/s)": 0.145356 }, { "epoch": 0.7215127048130042, "grad_norm": 0.7179928421974182, "learning_rate": 7.584151648818608e-05, "loss": 0.8650816917419434, "memory(GiB)": 91.52, "step": 55605, "token_acc": 0.7633988644195739, "train_speed(iter/s)": 0.145354 }, { "epoch": 0.7215775832146599, "grad_norm": 0.8110395073890686, "learning_rate": 7.583692449628744e-05, "loss": 0.8736638069152832, "memory(GiB)": 91.52, "step": 55610, "token_acc": 0.7566818596171376, "train_speed(iter/s)": 0.145351 }, { "epoch": 0.7216424616163156, "grad_norm": 0.7308327555656433, "learning_rate": 7.58323322070591e-05, "loss": 0.8897890090942383, "memory(GiB)": 91.52, "step": 55615, "token_acc": 0.7728550585388421, "train_speed(iter/s)": 0.145348 }, { "epoch": 0.7217073400179713, "grad_norm": 0.6971051096916199, "learning_rate": 7.582773962055399e-05, "loss": 0.8586977005004883, "memory(GiB)": 91.52, "step": 55620, "token_acc": 0.7673236449822242, "train_speed(iter/s)": 0.145346 }, { "epoch": 0.721772218419627, "grad_norm": 0.8581956624984741, "learning_rate": 7.582314673682486e-05, "loss": 0.8550516128540039, "memory(GiB)": 91.52, "step": 55625, "token_acc": 0.7548717948717949, "train_speed(iter/s)": 0.145343 }, { "epoch": 0.7218370968212827, "grad_norm": 0.7706736326217651, "learning_rate": 7.581855355592466e-05, "loss": 0.8908584594726563, "memory(GiB)": 91.52, "step": 55630, "token_acc": 0.7678217636723861, "train_speed(iter/s)": 0.145341 }, { "epoch": 0.7219019752229384, "grad_norm": 0.7866703867912292, "learning_rate": 7.581396007790621e-05, "loss": 0.887332534790039, "memory(GiB)": 91.52, "step": 55635, "token_acc": 0.76486308263631, "train_speed(iter/s)": 0.145338 }, { "epoch": 0.7219668536245941, "grad_norm": 0.7717531323432922, "learning_rate": 7.580936630282236e-05, "loss": 0.8930909156799316, "memory(GiB)": 91.52, "step": 55640, "token_acc": 0.7776489533011273, "train_speed(iter/s)": 0.145335 }, { "epoch": 0.7220317320262498, "grad_norm": 0.7585973143577576, "learning_rate": 7.580477223072597e-05, "loss": 0.8801702499389649, "memory(GiB)": 91.52, "step": 55645, "token_acc": 0.7403036771517177, "train_speed(iter/s)": 0.145332 }, { "epoch": 0.7220966104279055, "grad_norm": 0.7484157681465149, "learning_rate": 7.580017786166993e-05, "loss": 0.8641645431518554, "memory(GiB)": 91.52, "step": 55650, "token_acc": 0.7574960157147622, "train_speed(iter/s)": 0.14533 }, { "epoch": 0.7221614888295612, "grad_norm": 0.8037655353546143, "learning_rate": 7.579558319570711e-05, "loss": 0.8876239776611328, "memory(GiB)": 91.52, "step": 55655, "token_acc": 0.7652212818314008, "train_speed(iter/s)": 0.145328 }, { "epoch": 0.7222263672312169, "grad_norm": 0.6458654403686523, "learning_rate": 7.579098823289038e-05, "loss": 0.835666275024414, "memory(GiB)": 91.52, "step": 55660, "token_acc": 0.7758412250654846, "train_speed(iter/s)": 0.145325 }, { "epoch": 0.7222912456328726, "grad_norm": 0.7449392676353455, "learning_rate": 7.578639297327261e-05, "loss": 0.864544677734375, "memory(GiB)": 91.52, "step": 55665, "token_acc": 0.7741477272727273, "train_speed(iter/s)": 0.145322 }, { "epoch": 0.7223561240345283, "grad_norm": 0.7441444993019104, "learning_rate": 7.578179741690669e-05, "loss": 0.8810065269470215, "memory(GiB)": 91.52, "step": 55670, "token_acc": 0.7663810101622933, "train_speed(iter/s)": 0.145319 }, { "epoch": 0.722421002436184, "grad_norm": 0.747843325138092, "learning_rate": 7.57772015638455e-05, "loss": 0.9190541267395019, "memory(GiB)": 91.52, "step": 55675, "token_acc": 0.747327436046313, "train_speed(iter/s)": 0.145317 }, { "epoch": 0.7224858808378397, "grad_norm": 0.7694132924079895, "learning_rate": 7.577260541414195e-05, "loss": 0.9066873550415039, "memory(GiB)": 91.52, "step": 55680, "token_acc": 0.7578515842134519, "train_speed(iter/s)": 0.145315 }, { "epoch": 0.7225507592394954, "grad_norm": 0.662255048751831, "learning_rate": 7.576800896784891e-05, "loss": 0.8894463539123535, "memory(GiB)": 91.52, "step": 55685, "token_acc": 0.7594816085382123, "train_speed(iter/s)": 0.145311 }, { "epoch": 0.7226156376411511, "grad_norm": 0.7902615070343018, "learning_rate": 7.576341222501928e-05, "loss": 0.861122989654541, "memory(GiB)": 91.52, "step": 55690, "token_acc": 0.7703310175725377, "train_speed(iter/s)": 0.145309 }, { "epoch": 0.7226805160428068, "grad_norm": 0.8277578353881836, "learning_rate": 7.575881518570595e-05, "loss": 0.8935897827148438, "memory(GiB)": 91.52, "step": 55695, "token_acc": 0.7625894210099728, "train_speed(iter/s)": 0.145306 }, { "epoch": 0.7227453944444625, "grad_norm": 0.7558843493461609, "learning_rate": 7.575421784996185e-05, "loss": 0.860272216796875, "memory(GiB)": 91.52, "step": 55700, "token_acc": 0.7581472421683895, "train_speed(iter/s)": 0.145303 }, { "epoch": 0.7228102728461182, "grad_norm": 0.7042319178581238, "learning_rate": 7.574962021783985e-05, "loss": 0.8587944030761718, "memory(GiB)": 91.52, "step": 55705, "token_acc": 0.7663064658990257, "train_speed(iter/s)": 0.145301 }, { "epoch": 0.7228751512477739, "grad_norm": 0.6992419958114624, "learning_rate": 7.574502228939288e-05, "loss": 0.8844964981079102, "memory(GiB)": 91.52, "step": 55710, "token_acc": 0.7490373405871339, "train_speed(iter/s)": 0.145298 }, { "epoch": 0.7229400296494296, "grad_norm": 0.7291749715805054, "learning_rate": 7.574042406467386e-05, "loss": 0.8676692008972168, "memory(GiB)": 91.52, "step": 55715, "token_acc": 0.7593301745959629, "train_speed(iter/s)": 0.145296 }, { "epoch": 0.7230049080510853, "grad_norm": 0.8244789838790894, "learning_rate": 7.57358255437357e-05, "loss": 0.9090350151062012, "memory(GiB)": 91.52, "step": 55720, "token_acc": 0.7776475858979219, "train_speed(iter/s)": 0.145293 }, { "epoch": 0.723069786452741, "grad_norm": 0.7716279625892639, "learning_rate": 7.573122672663131e-05, "loss": 0.9081341743469238, "memory(GiB)": 91.52, "step": 55725, "token_acc": 0.7652913724032544, "train_speed(iter/s)": 0.145291 }, { "epoch": 0.7231346648543966, "grad_norm": 0.8170616626739502, "learning_rate": 7.572662761341362e-05, "loss": 0.8547327995300293, "memory(GiB)": 91.52, "step": 55730, "token_acc": 0.7532123069668749, "train_speed(iter/s)": 0.14529 }, { "epoch": 0.7231995432560523, "grad_norm": 0.7735037207603455, "learning_rate": 7.572202820413554e-05, "loss": 0.8920950889587402, "memory(GiB)": 91.52, "step": 55735, "token_acc": 0.7757983073750089, "train_speed(iter/s)": 0.145287 }, { "epoch": 0.723264421657708, "grad_norm": 0.7601195573806763, "learning_rate": 7.571742849885003e-05, "loss": 0.9365396499633789, "memory(GiB)": 91.52, "step": 55740, "token_acc": 0.7335741314069487, "train_speed(iter/s)": 0.145285 }, { "epoch": 0.7233293000593637, "grad_norm": 0.7545394897460938, "learning_rate": 7.571282849761002e-05, "loss": 0.9133186340332031, "memory(GiB)": 91.52, "step": 55745, "token_acc": 0.7423296834709817, "train_speed(iter/s)": 0.145283 }, { "epoch": 0.7233941784610194, "grad_norm": 0.713058352470398, "learning_rate": 7.570822820046841e-05, "loss": 0.8778411865234375, "memory(GiB)": 91.52, "step": 55750, "token_acc": 0.7409130757299885, "train_speed(iter/s)": 0.14528 }, { "epoch": 0.7234590568626751, "grad_norm": 0.8130477666854858, "learning_rate": 7.570362760747818e-05, "loss": 0.9088922500610351, "memory(GiB)": 91.52, "step": 55755, "token_acc": 0.7466936778533751, "train_speed(iter/s)": 0.145278 }, { "epoch": 0.7235239352643308, "grad_norm": 0.7083047032356262, "learning_rate": 7.569902671869225e-05, "loss": 0.8692378997802734, "memory(GiB)": 91.52, "step": 55760, "token_acc": 0.7469546345561467, "train_speed(iter/s)": 0.145275 }, { "epoch": 0.7235888136659865, "grad_norm": 0.6660280227661133, "learning_rate": 7.569442553416357e-05, "loss": 0.854473876953125, "memory(GiB)": 91.52, "step": 55765, "token_acc": 0.7661658590999407, "train_speed(iter/s)": 0.145273 }, { "epoch": 0.7236536920676422, "grad_norm": 0.7719893455505371, "learning_rate": 7.568982405394512e-05, "loss": 0.902867317199707, "memory(GiB)": 91.52, "step": 55770, "token_acc": 0.7325311979187837, "train_speed(iter/s)": 0.145271 }, { "epoch": 0.7237185704692979, "grad_norm": 0.7301846742630005, "learning_rate": 7.568522227808979e-05, "loss": 0.8723846435546875, "memory(GiB)": 91.52, "step": 55775, "token_acc": 0.7593756911314018, "train_speed(iter/s)": 0.145268 }, { "epoch": 0.7237834488709536, "grad_norm": 0.7438070774078369, "learning_rate": 7.568062020665061e-05, "loss": 0.8810564041137695, "memory(GiB)": 91.52, "step": 55780, "token_acc": 0.7791637247979917, "train_speed(iter/s)": 0.145266 }, { "epoch": 0.7238483272726093, "grad_norm": 0.8466398119926453, "learning_rate": 7.567601783968048e-05, "loss": 0.8906598091125488, "memory(GiB)": 91.52, "step": 55785, "token_acc": 0.7507828810020877, "train_speed(iter/s)": 0.145264 }, { "epoch": 0.723913205674265, "grad_norm": 0.7646565437316895, "learning_rate": 7.56714151772324e-05, "loss": 0.8787614822387695, "memory(GiB)": 91.52, "step": 55790, "token_acc": 0.7658071095571095, "train_speed(iter/s)": 0.145262 }, { "epoch": 0.7239780840759207, "grad_norm": 0.6952086687088013, "learning_rate": 7.566681221935935e-05, "loss": 0.8798429489135742, "memory(GiB)": 91.52, "step": 55795, "token_acc": 0.7775538279425409, "train_speed(iter/s)": 0.14526 }, { "epoch": 0.7240429624775764, "grad_norm": 0.8696998953819275, "learning_rate": 7.566220896611424e-05, "loss": 0.9034605026245117, "memory(GiB)": 91.52, "step": 55800, "token_acc": 0.7558924205378973, "train_speed(iter/s)": 0.145258 }, { "epoch": 0.7241078408792321, "grad_norm": 0.7781591415405273, "learning_rate": 7.565760541755009e-05, "loss": 0.9223537445068359, "memory(GiB)": 91.52, "step": 55805, "token_acc": 0.7542726202221445, "train_speed(iter/s)": 0.145255 }, { "epoch": 0.7241727192808878, "grad_norm": 0.7228285670280457, "learning_rate": 7.565300157371986e-05, "loss": 0.8568105697631836, "memory(GiB)": 91.52, "step": 55810, "token_acc": 0.7608322351343796, "train_speed(iter/s)": 0.145252 }, { "epoch": 0.7242375976825435, "grad_norm": 0.7615147829055786, "learning_rate": 7.564839743467656e-05, "loss": 0.8731437683105469, "memory(GiB)": 91.52, "step": 55815, "token_acc": 0.7590546347452425, "train_speed(iter/s)": 0.14525 }, { "epoch": 0.7243024760841992, "grad_norm": 0.643882691860199, "learning_rate": 7.564379300047316e-05, "loss": 0.8738714218139648, "memory(GiB)": 91.52, "step": 55820, "token_acc": 0.755289115053501, "train_speed(iter/s)": 0.145247 }, { "epoch": 0.7243673544858549, "grad_norm": 0.7906985282897949, "learning_rate": 7.563918827116262e-05, "loss": 0.8813267707824707, "memory(GiB)": 91.52, "step": 55825, "token_acc": 0.7653500897666068, "train_speed(iter/s)": 0.145245 }, { "epoch": 0.7244322328875106, "grad_norm": 0.7265352010726929, "learning_rate": 7.563458324679795e-05, "loss": 0.8737022399902343, "memory(GiB)": 91.52, "step": 55830, "token_acc": 0.8024420268157169, "train_speed(iter/s)": 0.145243 }, { "epoch": 0.7244971112891663, "grad_norm": 0.7442331910133362, "learning_rate": 7.562997792743215e-05, "loss": 0.8932747840881348, "memory(GiB)": 91.52, "step": 55835, "token_acc": 0.7531126749671333, "train_speed(iter/s)": 0.14524 }, { "epoch": 0.724561989690822, "grad_norm": 0.7547138333320618, "learning_rate": 7.562537231311821e-05, "loss": 0.8865341186523438, "memory(GiB)": 91.52, "step": 55840, "token_acc": 0.779767734713866, "train_speed(iter/s)": 0.145238 }, { "epoch": 0.7246268680924777, "grad_norm": 0.7658701539039612, "learning_rate": 7.562076640390914e-05, "loss": 0.883768367767334, "memory(GiB)": 91.52, "step": 55845, "token_acc": 0.7653391200748872, "train_speed(iter/s)": 0.145235 }, { "epoch": 0.7246917464941334, "grad_norm": 0.7391038537025452, "learning_rate": 7.561616019985794e-05, "loss": 0.907657527923584, "memory(GiB)": 91.52, "step": 55850, "token_acc": 0.7631631832797428, "train_speed(iter/s)": 0.145234 }, { "epoch": 0.7247566248957891, "grad_norm": 0.6932238340377808, "learning_rate": 7.561155370101762e-05, "loss": 0.9359777450561524, "memory(GiB)": 91.52, "step": 55855, "token_acc": 0.7573369109508636, "train_speed(iter/s)": 0.145231 }, { "epoch": 0.7248215032974448, "grad_norm": 0.7233836650848389, "learning_rate": 7.560694690744119e-05, "loss": 0.9343250274658204, "memory(GiB)": 91.52, "step": 55860, "token_acc": 0.7568891836133216, "train_speed(iter/s)": 0.145229 }, { "epoch": 0.7248863816991005, "grad_norm": 0.7174476981163025, "learning_rate": 7.560233981918165e-05, "loss": 0.8713288307189941, "memory(GiB)": 91.52, "step": 55865, "token_acc": 0.7671753089829455, "train_speed(iter/s)": 0.145226 }, { "epoch": 0.7249512601007562, "grad_norm": 0.7544372081756592, "learning_rate": 7.559773243629205e-05, "loss": 0.8968249320983886, "memory(GiB)": 91.52, "step": 55870, "token_acc": 0.7315396113602392, "train_speed(iter/s)": 0.145224 }, { "epoch": 0.7250161385024119, "grad_norm": 0.6972609162330627, "learning_rate": 7.559312475882539e-05, "loss": 0.8811824798583985, "memory(GiB)": 91.52, "step": 55875, "token_acc": 0.751650191583218, "train_speed(iter/s)": 0.145222 }, { "epoch": 0.7250810169040676, "grad_norm": 0.6977288126945496, "learning_rate": 7.558851678683468e-05, "loss": 0.8816205978393554, "memory(GiB)": 91.52, "step": 55880, "token_acc": 0.7479189287006877, "train_speed(iter/s)": 0.145219 }, { "epoch": 0.7251458953057233, "grad_norm": 0.8797891736030579, "learning_rate": 7.558390852037296e-05, "loss": 0.9240005493164063, "memory(GiB)": 91.52, "step": 55885, "token_acc": 0.747229916897507, "train_speed(iter/s)": 0.145217 }, { "epoch": 0.725210773707379, "grad_norm": 0.7456996440887451, "learning_rate": 7.557929995949328e-05, "loss": 0.8675379753112793, "memory(GiB)": 91.52, "step": 55890, "token_acc": 0.7509649212692557, "train_speed(iter/s)": 0.145215 }, { "epoch": 0.7252756521090347, "grad_norm": 0.860043466091156, "learning_rate": 7.557469110424868e-05, "loss": 0.8708665847778321, "memory(GiB)": 91.52, "step": 55895, "token_acc": 0.750552627104234, "train_speed(iter/s)": 0.145213 }, { "epoch": 0.7253405305106904, "grad_norm": 0.7398109436035156, "learning_rate": 7.557008195469215e-05, "loss": 0.9109333038330079, "memory(GiB)": 91.52, "step": 55900, "token_acc": 0.7447709923664122, "train_speed(iter/s)": 0.14521 }, { "epoch": 0.7254054089123461, "grad_norm": 0.8154604434967041, "learning_rate": 7.556547251087679e-05, "loss": 0.9198644638061524, "memory(GiB)": 91.52, "step": 55905, "token_acc": 0.7338960634821845, "train_speed(iter/s)": 0.145207 }, { "epoch": 0.7254702873140018, "grad_norm": 0.7197967767715454, "learning_rate": 7.556086277285562e-05, "loss": 0.8739463806152343, "memory(GiB)": 91.52, "step": 55910, "token_acc": 0.7458584467022559, "train_speed(iter/s)": 0.145205 }, { "epoch": 0.7255351657156575, "grad_norm": 0.7099709510803223, "learning_rate": 7.555625274068166e-05, "loss": 0.9068034172058106, "memory(GiB)": 91.52, "step": 55915, "token_acc": 0.7450905185639767, "train_speed(iter/s)": 0.145202 }, { "epoch": 0.7256000441173132, "grad_norm": 0.7536823749542236, "learning_rate": 7.5551642414408e-05, "loss": 0.9214603424072265, "memory(GiB)": 91.52, "step": 55920, "token_acc": 0.7693781370933752, "train_speed(iter/s)": 0.1452 }, { "epoch": 0.7256649225189689, "grad_norm": 0.7346317172050476, "learning_rate": 7.554703179408768e-05, "loss": 0.8870386123657227, "memory(GiB)": 91.52, "step": 55925, "token_acc": 0.761769710720363, "train_speed(iter/s)": 0.145198 }, { "epoch": 0.7257298009206246, "grad_norm": 0.7569758296012878, "learning_rate": 7.554242087977377e-05, "loss": 0.9218067169189453, "memory(GiB)": 91.52, "step": 55930, "token_acc": 0.7584461447910535, "train_speed(iter/s)": 0.145196 }, { "epoch": 0.7257946793222803, "grad_norm": 0.6549667716026306, "learning_rate": 7.553780967151932e-05, "loss": 0.814250373840332, "memory(GiB)": 91.52, "step": 55935, "token_acc": 0.7761273776447959, "train_speed(iter/s)": 0.145193 }, { "epoch": 0.725859557723936, "grad_norm": 0.7976692318916321, "learning_rate": 7.553319816937739e-05, "loss": 0.8911104202270508, "memory(GiB)": 91.52, "step": 55940, "token_acc": 0.7673480365788058, "train_speed(iter/s)": 0.145191 }, { "epoch": 0.7259244361255917, "grad_norm": 0.7605167627334595, "learning_rate": 7.552858637340107e-05, "loss": 0.8978280067443848, "memory(GiB)": 91.52, "step": 55945, "token_acc": 0.7412970628819239, "train_speed(iter/s)": 0.145189 }, { "epoch": 0.7259893145272474, "grad_norm": 0.7000042200088501, "learning_rate": 7.552397428364342e-05, "loss": 0.8801259994506836, "memory(GiB)": 91.52, "step": 55950, "token_acc": 0.7608661736715654, "train_speed(iter/s)": 0.145186 }, { "epoch": 0.726054192928903, "grad_norm": 0.8228152394294739, "learning_rate": 7.551936190015751e-05, "loss": 0.8781538963317871, "memory(GiB)": 91.52, "step": 55955, "token_acc": 0.7659971463066623, "train_speed(iter/s)": 0.145185 }, { "epoch": 0.7261190713305588, "grad_norm": 0.6731932759284973, "learning_rate": 7.551474922299644e-05, "loss": 0.8772336959838867, "memory(GiB)": 91.52, "step": 55960, "token_acc": 0.7737263650660132, "train_speed(iter/s)": 0.145183 }, { "epoch": 0.7261839497322145, "grad_norm": 0.732695996761322, "learning_rate": 7.551013625221327e-05, "loss": 0.8978362083435059, "memory(GiB)": 91.52, "step": 55965, "token_acc": 0.7453474994924545, "train_speed(iter/s)": 0.14518 }, { "epoch": 0.72624882813387, "grad_norm": 0.6899267435073853, "learning_rate": 7.550552298786108e-05, "loss": 0.869321346282959, "memory(GiB)": 91.52, "step": 55970, "token_acc": 0.7762368411597743, "train_speed(iter/s)": 0.145178 }, { "epoch": 0.7263137065355257, "grad_norm": 0.7422935962677002, "learning_rate": 7.550090942999299e-05, "loss": 0.8317914962768554, "memory(GiB)": 91.52, "step": 55975, "token_acc": 0.7691694159446035, "train_speed(iter/s)": 0.145176 }, { "epoch": 0.7263785849371814, "grad_norm": 0.7372440099716187, "learning_rate": 7.549629557866209e-05, "loss": 0.8692536354064941, "memory(GiB)": 91.52, "step": 55980, "token_acc": 0.764383961472509, "train_speed(iter/s)": 0.145173 }, { "epoch": 0.7264434633388371, "grad_norm": 0.8209224343299866, "learning_rate": 7.549168143392143e-05, "loss": 0.9293843269348144, "memory(GiB)": 91.52, "step": 55985, "token_acc": 0.7483957436438957, "train_speed(iter/s)": 0.145171 }, { "epoch": 0.7265083417404928, "grad_norm": 0.6228094696998596, "learning_rate": 7.548706699582415e-05, "loss": 0.8275051116943359, "memory(GiB)": 91.52, "step": 55990, "token_acc": 0.7858115391161287, "train_speed(iter/s)": 0.145168 }, { "epoch": 0.7265732201421485, "grad_norm": 0.7445549368858337, "learning_rate": 7.548245226442336e-05, "loss": 0.9012614250183105, "memory(GiB)": 91.52, "step": 55995, "token_acc": 0.7468074891982718, "train_speed(iter/s)": 0.145165 }, { "epoch": 0.7266380985438042, "grad_norm": 0.7257344722747803, "learning_rate": 7.547783723977214e-05, "loss": 0.8935400962829589, "memory(GiB)": 91.52, "step": 56000, "token_acc": 0.7588852758286269, "train_speed(iter/s)": 0.145162 }, { "epoch": 0.7267029769454599, "grad_norm": 0.7540980577468872, "learning_rate": 7.547322192192361e-05, "loss": 0.8706101417541504, "memory(GiB)": 91.52, "step": 56005, "token_acc": 0.7683355886332882, "train_speed(iter/s)": 0.145159 }, { "epoch": 0.7267678553471156, "grad_norm": 0.8190032243728638, "learning_rate": 7.546860631093087e-05, "loss": 0.869513988494873, "memory(GiB)": 91.52, "step": 56010, "token_acc": 0.7500592183181998, "train_speed(iter/s)": 0.145157 }, { "epoch": 0.7268327337487713, "grad_norm": 0.7593910694122314, "learning_rate": 7.546399040684706e-05, "loss": 0.8794497489929199, "memory(GiB)": 91.52, "step": 56015, "token_acc": 0.7523527181219621, "train_speed(iter/s)": 0.145155 }, { "epoch": 0.726897612150427, "grad_norm": 0.7292088866233826, "learning_rate": 7.545937420972527e-05, "loss": 0.9410137176513672, "memory(GiB)": 91.52, "step": 56020, "token_acc": 0.7437620647744334, "train_speed(iter/s)": 0.145152 }, { "epoch": 0.7269624905520827, "grad_norm": 0.7393746972084045, "learning_rate": 7.545475771961866e-05, "loss": 0.8719226837158203, "memory(GiB)": 91.52, "step": 56025, "token_acc": 0.7681902686069619, "train_speed(iter/s)": 0.14515 }, { "epoch": 0.7270273689537384, "grad_norm": 0.8587584495544434, "learning_rate": 7.545014093658032e-05, "loss": 0.9443278312683105, "memory(GiB)": 91.52, "step": 56030, "token_acc": 0.7400769230769231, "train_speed(iter/s)": 0.145147 }, { "epoch": 0.7270922473553941, "grad_norm": 0.7499858736991882, "learning_rate": 7.544552386066342e-05, "loss": 0.8911161422729492, "memory(GiB)": 91.52, "step": 56035, "token_acc": 0.7534987689516651, "train_speed(iter/s)": 0.145145 }, { "epoch": 0.7271571257570498, "grad_norm": 0.7256476283073425, "learning_rate": 7.544090649192106e-05, "loss": 0.9065963745117187, "memory(GiB)": 91.52, "step": 56040, "token_acc": 0.7619790416766659, "train_speed(iter/s)": 0.145142 }, { "epoch": 0.7272220041587055, "grad_norm": 0.7612072229385376, "learning_rate": 7.543628883040637e-05, "loss": 0.8975807189941406, "memory(GiB)": 91.52, "step": 56045, "token_acc": 0.748784148904233, "train_speed(iter/s)": 0.14514 }, { "epoch": 0.7272868825603612, "grad_norm": 0.7030619978904724, "learning_rate": 7.543167087617252e-05, "loss": 0.8945745468139649, "memory(GiB)": 91.52, "step": 56050, "token_acc": 0.7562132713026445, "train_speed(iter/s)": 0.145137 }, { "epoch": 0.7273517609620169, "grad_norm": 0.7230603694915771, "learning_rate": 7.542705262927264e-05, "loss": 0.8266239166259766, "memory(GiB)": 91.52, "step": 56055, "token_acc": 0.7711776697506665, "train_speed(iter/s)": 0.145133 }, { "epoch": 0.7274166393636726, "grad_norm": 0.8250481486320496, "learning_rate": 7.542243408975985e-05, "loss": 0.8176468849182129, "memory(GiB)": 91.52, "step": 56060, "token_acc": 0.7760945456874739, "train_speed(iter/s)": 0.145131 }, { "epoch": 0.7274815177653283, "grad_norm": 0.7818765640258789, "learning_rate": 7.541781525768734e-05, "loss": 0.8812696456909179, "memory(GiB)": 91.52, "step": 56065, "token_acc": 0.7504154770528828, "train_speed(iter/s)": 0.145128 }, { "epoch": 0.727546396166984, "grad_norm": 0.7119109034538269, "learning_rate": 7.541319613310827e-05, "loss": 0.8532703399658204, "memory(GiB)": 91.52, "step": 56070, "token_acc": 0.7577994191013155, "train_speed(iter/s)": 0.145126 }, { "epoch": 0.7276112745686397, "grad_norm": 0.7439045310020447, "learning_rate": 7.540857671607575e-05, "loss": 0.885588550567627, "memory(GiB)": 91.52, "step": 56075, "token_acc": 0.75121148111332, "train_speed(iter/s)": 0.145123 }, { "epoch": 0.7276761529702954, "grad_norm": 0.7341557145118713, "learning_rate": 7.540395700664295e-05, "loss": 0.9011354446411133, "memory(GiB)": 91.52, "step": 56080, "token_acc": 0.7435870219014259, "train_speed(iter/s)": 0.145121 }, { "epoch": 0.7277410313719511, "grad_norm": 0.7289822697639465, "learning_rate": 7.539933700486308e-05, "loss": 0.9139591217041015, "memory(GiB)": 91.52, "step": 56085, "token_acc": 0.7570261258290278, "train_speed(iter/s)": 0.145119 }, { "epoch": 0.7278059097736068, "grad_norm": 0.7557833790779114, "learning_rate": 7.539471671078926e-05, "loss": 0.8499582290649415, "memory(GiB)": 91.52, "step": 56090, "token_acc": 0.7628412472506145, "train_speed(iter/s)": 0.145117 }, { "epoch": 0.7278707881752625, "grad_norm": 0.8013666272163391, "learning_rate": 7.539009612447468e-05, "loss": 0.9056483268737793, "memory(GiB)": 91.52, "step": 56095, "token_acc": 0.7544995211518775, "train_speed(iter/s)": 0.145114 }, { "epoch": 0.7279356665769182, "grad_norm": 0.7592178583145142, "learning_rate": 7.53854752459725e-05, "loss": 0.888968563079834, "memory(GiB)": 91.52, "step": 56100, "token_acc": 0.7540894220283533, "train_speed(iter/s)": 0.145111 }, { "epoch": 0.7280005449785739, "grad_norm": 0.802935004234314, "learning_rate": 7.538085407533591e-05, "loss": 0.8922366142272949, "memory(GiB)": 91.52, "step": 56105, "token_acc": 0.7587134696580843, "train_speed(iter/s)": 0.14511 }, { "epoch": 0.7280654233802296, "grad_norm": 0.7955590486526489, "learning_rate": 7.53762326126181e-05, "loss": 0.8702489852905273, "memory(GiB)": 91.52, "step": 56110, "token_acc": 0.7587332173048235, "train_speed(iter/s)": 0.145107 }, { "epoch": 0.7281303017818853, "grad_norm": 0.7205252647399902, "learning_rate": 7.537161085787221e-05, "loss": 0.8621814727783204, "memory(GiB)": 91.52, "step": 56115, "token_acc": 0.7631256054246045, "train_speed(iter/s)": 0.145104 }, { "epoch": 0.728195180183541, "grad_norm": 0.6897310018539429, "learning_rate": 7.536698881115146e-05, "loss": 0.8742008209228516, "memory(GiB)": 91.52, "step": 56120, "token_acc": 0.7575365267246162, "train_speed(iter/s)": 0.145101 }, { "epoch": 0.7282600585851967, "grad_norm": 0.7395153641700745, "learning_rate": 7.536236647250905e-05, "loss": 0.9080616950988769, "memory(GiB)": 91.52, "step": 56125, "token_acc": 0.7755304533658983, "train_speed(iter/s)": 0.145099 }, { "epoch": 0.7283249369868524, "grad_norm": 0.745148241519928, "learning_rate": 7.535774384199815e-05, "loss": 0.8322694778442383, "memory(GiB)": 91.52, "step": 56130, "token_acc": 0.7922465990406542, "train_speed(iter/s)": 0.145097 }, { "epoch": 0.7283898153885081, "grad_norm": 0.7579658627510071, "learning_rate": 7.535312091967196e-05, "loss": 0.8598460197448731, "memory(GiB)": 91.52, "step": 56135, "token_acc": 0.7717361193557323, "train_speed(iter/s)": 0.145094 }, { "epoch": 0.7284546937901638, "grad_norm": 0.7986899614334106, "learning_rate": 7.53484977055837e-05, "loss": 0.9274410247802735, "memory(GiB)": 91.52, "step": 56140, "token_acc": 0.7561363479222845, "train_speed(iter/s)": 0.145092 }, { "epoch": 0.7285195721918195, "grad_norm": 0.7034804224967957, "learning_rate": 7.534387419978655e-05, "loss": 0.8997486114501954, "memory(GiB)": 91.52, "step": 56145, "token_acc": 0.7540378030037479, "train_speed(iter/s)": 0.145089 }, { "epoch": 0.7285844505934752, "grad_norm": 0.7601636648178101, "learning_rate": 7.533925040233373e-05, "loss": 0.9111092567443848, "memory(GiB)": 91.52, "step": 56150, "token_acc": 0.7458715596330275, "train_speed(iter/s)": 0.145087 }, { "epoch": 0.7286493289951309, "grad_norm": 0.7226070761680603, "learning_rate": 7.533462631327844e-05, "loss": 0.8719239234924316, "memory(GiB)": 91.52, "step": 56155, "token_acc": 0.7570833756474054, "train_speed(iter/s)": 0.145084 }, { "epoch": 0.7287142073967866, "grad_norm": 0.8003126382827759, "learning_rate": 7.533000193267392e-05, "loss": 0.8973945617675781, "memory(GiB)": 91.52, "step": 56160, "token_acc": 0.7496730731127402, "train_speed(iter/s)": 0.145082 }, { "epoch": 0.7287790857984423, "grad_norm": 0.7660616040229797, "learning_rate": 7.532537726057335e-05, "loss": 0.9023275375366211, "memory(GiB)": 91.52, "step": 56165, "token_acc": 0.7712496719283116, "train_speed(iter/s)": 0.145079 }, { "epoch": 0.728843964200098, "grad_norm": 0.6881347894668579, "learning_rate": 7.532075229702997e-05, "loss": 0.8688466072082519, "memory(GiB)": 91.52, "step": 56170, "token_acc": 0.7427948175568482, "train_speed(iter/s)": 0.145076 }, { "epoch": 0.7289088426017537, "grad_norm": 0.757980227470398, "learning_rate": 7.531612704209701e-05, "loss": 0.8486427307128906, "memory(GiB)": 91.52, "step": 56175, "token_acc": 0.7617676336746303, "train_speed(iter/s)": 0.145073 }, { "epoch": 0.7289737210034094, "grad_norm": 0.803272545337677, "learning_rate": 7.531150149582769e-05, "loss": 0.9356764793395996, "memory(GiB)": 91.52, "step": 56180, "token_acc": 0.7559500585251658, "train_speed(iter/s)": 0.145071 }, { "epoch": 0.7290385994050651, "grad_norm": 0.7567392587661743, "learning_rate": 7.530687565827523e-05, "loss": 0.8910980224609375, "memory(GiB)": 91.52, "step": 56185, "token_acc": 0.7651063148200056, "train_speed(iter/s)": 0.145069 }, { "epoch": 0.7291034778067208, "grad_norm": 0.7070745825767517, "learning_rate": 7.530224952949289e-05, "loss": 0.9070168495178222, "memory(GiB)": 91.52, "step": 56190, "token_acc": 0.7614491246316519, "train_speed(iter/s)": 0.145067 }, { "epoch": 0.7291683562083765, "grad_norm": 0.7513323426246643, "learning_rate": 7.529762310953388e-05, "loss": 0.8819500923156738, "memory(GiB)": 91.52, "step": 56195, "token_acc": 0.7439794498555068, "train_speed(iter/s)": 0.145065 }, { "epoch": 0.7292332346100322, "grad_norm": 0.7157362699508667, "learning_rate": 7.529299639845144e-05, "loss": 0.8502222061157226, "memory(GiB)": 91.52, "step": 56200, "token_acc": 0.7518596537423474, "train_speed(iter/s)": 0.145062 }, { "epoch": 0.7292981130116879, "grad_norm": 0.7475833892822266, "learning_rate": 7.528836939629884e-05, "loss": 0.8942194938659668, "memory(GiB)": 91.52, "step": 56205, "token_acc": 0.7331646281456489, "train_speed(iter/s)": 0.14506 }, { "epoch": 0.7293629914133435, "grad_norm": 0.6950247287750244, "learning_rate": 7.528374210312932e-05, "loss": 0.9007530212402344, "memory(GiB)": 91.52, "step": 56210, "token_acc": 0.7574792243767313, "train_speed(iter/s)": 0.145057 }, { "epoch": 0.7294278698149992, "grad_norm": 0.7845087647438049, "learning_rate": 7.52791145189961e-05, "loss": 0.8969844818115235, "memory(GiB)": 91.52, "step": 56215, "token_acc": 0.7585344924075663, "train_speed(iter/s)": 0.145054 }, { "epoch": 0.7294927482166549, "grad_norm": 0.7104060053825378, "learning_rate": 7.527448664395247e-05, "loss": 0.8807169914245605, "memory(GiB)": 91.52, "step": 56220, "token_acc": 0.754043892629186, "train_speed(iter/s)": 0.145051 }, { "epoch": 0.7295576266183106, "grad_norm": 0.7998694777488708, "learning_rate": 7.526985847805168e-05, "loss": 0.9135063171386719, "memory(GiB)": 91.52, "step": 56225, "token_acc": 0.7485894831866395, "train_speed(iter/s)": 0.145049 }, { "epoch": 0.7296225050199663, "grad_norm": 0.8343325257301331, "learning_rate": 7.526523002134697e-05, "loss": 0.8594446182250977, "memory(GiB)": 91.52, "step": 56230, "token_acc": 0.7727185176624677, "train_speed(iter/s)": 0.145047 }, { "epoch": 0.729687383421622, "grad_norm": 0.7332031726837158, "learning_rate": 7.526060127389165e-05, "loss": 0.9158674240112304, "memory(GiB)": 91.52, "step": 56235, "token_acc": 0.763187195546277, "train_speed(iter/s)": 0.145045 }, { "epoch": 0.7297522618232777, "grad_norm": 0.7231887578964233, "learning_rate": 7.525597223573892e-05, "loss": 0.8868486404418945, "memory(GiB)": 91.52, "step": 56240, "token_acc": 0.7464185443812541, "train_speed(iter/s)": 0.145043 }, { "epoch": 0.7298171402249334, "grad_norm": 0.7545178532600403, "learning_rate": 7.52513429069421e-05, "loss": 0.8966022491455078, "memory(GiB)": 91.52, "step": 56245, "token_acc": 0.7541448250640009, "train_speed(iter/s)": 0.14504 }, { "epoch": 0.7298820186265891, "grad_norm": 0.6972237229347229, "learning_rate": 7.524671328755446e-05, "loss": 0.9053995132446289, "memory(GiB)": 91.52, "step": 56250, "token_acc": 0.7349586485321209, "train_speed(iter/s)": 0.145038 }, { "epoch": 0.7299468970282448, "grad_norm": 0.7357496619224548, "learning_rate": 7.524208337762926e-05, "loss": 0.9102294921875, "memory(GiB)": 91.52, "step": 56255, "token_acc": 0.7513169909391023, "train_speed(iter/s)": 0.145037 }, { "epoch": 0.7300117754299005, "grad_norm": 0.6401663422584534, "learning_rate": 7.52374531772198e-05, "loss": 0.867553997039795, "memory(GiB)": 91.52, "step": 56260, "token_acc": 0.7681514398684348, "train_speed(iter/s)": 0.145034 }, { "epoch": 0.7300766538315562, "grad_norm": 0.7552361488342285, "learning_rate": 7.523282268637933e-05, "loss": 0.8465188026428223, "memory(GiB)": 91.52, "step": 56265, "token_acc": 0.7590284887210083, "train_speed(iter/s)": 0.145032 }, { "epoch": 0.7301415322332119, "grad_norm": 0.721211314201355, "learning_rate": 7.522819190516117e-05, "loss": 0.8701214790344238, "memory(GiB)": 91.52, "step": 56270, "token_acc": 0.7662564844975268, "train_speed(iter/s)": 0.145029 }, { "epoch": 0.7302064106348676, "grad_norm": 0.750810444355011, "learning_rate": 7.522356083361861e-05, "loss": 0.8622372627258301, "memory(GiB)": 91.52, "step": 56275, "token_acc": 0.7827543763676149, "train_speed(iter/s)": 0.145028 }, { "epoch": 0.7302712890365233, "grad_norm": 0.7025330662727356, "learning_rate": 7.521892947180494e-05, "loss": 0.875132942199707, "memory(GiB)": 91.52, "step": 56280, "token_acc": 0.7680894176089941, "train_speed(iter/s)": 0.145026 }, { "epoch": 0.730336167438179, "grad_norm": 0.697500467300415, "learning_rate": 7.521429781977343e-05, "loss": 0.8635113716125489, "memory(GiB)": 91.52, "step": 56285, "token_acc": 0.7608179904434123, "train_speed(iter/s)": 0.145023 }, { "epoch": 0.7304010458398347, "grad_norm": 0.7716700434684753, "learning_rate": 7.520966587757742e-05, "loss": 0.894159984588623, "memory(GiB)": 91.52, "step": 56290, "token_acc": 0.7423101067168864, "train_speed(iter/s)": 0.145021 }, { "epoch": 0.7304659242414904, "grad_norm": 0.7349469065666199, "learning_rate": 7.52050336452702e-05, "loss": 0.8640214920043945, "memory(GiB)": 91.52, "step": 56295, "token_acc": 0.7694954942169872, "train_speed(iter/s)": 0.145019 }, { "epoch": 0.7305308026431461, "grad_norm": 0.7288000583648682, "learning_rate": 7.520040112290505e-05, "loss": 0.8572893142700195, "memory(GiB)": 91.52, "step": 56300, "token_acc": 0.7874396135265701, "train_speed(iter/s)": 0.145016 }, { "epoch": 0.7305956810448018, "grad_norm": 0.6935621500015259, "learning_rate": 7.519576831053534e-05, "loss": 0.8420230865478515, "memory(GiB)": 91.52, "step": 56305, "token_acc": 0.7681894488695218, "train_speed(iter/s)": 0.145014 }, { "epoch": 0.7306605594464575, "grad_norm": 0.7044703364372253, "learning_rate": 7.519113520821433e-05, "loss": 0.9377347946166992, "memory(GiB)": 91.52, "step": 56310, "token_acc": 0.7452497701501686, "train_speed(iter/s)": 0.145011 }, { "epoch": 0.7307254378481132, "grad_norm": 0.691486120223999, "learning_rate": 7.518650181599535e-05, "loss": 0.8674288749694824, "memory(GiB)": 91.52, "step": 56315, "token_acc": 0.7548652694610778, "train_speed(iter/s)": 0.145009 }, { "epoch": 0.7307903162497689, "grad_norm": 0.789348304271698, "learning_rate": 7.518186813393175e-05, "loss": 0.9102033615112305, "memory(GiB)": 91.52, "step": 56320, "token_acc": 0.7542419067688843, "train_speed(iter/s)": 0.145007 }, { "epoch": 0.7308551946514246, "grad_norm": 0.6640843749046326, "learning_rate": 7.517723416207683e-05, "loss": 0.9006848335266113, "memory(GiB)": 91.52, "step": 56325, "token_acc": 0.7617018122521587, "train_speed(iter/s)": 0.145004 }, { "epoch": 0.7309200730530803, "grad_norm": 0.771034300327301, "learning_rate": 7.517259990048391e-05, "loss": 0.8490372657775879, "memory(GiB)": 91.52, "step": 56330, "token_acc": 0.7549407114624506, "train_speed(iter/s)": 0.145001 }, { "epoch": 0.730984951454736, "grad_norm": 0.6580572724342346, "learning_rate": 7.516796534920631e-05, "loss": 0.8642197608947754, "memory(GiB)": 91.52, "step": 56335, "token_acc": 0.7587348289812431, "train_speed(iter/s)": 0.144999 }, { "epoch": 0.7310498298563917, "grad_norm": 0.8219959735870361, "learning_rate": 7.516333050829742e-05, "loss": 0.9394957542419433, "memory(GiB)": 91.52, "step": 56340, "token_acc": 0.7283941187702884, "train_speed(iter/s)": 0.144997 }, { "epoch": 0.7311147082580474, "grad_norm": 0.7663021087646484, "learning_rate": 7.515869537781052e-05, "loss": 0.8940969467163086, "memory(GiB)": 91.52, "step": 56345, "token_acc": 0.7614932595921189, "train_speed(iter/s)": 0.144994 }, { "epoch": 0.7311795866597031, "grad_norm": 0.8060812950134277, "learning_rate": 7.515405995779899e-05, "loss": 0.8755032539367675, "memory(GiB)": 91.52, "step": 56350, "token_acc": 0.7736167676567001, "train_speed(iter/s)": 0.144992 }, { "epoch": 0.7312444650613588, "grad_norm": 0.7586158514022827, "learning_rate": 7.514942424831615e-05, "loss": 0.8792402267456054, "memory(GiB)": 91.52, "step": 56355, "token_acc": 0.7512980439538909, "train_speed(iter/s)": 0.144989 }, { "epoch": 0.7313093434630145, "grad_norm": 0.7089194655418396, "learning_rate": 7.514478824941535e-05, "loss": 0.9004158020019531, "memory(GiB)": 91.52, "step": 56360, "token_acc": 0.7512380553811816, "train_speed(iter/s)": 0.144987 }, { "epoch": 0.7313742218646702, "grad_norm": 0.6723378300666809, "learning_rate": 7.514015196114996e-05, "loss": 0.8961498260498046, "memory(GiB)": 91.52, "step": 56365, "token_acc": 0.767186302070023, "train_speed(iter/s)": 0.144985 }, { "epoch": 0.7314391002663259, "grad_norm": 0.7858480215072632, "learning_rate": 7.51355153835733e-05, "loss": 0.888454532623291, "memory(GiB)": 91.52, "step": 56370, "token_acc": 0.7681666838293344, "train_speed(iter/s)": 0.144982 }, { "epoch": 0.7315039786679816, "grad_norm": 0.727438747882843, "learning_rate": 7.513087851673876e-05, "loss": 0.8756114959716796, "memory(GiB)": 91.52, "step": 56375, "token_acc": 0.7512268780671951, "train_speed(iter/s)": 0.144979 }, { "epoch": 0.7315688570696373, "grad_norm": 0.711872935295105, "learning_rate": 7.512624136069968e-05, "loss": 0.8795738220214844, "memory(GiB)": 91.52, "step": 56380, "token_acc": 0.7745055093169795, "train_speed(iter/s)": 0.144977 }, { "epoch": 0.731633735471293, "grad_norm": 0.6768965125083923, "learning_rate": 7.512160391550943e-05, "loss": 0.840359878540039, "memory(GiB)": 91.52, "step": 56385, "token_acc": 0.7597895967270601, "train_speed(iter/s)": 0.144974 }, { "epoch": 0.7316986138729487, "grad_norm": 0.8085904121398926, "learning_rate": 7.511696618122138e-05, "loss": 0.905544376373291, "memory(GiB)": 91.52, "step": 56390, "token_acc": 0.7780290543907394, "train_speed(iter/s)": 0.144972 }, { "epoch": 0.7317634922746044, "grad_norm": 0.7795642018318176, "learning_rate": 7.51123281578889e-05, "loss": 0.8979372024536133, "memory(GiB)": 91.52, "step": 56395, "token_acc": 0.7761738578680203, "train_speed(iter/s)": 0.14497 }, { "epoch": 0.73182837067626, "grad_norm": 0.7052120566368103, "learning_rate": 7.510768984556538e-05, "loss": 0.8927536010742188, "memory(GiB)": 91.52, "step": 56400, "token_acc": 0.7378010471204188, "train_speed(iter/s)": 0.144968 }, { "epoch": 0.7318932490779158, "grad_norm": 0.7281783223152161, "learning_rate": 7.510305124430414e-05, "loss": 0.9117483139038086, "memory(GiB)": 91.52, "step": 56405, "token_acc": 0.7723463127080242, "train_speed(iter/s)": 0.144966 }, { "epoch": 0.7319581274795715, "grad_norm": 0.7197475433349609, "learning_rate": 7.509841235415864e-05, "loss": 0.8728755950927735, "memory(GiB)": 91.52, "step": 56410, "token_acc": 0.747155159157312, "train_speed(iter/s)": 0.144963 }, { "epoch": 0.7320230058812272, "grad_norm": 0.7613368034362793, "learning_rate": 7.509377317518221e-05, "loss": 0.8902894020080566, "memory(GiB)": 91.52, "step": 56415, "token_acc": 0.7657560765756076, "train_speed(iter/s)": 0.144961 }, { "epoch": 0.7320878842828829, "grad_norm": 0.7203894257545471, "learning_rate": 7.508913370742827e-05, "loss": 0.8718330383300781, "memory(GiB)": 91.52, "step": 56420, "token_acc": 0.7573830672916189, "train_speed(iter/s)": 0.144958 }, { "epoch": 0.7321527626845385, "grad_norm": 0.6877401471138, "learning_rate": 7.508449395095017e-05, "loss": 0.8331123352050781, "memory(GiB)": 91.52, "step": 56425, "token_acc": 0.7647512533744697, "train_speed(iter/s)": 0.144955 }, { "epoch": 0.7322176410861942, "grad_norm": 0.7834308743476868, "learning_rate": 7.507985390580135e-05, "loss": 0.8731291770935059, "memory(GiB)": 91.52, "step": 56430, "token_acc": 0.7714018960068946, "train_speed(iter/s)": 0.144953 }, { "epoch": 0.73228251948785, "grad_norm": 0.7493939995765686, "learning_rate": 7.507521357203517e-05, "loss": 0.9114599227905273, "memory(GiB)": 91.52, "step": 56435, "token_acc": 0.7430735706132614, "train_speed(iter/s)": 0.14495 }, { "epoch": 0.7323473978895056, "grad_norm": 0.7557680010795593, "learning_rate": 7.507057294970504e-05, "loss": 0.8658781051635742, "memory(GiB)": 91.52, "step": 56440, "token_acc": 0.7671112530948008, "train_speed(iter/s)": 0.144949 }, { "epoch": 0.7324122762911613, "grad_norm": 0.715518593788147, "learning_rate": 7.50659320388644e-05, "loss": 0.8707748413085937, "memory(GiB)": 91.52, "step": 56445, "token_acc": 0.742053343076361, "train_speed(iter/s)": 0.144946 }, { "epoch": 0.7324771546928169, "grad_norm": 0.8709354400634766, "learning_rate": 7.506129083956659e-05, "loss": 0.8972400665283203, "memory(GiB)": 91.52, "step": 56450, "token_acc": 0.7454661593579026, "train_speed(iter/s)": 0.144942 }, { "epoch": 0.7325420330944726, "grad_norm": 0.7059686779975891, "learning_rate": 7.505664935186508e-05, "loss": 0.9028411865234375, "memory(GiB)": 91.52, "step": 56455, "token_acc": 0.7595160413268081, "train_speed(iter/s)": 0.144939 }, { "epoch": 0.7326069114961283, "grad_norm": 0.7840588092803955, "learning_rate": 7.505200757581325e-05, "loss": 0.9265846252441406, "memory(GiB)": 91.52, "step": 56460, "token_acc": 0.7608613809201115, "train_speed(iter/s)": 0.144938 }, { "epoch": 0.732671789897784, "grad_norm": 0.6952821612358093, "learning_rate": 7.504736551146455e-05, "loss": 0.893553352355957, "memory(GiB)": 91.52, "step": 56465, "token_acc": 0.7591041794714198, "train_speed(iter/s)": 0.144935 }, { "epoch": 0.7327366682994397, "grad_norm": 0.833222508430481, "learning_rate": 7.504272315887237e-05, "loss": 0.9141170501708984, "memory(GiB)": 91.52, "step": 56470, "token_acc": 0.7683619344773791, "train_speed(iter/s)": 0.144933 }, { "epoch": 0.7328015467010954, "grad_norm": 0.8722273111343384, "learning_rate": 7.503808051809014e-05, "loss": 0.8947053909301758, "memory(GiB)": 91.52, "step": 56475, "token_acc": 0.762134335675764, "train_speed(iter/s)": 0.144931 }, { "epoch": 0.7328664251027511, "grad_norm": 0.7968506217002869, "learning_rate": 7.503343758917129e-05, "loss": 0.8729615211486816, "memory(GiB)": 91.52, "step": 56480, "token_acc": 0.7710903536599695, "train_speed(iter/s)": 0.144928 }, { "epoch": 0.7329313035044068, "grad_norm": 0.828959584236145, "learning_rate": 7.502879437216925e-05, "loss": 0.912017822265625, "memory(GiB)": 91.52, "step": 56485, "token_acc": 0.7225663149801376, "train_speed(iter/s)": 0.144927 }, { "epoch": 0.7329961819060625, "grad_norm": 0.7663460969924927, "learning_rate": 7.502415086713746e-05, "loss": 0.8817378997802734, "memory(GiB)": 91.52, "step": 56490, "token_acc": 0.7599950527194582, "train_speed(iter/s)": 0.144924 }, { "epoch": 0.7330610603077182, "grad_norm": 0.7310804724693298, "learning_rate": 7.501950707412934e-05, "loss": 0.8466064453125, "memory(GiB)": 91.52, "step": 56495, "token_acc": 0.7587247567984076, "train_speed(iter/s)": 0.144922 }, { "epoch": 0.7331259387093739, "grad_norm": 0.6479007601737976, "learning_rate": 7.501486299319837e-05, "loss": 0.8679749488830566, "memory(GiB)": 91.52, "step": 56500, "token_acc": 0.7704797859472932, "train_speed(iter/s)": 0.144919 }, { "epoch": 0.7331908171110296, "grad_norm": 0.7207817435264587, "learning_rate": 7.501021862439794e-05, "loss": 0.8833662033081054, "memory(GiB)": 91.52, "step": 56505, "token_acc": 0.7552657274375785, "train_speed(iter/s)": 0.144917 }, { "epoch": 0.7332556955126853, "grad_norm": 0.8214595317840576, "learning_rate": 7.500557396778154e-05, "loss": 0.8651138305664062, "memory(GiB)": 91.52, "step": 56510, "token_acc": 0.74937446162681, "train_speed(iter/s)": 0.144915 }, { "epoch": 0.733320573914341, "grad_norm": 0.8288494944572449, "learning_rate": 7.50009290234026e-05, "loss": 0.8720420837402344, "memory(GiB)": 91.52, "step": 56515, "token_acc": 0.7756104506300278, "train_speed(iter/s)": 0.144913 }, { "epoch": 0.7333854523159967, "grad_norm": 0.7970827221870422, "learning_rate": 7.499628379131457e-05, "loss": 0.8824052810668945, "memory(GiB)": 91.52, "step": 56520, "token_acc": 0.7542922831609004, "train_speed(iter/s)": 0.144911 }, { "epoch": 0.7334503307176524, "grad_norm": 0.7497125864028931, "learning_rate": 7.499163827157092e-05, "loss": 0.9147178649902343, "memory(GiB)": 91.52, "step": 56525, "token_acc": 0.7783591667489387, "train_speed(iter/s)": 0.144908 }, { "epoch": 0.7335152091193081, "grad_norm": 0.7111113667488098, "learning_rate": 7.49869924642251e-05, "loss": 0.8773393630981445, "memory(GiB)": 91.52, "step": 56530, "token_acc": 0.7473801389581974, "train_speed(iter/s)": 0.144904 }, { "epoch": 0.7335800875209638, "grad_norm": 0.6196887493133545, "learning_rate": 7.498234636933059e-05, "loss": 0.8346546173095704, "memory(GiB)": 91.52, "step": 56535, "token_acc": 0.745433097404786, "train_speed(iter/s)": 0.144901 }, { "epoch": 0.7336449659226195, "grad_norm": 0.7858136296272278, "learning_rate": 7.497769998694084e-05, "loss": 0.854709243774414, "memory(GiB)": 91.52, "step": 56540, "token_acc": 0.7725907384230288, "train_speed(iter/s)": 0.144898 }, { "epoch": 0.7337098443242752, "grad_norm": 0.7298663258552551, "learning_rate": 7.497305331710933e-05, "loss": 0.8663939476013184, "memory(GiB)": 91.52, "step": 56545, "token_acc": 0.7921904053551506, "train_speed(iter/s)": 0.144896 }, { "epoch": 0.7337747227259309, "grad_norm": 0.7775630354881287, "learning_rate": 7.496840635988952e-05, "loss": 0.8939559936523438, "memory(GiB)": 91.52, "step": 56550, "token_acc": 0.7701732321798045, "train_speed(iter/s)": 0.144894 }, { "epoch": 0.7338396011275866, "grad_norm": 0.74785315990448, "learning_rate": 7.49637591153349e-05, "loss": 0.8904228210449219, "memory(GiB)": 91.52, "step": 56555, "token_acc": 0.7623048707668686, "train_speed(iter/s)": 0.144891 }, { "epoch": 0.7339044795292423, "grad_norm": 0.8100738525390625, "learning_rate": 7.495911158349895e-05, "loss": 0.9073598861694336, "memory(GiB)": 91.52, "step": 56560, "token_acc": 0.7455084473336429, "train_speed(iter/s)": 0.144889 }, { "epoch": 0.733969357930898, "grad_norm": 0.8373385667800903, "learning_rate": 7.495446376443515e-05, "loss": 0.8953845977783204, "memory(GiB)": 91.52, "step": 56565, "token_acc": 0.7566416401963615, "train_speed(iter/s)": 0.144887 }, { "epoch": 0.7340342363325537, "grad_norm": 0.8222880363464355, "learning_rate": 7.494981565819698e-05, "loss": 0.9298542022705079, "memory(GiB)": 91.52, "step": 56570, "token_acc": 0.7472261142878782, "train_speed(iter/s)": 0.144885 }, { "epoch": 0.7340991147342094, "grad_norm": 0.7417625188827515, "learning_rate": 7.494516726483796e-05, "loss": 0.850771141052246, "memory(GiB)": 91.52, "step": 56575, "token_acc": 0.7554615702610742, "train_speed(iter/s)": 0.144882 }, { "epoch": 0.7341639931358651, "grad_norm": 0.797761857509613, "learning_rate": 7.494051858441154e-05, "loss": 0.8746195793151855, "memory(GiB)": 91.52, "step": 56580, "token_acc": 0.7648001508295625, "train_speed(iter/s)": 0.144881 }, { "epoch": 0.7342288715375208, "grad_norm": 0.739099383354187, "learning_rate": 7.493586961697125e-05, "loss": 0.8810880661010743, "memory(GiB)": 91.52, "step": 56585, "token_acc": 0.7442030456852792, "train_speed(iter/s)": 0.144878 }, { "epoch": 0.7342937499391765, "grad_norm": 0.9665683507919312, "learning_rate": 7.493122036257056e-05, "loss": 0.941920280456543, "memory(GiB)": 91.52, "step": 56590, "token_acc": 0.7444612286002014, "train_speed(iter/s)": 0.144877 }, { "epoch": 0.7343586283408322, "grad_norm": 0.7184846997261047, "learning_rate": 7.492657082126301e-05, "loss": 0.8612239837646485, "memory(GiB)": 91.52, "step": 56595, "token_acc": 0.7800315366972477, "train_speed(iter/s)": 0.144874 }, { "epoch": 0.7344235067424879, "grad_norm": 0.7923149466514587, "learning_rate": 7.492192099310206e-05, "loss": 0.9214770317077636, "memory(GiB)": 91.52, "step": 56600, "token_acc": 0.7693341478313989, "train_speed(iter/s)": 0.144872 }, { "epoch": 0.7344883851441436, "grad_norm": 0.733124315738678, "learning_rate": 7.491727087814126e-05, "loss": 0.8653227806091308, "memory(GiB)": 91.52, "step": 56605, "token_acc": 0.7595533888976254, "train_speed(iter/s)": 0.144869 }, { "epoch": 0.7345532635457993, "grad_norm": 0.6691800355911255, "learning_rate": 7.491262047643412e-05, "loss": 0.8680451393127442, "memory(GiB)": 91.52, "step": 56610, "token_acc": 0.7546395353914677, "train_speed(iter/s)": 0.144868 }, { "epoch": 0.734618141947455, "grad_norm": 0.6719748377799988, "learning_rate": 7.490796978803414e-05, "loss": 0.877925968170166, "memory(GiB)": 91.52, "step": 56615, "token_acc": 0.7600173489473859, "train_speed(iter/s)": 0.144865 }, { "epoch": 0.7346830203491107, "grad_norm": 0.7062301635742188, "learning_rate": 7.490331881299485e-05, "loss": 0.8855180740356445, "memory(GiB)": 91.52, "step": 56620, "token_acc": 0.7509881422924901, "train_speed(iter/s)": 0.144863 }, { "epoch": 0.7347478987507664, "grad_norm": 0.7027516961097717, "learning_rate": 7.489866755136979e-05, "loss": 0.9547636032104492, "memory(GiB)": 91.52, "step": 56625, "token_acc": 0.7571822646841149, "train_speed(iter/s)": 0.14486 }, { "epoch": 0.7348127771524221, "grad_norm": 0.7047033905982971, "learning_rate": 7.489401600321243e-05, "loss": 0.9035149574279785, "memory(GiB)": 91.52, "step": 56630, "token_acc": 0.7498751516016265, "train_speed(iter/s)": 0.144857 }, { "epoch": 0.7348776555540778, "grad_norm": 0.749725341796875, "learning_rate": 7.488936416857635e-05, "loss": 0.870991325378418, "memory(GiB)": 91.52, "step": 56635, "token_acc": 0.7607026270791233, "train_speed(iter/s)": 0.144854 }, { "epoch": 0.7349425339557335, "grad_norm": 0.718207597732544, "learning_rate": 7.488471204751508e-05, "loss": 0.8972393989562988, "memory(GiB)": 91.52, "step": 56640, "token_acc": 0.7611886743215032, "train_speed(iter/s)": 0.144852 }, { "epoch": 0.7350074123573892, "grad_norm": 0.7224076390266418, "learning_rate": 7.488005964008213e-05, "loss": 0.8901285171508789, "memory(GiB)": 91.52, "step": 56645, "token_acc": 0.7655322351367171, "train_speed(iter/s)": 0.14485 }, { "epoch": 0.7350722907590449, "grad_norm": 0.7257102727890015, "learning_rate": 7.487540694633105e-05, "loss": 0.8753378868103028, "memory(GiB)": 91.52, "step": 56650, "token_acc": 0.756297383223282, "train_speed(iter/s)": 0.144848 }, { "epoch": 0.7351371691607006, "grad_norm": 0.7782753109931946, "learning_rate": 7.48707539663154e-05, "loss": 0.8973692893981934, "memory(GiB)": 91.52, "step": 56655, "token_acc": 0.752682999037821, "train_speed(iter/s)": 0.144845 }, { "epoch": 0.7352020475623563, "grad_norm": 0.765625536441803, "learning_rate": 7.48661007000887e-05, "loss": 0.88504638671875, "memory(GiB)": 91.52, "step": 56660, "token_acc": 0.741068993727843, "train_speed(iter/s)": 0.144843 }, { "epoch": 0.735266925964012, "grad_norm": 0.8217378258705139, "learning_rate": 7.486144714770454e-05, "loss": 0.9461809158325195, "memory(GiB)": 91.52, "step": 56665, "token_acc": 0.7481229944858095, "train_speed(iter/s)": 0.144841 }, { "epoch": 0.7353318043656677, "grad_norm": 0.8519172072410583, "learning_rate": 7.485679330921642e-05, "loss": 0.924958610534668, "memory(GiB)": 91.52, "step": 56670, "token_acc": 0.7484515423852319, "train_speed(iter/s)": 0.144838 }, { "epoch": 0.7353966827673234, "grad_norm": 0.7253485918045044, "learning_rate": 7.485213918467794e-05, "loss": 0.9061243057250976, "memory(GiB)": 91.52, "step": 56675, "token_acc": 0.7750675878642235, "train_speed(iter/s)": 0.144835 }, { "epoch": 0.7354615611689791, "grad_norm": 0.7243061661720276, "learning_rate": 7.484748477414265e-05, "loss": 0.8925751686096192, "memory(GiB)": 91.52, "step": 56680, "token_acc": 0.7747750989564591, "train_speed(iter/s)": 0.144832 }, { "epoch": 0.7355264395706347, "grad_norm": 0.7785009741783142, "learning_rate": 7.484283007766409e-05, "loss": 0.8439650535583496, "memory(GiB)": 91.52, "step": 56685, "token_acc": 0.7773680731902459, "train_speed(iter/s)": 0.14483 }, { "epoch": 0.7355913179722904, "grad_norm": 0.7111243009567261, "learning_rate": 7.483817509529582e-05, "loss": 0.8587471008300781, "memory(GiB)": 91.52, "step": 56690, "token_acc": 0.7756985470221939, "train_speed(iter/s)": 0.144827 }, { "epoch": 0.7356561963739461, "grad_norm": 0.7120646834373474, "learning_rate": 7.483351982709145e-05, "loss": 0.9532649993896485, "memory(GiB)": 91.52, "step": 56695, "token_acc": 0.7459497384870519, "train_speed(iter/s)": 0.144825 }, { "epoch": 0.7357210747756018, "grad_norm": 0.6820804476737976, "learning_rate": 7.482886427310455e-05, "loss": 0.8300384521484375, "memory(GiB)": 91.52, "step": 56700, "token_acc": 0.7704170056237399, "train_speed(iter/s)": 0.144822 }, { "epoch": 0.7357859531772575, "grad_norm": 0.6637220978736877, "learning_rate": 7.482420843338865e-05, "loss": 0.815589714050293, "memory(GiB)": 91.52, "step": 56705, "token_acc": 0.7654517143855858, "train_speed(iter/s)": 0.144819 }, { "epoch": 0.7358508315789132, "grad_norm": 0.7140228748321533, "learning_rate": 7.481955230799737e-05, "loss": 0.8705798149108886, "memory(GiB)": 91.52, "step": 56710, "token_acc": 0.7578728356314137, "train_speed(iter/s)": 0.144816 }, { "epoch": 0.7359157099805689, "grad_norm": 0.6699581742286682, "learning_rate": 7.481489589698428e-05, "loss": 0.9058712005615235, "memory(GiB)": 91.52, "step": 56715, "token_acc": 0.7484945200529929, "train_speed(iter/s)": 0.144813 }, { "epoch": 0.7359805883822246, "grad_norm": 0.9022549390792847, "learning_rate": 7.481023920040296e-05, "loss": 0.8664647102355957, "memory(GiB)": 91.52, "step": 56720, "token_acc": 0.7678809979420154, "train_speed(iter/s)": 0.144812 }, { "epoch": 0.7360454667838803, "grad_norm": 0.7459667921066284, "learning_rate": 7.4805582218307e-05, "loss": 0.8570018768310547, "memory(GiB)": 91.52, "step": 56725, "token_acc": 0.7593706790238798, "train_speed(iter/s)": 0.144809 }, { "epoch": 0.736110345185536, "grad_norm": 0.8225456476211548, "learning_rate": 7.480092495075e-05, "loss": 0.9297792434692382, "memory(GiB)": 91.52, "step": 56730, "token_acc": 0.7455644528183388, "train_speed(iter/s)": 0.144807 }, { "epoch": 0.7361752235871917, "grad_norm": 0.7681074738502502, "learning_rate": 7.479626739778556e-05, "loss": 0.8385538101196289, "memory(GiB)": 91.52, "step": 56735, "token_acc": 0.7607653778242226, "train_speed(iter/s)": 0.144805 }, { "epoch": 0.7362401019888474, "grad_norm": 0.7740303874015808, "learning_rate": 7.479160955946726e-05, "loss": 0.8770177841186524, "memory(GiB)": 91.52, "step": 56740, "token_acc": 0.7749619316945834, "train_speed(iter/s)": 0.144802 }, { "epoch": 0.7363049803905031, "grad_norm": 0.6770594716072083, "learning_rate": 7.478695143584872e-05, "loss": 0.8920280456542968, "memory(GiB)": 91.52, "step": 56745, "token_acc": 0.7676064376212761, "train_speed(iter/s)": 0.144799 }, { "epoch": 0.7363698587921588, "grad_norm": 0.7957631349563599, "learning_rate": 7.478229302698353e-05, "loss": 0.8664770126342773, "memory(GiB)": 91.52, "step": 56750, "token_acc": 0.7742090949774519, "train_speed(iter/s)": 0.144797 }, { "epoch": 0.7364347371938145, "grad_norm": 0.6994745135307312, "learning_rate": 7.477763433292532e-05, "loss": 0.8671051025390625, "memory(GiB)": 91.52, "step": 56755, "token_acc": 0.7806422865145294, "train_speed(iter/s)": 0.144794 }, { "epoch": 0.7364996155954702, "grad_norm": 0.7518028020858765, "learning_rate": 7.477297535372768e-05, "loss": 0.9193323135375977, "memory(GiB)": 91.52, "step": 56760, "token_acc": 0.7445753700244288, "train_speed(iter/s)": 0.144791 }, { "epoch": 0.7365644939971259, "grad_norm": 0.6679977774620056, "learning_rate": 7.476831608944422e-05, "loss": 0.8427617073059082, "memory(GiB)": 91.52, "step": 56765, "token_acc": 0.7667600065887004, "train_speed(iter/s)": 0.144789 }, { "epoch": 0.7366293723987816, "grad_norm": 0.7077147960662842, "learning_rate": 7.476365654012861e-05, "loss": 0.9200555801391601, "memory(GiB)": 91.52, "step": 56770, "token_acc": 0.7520356864688805, "train_speed(iter/s)": 0.144787 }, { "epoch": 0.7366942508004373, "grad_norm": 0.6659114956855774, "learning_rate": 7.47589967058344e-05, "loss": 0.8920562744140625, "memory(GiB)": 91.52, "step": 56775, "token_acc": 0.7510410767231902, "train_speed(iter/s)": 0.144784 }, { "epoch": 0.736759129202093, "grad_norm": 0.8076251745223999, "learning_rate": 7.475433658661526e-05, "loss": 0.9228164672851562, "memory(GiB)": 91.52, "step": 56780, "token_acc": 0.7327339599509604, "train_speed(iter/s)": 0.144782 }, { "epoch": 0.7368240076037487, "grad_norm": 0.7330808639526367, "learning_rate": 7.474967618252481e-05, "loss": 0.8843873977661133, "memory(GiB)": 91.52, "step": 56785, "token_acc": 0.7534074138380068, "train_speed(iter/s)": 0.14478 }, { "epoch": 0.7368888860054044, "grad_norm": 0.6460529565811157, "learning_rate": 7.474501549361667e-05, "loss": 0.8804185867309571, "memory(GiB)": 91.52, "step": 56790, "token_acc": 0.7671313431803245, "train_speed(iter/s)": 0.144778 }, { "epoch": 0.7369537644070601, "grad_norm": 0.7266380786895752, "learning_rate": 7.47403545199445e-05, "loss": 0.8863391876220703, "memory(GiB)": 91.52, "step": 56795, "token_acc": 0.7779067237252484, "train_speed(iter/s)": 0.144776 }, { "epoch": 0.7370186428087158, "grad_norm": 0.8481109738349915, "learning_rate": 7.473569326156191e-05, "loss": 0.914927864074707, "memory(GiB)": 91.52, "step": 56800, "token_acc": 0.7584214808787633, "train_speed(iter/s)": 0.144774 }, { "epoch": 0.7370835212103715, "grad_norm": 0.8172802329063416, "learning_rate": 7.473103171852257e-05, "loss": 0.892515754699707, "memory(GiB)": 91.52, "step": 56805, "token_acc": 0.7531240264460521, "train_speed(iter/s)": 0.144771 }, { "epoch": 0.7371483996120272, "grad_norm": 0.7218819260597229, "learning_rate": 7.47263698908801e-05, "loss": 0.9016629219055176, "memory(GiB)": 91.52, "step": 56810, "token_acc": 0.7530810180121053, "train_speed(iter/s)": 0.144769 }, { "epoch": 0.7372132780136829, "grad_norm": 0.751276969909668, "learning_rate": 7.472170777868815e-05, "loss": 0.8213990211486817, "memory(GiB)": 91.52, "step": 56815, "token_acc": 0.7712947845106436, "train_speed(iter/s)": 0.144766 }, { "epoch": 0.7372781564153386, "grad_norm": 0.6387531161308289, "learning_rate": 7.471704538200037e-05, "loss": 0.9066036224365235, "memory(GiB)": 91.52, "step": 56820, "token_acc": 0.736464745383324, "train_speed(iter/s)": 0.144763 }, { "epoch": 0.7373430348169943, "grad_norm": 0.6846221089363098, "learning_rate": 7.471238270087044e-05, "loss": 0.8451676368713379, "memory(GiB)": 91.52, "step": 56825, "token_acc": 0.7623431996539578, "train_speed(iter/s)": 0.14476 }, { "epoch": 0.73740791321865, "grad_norm": 0.7705925703048706, "learning_rate": 7.4707719735352e-05, "loss": 0.8612859725952149, "memory(GiB)": 91.52, "step": 56830, "token_acc": 0.7708232249716966, "train_speed(iter/s)": 0.144758 }, { "epoch": 0.7374727916203057, "grad_norm": 0.7630854845046997, "learning_rate": 7.470305648549868e-05, "loss": 0.8486518859863281, "memory(GiB)": 91.52, "step": 56835, "token_acc": 0.7577090716526131, "train_speed(iter/s)": 0.144756 }, { "epoch": 0.7375376700219614, "grad_norm": 0.6766510009765625, "learning_rate": 7.469839295136422e-05, "loss": 0.8374750137329101, "memory(GiB)": 91.52, "step": 56840, "token_acc": 0.7589475459605329, "train_speed(iter/s)": 0.144754 }, { "epoch": 0.737602548423617, "grad_norm": 0.7143470048904419, "learning_rate": 7.469372913300224e-05, "loss": 0.8670612335205078, "memory(GiB)": 91.52, "step": 56845, "token_acc": 0.7912024401990688, "train_speed(iter/s)": 0.144751 }, { "epoch": 0.7376674268252728, "grad_norm": 0.7828647494316101, "learning_rate": 7.468906503046639e-05, "loss": 0.8621408462524414, "memory(GiB)": 91.52, "step": 56850, "token_acc": 0.758283520050671, "train_speed(iter/s)": 0.144749 }, { "epoch": 0.7377323052269285, "grad_norm": 0.7731269001960754, "learning_rate": 7.468440064381038e-05, "loss": 0.8836422920227051, "memory(GiB)": 91.52, "step": 56855, "token_acc": 0.7443553629469123, "train_speed(iter/s)": 0.144746 }, { "epoch": 0.7377971836285842, "grad_norm": 0.7079849243164062, "learning_rate": 7.467973597308787e-05, "loss": 0.9195386886596679, "memory(GiB)": 91.52, "step": 56860, "token_acc": 0.7338185070743211, "train_speed(iter/s)": 0.144744 }, { "epoch": 0.7378620620302399, "grad_norm": 0.7873604893684387, "learning_rate": 7.467507101835256e-05, "loss": 0.8893665313720703, "memory(GiB)": 91.52, "step": 56865, "token_acc": 0.7777824620573356, "train_speed(iter/s)": 0.144742 }, { "epoch": 0.7379269404318956, "grad_norm": 0.7071588635444641, "learning_rate": 7.46704057796581e-05, "loss": 0.9097209930419922, "memory(GiB)": 91.52, "step": 56870, "token_acc": 0.7576385759351059, "train_speed(iter/s)": 0.144739 }, { "epoch": 0.7379918188335512, "grad_norm": 0.7275938391685486, "learning_rate": 7.466574025705821e-05, "loss": 0.9176235198974609, "memory(GiB)": 91.52, "step": 56875, "token_acc": 0.7610944689787221, "train_speed(iter/s)": 0.144737 }, { "epoch": 0.738056697235207, "grad_norm": 0.7594193816184998, "learning_rate": 7.466107445060658e-05, "loss": 0.8990867614746094, "memory(GiB)": 91.52, "step": 56880, "token_acc": 0.7622817229336438, "train_speed(iter/s)": 0.144735 }, { "epoch": 0.7381215756368626, "grad_norm": 0.8297309875488281, "learning_rate": 7.465640836035688e-05, "loss": 0.8963724136352539, "memory(GiB)": 91.52, "step": 56885, "token_acc": 0.7694931773879142, "train_speed(iter/s)": 0.144733 }, { "epoch": 0.7381864540385183, "grad_norm": 0.833094596862793, "learning_rate": 7.465174198636281e-05, "loss": 0.8881234169006348, "memory(GiB)": 91.52, "step": 56890, "token_acc": 0.7584856841316856, "train_speed(iter/s)": 0.144731 }, { "epoch": 0.738251332440174, "grad_norm": 0.6964670419692993, "learning_rate": 7.46470753286781e-05, "loss": 0.8796697616577148, "memory(GiB)": 91.52, "step": 56895, "token_acc": 0.7382950530035336, "train_speed(iter/s)": 0.144728 }, { "epoch": 0.7383162108418297, "grad_norm": 0.7741166353225708, "learning_rate": 7.464240838735644e-05, "loss": 0.9151186943054199, "memory(GiB)": 91.52, "step": 56900, "token_acc": 0.7577932434391764, "train_speed(iter/s)": 0.144726 }, { "epoch": 0.7383810892434854, "grad_norm": 0.7507587671279907, "learning_rate": 7.463774116245149e-05, "loss": 0.8668905258178711, "memory(GiB)": 91.52, "step": 56905, "token_acc": 0.7670305948773299, "train_speed(iter/s)": 0.144724 }, { "epoch": 0.7384459676451411, "grad_norm": 0.724197268486023, "learning_rate": 7.463307365401702e-05, "loss": 0.9337149620056152, "memory(GiB)": 91.52, "step": 56910, "token_acc": 0.7489029566218258, "train_speed(iter/s)": 0.144722 }, { "epoch": 0.7385108460467968, "grad_norm": 0.6450182199478149, "learning_rate": 7.462840586210673e-05, "loss": 0.856199836730957, "memory(GiB)": 91.52, "step": 56915, "token_acc": 0.7645366915422885, "train_speed(iter/s)": 0.144719 }, { "epoch": 0.7385757244484525, "grad_norm": 0.7283540964126587, "learning_rate": 7.462373778677434e-05, "loss": 0.8941133499145508, "memory(GiB)": 91.52, "step": 56920, "token_acc": 0.761919018037572, "train_speed(iter/s)": 0.144717 }, { "epoch": 0.7386406028501081, "grad_norm": 0.644637942314148, "learning_rate": 7.461906942807355e-05, "loss": 0.8790565490722656, "memory(GiB)": 91.52, "step": 56925, "token_acc": 0.7429844961240311, "train_speed(iter/s)": 0.144715 }, { "epoch": 0.7387054812517638, "grad_norm": 0.7938064336776733, "learning_rate": 7.461440078605808e-05, "loss": 0.9355713844299316, "memory(GiB)": 91.52, "step": 56930, "token_acc": 0.7412384037696952, "train_speed(iter/s)": 0.144713 }, { "epoch": 0.7387703596534195, "grad_norm": 0.7497685551643372, "learning_rate": 7.46097318607817e-05, "loss": 0.8736734390258789, "memory(GiB)": 91.52, "step": 56935, "token_acc": 0.7511760923274783, "train_speed(iter/s)": 0.144711 }, { "epoch": 0.7388352380550752, "grad_norm": 0.779864490032196, "learning_rate": 7.460506265229808e-05, "loss": 0.9176531791687011, "memory(GiB)": 91.52, "step": 56940, "token_acc": 0.7691885964912281, "train_speed(iter/s)": 0.144709 }, { "epoch": 0.7389001164567309, "grad_norm": 0.7043200135231018, "learning_rate": 7.4600393160661e-05, "loss": 0.8981372833251953, "memory(GiB)": 91.52, "step": 56945, "token_acc": 0.7626631668558457, "train_speed(iter/s)": 0.144707 }, { "epoch": 0.7389649948583866, "grad_norm": 0.8238042593002319, "learning_rate": 7.459572338592417e-05, "loss": 0.8655126571655274, "memory(GiB)": 91.52, "step": 56950, "token_acc": 0.7567779520501905, "train_speed(iter/s)": 0.144704 }, { "epoch": 0.7390298732600423, "grad_norm": 0.7521518468856812, "learning_rate": 7.459105332814135e-05, "loss": 0.8795077323913574, "memory(GiB)": 91.52, "step": 56955, "token_acc": 0.7558031927199815, "train_speed(iter/s)": 0.144703 }, { "epoch": 0.739094751661698, "grad_norm": 0.7485296130180359, "learning_rate": 7.458638298736626e-05, "loss": 0.8772944450378418, "memory(GiB)": 91.52, "step": 56960, "token_acc": 0.7512647000854084, "train_speed(iter/s)": 0.1447 }, { "epoch": 0.7391596300633537, "grad_norm": 0.683285117149353, "learning_rate": 7.458171236365265e-05, "loss": 0.8634690284729004, "memory(GiB)": 91.52, "step": 56965, "token_acc": 0.7643466573165031, "train_speed(iter/s)": 0.144697 }, { "epoch": 0.7392245084650094, "grad_norm": 0.7448264956474304, "learning_rate": 7.457704145705429e-05, "loss": 0.8832174301147461, "memory(GiB)": 91.52, "step": 56970, "token_acc": 0.7463628512588852, "train_speed(iter/s)": 0.144695 }, { "epoch": 0.7392893868666651, "grad_norm": 0.7487913370132446, "learning_rate": 7.457237026762491e-05, "loss": 0.8703067779541016, "memory(GiB)": 91.52, "step": 56975, "token_acc": 0.7422650214199407, "train_speed(iter/s)": 0.144693 }, { "epoch": 0.7393542652683208, "grad_norm": 0.757897675037384, "learning_rate": 7.456769879541828e-05, "loss": 0.920843505859375, "memory(GiB)": 91.52, "step": 56980, "token_acc": 0.7332418935077724, "train_speed(iter/s)": 0.14469 }, { "epoch": 0.7394191436699765, "grad_norm": 0.703139066696167, "learning_rate": 7.456302704048816e-05, "loss": 0.8714389801025391, "memory(GiB)": 91.52, "step": 56985, "token_acc": 0.7617735995928265, "train_speed(iter/s)": 0.144688 }, { "epoch": 0.7394840220716322, "grad_norm": 0.7663980722427368, "learning_rate": 7.45583550028883e-05, "loss": 0.9045011520385742, "memory(GiB)": 91.52, "step": 56990, "token_acc": 0.755382074763549, "train_speed(iter/s)": 0.144684 }, { "epoch": 0.7395489004732879, "grad_norm": 0.7996866106987, "learning_rate": 7.455368268267247e-05, "loss": 0.8672143936157226, "memory(GiB)": 91.52, "step": 56995, "token_acc": 0.7517107950270503, "train_speed(iter/s)": 0.144682 }, { "epoch": 0.7396137788749436, "grad_norm": 0.7060288190841675, "learning_rate": 7.454901007989443e-05, "loss": 0.8730798721313476, "memory(GiB)": 91.52, "step": 57000, "token_acc": 0.7599613619898575, "train_speed(iter/s)": 0.144679 }, { "epoch": 0.7396786572765993, "grad_norm": 0.7197719216346741, "learning_rate": 7.454433719460797e-05, "loss": 0.893682861328125, "memory(GiB)": 91.52, "step": 57005, "token_acc": 0.7243509474914434, "train_speed(iter/s)": 0.144676 }, { "epoch": 0.739743535678255, "grad_norm": 0.669053316116333, "learning_rate": 7.453966402686686e-05, "loss": 0.8982934951782227, "memory(GiB)": 91.52, "step": 57010, "token_acc": 0.754871130842274, "train_speed(iter/s)": 0.144674 }, { "epoch": 0.7398084140799107, "grad_norm": 0.7085744142532349, "learning_rate": 7.453499057672488e-05, "loss": 0.8735822677612305, "memory(GiB)": 91.52, "step": 57015, "token_acc": 0.7562325972932721, "train_speed(iter/s)": 0.144672 }, { "epoch": 0.7398732924815664, "grad_norm": 0.690913736820221, "learning_rate": 7.453031684423579e-05, "loss": 0.8713234901428223, "memory(GiB)": 91.52, "step": 57020, "token_acc": 0.7539481865284974, "train_speed(iter/s)": 0.14467 }, { "epoch": 0.7399381708832221, "grad_norm": 0.6697738766670227, "learning_rate": 7.452564282945343e-05, "loss": 0.8705614089965821, "memory(GiB)": 91.52, "step": 57025, "token_acc": 0.7789843397563962, "train_speed(iter/s)": 0.144667 }, { "epoch": 0.7400030492848778, "grad_norm": 0.8286255598068237, "learning_rate": 7.45209685324315e-05, "loss": 0.8779019355773926, "memory(GiB)": 91.52, "step": 57030, "token_acc": 0.7651623060199415, "train_speed(iter/s)": 0.144665 }, { "epoch": 0.7400679276865335, "grad_norm": 0.7430498003959656, "learning_rate": 7.451629395322387e-05, "loss": 0.9102877616882324, "memory(GiB)": 91.52, "step": 57035, "token_acc": 0.7552948232559883, "train_speed(iter/s)": 0.144663 }, { "epoch": 0.7401328060881892, "grad_norm": 0.7845720648765564, "learning_rate": 7.451161909188432e-05, "loss": 0.8891315460205078, "memory(GiB)": 91.52, "step": 57040, "token_acc": 0.742109979092689, "train_speed(iter/s)": 0.14466 }, { "epoch": 0.7401976844898449, "grad_norm": 0.7975513935089111, "learning_rate": 7.450694394846661e-05, "loss": 0.8904743194580078, "memory(GiB)": 91.52, "step": 57045, "token_acc": 0.7651120603897243, "train_speed(iter/s)": 0.144657 }, { "epoch": 0.7402625628915006, "grad_norm": 0.7470682263374329, "learning_rate": 7.450226852302458e-05, "loss": 0.9074419021606446, "memory(GiB)": 91.52, "step": 57050, "token_acc": 0.7532249572199552, "train_speed(iter/s)": 0.144655 }, { "epoch": 0.7403274412931563, "grad_norm": 0.6698364615440369, "learning_rate": 7.4497592815612e-05, "loss": 0.9370760917663574, "memory(GiB)": 91.52, "step": 57055, "token_acc": 0.7417506708245704, "train_speed(iter/s)": 0.144653 }, { "epoch": 0.740392319694812, "grad_norm": 0.7336117029190063, "learning_rate": 7.449291682628273e-05, "loss": 0.8686554908752442, "memory(GiB)": 91.52, "step": 57060, "token_acc": 0.763240980524318, "train_speed(iter/s)": 0.144651 }, { "epoch": 0.7404571980964677, "grad_norm": 0.7036046981811523, "learning_rate": 7.448824055509053e-05, "loss": 0.903499698638916, "memory(GiB)": 91.52, "step": 57065, "token_acc": 0.752296035512016, "train_speed(iter/s)": 0.144648 }, { "epoch": 0.7405220764981234, "grad_norm": 0.7576865553855896, "learning_rate": 7.448356400208924e-05, "loss": 0.8367956161499024, "memory(GiB)": 91.52, "step": 57070, "token_acc": 0.7712617415587713, "train_speed(iter/s)": 0.144646 }, { "epoch": 0.7405869548997791, "grad_norm": 0.7009978890419006, "learning_rate": 7.447888716733267e-05, "loss": 0.8530183792114258, "memory(GiB)": 91.52, "step": 57075, "token_acc": 0.7593467011642949, "train_speed(iter/s)": 0.144644 }, { "epoch": 0.7406518333014348, "grad_norm": 0.6365631222724915, "learning_rate": 7.447421005087464e-05, "loss": 0.8684144973754883, "memory(GiB)": 91.52, "step": 57080, "token_acc": 0.7506008626082376, "train_speed(iter/s)": 0.144641 }, { "epoch": 0.7407167117030905, "grad_norm": 0.7132109999656677, "learning_rate": 7.446953265276899e-05, "loss": 0.8442627906799316, "memory(GiB)": 91.52, "step": 57085, "token_acc": 0.789215021691974, "train_speed(iter/s)": 0.144638 }, { "epoch": 0.7407815901047462, "grad_norm": 0.7647785544395447, "learning_rate": 7.44648549730695e-05, "loss": 0.9039666175842285, "memory(GiB)": 91.52, "step": 57090, "token_acc": 0.7503508560888568, "train_speed(iter/s)": 0.144636 }, { "epoch": 0.7408464685064019, "grad_norm": 0.7266863584518433, "learning_rate": 7.446017701183007e-05, "loss": 0.9402195930480957, "memory(GiB)": 91.52, "step": 57095, "token_acc": 0.7572355861015655, "train_speed(iter/s)": 0.144633 }, { "epoch": 0.7409113469080576, "grad_norm": 0.7439108490943909, "learning_rate": 7.445549876910447e-05, "loss": 0.9230321884155274, "memory(GiB)": 91.52, "step": 57100, "token_acc": 0.7570634920634921, "train_speed(iter/s)": 0.144631 }, { "epoch": 0.7409762253097133, "grad_norm": 0.736998975276947, "learning_rate": 7.445082024494657e-05, "loss": 0.8628792762756348, "memory(GiB)": 91.52, "step": 57105, "token_acc": 0.7531751737359214, "train_speed(iter/s)": 0.144629 }, { "epoch": 0.741041103711369, "grad_norm": 0.7430394291877747, "learning_rate": 7.444614143941022e-05, "loss": 0.8954824447631836, "memory(GiB)": 91.52, "step": 57110, "token_acc": 0.7522074142663447, "train_speed(iter/s)": 0.144626 }, { "epoch": 0.7411059821130247, "grad_norm": 0.7592759728431702, "learning_rate": 7.444146235254923e-05, "loss": 0.9013011932373047, "memory(GiB)": 91.52, "step": 57115, "token_acc": 0.7672580299019959, "train_speed(iter/s)": 0.144623 }, { "epoch": 0.7411708605146804, "grad_norm": 0.656640887260437, "learning_rate": 7.443678298441747e-05, "loss": 0.9343717575073243, "memory(GiB)": 91.52, "step": 57120, "token_acc": 0.7417797494780793, "train_speed(iter/s)": 0.144621 }, { "epoch": 0.7412357389163361, "grad_norm": 0.6900983452796936, "learning_rate": 7.443210333506878e-05, "loss": 0.9052244186401367, "memory(GiB)": 91.52, "step": 57125, "token_acc": 0.7465106655129604, "train_speed(iter/s)": 0.144619 }, { "epoch": 0.7413006173179918, "grad_norm": 0.7459966540336609, "learning_rate": 7.442742340455705e-05, "loss": 0.8574673652648925, "memory(GiB)": 91.52, "step": 57130, "token_acc": 0.778837301065369, "train_speed(iter/s)": 0.144616 }, { "epoch": 0.7413654957196475, "grad_norm": 0.8033113479614258, "learning_rate": 7.442274319293605e-05, "loss": 0.9122737884521485, "memory(GiB)": 91.52, "step": 57135, "token_acc": 0.7554498464304442, "train_speed(iter/s)": 0.144615 }, { "epoch": 0.7414303741213032, "grad_norm": 0.7442429661750793, "learning_rate": 7.441806270025973e-05, "loss": 0.8640505790710449, "memory(GiB)": 91.52, "step": 57140, "token_acc": 0.7762914179872402, "train_speed(iter/s)": 0.144612 }, { "epoch": 0.7414952525229589, "grad_norm": 0.8501061201095581, "learning_rate": 7.441338192658191e-05, "loss": 0.8967150688171387, "memory(GiB)": 91.52, "step": 57145, "token_acc": 0.7602165917449939, "train_speed(iter/s)": 0.14461 }, { "epoch": 0.7415601309246146, "grad_norm": 0.71865314245224, "learning_rate": 7.440870087195646e-05, "loss": 0.886744213104248, "memory(GiB)": 91.52, "step": 57150, "token_acc": 0.7767585307026215, "train_speed(iter/s)": 0.144608 }, { "epoch": 0.7416250093262703, "grad_norm": 0.7592803835868835, "learning_rate": 7.440401953643725e-05, "loss": 0.8937021255493164, "memory(GiB)": 91.52, "step": 57155, "token_acc": 0.7591667397084173, "train_speed(iter/s)": 0.144605 }, { "epoch": 0.741689887727926, "grad_norm": 0.7531854510307312, "learning_rate": 7.439933792007814e-05, "loss": 0.9261598587036133, "memory(GiB)": 91.52, "step": 57160, "token_acc": 0.7399834879316206, "train_speed(iter/s)": 0.144604 }, { "epoch": 0.7417547661295816, "grad_norm": 0.747895359992981, "learning_rate": 7.439465602293303e-05, "loss": 0.8403997421264648, "memory(GiB)": 91.52, "step": 57165, "token_acc": 0.7726725923945733, "train_speed(iter/s)": 0.144602 }, { "epoch": 0.7418196445312373, "grad_norm": 0.7377320528030396, "learning_rate": 7.438997384505579e-05, "loss": 0.8657204627990722, "memory(GiB)": 91.52, "step": 57170, "token_acc": 0.7495506291192331, "train_speed(iter/s)": 0.1446 }, { "epoch": 0.741884522932893, "grad_norm": 0.7592939138412476, "learning_rate": 7.438529138650031e-05, "loss": 0.9019721984863281, "memory(GiB)": 91.52, "step": 57175, "token_acc": 0.7672255882767367, "train_speed(iter/s)": 0.144598 }, { "epoch": 0.7419494013345487, "grad_norm": 0.7409451007843018, "learning_rate": 7.438060864732045e-05, "loss": 0.8640894889831543, "memory(GiB)": 91.52, "step": 57180, "token_acc": 0.7527583809437441, "train_speed(iter/s)": 0.144596 }, { "epoch": 0.7420142797362044, "grad_norm": 0.7062248587608337, "learning_rate": 7.437592562757013e-05, "loss": 0.8728549957275391, "memory(GiB)": 91.52, "step": 57185, "token_acc": 0.7624694487191093, "train_speed(iter/s)": 0.144593 }, { "epoch": 0.7420791581378601, "grad_norm": 0.7763222455978394, "learning_rate": 7.437124232730321e-05, "loss": 0.8869370460510254, "memory(GiB)": 91.52, "step": 57190, "token_acc": 0.7558894738864763, "train_speed(iter/s)": 0.14459 }, { "epoch": 0.7421440365395158, "grad_norm": 0.6932972073554993, "learning_rate": 7.436655874657361e-05, "loss": 0.8566974639892578, "memory(GiB)": 91.52, "step": 57195, "token_acc": 0.7560729074961687, "train_speed(iter/s)": 0.144588 }, { "epoch": 0.7422089149411715, "grad_norm": 0.7069047689437866, "learning_rate": 7.436187488543522e-05, "loss": 0.8655166625976562, "memory(GiB)": 91.52, "step": 57200, "token_acc": 0.7432554842266945, "train_speed(iter/s)": 0.144585 }, { "epoch": 0.7422737933428272, "grad_norm": 0.7476593852043152, "learning_rate": 7.435719074394196e-05, "loss": 0.865176010131836, "memory(GiB)": 91.52, "step": 57205, "token_acc": 0.7712010816495155, "train_speed(iter/s)": 0.144582 }, { "epoch": 0.7423386717444829, "grad_norm": 0.7534254193305969, "learning_rate": 7.43525063221477e-05, "loss": 0.8686012268066406, "memory(GiB)": 91.52, "step": 57210, "token_acc": 0.7513715993794695, "train_speed(iter/s)": 0.14458 }, { "epoch": 0.7424035501461386, "grad_norm": 0.7166911959648132, "learning_rate": 7.434782162010637e-05, "loss": 0.8889856338500977, "memory(GiB)": 91.52, "step": 57215, "token_acc": 0.7427042138279013, "train_speed(iter/s)": 0.144578 }, { "epoch": 0.7424684285477943, "grad_norm": 0.7570235133171082, "learning_rate": 7.434313663787186e-05, "loss": 0.8790361404418945, "memory(GiB)": 91.52, "step": 57220, "token_acc": 0.7494651003189213, "train_speed(iter/s)": 0.144576 }, { "epoch": 0.74253330694945, "grad_norm": 0.7876137495040894, "learning_rate": 7.433845137549811e-05, "loss": 0.8433671951293945, "memory(GiB)": 91.52, "step": 57225, "token_acc": 0.7702999741401604, "train_speed(iter/s)": 0.144574 }, { "epoch": 0.7425981853511057, "grad_norm": 0.7152413129806519, "learning_rate": 7.433376583303901e-05, "loss": 0.8481540679931641, "memory(GiB)": 91.52, "step": 57230, "token_acc": 0.7599116426095899, "train_speed(iter/s)": 0.144571 }, { "epoch": 0.7426630637527614, "grad_norm": 0.7401031255722046, "learning_rate": 7.432908001054852e-05, "loss": 0.8685689926147461, "memory(GiB)": 91.52, "step": 57235, "token_acc": 0.7438212224359924, "train_speed(iter/s)": 0.144568 }, { "epoch": 0.7427279421544171, "grad_norm": 0.817700207233429, "learning_rate": 7.432439390808055e-05, "loss": 0.8984460830688477, "memory(GiB)": 91.52, "step": 57240, "token_acc": 0.7441365293415492, "train_speed(iter/s)": 0.144566 }, { "epoch": 0.7427928205560728, "grad_norm": 0.7804964780807495, "learning_rate": 7.431970752568901e-05, "loss": 0.8821267127990723, "memory(GiB)": 91.52, "step": 57245, "token_acc": 0.7489378014886752, "train_speed(iter/s)": 0.144564 }, { "epoch": 0.7428576989577285, "grad_norm": 0.7516489028930664, "learning_rate": 7.431502086342784e-05, "loss": 0.8826639175415039, "memory(GiB)": 91.52, "step": 57250, "token_acc": 0.7670379209758684, "train_speed(iter/s)": 0.144561 }, { "epoch": 0.7429225773593842, "grad_norm": 0.8736623525619507, "learning_rate": 7.431033392135098e-05, "loss": 0.8631772994995117, "memory(GiB)": 91.52, "step": 57255, "token_acc": 0.7373582592706098, "train_speed(iter/s)": 0.144559 }, { "epoch": 0.7429874557610399, "grad_norm": 0.7952685356140137, "learning_rate": 7.430564669951237e-05, "loss": 0.8914800643920898, "memory(GiB)": 91.52, "step": 57260, "token_acc": 0.7357686102788661, "train_speed(iter/s)": 0.144556 }, { "epoch": 0.7430523341626956, "grad_norm": 0.6995980143547058, "learning_rate": 7.430095919796592e-05, "loss": 0.9272619247436523, "memory(GiB)": 91.52, "step": 57265, "token_acc": 0.7604923526094025, "train_speed(iter/s)": 0.144553 }, { "epoch": 0.7431172125643513, "grad_norm": 0.797073245048523, "learning_rate": 7.429627141676559e-05, "loss": 0.941433334350586, "memory(GiB)": 91.52, "step": 57270, "token_acc": 0.741854468740582, "train_speed(iter/s)": 0.144551 }, { "epoch": 0.743182090966007, "grad_norm": 0.7997927665710449, "learning_rate": 7.429158335596535e-05, "loss": 0.8703205108642578, "memory(GiB)": 91.52, "step": 57275, "token_acc": 0.767405579944128, "train_speed(iter/s)": 0.144549 }, { "epoch": 0.7432469693676627, "grad_norm": 0.8041414618492126, "learning_rate": 7.428689501561914e-05, "loss": 0.9226573944091797, "memory(GiB)": 91.52, "step": 57280, "token_acc": 0.7563312510552085, "train_speed(iter/s)": 0.144547 }, { "epoch": 0.7433118477693184, "grad_norm": 0.7014027833938599, "learning_rate": 7.42822063957809e-05, "loss": 0.9002901077270508, "memory(GiB)": 91.52, "step": 57285, "token_acc": 0.7452195799992487, "train_speed(iter/s)": 0.144545 }, { "epoch": 0.7433767261709741, "grad_norm": 0.7371746897697449, "learning_rate": 7.427751749650457e-05, "loss": 0.8614697456359863, "memory(GiB)": 91.52, "step": 57290, "token_acc": 0.7363138992380761, "train_speed(iter/s)": 0.144543 }, { "epoch": 0.7434416045726298, "grad_norm": 0.7838491797447205, "learning_rate": 7.427282831784416e-05, "loss": 0.88524169921875, "memory(GiB)": 91.52, "step": 57295, "token_acc": 0.7633687021291546, "train_speed(iter/s)": 0.144541 }, { "epoch": 0.7435064829742855, "grad_norm": 0.7526125311851501, "learning_rate": 7.426813885985359e-05, "loss": 0.8838088035583496, "memory(GiB)": 91.52, "step": 57300, "token_acc": 0.7621062529384109, "train_speed(iter/s)": 0.144539 }, { "epoch": 0.7435713613759412, "grad_norm": 0.7256680727005005, "learning_rate": 7.426344912258684e-05, "loss": 0.8136683464050293, "memory(GiB)": 91.52, "step": 57305, "token_acc": 0.7858809801633606, "train_speed(iter/s)": 0.144537 }, { "epoch": 0.7436362397775969, "grad_norm": 0.8271343111991882, "learning_rate": 7.425875910609789e-05, "loss": 0.8879133224487304, "memory(GiB)": 91.52, "step": 57310, "token_acc": 0.7558093082096011, "train_speed(iter/s)": 0.144535 }, { "epoch": 0.7437011181792526, "grad_norm": 0.7478878498077393, "learning_rate": 7.425406881044067e-05, "loss": 0.9000751495361328, "memory(GiB)": 91.52, "step": 57315, "token_acc": 0.7446592478271464, "train_speed(iter/s)": 0.144533 }, { "epoch": 0.7437659965809083, "grad_norm": 0.7322369813919067, "learning_rate": 7.424937823566923e-05, "loss": 0.8965646743774414, "memory(GiB)": 91.52, "step": 57320, "token_acc": 0.7461699927942604, "train_speed(iter/s)": 0.144531 }, { "epoch": 0.743830874982564, "grad_norm": 0.7476310729980469, "learning_rate": 7.424468738183747e-05, "loss": 0.8775758743286133, "memory(GiB)": 91.52, "step": 57325, "token_acc": 0.7701735273519571, "train_speed(iter/s)": 0.144529 }, { "epoch": 0.7438957533842196, "grad_norm": 0.7559404373168945, "learning_rate": 7.423999624899944e-05, "loss": 0.9078590393066406, "memory(GiB)": 91.52, "step": 57330, "token_acc": 0.7484603609697056, "train_speed(iter/s)": 0.144526 }, { "epoch": 0.7439606317858753, "grad_norm": 0.6577892899513245, "learning_rate": 7.423530483720906e-05, "loss": 0.9121435165405274, "memory(GiB)": 91.52, "step": 57335, "token_acc": 0.7660477261188076, "train_speed(iter/s)": 0.144524 }, { "epoch": 0.744025510187531, "grad_norm": 0.6834803819656372, "learning_rate": 7.423061314652037e-05, "loss": 0.9088698387145996, "memory(GiB)": 91.52, "step": 57340, "token_acc": 0.7676540381310858, "train_speed(iter/s)": 0.144522 }, { "epoch": 0.7440903885891867, "grad_norm": 0.7869913578033447, "learning_rate": 7.422592117698734e-05, "loss": 0.9356491088867187, "memory(GiB)": 91.52, "step": 57345, "token_acc": 0.7478303747534517, "train_speed(iter/s)": 0.14452 }, { "epoch": 0.7441552669908424, "grad_norm": 0.7504367232322693, "learning_rate": 7.422122892866398e-05, "loss": 0.8637495040893555, "memory(GiB)": 91.52, "step": 57350, "token_acc": 0.7618936476446104, "train_speed(iter/s)": 0.144518 }, { "epoch": 0.7442201453924981, "grad_norm": 0.6825475096702576, "learning_rate": 7.421653640160427e-05, "loss": 0.8178089141845704, "memory(GiB)": 91.52, "step": 57355, "token_acc": 0.7654255116429742, "train_speed(iter/s)": 0.144515 }, { "epoch": 0.7442850237941538, "grad_norm": 0.7541028261184692, "learning_rate": 7.42118435958622e-05, "loss": 0.8675979614257813, "memory(GiB)": 91.52, "step": 57360, "token_acc": 0.7712369062993256, "train_speed(iter/s)": 0.144513 }, { "epoch": 0.7443499021958095, "grad_norm": 0.6550434231758118, "learning_rate": 7.420715051149182e-05, "loss": 0.876182746887207, "memory(GiB)": 91.52, "step": 57365, "token_acc": 0.750547511862757, "train_speed(iter/s)": 0.14451 }, { "epoch": 0.7444147805974652, "grad_norm": 0.7441068887710571, "learning_rate": 7.420245714854709e-05, "loss": 0.8687402725219726, "memory(GiB)": 91.52, "step": 57370, "token_acc": 0.7429530725384556, "train_speed(iter/s)": 0.144507 }, { "epoch": 0.7444796589991209, "grad_norm": 0.7441849112510681, "learning_rate": 7.419776350708206e-05, "loss": 0.8874114036560059, "memory(GiB)": 91.52, "step": 57375, "token_acc": 0.7438461538461538, "train_speed(iter/s)": 0.144506 }, { "epoch": 0.7445445374007766, "grad_norm": 0.7296062707901001, "learning_rate": 7.419306958715068e-05, "loss": 0.8875502586364746, "memory(GiB)": 91.52, "step": 57380, "token_acc": 0.7718798918457379, "train_speed(iter/s)": 0.144504 }, { "epoch": 0.7446094158024323, "grad_norm": 0.7334626913070679, "learning_rate": 7.418837538880706e-05, "loss": 0.8664716720581055, "memory(GiB)": 91.52, "step": 57385, "token_acc": 0.7424617926476662, "train_speed(iter/s)": 0.144501 }, { "epoch": 0.744674294204088, "grad_norm": 0.6497259736061096, "learning_rate": 7.418368091210516e-05, "loss": 0.9109176635742188, "memory(GiB)": 91.52, "step": 57390, "token_acc": 0.7545535738450296, "train_speed(iter/s)": 0.144498 }, { "epoch": 0.7447391726057437, "grad_norm": 0.7017306089401245, "learning_rate": 7.417898615709901e-05, "loss": 0.8889997482299805, "memory(GiB)": 91.52, "step": 57395, "token_acc": 0.7832998410365616, "train_speed(iter/s)": 0.144496 }, { "epoch": 0.7448040510073994, "grad_norm": 0.7314655184745789, "learning_rate": 7.417429112384264e-05, "loss": 0.912662124633789, "memory(GiB)": 91.52, "step": 57400, "token_acc": 0.755888284962226, "train_speed(iter/s)": 0.144495 }, { "epoch": 0.744868929409055, "grad_norm": 0.8108238577842712, "learning_rate": 7.416959581239011e-05, "loss": 0.9219389915466308, "memory(GiB)": 91.52, "step": 57405, "token_acc": 0.7667698658410733, "train_speed(iter/s)": 0.144492 }, { "epoch": 0.7449338078107107, "grad_norm": 0.6948550939559937, "learning_rate": 7.41649002227954e-05, "loss": 0.8595694541931153, "memory(GiB)": 91.52, "step": 57410, "token_acc": 0.764371988647614, "train_speed(iter/s)": 0.14449 }, { "epoch": 0.7449986862123664, "grad_norm": 0.733791708946228, "learning_rate": 7.416020435511258e-05, "loss": 0.924893569946289, "memory(GiB)": 91.52, "step": 57415, "token_acc": 0.7721133917229391, "train_speed(iter/s)": 0.144488 }, { "epoch": 0.7450635646140221, "grad_norm": 0.6950185298919678, "learning_rate": 7.415550820939568e-05, "loss": 0.8725947380065918, "memory(GiB)": 91.52, "step": 57420, "token_acc": 0.7765653802699649, "train_speed(iter/s)": 0.144485 }, { "epoch": 0.7451284430156778, "grad_norm": 0.7397798299789429, "learning_rate": 7.415081178569875e-05, "loss": 0.9397990226745605, "memory(GiB)": 91.52, "step": 57425, "token_acc": 0.7665520807132277, "train_speed(iter/s)": 0.144483 }, { "epoch": 0.7451933214173335, "grad_norm": 0.7927760481834412, "learning_rate": 7.414611508407583e-05, "loss": 0.9241859436035156, "memory(GiB)": 91.52, "step": 57430, "token_acc": 0.7443286544732738, "train_speed(iter/s)": 0.14448 }, { "epoch": 0.7452581998189892, "grad_norm": 0.7361505627632141, "learning_rate": 7.414141810458097e-05, "loss": 0.8531036376953125, "memory(GiB)": 91.52, "step": 57435, "token_acc": 0.7601688037656225, "train_speed(iter/s)": 0.144477 }, { "epoch": 0.7453230782206449, "grad_norm": 0.7803142070770264, "learning_rate": 7.413672084726824e-05, "loss": 0.8835889816284179, "memory(GiB)": 91.52, "step": 57440, "token_acc": 0.7767123287671233, "train_speed(iter/s)": 0.144475 }, { "epoch": 0.7453879566223006, "grad_norm": 0.8171340823173523, "learning_rate": 7.413202331219167e-05, "loss": 0.881653118133545, "memory(GiB)": 91.52, "step": 57445, "token_acc": 0.7758634100808961, "train_speed(iter/s)": 0.144473 }, { "epoch": 0.7454528350239563, "grad_norm": 0.815544068813324, "learning_rate": 7.41273254994053e-05, "loss": 0.9310131072998047, "memory(GiB)": 91.52, "step": 57450, "token_acc": 0.7680591659649251, "train_speed(iter/s)": 0.144472 }, { "epoch": 0.745517713425612, "grad_norm": 0.6844085454940796, "learning_rate": 7.412262740896328e-05, "loss": 0.9138843536376953, "memory(GiB)": 91.52, "step": 57455, "token_acc": 0.7522550487528759, "train_speed(iter/s)": 0.14447 }, { "epoch": 0.7455825918272677, "grad_norm": 0.7344011664390564, "learning_rate": 7.411792904091956e-05, "loss": 0.9105278968811035, "memory(GiB)": 91.52, "step": 57460, "token_acc": 0.7511420194995568, "train_speed(iter/s)": 0.144468 }, { "epoch": 0.7456474702289234, "grad_norm": 0.7311747074127197, "learning_rate": 7.41132303953283e-05, "loss": 0.8735243797302246, "memory(GiB)": 91.52, "step": 57465, "token_acc": 0.7747603833865815, "train_speed(iter/s)": 0.144465 }, { "epoch": 0.7457123486305791, "grad_norm": 0.6868667006492615, "learning_rate": 7.410853147224352e-05, "loss": 0.8770498275756836, "memory(GiB)": 91.52, "step": 57470, "token_acc": 0.7418506309725968, "train_speed(iter/s)": 0.144463 }, { "epoch": 0.7457772270322348, "grad_norm": 0.8136026263237, "learning_rate": 7.410383227171933e-05, "loss": 0.9051370620727539, "memory(GiB)": 91.52, "step": 57475, "token_acc": 0.7415805268422807, "train_speed(iter/s)": 0.144461 }, { "epoch": 0.7458421054338905, "grad_norm": 0.7847896814346313, "learning_rate": 7.409913279380977e-05, "loss": 0.8980425834655762, "memory(GiB)": 91.52, "step": 57480, "token_acc": 0.7694205636300552, "train_speed(iter/s)": 0.144459 }, { "epoch": 0.7459069838355462, "grad_norm": 0.707533061504364, "learning_rate": 7.409443303856895e-05, "loss": 0.849457836151123, "memory(GiB)": 91.52, "step": 57485, "token_acc": 0.7695230213662353, "train_speed(iter/s)": 0.144457 }, { "epoch": 0.7459718622372019, "grad_norm": 0.7725891470909119, "learning_rate": 7.408973300605095e-05, "loss": 0.9124812126159668, "memory(GiB)": 91.52, "step": 57490, "token_acc": 0.754362594911416, "train_speed(iter/s)": 0.144455 }, { "epoch": 0.7460367406388576, "grad_norm": 0.7338753938674927, "learning_rate": 7.408503269630983e-05, "loss": 0.9046737670898437, "memory(GiB)": 91.52, "step": 57495, "token_acc": 0.7693683666573916, "train_speed(iter/s)": 0.144453 }, { "epoch": 0.7461016190405133, "grad_norm": 0.7514536380767822, "learning_rate": 7.408033210939974e-05, "loss": 0.9214086532592773, "memory(GiB)": 91.52, "step": 57500, "token_acc": 0.7409026465028355, "train_speed(iter/s)": 0.144451 }, { "epoch": 0.746166497442169, "grad_norm": 0.7836441397666931, "learning_rate": 7.40756312453747e-05, "loss": 0.9226548194885253, "memory(GiB)": 91.52, "step": 57505, "token_acc": 0.757375543228633, "train_speed(iter/s)": 0.144449 }, { "epoch": 0.7462313758438247, "grad_norm": 0.7552817463874817, "learning_rate": 7.407093010428888e-05, "loss": 0.8632036209106445, "memory(GiB)": 91.52, "step": 57510, "token_acc": 0.7677310767216794, "train_speed(iter/s)": 0.144447 }, { "epoch": 0.7462962542454804, "grad_norm": 0.7951161861419678, "learning_rate": 7.406622868619631e-05, "loss": 0.8941877365112305, "memory(GiB)": 91.52, "step": 57515, "token_acc": 0.7489340512277862, "train_speed(iter/s)": 0.144445 }, { "epoch": 0.7463611326471361, "grad_norm": 0.7123939990997314, "learning_rate": 7.406152699115114e-05, "loss": 0.8653818130493164, "memory(GiB)": 91.52, "step": 57520, "token_acc": 0.7452476572958501, "train_speed(iter/s)": 0.144442 }, { "epoch": 0.7464260110487918, "grad_norm": 0.6618219017982483, "learning_rate": 7.405682501920748e-05, "loss": 0.8347021102905273, "memory(GiB)": 91.52, "step": 57525, "token_acc": 0.7673799756070517, "train_speed(iter/s)": 0.14444 }, { "epoch": 0.7464908894504475, "grad_norm": 0.7913028001785278, "learning_rate": 7.405212277041942e-05, "loss": 0.8855945587158203, "memory(GiB)": 91.52, "step": 57530, "token_acc": 0.7467166398215834, "train_speed(iter/s)": 0.144438 }, { "epoch": 0.7465557678521032, "grad_norm": 0.7022805213928223, "learning_rate": 7.404742024484109e-05, "loss": 0.8738943099975586, "memory(GiB)": 91.52, "step": 57535, "token_acc": 0.7684075052496105, "train_speed(iter/s)": 0.144435 }, { "epoch": 0.7466206462537589, "grad_norm": 0.7175495624542236, "learning_rate": 7.404271744252658e-05, "loss": 0.8993022918701172, "memory(GiB)": 91.52, "step": 57540, "token_acc": 0.752603697430538, "train_speed(iter/s)": 0.144433 }, { "epoch": 0.7466855246554146, "grad_norm": 0.6633253693580627, "learning_rate": 7.403801436353003e-05, "loss": 0.83560791015625, "memory(GiB)": 91.52, "step": 57545, "token_acc": 0.7769962763756724, "train_speed(iter/s)": 0.144431 }, { "epoch": 0.7467504030570703, "grad_norm": 0.7133544087409973, "learning_rate": 7.403331100790556e-05, "loss": 0.9198470115661621, "memory(GiB)": 91.52, "step": 57550, "token_acc": 0.7514614825832786, "train_speed(iter/s)": 0.144428 }, { "epoch": 0.746815281458726, "grad_norm": 0.8550594449043274, "learning_rate": 7.402860737570727e-05, "loss": 0.8598106384277344, "memory(GiB)": 91.52, "step": 57555, "token_acc": 0.7715196191353759, "train_speed(iter/s)": 0.144427 }, { "epoch": 0.7468801598603817, "grad_norm": 0.7038490772247314, "learning_rate": 7.402390346698934e-05, "loss": 0.8633933067321777, "memory(GiB)": 91.52, "step": 57560, "token_acc": 0.7561242810626578, "train_speed(iter/s)": 0.144424 }, { "epoch": 0.7469450382620374, "grad_norm": 0.7191336750984192, "learning_rate": 7.401919928180587e-05, "loss": 0.8522493362426757, "memory(GiB)": 91.52, "step": 57565, "token_acc": 0.7586552217453505, "train_speed(iter/s)": 0.144422 }, { "epoch": 0.7470099166636931, "grad_norm": 0.7937531471252441, "learning_rate": 7.4014494820211e-05, "loss": 0.8222274780273438, "memory(GiB)": 91.52, "step": 57570, "token_acc": 0.7506909299737985, "train_speed(iter/s)": 0.144419 }, { "epoch": 0.7470747950653488, "grad_norm": 0.7484298944473267, "learning_rate": 7.400979008225887e-05, "loss": 0.8999843597412109, "memory(GiB)": 91.52, "step": 57575, "token_acc": 0.7554199038105809, "train_speed(iter/s)": 0.144417 }, { "epoch": 0.7471396734670045, "grad_norm": 0.7038512825965881, "learning_rate": 7.400508506800362e-05, "loss": 0.8645974159240722, "memory(GiB)": 91.52, "step": 57580, "token_acc": 0.753929648241206, "train_speed(iter/s)": 0.144415 }, { "epoch": 0.7472045518686602, "grad_norm": 0.7778701186180115, "learning_rate": 7.400037977749939e-05, "loss": 0.9253267288208008, "memory(GiB)": 91.52, "step": 57585, "token_acc": 0.7451963857643371, "train_speed(iter/s)": 0.144413 }, { "epoch": 0.7472694302703159, "grad_norm": 0.6795778870582581, "learning_rate": 7.399567421080033e-05, "loss": 0.9088869094848633, "memory(GiB)": 91.52, "step": 57590, "token_acc": 0.7382545022758757, "train_speed(iter/s)": 0.144412 }, { "epoch": 0.7473343086719716, "grad_norm": 0.6390576958656311, "learning_rate": 7.399096836796061e-05, "loss": 0.848844051361084, "memory(GiB)": 91.52, "step": 57595, "token_acc": 0.7942221535609455, "train_speed(iter/s)": 0.144409 }, { "epoch": 0.7473991870736273, "grad_norm": 0.7584673762321472, "learning_rate": 7.398626224903437e-05, "loss": 0.8790766716003418, "memory(GiB)": 91.52, "step": 57600, "token_acc": 0.7745611616579359, "train_speed(iter/s)": 0.144407 }, { "epoch": 0.747464065475283, "grad_norm": 0.8065000772476196, "learning_rate": 7.398155585407576e-05, "loss": 0.885736083984375, "memory(GiB)": 91.52, "step": 57605, "token_acc": 0.7520387661032975, "train_speed(iter/s)": 0.144404 }, { "epoch": 0.7475289438769387, "grad_norm": 0.7350087761878967, "learning_rate": 7.397684918313895e-05, "loss": 0.8518924713134766, "memory(GiB)": 91.52, "step": 57610, "token_acc": 0.7580295986087944, "train_speed(iter/s)": 0.144402 }, { "epoch": 0.7475938222785944, "grad_norm": 0.789607584476471, "learning_rate": 7.39721422362781e-05, "loss": 0.8436960220336914, "memory(GiB)": 91.52, "step": 57615, "token_acc": 0.7683791951646254, "train_speed(iter/s)": 0.144399 }, { "epoch": 0.7476587006802501, "grad_norm": 0.7099424600601196, "learning_rate": 7.396743501354739e-05, "loss": 0.9207797050476074, "memory(GiB)": 91.52, "step": 57620, "token_acc": 0.7531799443289525, "train_speed(iter/s)": 0.144397 }, { "epoch": 0.7477235790819058, "grad_norm": 0.7562662363052368, "learning_rate": 7.396272751500097e-05, "loss": 0.8964755058288574, "memory(GiB)": 91.52, "step": 57625, "token_acc": 0.7476566536275739, "train_speed(iter/s)": 0.144396 }, { "epoch": 0.7477884574835615, "grad_norm": 0.7568556070327759, "learning_rate": 7.395801974069303e-05, "loss": 0.8989065170288086, "memory(GiB)": 91.52, "step": 57630, "token_acc": 0.7482267061497142, "train_speed(iter/s)": 0.144393 }, { "epoch": 0.7478533358852172, "grad_norm": 0.7512944340705872, "learning_rate": 7.395331169067775e-05, "loss": 0.8843728065490722, "memory(GiB)": 91.52, "step": 57635, "token_acc": 0.7343046103709384, "train_speed(iter/s)": 0.144391 }, { "epoch": 0.7479182142868728, "grad_norm": 0.7377315163612366, "learning_rate": 7.394860336500929e-05, "loss": 0.8725921630859375, "memory(GiB)": 91.52, "step": 57640, "token_acc": 0.7678204905200905, "train_speed(iter/s)": 0.144389 }, { "epoch": 0.7479830926885285, "grad_norm": 0.6901469826698303, "learning_rate": 7.394389476374184e-05, "loss": 0.9002530097961425, "memory(GiB)": 91.52, "step": 57645, "token_acc": 0.7570940937144434, "train_speed(iter/s)": 0.144386 }, { "epoch": 0.7480479710901842, "grad_norm": 0.7720953226089478, "learning_rate": 7.393918588692959e-05, "loss": 0.8723234176635742, "memory(GiB)": 91.52, "step": 57650, "token_acc": 0.7656386701662292, "train_speed(iter/s)": 0.144384 }, { "epoch": 0.7481128494918399, "grad_norm": 0.7423030138015747, "learning_rate": 7.393447673462674e-05, "loss": 0.8927700996398926, "memory(GiB)": 91.52, "step": 57655, "token_acc": 0.7942098914354644, "train_speed(iter/s)": 0.144382 }, { "epoch": 0.7481777278934956, "grad_norm": 0.8124757409095764, "learning_rate": 7.392976730688745e-05, "loss": 0.9004560470581054, "memory(GiB)": 91.52, "step": 57660, "token_acc": 0.7588507093583235, "train_speed(iter/s)": 0.14438 }, { "epoch": 0.7482426062951513, "grad_norm": 0.7068032622337341, "learning_rate": 7.392505760376596e-05, "loss": 0.8999530792236328, "memory(GiB)": 91.52, "step": 57665, "token_acc": 0.758638986535783, "train_speed(iter/s)": 0.144377 }, { "epoch": 0.748307484696807, "grad_norm": 0.8276836276054382, "learning_rate": 7.392034762531643e-05, "loss": 0.9161055564880372, "memory(GiB)": 91.52, "step": 57670, "token_acc": 0.749584842995169, "train_speed(iter/s)": 0.144375 }, { "epoch": 0.7483723630984627, "grad_norm": 0.6969913840293884, "learning_rate": 7.39156373715931e-05, "loss": 0.8794990539550781, "memory(GiB)": 91.52, "step": 57675, "token_acc": 0.764714537963508, "train_speed(iter/s)": 0.144372 }, { "epoch": 0.7484372415001184, "grad_norm": 0.862149715423584, "learning_rate": 7.391092684265014e-05, "loss": 0.869974422454834, "memory(GiB)": 91.52, "step": 57680, "token_acc": 0.7687135988742524, "train_speed(iter/s)": 0.14437 }, { "epoch": 0.7485021199017741, "grad_norm": 0.8043318390846252, "learning_rate": 7.390621603854177e-05, "loss": 0.8726449012756348, "memory(GiB)": 91.52, "step": 57685, "token_acc": 0.7451201972467639, "train_speed(iter/s)": 0.144368 }, { "epoch": 0.7485669983034298, "grad_norm": 0.7364556789398193, "learning_rate": 7.390150495932222e-05, "loss": 0.8322968482971191, "memory(GiB)": 91.52, "step": 57690, "token_acc": 0.7662312959675374, "train_speed(iter/s)": 0.144366 }, { "epoch": 0.7486318767050855, "grad_norm": 0.7188898324966431, "learning_rate": 7.389679360504568e-05, "loss": 0.8681042671203614, "memory(GiB)": 91.52, "step": 57695, "token_acc": 0.762532701689882, "train_speed(iter/s)": 0.144364 }, { "epoch": 0.7486967551067412, "grad_norm": 0.7217425107955933, "learning_rate": 7.389208197576637e-05, "loss": 0.905057144165039, "memory(GiB)": 91.52, "step": 57700, "token_acc": 0.7478726078058239, "train_speed(iter/s)": 0.144362 }, { "epoch": 0.7487616335083969, "grad_norm": 0.8159750699996948, "learning_rate": 7.388737007153852e-05, "loss": 0.883027172088623, "memory(GiB)": 91.52, "step": 57705, "token_acc": 0.7569798181700947, "train_speed(iter/s)": 0.144359 }, { "epoch": 0.7488265119100526, "grad_norm": 0.7869623303413391, "learning_rate": 7.388265789241638e-05, "loss": 0.9092319488525391, "memory(GiB)": 91.52, "step": 57710, "token_acc": 0.7448829332940656, "train_speed(iter/s)": 0.144357 }, { "epoch": 0.7488913903117083, "grad_norm": 0.7562255263328552, "learning_rate": 7.387794543845411e-05, "loss": 0.8900760650634766, "memory(GiB)": 91.52, "step": 57715, "token_acc": 0.7564106895503226, "train_speed(iter/s)": 0.144355 }, { "epoch": 0.748956268713364, "grad_norm": 0.6962262392044067, "learning_rate": 7.3873232709706e-05, "loss": 0.872861671447754, "memory(GiB)": 91.52, "step": 57720, "token_acc": 0.765144288099213, "train_speed(iter/s)": 0.144352 }, { "epoch": 0.7490211471150197, "grad_norm": 0.7470207214355469, "learning_rate": 7.386851970622627e-05, "loss": 0.8654681205749511, "memory(GiB)": 91.52, "step": 57725, "token_acc": 0.7572070962178145, "train_speed(iter/s)": 0.14435 }, { "epoch": 0.7490860255166754, "grad_norm": 0.5913388133049011, "learning_rate": 7.386380642806913e-05, "loss": 0.8781402587890625, "memory(GiB)": 91.52, "step": 57730, "token_acc": 0.755894970639243, "train_speed(iter/s)": 0.144348 }, { "epoch": 0.7491509039183311, "grad_norm": 0.705919623374939, "learning_rate": 7.385909287528887e-05, "loss": 0.8601659774780274, "memory(GiB)": 91.52, "step": 57735, "token_acc": 0.7608718001439293, "train_speed(iter/s)": 0.144346 }, { "epoch": 0.7492157823199868, "grad_norm": 0.6978958249092102, "learning_rate": 7.385437904793968e-05, "loss": 0.8469146728515625, "memory(GiB)": 91.52, "step": 57740, "token_acc": 0.7781904118838623, "train_speed(iter/s)": 0.144344 }, { "epoch": 0.7492806607216425, "grad_norm": 0.7386767864227295, "learning_rate": 7.384966494607585e-05, "loss": 0.8604656219482422, "memory(GiB)": 91.52, "step": 57745, "token_acc": 0.7452171151861554, "train_speed(iter/s)": 0.144341 }, { "epoch": 0.7493455391232982, "grad_norm": 0.7710165977478027, "learning_rate": 7.38449505697516e-05, "loss": 0.8839912414550781, "memory(GiB)": 91.52, "step": 57750, "token_acc": 0.7682569452584691, "train_speed(iter/s)": 0.144339 }, { "epoch": 0.7494104175249539, "grad_norm": 0.7969374656677246, "learning_rate": 7.38402359190212e-05, "loss": 0.9214229583740234, "memory(GiB)": 91.52, "step": 57755, "token_acc": 0.7712649016875678, "train_speed(iter/s)": 0.144337 }, { "epoch": 0.7494752959266096, "grad_norm": 0.6949726939201355, "learning_rate": 7.38355209939389e-05, "loss": 0.8646704673767089, "memory(GiB)": 91.52, "step": 57760, "token_acc": 0.7696594427244582, "train_speed(iter/s)": 0.144335 }, { "epoch": 0.7495401743282653, "grad_norm": 0.7323638200759888, "learning_rate": 7.383080579455894e-05, "loss": 0.9180296897888184, "memory(GiB)": 91.52, "step": 57765, "token_acc": 0.738926602407831, "train_speed(iter/s)": 0.144333 }, { "epoch": 0.749605052729921, "grad_norm": 0.72540283203125, "learning_rate": 7.382609032093562e-05, "loss": 0.8969062805175781, "memory(GiB)": 91.52, "step": 57770, "token_acc": 0.7580864365316662, "train_speed(iter/s)": 0.144332 }, { "epoch": 0.7496699311315767, "grad_norm": 0.6440251469612122, "learning_rate": 7.382137457312317e-05, "loss": 0.855802059173584, "memory(GiB)": 91.52, "step": 57775, "token_acc": 0.7441251636227638, "train_speed(iter/s)": 0.14433 }, { "epoch": 0.7497348095332323, "grad_norm": 0.8227203488349915, "learning_rate": 7.38166585511759e-05, "loss": 0.8425006866455078, "memory(GiB)": 91.52, "step": 57780, "token_acc": 0.7735459352280238, "train_speed(iter/s)": 0.144327 }, { "epoch": 0.749799687934888, "grad_norm": 0.7611899375915527, "learning_rate": 7.381194225514804e-05, "loss": 0.8315525054931641, "memory(GiB)": 91.52, "step": 57785, "token_acc": 0.7713591680267203, "train_speed(iter/s)": 0.144325 }, { "epoch": 0.7498645663365437, "grad_norm": 0.6283413767814636, "learning_rate": 7.380722568509388e-05, "loss": 0.8688816070556641, "memory(GiB)": 91.52, "step": 57790, "token_acc": 0.7577502055994565, "train_speed(iter/s)": 0.144323 }, { "epoch": 0.7499294447381994, "grad_norm": 0.688553512096405, "learning_rate": 7.380250884106769e-05, "loss": 0.893309211730957, "memory(GiB)": 91.52, "step": 57795, "token_acc": 0.7646031746031746, "train_speed(iter/s)": 0.144321 }, { "epoch": 0.7499943231398551, "grad_norm": 0.7738565802574158, "learning_rate": 7.379779172312377e-05, "loss": 0.8563531875610352, "memory(GiB)": 91.52, "step": 57800, "token_acc": 0.7775663892191835, "train_speed(iter/s)": 0.144318 }, { "epoch": 0.7500592015415108, "grad_norm": 0.7939453125, "learning_rate": 7.379307433131641e-05, "loss": 0.8708662033081055, "memory(GiB)": 91.52, "step": 57805, "token_acc": 0.7837790748323025, "train_speed(iter/s)": 0.144315 }, { "epoch": 0.7501240799431665, "grad_norm": 0.6534382700920105, "learning_rate": 7.378835666569986e-05, "loss": 0.8711103439331055, "memory(GiB)": 91.52, "step": 57810, "token_acc": 0.7690558849734247, "train_speed(iter/s)": 0.144313 }, { "epoch": 0.7501889583448222, "grad_norm": 0.8776047825813293, "learning_rate": 7.378363872632846e-05, "loss": 0.8992424011230469, "memory(GiB)": 91.52, "step": 57815, "token_acc": 0.7538783898134344, "train_speed(iter/s)": 0.14431 }, { "epoch": 0.7502538367464779, "grad_norm": 0.7519142031669617, "learning_rate": 7.377892051325644e-05, "loss": 0.8540278434753418, "memory(GiB)": 91.52, "step": 57820, "token_acc": 0.7819980302316619, "train_speed(iter/s)": 0.144307 }, { "epoch": 0.7503187151481336, "grad_norm": 0.7115068435668945, "learning_rate": 7.377420202653815e-05, "loss": 0.9183073043823242, "memory(GiB)": 91.52, "step": 57825, "token_acc": 0.7496107338081829, "train_speed(iter/s)": 0.144305 }, { "epoch": 0.7503835935497893, "grad_norm": 0.7926620244979858, "learning_rate": 7.376948326622788e-05, "loss": 0.8413142204284668, "memory(GiB)": 91.52, "step": 57830, "token_acc": 0.7882826162994054, "train_speed(iter/s)": 0.144302 }, { "epoch": 0.750448471951445, "grad_norm": 0.7167834639549255, "learning_rate": 7.376476423237992e-05, "loss": 0.8898849487304688, "memory(GiB)": 91.52, "step": 57835, "token_acc": 0.7545499435665914, "train_speed(iter/s)": 0.1443 }, { "epoch": 0.7505133503531007, "grad_norm": 0.7684977650642395, "learning_rate": 7.376004492504858e-05, "loss": 0.8671869277954102, "memory(GiB)": 91.52, "step": 57840, "token_acc": 0.7478548345158055, "train_speed(iter/s)": 0.144298 }, { "epoch": 0.7505782287547564, "grad_norm": 0.7606369853019714, "learning_rate": 7.375532534428816e-05, "loss": 0.851015853881836, "memory(GiB)": 91.52, "step": 57845, "token_acc": 0.7562900655871345, "train_speed(iter/s)": 0.144296 }, { "epoch": 0.7506431071564121, "grad_norm": 0.7310246229171753, "learning_rate": 7.375060549015301e-05, "loss": 0.9081531524658203, "memory(GiB)": 91.52, "step": 57850, "token_acc": 0.7522829684571519, "train_speed(iter/s)": 0.144294 }, { "epoch": 0.7507079855580678, "grad_norm": 0.7320702075958252, "learning_rate": 7.374588536269741e-05, "loss": 0.8589682579040527, "memory(GiB)": 91.52, "step": 57855, "token_acc": 0.7824576639166305, "train_speed(iter/s)": 0.144293 }, { "epoch": 0.7507728639597235, "grad_norm": 0.7976179122924805, "learning_rate": 7.374116496197569e-05, "loss": 0.9054536819458008, "memory(GiB)": 91.52, "step": 57860, "token_acc": 0.7471213626163411, "train_speed(iter/s)": 0.144291 }, { "epoch": 0.7508377423613792, "grad_norm": 0.7627041935920715, "learning_rate": 7.373644428804217e-05, "loss": 0.9107145309448242, "memory(GiB)": 91.52, "step": 57865, "token_acc": 0.7476740817049222, "train_speed(iter/s)": 0.144289 }, { "epoch": 0.7509026207630349, "grad_norm": 0.6829179525375366, "learning_rate": 7.373172334095117e-05, "loss": 0.8694805145263672, "memory(GiB)": 91.52, "step": 57870, "token_acc": 0.7575947167188043, "train_speed(iter/s)": 0.144287 }, { "epoch": 0.7509674991646906, "grad_norm": 0.7053187489509583, "learning_rate": 7.372700212075706e-05, "loss": 0.8878229141235352, "memory(GiB)": 91.52, "step": 57875, "token_acc": 0.7588569572982627, "train_speed(iter/s)": 0.144285 }, { "epoch": 0.7510323775663462, "grad_norm": 0.7179449796676636, "learning_rate": 7.372228062751409e-05, "loss": 0.9227478981018067, "memory(GiB)": 91.52, "step": 57880, "token_acc": 0.739000768639508, "train_speed(iter/s)": 0.144283 }, { "epoch": 0.7510972559680019, "grad_norm": 0.7083948254585266, "learning_rate": 7.371755886127666e-05, "loss": 0.8686147689819336, "memory(GiB)": 91.52, "step": 57885, "token_acc": 0.7569636917960089, "train_speed(iter/s)": 0.14428 }, { "epoch": 0.7511621343696576, "grad_norm": 0.7725776433944702, "learning_rate": 7.371283682209908e-05, "loss": 0.9318418502807617, "memory(GiB)": 91.52, "step": 57890, "token_acc": 0.7591800981079188, "train_speed(iter/s)": 0.144278 }, { "epoch": 0.7512270127713133, "grad_norm": 0.8218277096748352, "learning_rate": 7.370811451003572e-05, "loss": 0.8818838119506835, "memory(GiB)": 91.52, "step": 57895, "token_acc": 0.7569890315676833, "train_speed(iter/s)": 0.144276 }, { "epoch": 0.751291891172969, "grad_norm": 0.8087921142578125, "learning_rate": 7.37033919251409e-05, "loss": 0.8931867599487304, "memory(GiB)": 91.52, "step": 57900, "token_acc": 0.7589359933499584, "train_speed(iter/s)": 0.144274 }, { "epoch": 0.7513567695746247, "grad_norm": 0.7388021349906921, "learning_rate": 7.369866906746896e-05, "loss": 0.8928738594055176, "memory(GiB)": 91.52, "step": 57905, "token_acc": 0.7481050479635163, "train_speed(iter/s)": 0.144271 }, { "epoch": 0.7514216479762804, "grad_norm": 0.7319725155830383, "learning_rate": 7.369394593707428e-05, "loss": 0.9195606231689453, "memory(GiB)": 91.52, "step": 57910, "token_acc": 0.7569523315053379, "train_speed(iter/s)": 0.144268 }, { "epoch": 0.7514865263779361, "grad_norm": 0.768810510635376, "learning_rate": 7.368922253401116e-05, "loss": 0.877103042602539, "memory(GiB)": 91.52, "step": 57915, "token_acc": 0.7710976873860671, "train_speed(iter/s)": 0.144266 }, { "epoch": 0.7515514047795918, "grad_norm": 0.78094482421875, "learning_rate": 7.368449885833402e-05, "loss": 0.8940681457519531, "memory(GiB)": 91.52, "step": 57920, "token_acc": 0.7548500881834215, "train_speed(iter/s)": 0.144265 }, { "epoch": 0.7516162831812475, "grad_norm": 0.7223144173622131, "learning_rate": 7.367977491009719e-05, "loss": 0.9108673095703125, "memory(GiB)": 91.52, "step": 57925, "token_acc": 0.7731732125865407, "train_speed(iter/s)": 0.144262 }, { "epoch": 0.7516811615829032, "grad_norm": 0.7099413871765137, "learning_rate": 7.367505068935503e-05, "loss": 0.8158475875854492, "memory(GiB)": 91.52, "step": 57930, "token_acc": 0.7627496049418188, "train_speed(iter/s)": 0.144259 }, { "epoch": 0.7517460399845589, "grad_norm": 0.6990278959274292, "learning_rate": 7.367032619616191e-05, "loss": 0.8853898048400879, "memory(GiB)": 91.52, "step": 57935, "token_acc": 0.7767244920529592, "train_speed(iter/s)": 0.144257 }, { "epoch": 0.7518109183862146, "grad_norm": 0.6639370322227478, "learning_rate": 7.36656014305722e-05, "loss": 0.8511490821838379, "memory(GiB)": 91.52, "step": 57940, "token_acc": 0.7646343933562892, "train_speed(iter/s)": 0.144254 }, { "epoch": 0.7518757967878703, "grad_norm": 0.7683162689208984, "learning_rate": 7.366087639264027e-05, "loss": 0.8410362243652344, "memory(GiB)": 91.52, "step": 57945, "token_acc": 0.772002200220022, "train_speed(iter/s)": 0.144252 }, { "epoch": 0.751940675189526, "grad_norm": 0.75978022813797, "learning_rate": 7.365615108242049e-05, "loss": 0.8810418128967286, "memory(GiB)": 91.52, "step": 57950, "token_acc": 0.734968384748036, "train_speed(iter/s)": 0.144249 }, { "epoch": 0.7520055535911817, "grad_norm": 0.7558796405792236, "learning_rate": 7.365142549996725e-05, "loss": 0.8239553451538086, "memory(GiB)": 91.52, "step": 57955, "token_acc": 0.7942047930283225, "train_speed(iter/s)": 0.144246 }, { "epoch": 0.7520704319928374, "grad_norm": 0.7756421566009521, "learning_rate": 7.364669964533493e-05, "loss": 0.9165890693664551, "memory(GiB)": 91.52, "step": 57960, "token_acc": 0.7597976317043663, "train_speed(iter/s)": 0.144244 }, { "epoch": 0.7521353103944931, "grad_norm": 0.7875473499298096, "learning_rate": 7.364197351857792e-05, "loss": 0.8802560806274414, "memory(GiB)": 91.52, "step": 57965, "token_acc": 0.7624013748925865, "train_speed(iter/s)": 0.144242 }, { "epoch": 0.7522001887961488, "grad_norm": 0.7441377639770508, "learning_rate": 7.363724711975059e-05, "loss": 0.9009206771850586, "memory(GiB)": 91.52, "step": 57970, "token_acc": 0.7429687220092437, "train_speed(iter/s)": 0.14424 }, { "epoch": 0.7522650671978045, "grad_norm": 0.6213868856430054, "learning_rate": 7.363252044890735e-05, "loss": 0.8542790412902832, "memory(GiB)": 91.52, "step": 57975, "token_acc": 0.7571844764953423, "train_speed(iter/s)": 0.144238 }, { "epoch": 0.7523299455994602, "grad_norm": 0.7600971460342407, "learning_rate": 7.36277935061026e-05, "loss": 0.8698015213012695, "memory(GiB)": 91.52, "step": 57980, "token_acc": 0.7566184115812206, "train_speed(iter/s)": 0.144236 }, { "epoch": 0.7523948240011159, "grad_norm": 0.8449204564094543, "learning_rate": 7.362306629139068e-05, "loss": 0.8775376319885254, "memory(GiB)": 91.52, "step": 57985, "token_acc": 0.7765506394036792, "train_speed(iter/s)": 0.144233 }, { "epoch": 0.7524597024027716, "grad_norm": 0.8288512229919434, "learning_rate": 7.361833880482607e-05, "loss": 0.8868222236633301, "memory(GiB)": 91.52, "step": 57990, "token_acc": 0.7560191469600629, "train_speed(iter/s)": 0.144231 }, { "epoch": 0.7525245808044273, "grad_norm": 0.7611268162727356, "learning_rate": 7.361361104646312e-05, "loss": 0.8953304290771484, "memory(GiB)": 91.52, "step": 57995, "token_acc": 0.7712142245325929, "train_speed(iter/s)": 0.144228 }, { "epoch": 0.752589459206083, "grad_norm": 0.716072678565979, "learning_rate": 7.360888301635627e-05, "loss": 0.9271149635314941, "memory(GiB)": 91.52, "step": 58000, "token_acc": 0.7378623942294354, "train_speed(iter/s)": 0.144226 }, { "epoch": 0.7526543376077387, "grad_norm": 0.8114452362060547, "learning_rate": 7.36041547145599e-05, "loss": 0.927166748046875, "memory(GiB)": 91.52, "step": 58005, "token_acc": 0.7376612903225807, "train_speed(iter/s)": 0.144224 }, { "epoch": 0.7527192160093944, "grad_norm": 0.8110746145248413, "learning_rate": 7.359942614112842e-05, "loss": 0.9233252525329589, "memory(GiB)": 91.52, "step": 58010, "token_acc": 0.7500426257459506, "train_speed(iter/s)": 0.144222 }, { "epoch": 0.7527840944110501, "grad_norm": 0.7005329728126526, "learning_rate": 7.35946972961163e-05, "loss": 0.8812580108642578, "memory(GiB)": 91.52, "step": 58015, "token_acc": 0.7458483754512636, "train_speed(iter/s)": 0.14422 }, { "epoch": 0.7528489728127058, "grad_norm": 0.7138186097145081, "learning_rate": 7.35899681795779e-05, "loss": 0.8973293304443359, "memory(GiB)": 91.52, "step": 58020, "token_acc": 0.7723509304975313, "train_speed(iter/s)": 0.144217 }, { "epoch": 0.7529138512143615, "grad_norm": 0.7398332953453064, "learning_rate": 7.358523879156766e-05, "loss": 0.8381604194641114, "memory(GiB)": 91.52, "step": 58025, "token_acc": 0.7627645117452921, "train_speed(iter/s)": 0.144215 }, { "epoch": 0.7529787296160172, "grad_norm": 0.8113372325897217, "learning_rate": 7.358050913214004e-05, "loss": 0.8509729385375977, "memory(GiB)": 91.52, "step": 58030, "token_acc": 0.7772644329696782, "train_speed(iter/s)": 0.144212 }, { "epoch": 0.7530436080176729, "grad_norm": 0.8489682078361511, "learning_rate": 7.35757792013494e-05, "loss": 0.8767339706420898, "memory(GiB)": 91.52, "step": 58035, "token_acc": 0.7722249084755128, "train_speed(iter/s)": 0.14421 }, { "epoch": 0.7531084864193286, "grad_norm": 0.7811515927314758, "learning_rate": 7.357104899925023e-05, "loss": 0.9035992622375488, "memory(GiB)": 91.52, "step": 58040, "token_acc": 0.7548282171173003, "train_speed(iter/s)": 0.144208 }, { "epoch": 0.7531733648209843, "grad_norm": 0.7505918145179749, "learning_rate": 7.356631852589694e-05, "loss": 0.894407081604004, "memory(GiB)": 91.52, "step": 58045, "token_acc": 0.751275604525623, "train_speed(iter/s)": 0.144206 }, { "epoch": 0.75323824322264, "grad_norm": 0.7611631155014038, "learning_rate": 7.356158778134399e-05, "loss": 0.909869384765625, "memory(GiB)": 91.52, "step": 58050, "token_acc": 0.7505930673087137, "train_speed(iter/s)": 0.144204 }, { "epoch": 0.7533031216242957, "grad_norm": 0.7733972072601318, "learning_rate": 7.355685676564577e-05, "loss": 0.9209247589111328, "memory(GiB)": 91.52, "step": 58055, "token_acc": 0.7454919029981887, "train_speed(iter/s)": 0.144201 }, { "epoch": 0.7533680000259514, "grad_norm": 0.770232081413269, "learning_rate": 7.355212547885679e-05, "loss": 0.8988517761230469, "memory(GiB)": 91.52, "step": 58060, "token_acc": 0.754729610709882, "train_speed(iter/s)": 0.144199 }, { "epoch": 0.7534328784276071, "grad_norm": 0.70460444688797, "learning_rate": 7.354739392103143e-05, "loss": 0.8906142234802246, "memory(GiB)": 91.52, "step": 58065, "token_acc": 0.7572664817016779, "train_speed(iter/s)": 0.144197 }, { "epoch": 0.7534977568292628, "grad_norm": 0.797966480255127, "learning_rate": 7.354266209222422e-05, "loss": 0.9115671157836914, "memory(GiB)": 91.52, "step": 58070, "token_acc": 0.7434321320309869, "train_speed(iter/s)": 0.144195 }, { "epoch": 0.7535626352309185, "grad_norm": 0.7622003555297852, "learning_rate": 7.353792999248955e-05, "loss": 0.8756815910339355, "memory(GiB)": 91.52, "step": 58075, "token_acc": 0.773581443147618, "train_speed(iter/s)": 0.144193 }, { "epoch": 0.7536275136325742, "grad_norm": 0.6921879053115845, "learning_rate": 7.353319762188189e-05, "loss": 0.8358020782470703, "memory(GiB)": 91.52, "step": 58080, "token_acc": 0.7747052271767101, "train_speed(iter/s)": 0.144191 }, { "epoch": 0.7536923920342299, "grad_norm": 0.671147882938385, "learning_rate": 7.352846498045573e-05, "loss": 0.8248275756835938, "memory(GiB)": 91.52, "step": 58085, "token_acc": 0.7738505266677813, "train_speed(iter/s)": 0.144188 }, { "epoch": 0.7537572704358856, "grad_norm": 0.7153525948524475, "learning_rate": 7.352373206826549e-05, "loss": 0.87021484375, "memory(GiB)": 91.52, "step": 58090, "token_acc": 0.7634334577339759, "train_speed(iter/s)": 0.144186 }, { "epoch": 0.7538221488375413, "grad_norm": 0.6586142182350159, "learning_rate": 7.351899888536565e-05, "loss": 0.8520642280578613, "memory(GiB)": 91.52, "step": 58095, "token_acc": 0.7696719051557761, "train_speed(iter/s)": 0.144184 }, { "epoch": 0.753887027239197, "grad_norm": 0.7968019843101501, "learning_rate": 7.351426543181067e-05, "loss": 0.8853004455566407, "memory(GiB)": 91.52, "step": 58100, "token_acc": 0.754658846239513, "train_speed(iter/s)": 0.144182 }, { "epoch": 0.7539519056408527, "grad_norm": 0.7044984698295593, "learning_rate": 7.350953170765508e-05, "loss": 0.820610523223877, "memory(GiB)": 91.52, "step": 58105, "token_acc": 0.7386561322122355, "train_speed(iter/s)": 0.14418 }, { "epoch": 0.7540167840425084, "grad_norm": 0.785721480846405, "learning_rate": 7.35047977129533e-05, "loss": 0.9278436660766601, "memory(GiB)": 91.52, "step": 58110, "token_acc": 0.7863189064696557, "train_speed(iter/s)": 0.144178 }, { "epoch": 0.7540816624441641, "grad_norm": 0.7513487339019775, "learning_rate": 7.350006344775981e-05, "loss": 0.8254344940185547, "memory(GiB)": 91.52, "step": 58115, "token_acc": 0.7542384515066236, "train_speed(iter/s)": 0.144176 }, { "epoch": 0.7541465408458197, "grad_norm": 0.7674004435539246, "learning_rate": 7.349532891212911e-05, "loss": 0.900666332244873, "memory(GiB)": 91.52, "step": 58120, "token_acc": 0.7450875372409869, "train_speed(iter/s)": 0.144173 }, { "epoch": 0.7542114192474754, "grad_norm": 0.7252922654151917, "learning_rate": 7.349059410611567e-05, "loss": 0.9087367057800293, "memory(GiB)": 91.52, "step": 58125, "token_acc": 0.7563503429562072, "train_speed(iter/s)": 0.144171 }, { "epoch": 0.7542762976491311, "grad_norm": 0.7972896695137024, "learning_rate": 7.348585902977397e-05, "loss": 0.9259725570678711, "memory(GiB)": 91.52, "step": 58130, "token_acc": 0.7552487398641244, "train_speed(iter/s)": 0.144169 }, { "epoch": 0.7543411760507868, "grad_norm": 0.721207857131958, "learning_rate": 7.348112368315853e-05, "loss": 0.8634227752685547, "memory(GiB)": 91.52, "step": 58135, "token_acc": 0.7785511555615624, "train_speed(iter/s)": 0.144167 }, { "epoch": 0.7544060544524425, "grad_norm": 0.6450807452201843, "learning_rate": 7.347638806632384e-05, "loss": 0.8915866851806641, "memory(GiB)": 91.52, "step": 58140, "token_acc": 0.7926944200708459, "train_speed(iter/s)": 0.144164 }, { "epoch": 0.7544709328540982, "grad_norm": 0.6607292890548706, "learning_rate": 7.347165217932437e-05, "loss": 0.8608848571777343, "memory(GiB)": 91.52, "step": 58145, "token_acc": 0.7628993550322484, "train_speed(iter/s)": 0.144162 }, { "epoch": 0.7545358112557539, "grad_norm": 0.7103414535522461, "learning_rate": 7.346691602221466e-05, "loss": 0.8919027328491211, "memory(GiB)": 91.52, "step": 58150, "token_acc": 0.7445496366424428, "train_speed(iter/s)": 0.14416 }, { "epoch": 0.7546006896574096, "grad_norm": 0.8010864853858948, "learning_rate": 7.346217959504917e-05, "loss": 0.9030206680297852, "memory(GiB)": 91.52, "step": 58155, "token_acc": 0.7486057796769755, "train_speed(iter/s)": 0.144159 }, { "epoch": 0.7546655680590653, "grad_norm": 0.6791475415229797, "learning_rate": 7.345744289788243e-05, "loss": 0.874827766418457, "memory(GiB)": 91.52, "step": 58160, "token_acc": 0.7750280831943357, "train_speed(iter/s)": 0.144157 }, { "epoch": 0.754730446460721, "grad_norm": 0.8912416100502014, "learning_rate": 7.345270593076896e-05, "loss": 0.8555791854858399, "memory(GiB)": 91.52, "step": 58165, "token_acc": 0.7680979806080047, "train_speed(iter/s)": 0.144154 }, { "epoch": 0.7547953248623767, "grad_norm": 0.7319112420082092, "learning_rate": 7.344796869376324e-05, "loss": 0.8735936164855957, "memory(GiB)": 91.52, "step": 58170, "token_acc": 0.7749168914257041, "train_speed(iter/s)": 0.144152 }, { "epoch": 0.7548602032640324, "grad_norm": 0.7841919660568237, "learning_rate": 7.344323118691982e-05, "loss": 0.9006289482116699, "memory(GiB)": 91.52, "step": 58175, "token_acc": 0.7818690648386893, "train_speed(iter/s)": 0.144149 }, { "epoch": 0.7549250816656881, "grad_norm": 0.7824514508247375, "learning_rate": 7.34384934102932e-05, "loss": 0.9065006256103516, "memory(GiB)": 91.52, "step": 58180, "token_acc": 0.7432518986183548, "train_speed(iter/s)": 0.144147 }, { "epoch": 0.7549899600673438, "grad_norm": 0.8623208403587341, "learning_rate": 7.34337553639379e-05, "loss": 0.8462865829467774, "memory(GiB)": 91.52, "step": 58185, "token_acc": 0.772428674178854, "train_speed(iter/s)": 0.144146 }, { "epoch": 0.7550548384689995, "grad_norm": 0.7234722971916199, "learning_rate": 7.342901704790847e-05, "loss": 0.8472972869873047, "memory(GiB)": 91.52, "step": 58190, "token_acc": 0.7667378426780548, "train_speed(iter/s)": 0.144143 }, { "epoch": 0.7551197168706552, "grad_norm": 0.8114949464797974, "learning_rate": 7.342427846225942e-05, "loss": 0.8849357604980469, "memory(GiB)": 91.52, "step": 58195, "token_acc": 0.7548202966846979, "train_speed(iter/s)": 0.14414 }, { "epoch": 0.7551845952723109, "grad_norm": 0.7351827621459961, "learning_rate": 7.341953960704525e-05, "loss": 0.8612015724182129, "memory(GiB)": 91.52, "step": 58200, "token_acc": 0.7561803386169925, "train_speed(iter/s)": 0.144138 }, { "epoch": 0.7552494736739666, "grad_norm": 0.6782708168029785, "learning_rate": 7.341480048232055e-05, "loss": 0.8671784400939941, "memory(GiB)": 91.52, "step": 58205, "token_acc": 0.7791163548948634, "train_speed(iter/s)": 0.144135 }, { "epoch": 0.7553143520756223, "grad_norm": 0.7690924406051636, "learning_rate": 7.341006108813983e-05, "loss": 0.8678854942321778, "memory(GiB)": 91.52, "step": 58210, "token_acc": 0.7687782805429865, "train_speed(iter/s)": 0.144133 }, { "epoch": 0.755379230477278, "grad_norm": 0.7084833383560181, "learning_rate": 7.340532142455763e-05, "loss": 0.8547816276550293, "memory(GiB)": 91.52, "step": 58215, "token_acc": 0.7653476521084852, "train_speed(iter/s)": 0.14413 }, { "epoch": 0.7554441088789337, "grad_norm": 0.7345613241195679, "learning_rate": 7.34005814916285e-05, "loss": 0.8950442314147949, "memory(GiB)": 91.52, "step": 58220, "token_acc": 0.7643496374016355, "train_speed(iter/s)": 0.144128 }, { "epoch": 0.7555089872805894, "grad_norm": 0.7392287254333496, "learning_rate": 7.339584128940698e-05, "loss": 0.8991791725158691, "memory(GiB)": 91.52, "step": 58225, "token_acc": 0.7580046403712297, "train_speed(iter/s)": 0.144126 }, { "epoch": 0.755573865682245, "grad_norm": 0.721487283706665, "learning_rate": 7.339110081794762e-05, "loss": 0.8779314041137696, "memory(GiB)": 91.52, "step": 58230, "token_acc": 0.7721706625418923, "train_speed(iter/s)": 0.144124 }, { "epoch": 0.7556387440839007, "grad_norm": 0.8160021305084229, "learning_rate": 7.3386360077305e-05, "loss": 0.8849027633666993, "memory(GiB)": 91.52, "step": 58235, "token_acc": 0.7571353386502571, "train_speed(iter/s)": 0.144122 }, { "epoch": 0.7557036224855564, "grad_norm": 0.8075251579284668, "learning_rate": 7.338161906753361e-05, "loss": 0.8578786849975586, "memory(GiB)": 91.52, "step": 58240, "token_acc": 0.7658570569049051, "train_speed(iter/s)": 0.144119 }, { "epoch": 0.7557685008872121, "grad_norm": 0.6911323666572571, "learning_rate": 7.337687778868809e-05, "loss": 0.8356324195861816, "memory(GiB)": 91.52, "step": 58245, "token_acc": 0.773259550160657, "train_speed(iter/s)": 0.144116 }, { "epoch": 0.7558333792888678, "grad_norm": 0.7439384460449219, "learning_rate": 7.337213624082295e-05, "loss": 0.9275023460388183, "memory(GiB)": 91.52, "step": 58250, "token_acc": 0.7509738343497935, "train_speed(iter/s)": 0.144114 }, { "epoch": 0.7558982576905235, "grad_norm": 0.6470598578453064, "learning_rate": 7.336739442399278e-05, "loss": 0.8440473556518555, "memory(GiB)": 91.52, "step": 58255, "token_acc": 0.7504745634016705, "train_speed(iter/s)": 0.144112 }, { "epoch": 0.7559631360921792, "grad_norm": 0.7804509997367859, "learning_rate": 7.336265233825213e-05, "loss": 0.8664228439331054, "memory(GiB)": 91.52, "step": 58260, "token_acc": 0.7622550778726764, "train_speed(iter/s)": 0.14411 }, { "epoch": 0.7560280144938349, "grad_norm": 0.7311892509460449, "learning_rate": 7.335790998365559e-05, "loss": 0.867701530456543, "memory(GiB)": 91.52, "step": 58265, "token_acc": 0.7610608848707897, "train_speed(iter/s)": 0.144109 }, { "epoch": 0.7560928928954906, "grad_norm": 0.7474988102912903, "learning_rate": 7.335316736025772e-05, "loss": 0.9469767570495605, "memory(GiB)": 91.52, "step": 58270, "token_acc": 0.7546838874272502, "train_speed(iter/s)": 0.144107 }, { "epoch": 0.7561577712971463, "grad_norm": 0.7113270163536072, "learning_rate": 7.33484244681131e-05, "loss": 0.8735896110534668, "memory(GiB)": 91.52, "step": 58275, "token_acc": 0.7563808012030577, "train_speed(iter/s)": 0.144105 }, { "epoch": 0.756222649698802, "grad_norm": 0.6481325030326843, "learning_rate": 7.334368130727631e-05, "loss": 0.8540291786193848, "memory(GiB)": 91.52, "step": 58280, "token_acc": 0.7579236977256053, "train_speed(iter/s)": 0.144103 }, { "epoch": 0.7562875281004577, "grad_norm": 0.7443452477455139, "learning_rate": 7.333893787780195e-05, "loss": 0.8860090255737305, "memory(GiB)": 91.52, "step": 58285, "token_acc": 0.7539534159071434, "train_speed(iter/s)": 0.1441 }, { "epoch": 0.7563524065021134, "grad_norm": 0.6612696051597595, "learning_rate": 7.33341941797446e-05, "loss": 0.8792035102844238, "memory(GiB)": 91.52, "step": 58290, "token_acc": 0.7650499036946244, "train_speed(iter/s)": 0.144099 }, { "epoch": 0.7564172849037691, "grad_norm": 0.7186434864997864, "learning_rate": 7.332945021315883e-05, "loss": 0.8870418548583985, "memory(GiB)": 91.52, "step": 58295, "token_acc": 0.7422610227864322, "train_speed(iter/s)": 0.144097 }, { "epoch": 0.7564821633054248, "grad_norm": 0.8985108733177185, "learning_rate": 7.332470597809925e-05, "loss": 0.9034111022949218, "memory(GiB)": 91.52, "step": 58300, "token_acc": 0.7507562216416884, "train_speed(iter/s)": 0.144096 }, { "epoch": 0.7565470417070805, "grad_norm": 0.7533365488052368, "learning_rate": 7.331996147462045e-05, "loss": 0.8845135688781738, "memory(GiB)": 91.52, "step": 58305, "token_acc": 0.7565144558878273, "train_speed(iter/s)": 0.144093 }, { "epoch": 0.7566119201087362, "grad_norm": 0.8498138785362244, "learning_rate": 7.331521670277705e-05, "loss": 0.9157296180725097, "memory(GiB)": 91.52, "step": 58310, "token_acc": 0.7611681364753505, "train_speed(iter/s)": 0.144092 }, { "epoch": 0.7566767985103919, "grad_norm": 0.773030161857605, "learning_rate": 7.331047166262363e-05, "loss": 0.8209712982177735, "memory(GiB)": 91.52, "step": 58315, "token_acc": 0.7889580602883355, "train_speed(iter/s)": 0.14409 }, { "epoch": 0.7567416769120476, "grad_norm": 0.7214429378509521, "learning_rate": 7.33057263542148e-05, "loss": 0.8979534149169922, "memory(GiB)": 91.52, "step": 58320, "token_acc": 0.7555968325349497, "train_speed(iter/s)": 0.144087 }, { "epoch": 0.7568065553137033, "grad_norm": 0.697867751121521, "learning_rate": 7.330098077760517e-05, "loss": 0.8512944221496582, "memory(GiB)": 91.52, "step": 58325, "token_acc": 0.788958820356051, "train_speed(iter/s)": 0.144084 }, { "epoch": 0.756871433715359, "grad_norm": 0.8071195483207703, "learning_rate": 7.329623493284936e-05, "loss": 0.8783447265625, "memory(GiB)": 91.52, "step": 58330, "token_acc": 0.7777424145979206, "train_speed(iter/s)": 0.144083 }, { "epoch": 0.7569363121170147, "grad_norm": 0.626002848148346, "learning_rate": 7.329148882000196e-05, "loss": 0.8180742263793945, "memory(GiB)": 91.52, "step": 58335, "token_acc": 0.7851762173796072, "train_speed(iter/s)": 0.14408 }, { "epoch": 0.7570011905186704, "grad_norm": 0.7412747740745544, "learning_rate": 7.328674243911763e-05, "loss": 0.827542495727539, "memory(GiB)": 91.52, "step": 58340, "token_acc": 0.771728515625, "train_speed(iter/s)": 0.144078 }, { "epoch": 0.7570660689203261, "grad_norm": 0.7118691802024841, "learning_rate": 7.328199579025095e-05, "loss": 0.905515480041504, "memory(GiB)": 91.52, "step": 58345, "token_acc": 0.7489950714810025, "train_speed(iter/s)": 0.144076 }, { "epoch": 0.7571309473219818, "grad_norm": 0.9087251424789429, "learning_rate": 7.327724887345658e-05, "loss": 0.8573759078979493, "memory(GiB)": 91.52, "step": 58350, "token_acc": 0.7584081516317515, "train_speed(iter/s)": 0.144074 }, { "epoch": 0.7571958257236375, "grad_norm": 0.7209140658378601, "learning_rate": 7.32725016887891e-05, "loss": 0.8791472434997558, "memory(GiB)": 91.52, "step": 58355, "token_acc": 0.7636685827894714, "train_speed(iter/s)": 0.144072 }, { "epoch": 0.7572607041252931, "grad_norm": 0.7584697008132935, "learning_rate": 7.326775423630319e-05, "loss": 0.832004165649414, "memory(GiB)": 91.52, "step": 58360, "token_acc": 0.756191337781296, "train_speed(iter/s)": 0.14407 }, { "epoch": 0.7573255825269488, "grad_norm": 0.792465329170227, "learning_rate": 7.326300651605345e-05, "loss": 0.8643032073974609, "memory(GiB)": 91.52, "step": 58365, "token_acc": 0.7667682050692568, "train_speed(iter/s)": 0.144068 }, { "epoch": 0.7573904609286045, "grad_norm": 0.6866785287857056, "learning_rate": 7.325825852809453e-05, "loss": 0.8870410919189453, "memory(GiB)": 91.52, "step": 58370, "token_acc": 0.7531686186427081, "train_speed(iter/s)": 0.144065 }, { "epoch": 0.7574553393302602, "grad_norm": 0.7830979824066162, "learning_rate": 7.325351027248109e-05, "loss": 0.9029713630676269, "memory(GiB)": 91.52, "step": 58375, "token_acc": 0.7578260510448501, "train_speed(iter/s)": 0.144063 }, { "epoch": 0.7575202177319159, "grad_norm": 0.7585271000862122, "learning_rate": 7.324876174926771e-05, "loss": 0.8802938461303711, "memory(GiB)": 91.52, "step": 58380, "token_acc": 0.7720179939551557, "train_speed(iter/s)": 0.144061 }, { "epoch": 0.7575850961335716, "grad_norm": 0.7009251117706299, "learning_rate": 7.32440129585091e-05, "loss": 0.8782936096191406, "memory(GiB)": 91.52, "step": 58385, "token_acc": 0.7561085246427998, "train_speed(iter/s)": 0.144058 }, { "epoch": 0.7576499745352273, "grad_norm": 0.7612216472625732, "learning_rate": 7.323926390025987e-05, "loss": 0.876866340637207, "memory(GiB)": 91.52, "step": 58390, "token_acc": 0.77549476868776, "train_speed(iter/s)": 0.144055 }, { "epoch": 0.757714852936883, "grad_norm": 0.6420809030532837, "learning_rate": 7.32345145745747e-05, "loss": 0.9091571807861328, "memory(GiB)": 91.52, "step": 58395, "token_acc": 0.7453130527060489, "train_speed(iter/s)": 0.144053 }, { "epoch": 0.7577797313385387, "grad_norm": 0.7992388606071472, "learning_rate": 7.322976498150825e-05, "loss": 0.8974098205566406, "memory(GiB)": 91.52, "step": 58400, "token_acc": 0.7477114610032954, "train_speed(iter/s)": 0.144051 }, { "epoch": 0.7578446097401944, "grad_norm": 0.7932109832763672, "learning_rate": 7.322501512111513e-05, "loss": 0.8839509963989258, "memory(GiB)": 91.52, "step": 58405, "token_acc": 0.7598412293798478, "train_speed(iter/s)": 0.144049 }, { "epoch": 0.7579094881418501, "grad_norm": 0.7236371636390686, "learning_rate": 7.322026499345006e-05, "loss": 0.9011058807373047, "memory(GiB)": 91.52, "step": 58410, "token_acc": 0.7579591413460541, "train_speed(iter/s)": 0.144046 }, { "epoch": 0.7579743665435058, "grad_norm": 0.6740753650665283, "learning_rate": 7.321551459856764e-05, "loss": 0.907862663269043, "memory(GiB)": 91.52, "step": 58415, "token_acc": 0.7618793201322336, "train_speed(iter/s)": 0.144043 }, { "epoch": 0.7580392449451615, "grad_norm": 0.6349754929542542, "learning_rate": 7.321076393652259e-05, "loss": 0.8919300079345703, "memory(GiB)": 91.52, "step": 58420, "token_acc": 0.7430106519226565, "train_speed(iter/s)": 0.14404 }, { "epoch": 0.7581041233468172, "grad_norm": 0.6696142554283142, "learning_rate": 7.320601300736956e-05, "loss": 0.8778651237487793, "memory(GiB)": 91.52, "step": 58425, "token_acc": 0.7657040602246081, "train_speed(iter/s)": 0.144038 }, { "epoch": 0.7581690017484729, "grad_norm": 0.6883639693260193, "learning_rate": 7.320126181116322e-05, "loss": 0.8661160469055176, "memory(GiB)": 91.52, "step": 58430, "token_acc": 0.7574063140346612, "train_speed(iter/s)": 0.144036 }, { "epoch": 0.7582338801501286, "grad_norm": 0.7298831343650818, "learning_rate": 7.319651034795825e-05, "loss": 0.9256791114807129, "memory(GiB)": 91.52, "step": 58435, "token_acc": 0.7598565022421525, "train_speed(iter/s)": 0.144034 }, { "epoch": 0.7582987585517843, "grad_norm": 0.7529110312461853, "learning_rate": 7.319175861780934e-05, "loss": 0.8535943984985351, "memory(GiB)": 91.52, "step": 58440, "token_acc": 0.7639439411098528, "train_speed(iter/s)": 0.144032 }, { "epoch": 0.75836363695344, "grad_norm": 0.8038285374641418, "learning_rate": 7.318700662077118e-05, "loss": 0.8993740081787109, "memory(GiB)": 91.52, "step": 58445, "token_acc": 0.7497684266243219, "train_speed(iter/s)": 0.14403 }, { "epoch": 0.7584285153550957, "grad_norm": 0.7462354302406311, "learning_rate": 7.318225435689842e-05, "loss": 0.8870643615722656, "memory(GiB)": 91.52, "step": 58450, "token_acc": 0.7515779658585569, "train_speed(iter/s)": 0.144028 }, { "epoch": 0.7584933937567514, "grad_norm": 0.7568628191947937, "learning_rate": 7.317750182624577e-05, "loss": 0.8271286010742187, "memory(GiB)": 91.52, "step": 58455, "token_acc": 0.7781041464666628, "train_speed(iter/s)": 0.144025 }, { "epoch": 0.7585582721584071, "grad_norm": 0.7027568221092224, "learning_rate": 7.317274902886793e-05, "loss": 0.8808921813964844, "memory(GiB)": 91.52, "step": 58460, "token_acc": 0.7769064157143925, "train_speed(iter/s)": 0.144023 }, { "epoch": 0.7586231505600628, "grad_norm": 0.7878215312957764, "learning_rate": 7.316799596481958e-05, "loss": 0.912359619140625, "memory(GiB)": 91.52, "step": 58465, "token_acc": 0.7433740629242666, "train_speed(iter/s)": 0.144021 }, { "epoch": 0.7586880289617185, "grad_norm": 0.7096385359764099, "learning_rate": 7.316324263415542e-05, "loss": 0.8524150848388672, "memory(GiB)": 91.52, "step": 58470, "token_acc": 0.759447004608295, "train_speed(iter/s)": 0.144018 }, { "epoch": 0.7587529073633742, "grad_norm": 0.7368124127388, "learning_rate": 7.315848903693014e-05, "loss": 0.8691922187805176, "memory(GiB)": 91.52, "step": 58475, "token_acc": 0.7490967482938579, "train_speed(iter/s)": 0.144016 }, { "epoch": 0.7588177857650299, "grad_norm": 0.7567583322525024, "learning_rate": 7.31537351731985e-05, "loss": 0.8580215454101563, "memory(GiB)": 91.52, "step": 58480, "token_acc": 0.7617389574214086, "train_speed(iter/s)": 0.144014 }, { "epoch": 0.7588826641666856, "grad_norm": 0.7555962800979614, "learning_rate": 7.314898104301513e-05, "loss": 0.8558988571166992, "memory(GiB)": 91.52, "step": 58485, "token_acc": 0.7754841001954166, "train_speed(iter/s)": 0.144011 }, { "epoch": 0.7589475425683413, "grad_norm": 0.7665497660636902, "learning_rate": 7.314422664643479e-05, "loss": 0.9095767974853516, "memory(GiB)": 91.52, "step": 58490, "token_acc": 0.7546650103966946, "train_speed(iter/s)": 0.144009 }, { "epoch": 0.759012420969997, "grad_norm": 0.7006574273109436, "learning_rate": 7.313947198351216e-05, "loss": 0.8495929718017579, "memory(GiB)": 91.52, "step": 58495, "token_acc": 0.7602584447970981, "train_speed(iter/s)": 0.144007 }, { "epoch": 0.7590772993716527, "grad_norm": 0.7185079455375671, "learning_rate": 7.313471705430201e-05, "loss": 0.8915096282958984, "memory(GiB)": 91.52, "step": 58500, "token_acc": 0.7633030373650276, "train_speed(iter/s)": 0.144004 }, { "epoch": 0.7591421777733084, "grad_norm": 0.7301084399223328, "learning_rate": 7.312996185885901e-05, "loss": 0.8752029418945313, "memory(GiB)": 91.52, "step": 58505, "token_acc": 0.751957079227966, "train_speed(iter/s)": 0.144002 }, { "epoch": 0.7592070561749641, "grad_norm": 0.8254951238632202, "learning_rate": 7.312520639723789e-05, "loss": 0.8790287017822266, "memory(GiB)": 91.52, "step": 58510, "token_acc": 0.7690259285213735, "train_speed(iter/s)": 0.144 }, { "epoch": 0.7592719345766198, "grad_norm": 0.783390998840332, "learning_rate": 7.312045066949339e-05, "loss": 0.9248311996459961, "memory(GiB)": 91.52, "step": 58515, "token_acc": 0.7655915651869432, "train_speed(iter/s)": 0.143998 }, { "epoch": 0.7593368129782755, "grad_norm": 0.6844435930252075, "learning_rate": 7.311569467568023e-05, "loss": 0.9107696533203125, "memory(GiB)": 91.52, "step": 58520, "token_acc": 0.7593080496429314, "train_speed(iter/s)": 0.143996 }, { "epoch": 0.7594016913799312, "grad_norm": 0.788556694984436, "learning_rate": 7.311093841585315e-05, "loss": 0.8938995361328125, "memory(GiB)": 91.52, "step": 58525, "token_acc": 0.7588863359913501, "train_speed(iter/s)": 0.143994 }, { "epoch": 0.7594665697815869, "grad_norm": 0.8012846112251282, "learning_rate": 7.310618189006687e-05, "loss": 0.8886018753051758, "memory(GiB)": 91.52, "step": 58530, "token_acc": 0.7721652286160745, "train_speed(iter/s)": 0.143992 }, { "epoch": 0.7595314481832426, "grad_norm": 0.6586824059486389, "learning_rate": 7.310142509837614e-05, "loss": 0.8759535789489746, "memory(GiB)": 91.52, "step": 58535, "token_acc": 0.7631373256373256, "train_speed(iter/s)": 0.143989 }, { "epoch": 0.7595963265848983, "grad_norm": 0.8384619951248169, "learning_rate": 7.30966680408357e-05, "loss": 0.8884042739868164, "memory(GiB)": 91.52, "step": 58540, "token_acc": 0.7571777519721299, "train_speed(iter/s)": 0.143987 }, { "epoch": 0.759661204986554, "grad_norm": 0.7701781392097473, "learning_rate": 7.309191071750027e-05, "loss": 0.8500173568725586, "memory(GiB)": 91.52, "step": 58545, "token_acc": 0.7627660297817366, "train_speed(iter/s)": 0.143985 }, { "epoch": 0.7597260833882097, "grad_norm": 0.6864162087440491, "learning_rate": 7.308715312842464e-05, "loss": 0.8295351028442383, "memory(GiB)": 91.52, "step": 58550, "token_acc": 0.7952736139998565, "train_speed(iter/s)": 0.143982 }, { "epoch": 0.7597909617898654, "grad_norm": 0.7290418148040771, "learning_rate": 7.308239527366353e-05, "loss": 0.8750616073608398, "memory(GiB)": 91.52, "step": 58555, "token_acc": 0.7658935308089981, "train_speed(iter/s)": 0.143979 }, { "epoch": 0.7598558401915211, "grad_norm": 0.6701486706733704, "learning_rate": 7.307763715327169e-05, "loss": 0.8661262512207031, "memory(GiB)": 91.52, "step": 58560, "token_acc": 0.7664233576642335, "train_speed(iter/s)": 0.143977 }, { "epoch": 0.7599207185931768, "grad_norm": 0.6967025995254517, "learning_rate": 7.30728787673039e-05, "loss": 0.8435604095458984, "memory(GiB)": 91.52, "step": 58565, "token_acc": 0.7613652113652114, "train_speed(iter/s)": 0.143974 }, { "epoch": 0.7599855969948325, "grad_norm": 0.6993100047111511, "learning_rate": 7.306812011581492e-05, "loss": 0.920071029663086, "memory(GiB)": 91.52, "step": 58570, "token_acc": 0.7482034196976789, "train_speed(iter/s)": 0.143972 }, { "epoch": 0.7600504753964882, "grad_norm": 0.6799155473709106, "learning_rate": 7.306336119885947e-05, "loss": 0.9031379699707032, "memory(GiB)": 91.52, "step": 58575, "token_acc": 0.7576937201615865, "train_speed(iter/s)": 0.14397 }, { "epoch": 0.7601153537981439, "grad_norm": 0.6945332288742065, "learning_rate": 7.305860201649235e-05, "loss": 0.8576518058776855, "memory(GiB)": 91.52, "step": 58580, "token_acc": 0.7531613725555966, "train_speed(iter/s)": 0.143967 }, { "epoch": 0.7601802321997996, "grad_norm": 0.6858463883399963, "learning_rate": 7.305384256876834e-05, "loss": 0.8815547943115234, "memory(GiB)": 91.52, "step": 58585, "token_acc": 0.7573870668316832, "train_speed(iter/s)": 0.143964 }, { "epoch": 0.7602451106014553, "grad_norm": 0.8602241277694702, "learning_rate": 7.304908285574218e-05, "loss": 0.9283523559570312, "memory(GiB)": 91.52, "step": 58590, "token_acc": 0.7359071096296839, "train_speed(iter/s)": 0.143962 }, { "epoch": 0.7603099890031109, "grad_norm": 0.6500288844108582, "learning_rate": 7.304432287746866e-05, "loss": 0.8760208129882813, "memory(GiB)": 91.52, "step": 58595, "token_acc": 0.7340788938592924, "train_speed(iter/s)": 0.14396 }, { "epoch": 0.7603748674047666, "grad_norm": 0.7325539588928223, "learning_rate": 7.303956263400254e-05, "loss": 0.899447250366211, "memory(GiB)": 91.52, "step": 58600, "token_acc": 0.7515800085301074, "train_speed(iter/s)": 0.143958 }, { "epoch": 0.7604397458064223, "grad_norm": 0.6947696208953857, "learning_rate": 7.303480212539863e-05, "loss": 0.881163215637207, "memory(GiB)": 91.52, "step": 58605, "token_acc": 0.7437642518856341, "train_speed(iter/s)": 0.143956 }, { "epoch": 0.760504624208078, "grad_norm": 0.730739176273346, "learning_rate": 7.303004135171171e-05, "loss": 0.8870035171508789, "memory(GiB)": 91.52, "step": 58610, "token_acc": 0.7557033172917967, "train_speed(iter/s)": 0.143954 }, { "epoch": 0.7605695026097337, "grad_norm": 0.7497032880783081, "learning_rate": 7.302528031299654e-05, "loss": 0.8618930816650391, "memory(GiB)": 91.52, "step": 58615, "token_acc": 0.7635681480043988, "train_speed(iter/s)": 0.143953 }, { "epoch": 0.7606343810113894, "grad_norm": 0.7896799445152283, "learning_rate": 7.302051900930795e-05, "loss": 0.9214017868041993, "memory(GiB)": 91.52, "step": 58620, "token_acc": 0.735212589022986, "train_speed(iter/s)": 0.143951 }, { "epoch": 0.7606992594130451, "grad_norm": 0.6726945042610168, "learning_rate": 7.301575744070069e-05, "loss": 0.806285285949707, "memory(GiB)": 91.52, "step": 58625, "token_acc": 0.7726015741331631, "train_speed(iter/s)": 0.143949 }, { "epoch": 0.7607641378147008, "grad_norm": 0.7858715653419495, "learning_rate": 7.301099560722957e-05, "loss": 0.8929367065429688, "memory(GiB)": 91.52, "step": 58630, "token_acc": 0.7681403453812116, "train_speed(iter/s)": 0.143947 }, { "epoch": 0.7608290162163565, "grad_norm": 0.8559355139732361, "learning_rate": 7.30062335089494e-05, "loss": 0.8998039245605469, "memory(GiB)": 91.52, "step": 58635, "token_acc": 0.7696676253230915, "train_speed(iter/s)": 0.143944 }, { "epoch": 0.7608938946180122, "grad_norm": 0.754265308380127, "learning_rate": 7.300147114591498e-05, "loss": 0.9016529083251953, "memory(GiB)": 91.52, "step": 58640, "token_acc": 0.7583662914511713, "train_speed(iter/s)": 0.143942 }, { "epoch": 0.7609587730196679, "grad_norm": 0.7232889533042908, "learning_rate": 7.29967085181811e-05, "loss": 0.8415700912475585, "memory(GiB)": 91.52, "step": 58645, "token_acc": 0.762841776432321, "train_speed(iter/s)": 0.14394 }, { "epoch": 0.7610236514213236, "grad_norm": 0.7064405679702759, "learning_rate": 7.299194562580258e-05, "loss": 0.8771333694458008, "memory(GiB)": 91.52, "step": 58650, "token_acc": 0.7579396936514563, "train_speed(iter/s)": 0.143938 }, { "epoch": 0.7610885298229793, "grad_norm": 0.7586607933044434, "learning_rate": 7.298718246883424e-05, "loss": 0.873564338684082, "memory(GiB)": 91.52, "step": 58655, "token_acc": 0.7825639808551876, "train_speed(iter/s)": 0.143936 }, { "epoch": 0.761153408224635, "grad_norm": 0.9642313718795776, "learning_rate": 7.298241904733088e-05, "loss": 0.8525259971618653, "memory(GiB)": 91.52, "step": 58660, "token_acc": 0.7660033167495854, "train_speed(iter/s)": 0.143933 }, { "epoch": 0.7612182866262907, "grad_norm": 0.8229027986526489, "learning_rate": 7.297765536134732e-05, "loss": 0.9154132843017578, "memory(GiB)": 91.52, "step": 58665, "token_acc": 0.7747629944426283, "train_speed(iter/s)": 0.14393 }, { "epoch": 0.7612831650279464, "grad_norm": 0.6906251311302185, "learning_rate": 7.297289141093837e-05, "loss": 0.8839654922485352, "memory(GiB)": 91.52, "step": 58670, "token_acc": 0.7519374528495988, "train_speed(iter/s)": 0.143927 }, { "epoch": 0.761348043429602, "grad_norm": 0.8357915878295898, "learning_rate": 7.296812719615888e-05, "loss": 0.9107891082763672, "memory(GiB)": 91.52, "step": 58675, "token_acc": 0.7532292198788586, "train_speed(iter/s)": 0.143925 }, { "epoch": 0.7614129218312577, "grad_norm": 0.7553039789199829, "learning_rate": 7.296336271706366e-05, "loss": 0.919521713256836, "memory(GiB)": 91.52, "step": 58680, "token_acc": 0.7350201334214796, "train_speed(iter/s)": 0.143923 }, { "epoch": 0.7614778002329134, "grad_norm": 0.7597222328186035, "learning_rate": 7.295859797370753e-05, "loss": 0.9217668533325195, "memory(GiB)": 91.52, "step": 58685, "token_acc": 0.7629965566473503, "train_speed(iter/s)": 0.143921 }, { "epoch": 0.7615426786345691, "grad_norm": 0.8689200282096863, "learning_rate": 7.295383296614534e-05, "loss": 0.8634450912475586, "memory(GiB)": 91.52, "step": 58690, "token_acc": 0.755917213382145, "train_speed(iter/s)": 0.143918 }, { "epoch": 0.7616075570362248, "grad_norm": 0.741280198097229, "learning_rate": 7.294906769443191e-05, "loss": 0.8564120292663574, "memory(GiB)": 91.52, "step": 58695, "token_acc": 0.7757200426691953, "train_speed(iter/s)": 0.143916 }, { "epoch": 0.7616724354378805, "grad_norm": 0.7934805750846863, "learning_rate": 7.29443021586221e-05, "loss": 0.8789039611816406, "memory(GiB)": 91.52, "step": 58700, "token_acc": 0.7599136580440762, "train_speed(iter/s)": 0.143915 }, { "epoch": 0.7617373138395362, "grad_norm": 0.7669203877449036, "learning_rate": 7.293953635877071e-05, "loss": 0.8587818145751953, "memory(GiB)": 91.52, "step": 58705, "token_acc": 0.7483959670027498, "train_speed(iter/s)": 0.143912 }, { "epoch": 0.7618021922411919, "grad_norm": 0.723761260509491, "learning_rate": 7.293477029493263e-05, "loss": 0.8536478042602539, "memory(GiB)": 91.52, "step": 58710, "token_acc": 0.7558058791759895, "train_speed(iter/s)": 0.14391 }, { "epoch": 0.7618670706428476, "grad_norm": 0.6412605047225952, "learning_rate": 7.293000396716268e-05, "loss": 0.7960672378540039, "memory(GiB)": 91.52, "step": 58715, "token_acc": 0.7666799107552187, "train_speed(iter/s)": 0.143907 }, { "epoch": 0.7619319490445033, "grad_norm": 0.720649003982544, "learning_rate": 7.292523737551574e-05, "loss": 0.8429579734802246, "memory(GiB)": 91.52, "step": 58720, "token_acc": 0.7494853410946263, "train_speed(iter/s)": 0.143905 }, { "epoch": 0.761996827446159, "grad_norm": 0.6950243711471558, "learning_rate": 7.292047052004663e-05, "loss": 0.8627993583679199, "memory(GiB)": 91.52, "step": 58725, "token_acc": 0.7801294106397221, "train_speed(iter/s)": 0.143902 }, { "epoch": 0.7620617058478147, "grad_norm": 0.7079561948776245, "learning_rate": 7.291570340081022e-05, "loss": 0.934224796295166, "memory(GiB)": 91.52, "step": 58730, "token_acc": 0.7431776542059079, "train_speed(iter/s)": 0.1439 }, { "epoch": 0.7621265842494704, "grad_norm": 0.7266875505447388, "learning_rate": 7.291093601786138e-05, "loss": 0.8836400985717774, "memory(GiB)": 91.52, "step": 58735, "token_acc": 0.7632211945658434, "train_speed(iter/s)": 0.143897 }, { "epoch": 0.7621914626511261, "grad_norm": 0.6877546906471252, "learning_rate": 7.290616837125495e-05, "loss": 0.8706271171569824, "memory(GiB)": 91.52, "step": 58740, "token_acc": 0.7567950386647271, "train_speed(iter/s)": 0.143895 }, { "epoch": 0.7622563410527818, "grad_norm": 0.6650570631027222, "learning_rate": 7.29014004610458e-05, "loss": 0.8552200317382812, "memory(GiB)": 91.52, "step": 58745, "token_acc": 0.7436183790682833, "train_speed(iter/s)": 0.143893 }, { "epoch": 0.7623212194544375, "grad_norm": 0.7659575343132019, "learning_rate": 7.289663228728884e-05, "loss": 0.8612251281738281, "memory(GiB)": 91.52, "step": 58750, "token_acc": 0.7649482962567253, "train_speed(iter/s)": 0.143891 }, { "epoch": 0.7623860978560932, "grad_norm": 0.6885077953338623, "learning_rate": 7.289186385003888e-05, "loss": 0.8995941162109375, "memory(GiB)": 91.52, "step": 58755, "token_acc": 0.7584573041145525, "train_speed(iter/s)": 0.143889 }, { "epoch": 0.7624509762577489, "grad_norm": 0.7479057312011719, "learning_rate": 7.288709514935085e-05, "loss": 0.8736603736877442, "memory(GiB)": 91.52, "step": 58760, "token_acc": 0.7965937552178994, "train_speed(iter/s)": 0.143886 }, { "epoch": 0.7625158546594046, "grad_norm": 0.6961325407028198, "learning_rate": 7.288232618527958e-05, "loss": 0.8552021026611328, "memory(GiB)": 91.52, "step": 58765, "token_acc": 0.7546330355578911, "train_speed(iter/s)": 0.143885 }, { "epoch": 0.7625807330610603, "grad_norm": 0.7954278588294983, "learning_rate": 7.287755695787999e-05, "loss": 0.9462360382080078, "memory(GiB)": 91.52, "step": 58770, "token_acc": 0.7579005006257822, "train_speed(iter/s)": 0.143883 }, { "epoch": 0.762645611462716, "grad_norm": 0.7356359362602234, "learning_rate": 7.287278746720691e-05, "loss": 0.8881311416625977, "memory(GiB)": 91.52, "step": 58775, "token_acc": 0.7680075946362881, "train_speed(iter/s)": 0.14388 }, { "epoch": 0.7627104898643717, "grad_norm": 0.7364934682846069, "learning_rate": 7.28680177133153e-05, "loss": 0.8709061622619629, "memory(GiB)": 91.52, "step": 58780, "token_acc": 0.76309963099631, "train_speed(iter/s)": 0.143878 }, { "epoch": 0.7627753682660274, "grad_norm": 0.8827677965164185, "learning_rate": 7.286324769626001e-05, "loss": 0.928767204284668, "memory(GiB)": 91.52, "step": 58785, "token_acc": 0.7360308932169242, "train_speed(iter/s)": 0.143876 }, { "epoch": 0.7628402466676831, "grad_norm": 0.6536890864372253, "learning_rate": 7.285847741609593e-05, "loss": 0.8345511436462403, "memory(GiB)": 91.52, "step": 58790, "token_acc": 0.7640400376192396, "train_speed(iter/s)": 0.143874 }, { "epoch": 0.7629051250693388, "grad_norm": 0.7329316139221191, "learning_rate": 7.285370687287797e-05, "loss": 0.9005674362182617, "memory(GiB)": 91.52, "step": 58795, "token_acc": 0.7467561858780929, "train_speed(iter/s)": 0.143872 }, { "epoch": 0.7629700034709945, "grad_norm": 0.7066285014152527, "learning_rate": 7.284893606666101e-05, "loss": 0.863064956665039, "memory(GiB)": 91.52, "step": 58800, "token_acc": 0.7510607381575124, "train_speed(iter/s)": 0.14387 }, { "epoch": 0.7630348818726502, "grad_norm": 0.7740391492843628, "learning_rate": 7.284416499749998e-05, "loss": 0.826497745513916, "memory(GiB)": 91.52, "step": 58805, "token_acc": 0.7753676312044504, "train_speed(iter/s)": 0.143868 }, { "epoch": 0.7630997602743059, "grad_norm": 0.8123400211334229, "learning_rate": 7.283939366544975e-05, "loss": 0.8628913879394531, "memory(GiB)": 91.52, "step": 58810, "token_acc": 0.7507982686440077, "train_speed(iter/s)": 0.143865 }, { "epoch": 0.7631646386759616, "grad_norm": 0.686692476272583, "learning_rate": 7.283462207056527e-05, "loss": 0.8675798416137696, "memory(GiB)": 91.52, "step": 58815, "token_acc": 0.7671134514121589, "train_speed(iter/s)": 0.143863 }, { "epoch": 0.7632295170776173, "grad_norm": 0.7684152126312256, "learning_rate": 7.28298502129014e-05, "loss": 0.9751566886901856, "memory(GiB)": 91.52, "step": 58820, "token_acc": 0.7450394545315253, "train_speed(iter/s)": 0.143861 }, { "epoch": 0.763294395479273, "grad_norm": 0.7842044830322266, "learning_rate": 7.282507809251311e-05, "loss": 0.8798357009887695, "memory(GiB)": 91.52, "step": 58825, "token_acc": 0.7613300201159626, "train_speed(iter/s)": 0.143859 }, { "epoch": 0.7633592738809287, "grad_norm": 0.7553620934486389, "learning_rate": 7.282030570945528e-05, "loss": 0.9034556388854981, "memory(GiB)": 91.52, "step": 58830, "token_acc": 0.7427618087657711, "train_speed(iter/s)": 0.143857 }, { "epoch": 0.7634241522825843, "grad_norm": 0.7343460321426392, "learning_rate": 7.281553306378284e-05, "loss": 0.8775614738464356, "memory(GiB)": 91.52, "step": 58835, "token_acc": 0.7754797347736354, "train_speed(iter/s)": 0.143855 }, { "epoch": 0.76348903068424, "grad_norm": 0.8019881248474121, "learning_rate": 7.28107601555507e-05, "loss": 0.8709402084350586, "memory(GiB)": 91.52, "step": 58840, "token_acc": 0.78032648466085, "train_speed(iter/s)": 0.143853 }, { "epoch": 0.7635539090858957, "grad_norm": 0.6712687611579895, "learning_rate": 7.280598698481383e-05, "loss": 0.9142583847045899, "memory(GiB)": 91.52, "step": 58845, "token_acc": 0.7312370483827372, "train_speed(iter/s)": 0.143851 }, { "epoch": 0.7636187874875514, "grad_norm": 0.7068748474121094, "learning_rate": 7.280121355162712e-05, "loss": 0.8827383041381835, "memory(GiB)": 91.52, "step": 58850, "token_acc": 0.7618284440108704, "train_speed(iter/s)": 0.143848 }, { "epoch": 0.7636836658892071, "grad_norm": 0.7477800250053406, "learning_rate": 7.279643985604549e-05, "loss": 0.9073486328125, "memory(GiB)": 91.52, "step": 58855, "token_acc": 0.7398722065453602, "train_speed(iter/s)": 0.143846 }, { "epoch": 0.7637485442908628, "grad_norm": 0.6634904146194458, "learning_rate": 7.279166589812393e-05, "loss": 0.8340479850769043, "memory(GiB)": 91.52, "step": 58860, "token_acc": 0.7690055457958733, "train_speed(iter/s)": 0.143843 }, { "epoch": 0.7638134226925185, "grad_norm": 0.7851501107215881, "learning_rate": 7.278689167791733e-05, "loss": 0.8766674041748047, "memory(GiB)": 91.52, "step": 58865, "token_acc": 0.757106390343097, "train_speed(iter/s)": 0.143841 }, { "epoch": 0.7638783010941742, "grad_norm": 0.7514155507087708, "learning_rate": 7.278211719548064e-05, "loss": 0.9278560638427734, "memory(GiB)": 91.52, "step": 58870, "token_acc": 0.7342035980693287, "train_speed(iter/s)": 0.143839 }, { "epoch": 0.7639431794958299, "grad_norm": 0.6808501482009888, "learning_rate": 7.277734245086881e-05, "loss": 0.8552881240844726, "memory(GiB)": 91.52, "step": 58875, "token_acc": 0.7656668651384341, "train_speed(iter/s)": 0.143837 }, { "epoch": 0.7640080578974856, "grad_norm": 0.8008802533149719, "learning_rate": 7.27725674441368e-05, "loss": 0.8861549377441407, "memory(GiB)": 91.52, "step": 58880, "token_acc": 0.7660861917326297, "train_speed(iter/s)": 0.143835 }, { "epoch": 0.7640729362991413, "grad_norm": 0.8103654384613037, "learning_rate": 7.276779217533956e-05, "loss": 0.8579084396362304, "memory(GiB)": 91.52, "step": 58885, "token_acc": 0.7436229537686634, "train_speed(iter/s)": 0.143834 }, { "epoch": 0.764137814700797, "grad_norm": 0.7135912775993347, "learning_rate": 7.2763016644532e-05, "loss": 0.8725849151611328, "memory(GiB)": 91.52, "step": 58890, "token_acc": 0.7418389734725085, "train_speed(iter/s)": 0.143831 }, { "epoch": 0.7642026931024527, "grad_norm": 0.6928343772888184, "learning_rate": 7.275824085176915e-05, "loss": 0.830014705657959, "memory(GiB)": 91.52, "step": 58895, "token_acc": 0.7635704828931782, "train_speed(iter/s)": 0.143829 }, { "epoch": 0.7642675715041084, "grad_norm": 0.7415639758110046, "learning_rate": 7.27534647971059e-05, "loss": 0.8556819915771484, "memory(GiB)": 91.52, "step": 58900, "token_acc": 0.7717908685884011, "train_speed(iter/s)": 0.143827 }, { "epoch": 0.7643324499057641, "grad_norm": 0.8569409251213074, "learning_rate": 7.274868848059724e-05, "loss": 0.9026741027832031, "memory(GiB)": 91.52, "step": 58905, "token_acc": 0.7464061683220073, "train_speed(iter/s)": 0.143825 }, { "epoch": 0.7643973283074198, "grad_norm": 0.7765989899635315, "learning_rate": 7.274391190229816e-05, "loss": 0.9080381393432617, "memory(GiB)": 91.52, "step": 58910, "token_acc": 0.752607527819444, "train_speed(iter/s)": 0.143822 }, { "epoch": 0.7644622067090755, "grad_norm": 0.8053651452064514, "learning_rate": 7.273913506226359e-05, "loss": 0.8577372550964355, "memory(GiB)": 91.52, "step": 58915, "token_acc": 0.7582436069986541, "train_speed(iter/s)": 0.14382 }, { "epoch": 0.7645270851107312, "grad_norm": 0.787084698677063, "learning_rate": 7.273435796054852e-05, "loss": 0.9037141799926758, "memory(GiB)": 91.52, "step": 58920, "token_acc": 0.7543490512868684, "train_speed(iter/s)": 0.143818 }, { "epoch": 0.7645919635123869, "grad_norm": 0.7666274905204773, "learning_rate": 7.272958059720791e-05, "loss": 0.9128000259399414, "memory(GiB)": 91.52, "step": 58925, "token_acc": 0.7619154340355595, "train_speed(iter/s)": 0.143817 }, { "epoch": 0.7646568419140426, "grad_norm": 0.711787223815918, "learning_rate": 7.272480297229676e-05, "loss": 0.8896699905395508, "memory(GiB)": 91.52, "step": 58930, "token_acc": 0.746907156896751, "train_speed(iter/s)": 0.143815 }, { "epoch": 0.7647217203156983, "grad_norm": 0.7580145001411438, "learning_rate": 7.272002508587006e-05, "loss": 0.8851603507995606, "memory(GiB)": 91.52, "step": 58935, "token_acc": 0.7623927619828419, "train_speed(iter/s)": 0.143813 }, { "epoch": 0.764786598717354, "grad_norm": 0.7426559329032898, "learning_rate": 7.271524693798274e-05, "loss": 0.9033119201660156, "memory(GiB)": 91.52, "step": 58940, "token_acc": 0.7462027855791826, "train_speed(iter/s)": 0.14381 }, { "epoch": 0.7648514771190097, "grad_norm": 0.6640188097953796, "learning_rate": 7.271046852868983e-05, "loss": 0.8461673736572266, "memory(GiB)": 91.52, "step": 58945, "token_acc": 0.7569677440786633, "train_speed(iter/s)": 0.143807 }, { "epoch": 0.7649163555206654, "grad_norm": 0.7515891194343567, "learning_rate": 7.270568985804633e-05, "loss": 0.8304339408874511, "memory(GiB)": 91.52, "step": 58950, "token_acc": 0.769577510961767, "train_speed(iter/s)": 0.143806 }, { "epoch": 0.7649812339223211, "grad_norm": 0.7309873104095459, "learning_rate": 7.270091092610719e-05, "loss": 0.8697887420654297, "memory(GiB)": 91.52, "step": 58955, "token_acc": 0.7484362251835736, "train_speed(iter/s)": 0.143804 }, { "epoch": 0.7650461123239768, "grad_norm": 0.7568092942237854, "learning_rate": 7.269613173292742e-05, "loss": 0.8971138000488281, "memory(GiB)": 91.52, "step": 58960, "token_acc": 0.7626530893078299, "train_speed(iter/s)": 0.1438 }, { "epoch": 0.7651109907256325, "grad_norm": 0.705643355846405, "learning_rate": 7.269135227856205e-05, "loss": 0.8790236473083496, "memory(GiB)": 91.52, "step": 58965, "token_acc": 0.752293943987872, "train_speed(iter/s)": 0.143798 }, { "epoch": 0.7651758691272882, "grad_norm": 0.7159092426300049, "learning_rate": 7.268657256306605e-05, "loss": 0.860820198059082, "memory(GiB)": 91.52, "step": 58970, "token_acc": 0.779171766625867, "train_speed(iter/s)": 0.143795 }, { "epoch": 0.7652407475289439, "grad_norm": 0.7842512726783752, "learning_rate": 7.268179258649443e-05, "loss": 0.8967286109924316, "memory(GiB)": 91.52, "step": 58975, "token_acc": 0.7403561336011713, "train_speed(iter/s)": 0.143792 }, { "epoch": 0.7653056259305996, "grad_norm": 0.7163425087928772, "learning_rate": 7.267701234890221e-05, "loss": 0.8882581710815429, "memory(GiB)": 91.52, "step": 58980, "token_acc": 0.7739000691437098, "train_speed(iter/s)": 0.14379 }, { "epoch": 0.7653705043322553, "grad_norm": 0.742731511592865, "learning_rate": 7.267223185034438e-05, "loss": 0.8942626953125, "memory(GiB)": 91.52, "step": 58985, "token_acc": 0.744432882414152, "train_speed(iter/s)": 0.143788 }, { "epoch": 0.765435382733911, "grad_norm": 0.6722486615180969, "learning_rate": 7.266745109087598e-05, "loss": 0.8618797302246094, "memory(GiB)": 91.52, "step": 58990, "token_acc": 0.7736325385694249, "train_speed(iter/s)": 0.143786 }, { "epoch": 0.7655002611355667, "grad_norm": 0.6914911866188049, "learning_rate": 7.266267007055199e-05, "loss": 0.8987990379333496, "memory(GiB)": 91.52, "step": 58995, "token_acc": 0.770998521704072, "train_speed(iter/s)": 0.143784 }, { "epoch": 0.7655651395372224, "grad_norm": 0.7329782843589783, "learning_rate": 7.265788878942746e-05, "loss": 0.8716008186340332, "memory(GiB)": 91.52, "step": 59000, "token_acc": 0.7545019920318725, "train_speed(iter/s)": 0.143781 }, { "epoch": 0.7656300179388781, "grad_norm": 0.7247846722602844, "learning_rate": 7.26531072475574e-05, "loss": 0.8771734237670898, "memory(GiB)": 91.52, "step": 59005, "token_acc": 0.7678645616950561, "train_speed(iter/s)": 0.143779 }, { "epoch": 0.7656948963405338, "grad_norm": 0.7282172441482544, "learning_rate": 7.264832544499687e-05, "loss": 0.9018068313598633, "memory(GiB)": 91.52, "step": 59010, "token_acc": 0.7548331040474984, "train_speed(iter/s)": 0.143777 }, { "epoch": 0.7657597747421895, "grad_norm": 0.6833682656288147, "learning_rate": 7.264354338180083e-05, "loss": 0.8784211158752442, "memory(GiB)": 91.52, "step": 59015, "token_acc": 0.7541392821031345, "train_speed(iter/s)": 0.143775 }, { "epoch": 0.7658246531438452, "grad_norm": 0.7332867383956909, "learning_rate": 7.263876105802436e-05, "loss": 0.884449577331543, "memory(GiB)": 91.52, "step": 59020, "token_acc": 0.7506868131868132, "train_speed(iter/s)": 0.143773 }, { "epoch": 0.7658895315455009, "grad_norm": 0.7584024667739868, "learning_rate": 7.26339784737225e-05, "loss": 0.8668937683105469, "memory(GiB)": 91.52, "step": 59025, "token_acc": 0.7497692228794147, "train_speed(iter/s)": 0.14377 }, { "epoch": 0.7659544099471566, "grad_norm": 0.7059548497200012, "learning_rate": 7.262919562895025e-05, "loss": 0.9006650924682618, "memory(GiB)": 91.52, "step": 59030, "token_acc": 0.7643645197206383, "train_speed(iter/s)": 0.143768 }, { "epoch": 0.7660192883488123, "grad_norm": 0.7822686433792114, "learning_rate": 7.26244125237627e-05, "loss": 0.8698400497436524, "memory(GiB)": 91.52, "step": 59035, "token_acc": 0.7621828626136907, "train_speed(iter/s)": 0.143766 }, { "epoch": 0.766084166750468, "grad_norm": 0.7657418251037598, "learning_rate": 7.261962915821486e-05, "loss": 0.8796032905578614, "memory(GiB)": 91.52, "step": 59040, "token_acc": 0.7594279970167644, "train_speed(iter/s)": 0.143764 }, { "epoch": 0.7661490451521237, "grad_norm": 0.7551046013832092, "learning_rate": 7.261484553236177e-05, "loss": 0.8675605773925781, "memory(GiB)": 91.52, "step": 59045, "token_acc": 0.7552800901523138, "train_speed(iter/s)": 0.143762 }, { "epoch": 0.7662139235537794, "grad_norm": 0.7165749073028564, "learning_rate": 7.26100616462585e-05, "loss": 0.8972913742065429, "memory(GiB)": 91.52, "step": 59050, "token_acc": 0.7630080925566941, "train_speed(iter/s)": 0.143761 }, { "epoch": 0.7662788019554351, "grad_norm": 0.7963652610778809, "learning_rate": 7.260527749996009e-05, "loss": 0.9194169998168945, "memory(GiB)": 91.52, "step": 59055, "token_acc": 0.7407047545018784, "train_speed(iter/s)": 0.143758 }, { "epoch": 0.7663436803570908, "grad_norm": 0.734219491481781, "learning_rate": 7.260049309352161e-05, "loss": 0.851133918762207, "memory(GiB)": 91.52, "step": 59060, "token_acc": 0.7629969947148434, "train_speed(iter/s)": 0.143756 }, { "epoch": 0.7664085587587465, "grad_norm": 0.7741819620132446, "learning_rate": 7.25957084269981e-05, "loss": 0.9078925132751465, "memory(GiB)": 91.52, "step": 59065, "token_acc": 0.7458512340999074, "train_speed(iter/s)": 0.143754 }, { "epoch": 0.7664734371604022, "grad_norm": 0.6632319092750549, "learning_rate": 7.259092350044464e-05, "loss": 0.9248880386352539, "memory(GiB)": 91.52, "step": 59070, "token_acc": 0.7319663952103518, "train_speed(iter/s)": 0.143752 }, { "epoch": 0.7665383155620578, "grad_norm": 0.6402105093002319, "learning_rate": 7.258613831391628e-05, "loss": 0.8716675758361816, "memory(GiB)": 91.52, "step": 59075, "token_acc": 0.7456154569326029, "train_speed(iter/s)": 0.14375 }, { "epoch": 0.7666031939637135, "grad_norm": 0.7558024525642395, "learning_rate": 7.25813528674681e-05, "loss": 0.8886917114257813, "memory(GiB)": 91.52, "step": 59080, "token_acc": 0.776706293124948, "train_speed(iter/s)": 0.143748 }, { "epoch": 0.7666680723653692, "grad_norm": 0.794067919254303, "learning_rate": 7.257656716115518e-05, "loss": 0.8661538124084472, "memory(GiB)": 91.52, "step": 59085, "token_acc": 0.7481198745323306, "train_speed(iter/s)": 0.143745 }, { "epoch": 0.7667329507670249, "grad_norm": 0.6780319213867188, "learning_rate": 7.257178119503254e-05, "loss": 0.9237881660461426, "memory(GiB)": 91.52, "step": 59090, "token_acc": 0.7630682230116725, "train_speed(iter/s)": 0.143744 }, { "epoch": 0.7667978291686806, "grad_norm": 0.7147473692893982, "learning_rate": 7.256699496915533e-05, "loss": 0.9197657585144043, "memory(GiB)": 91.52, "step": 59095, "token_acc": 0.7664010303469372, "train_speed(iter/s)": 0.143742 }, { "epoch": 0.7668627075703363, "grad_norm": 0.7477144002914429, "learning_rate": 7.256220848357856e-05, "loss": 0.857125186920166, "memory(GiB)": 91.52, "step": 59100, "token_acc": 0.7535843705687727, "train_speed(iter/s)": 0.14374 }, { "epoch": 0.766927585971992, "grad_norm": 0.7663062810897827, "learning_rate": 7.255742173835738e-05, "loss": 0.8331643104553222, "memory(GiB)": 91.52, "step": 59105, "token_acc": 0.7636329854767288, "train_speed(iter/s)": 0.143738 }, { "epoch": 0.7669924643736477, "grad_norm": 0.722871720790863, "learning_rate": 7.255263473354683e-05, "loss": 0.8351519584655762, "memory(GiB)": 91.52, "step": 59110, "token_acc": 0.7566910796854434, "train_speed(iter/s)": 0.143736 }, { "epoch": 0.7670573427753034, "grad_norm": 0.6707016229629517, "learning_rate": 7.2547847469202e-05, "loss": 0.8200799942016601, "memory(GiB)": 91.52, "step": 59115, "token_acc": 0.7608861992209237, "train_speed(iter/s)": 0.143734 }, { "epoch": 0.767122221176959, "grad_norm": 0.7585172057151794, "learning_rate": 7.254305994537801e-05, "loss": 0.8444695472717285, "memory(GiB)": 91.52, "step": 59120, "token_acc": 0.7850976828714221, "train_speed(iter/s)": 0.143732 }, { "epoch": 0.7671870995786148, "grad_norm": 0.6637758612632751, "learning_rate": 7.253827216212991e-05, "loss": 0.8690332412719727, "memory(GiB)": 91.52, "step": 59125, "token_acc": 0.7580591658876391, "train_speed(iter/s)": 0.143729 }, { "epoch": 0.7672519779802704, "grad_norm": 0.7096869349479675, "learning_rate": 7.253348411951286e-05, "loss": 0.8327922821044922, "memory(GiB)": 91.52, "step": 59130, "token_acc": 0.7766199934874634, "train_speed(iter/s)": 0.143726 }, { "epoch": 0.7673168563819261, "grad_norm": 0.8285040855407715, "learning_rate": 7.25286958175819e-05, "loss": 0.9090295791625976, "memory(GiB)": 91.52, "step": 59135, "token_acc": 0.7617339201834523, "train_speed(iter/s)": 0.143725 }, { "epoch": 0.7673817347835818, "grad_norm": 0.6888059973716736, "learning_rate": 7.252390725639217e-05, "loss": 0.8488517761230469, "memory(GiB)": 91.52, "step": 59140, "token_acc": 0.779457181266577, "train_speed(iter/s)": 0.143723 }, { "epoch": 0.7674466131852375, "grad_norm": 0.6960793137550354, "learning_rate": 7.251911843599876e-05, "loss": 0.843897533416748, "memory(GiB)": 91.52, "step": 59145, "token_acc": 0.7617964155698684, "train_speed(iter/s)": 0.143721 }, { "epoch": 0.7675114915868932, "grad_norm": 0.6922509670257568, "learning_rate": 7.25143293564568e-05, "loss": 0.8370449066162109, "memory(GiB)": 91.52, "step": 59150, "token_acc": 0.7683485482648387, "train_speed(iter/s)": 0.143719 }, { "epoch": 0.767576369988549, "grad_norm": 0.7012591361999512, "learning_rate": 7.250954001782137e-05, "loss": 0.8835386276245117, "memory(GiB)": 91.52, "step": 59155, "token_acc": 0.761848213321818, "train_speed(iter/s)": 0.143716 }, { "epoch": 0.7676412483902046, "grad_norm": 0.6980513334274292, "learning_rate": 7.25047504201476e-05, "loss": 0.8425398826599121, "memory(GiB)": 91.52, "step": 59160, "token_acc": 0.7431970082749841, "train_speed(iter/s)": 0.143713 }, { "epoch": 0.7677061267918603, "grad_norm": 0.6998262405395508, "learning_rate": 7.249996056349062e-05, "loss": 0.8788604736328125, "memory(GiB)": 91.52, "step": 59165, "token_acc": 0.7518687329079308, "train_speed(iter/s)": 0.143711 }, { "epoch": 0.767771005193516, "grad_norm": 0.7748426198959351, "learning_rate": 7.249517044790554e-05, "loss": 0.8317755699157715, "memory(GiB)": 91.52, "step": 59170, "token_acc": 0.7755970993280553, "train_speed(iter/s)": 0.143708 }, { "epoch": 0.7678358835951717, "grad_norm": 0.681020975112915, "learning_rate": 7.249038007344748e-05, "loss": 0.8186084747314453, "memory(GiB)": 91.52, "step": 59175, "token_acc": 0.7890416189385262, "train_speed(iter/s)": 0.143706 }, { "epoch": 0.7679007619968274, "grad_norm": 1.0861843824386597, "learning_rate": 7.248558944017158e-05, "loss": 0.8563125610351563, "memory(GiB)": 91.52, "step": 59180, "token_acc": 0.7532877882152007, "train_speed(iter/s)": 0.143703 }, { "epoch": 0.7679656403984831, "grad_norm": 0.6585047245025635, "learning_rate": 7.248079854813297e-05, "loss": 0.8654443740844726, "memory(GiB)": 91.52, "step": 59185, "token_acc": 0.7655891741221957, "train_speed(iter/s)": 0.143701 }, { "epoch": 0.7680305188001388, "grad_norm": 0.7290235757827759, "learning_rate": 7.247600739738677e-05, "loss": 0.8651285171508789, "memory(GiB)": 91.52, "step": 59190, "token_acc": 0.7711258278145695, "train_speed(iter/s)": 0.143699 }, { "epoch": 0.7680953972017945, "grad_norm": 0.7743969559669495, "learning_rate": 7.247121598798812e-05, "loss": 0.8918405532836914, "memory(GiB)": 91.52, "step": 59195, "token_acc": 0.743491912086165, "train_speed(iter/s)": 0.143697 }, { "epoch": 0.7681602756034502, "grad_norm": 0.768901526927948, "learning_rate": 7.246642431999215e-05, "loss": 0.8999360084533692, "memory(GiB)": 91.52, "step": 59200, "token_acc": 0.7308775222900047, "train_speed(iter/s)": 0.143696 }, { "epoch": 0.7682251540051059, "grad_norm": 0.7923042178153992, "learning_rate": 7.246163239345403e-05, "loss": 0.8913782119750977, "memory(GiB)": 91.52, "step": 59205, "token_acc": 0.7594732401599354, "train_speed(iter/s)": 0.143694 }, { "epoch": 0.7682900324067616, "grad_norm": 0.83741694688797, "learning_rate": 7.24568402084289e-05, "loss": 0.884007453918457, "memory(GiB)": 91.52, "step": 59210, "token_acc": 0.7557459108855048, "train_speed(iter/s)": 0.143691 }, { "epoch": 0.7683549108084173, "grad_norm": 0.8086025714874268, "learning_rate": 7.245204776497189e-05, "loss": 0.8836262702941895, "memory(GiB)": 91.52, "step": 59215, "token_acc": 0.755219412274482, "train_speed(iter/s)": 0.143689 }, { "epoch": 0.768419789210073, "grad_norm": 0.7854841947555542, "learning_rate": 7.244725506313816e-05, "loss": 0.888063621520996, "memory(GiB)": 91.52, "step": 59220, "token_acc": 0.7533123587539328, "train_speed(iter/s)": 0.143686 }, { "epoch": 0.7684846676117287, "grad_norm": 0.7581737637519836, "learning_rate": 7.244246210298287e-05, "loss": 0.8992964744567871, "memory(GiB)": 91.52, "step": 59225, "token_acc": 0.744750656167979, "train_speed(iter/s)": 0.143684 }, { "epoch": 0.7685495460133844, "grad_norm": 0.7742145657539368, "learning_rate": 7.243766888456114e-05, "loss": 0.8587448120117187, "memory(GiB)": 91.52, "step": 59230, "token_acc": 0.7668239384156941, "train_speed(iter/s)": 0.143682 }, { "epoch": 0.7686144244150401, "grad_norm": 0.7424253225326538, "learning_rate": 7.243287540792818e-05, "loss": 0.8858048439025878, "memory(GiB)": 91.52, "step": 59235, "token_acc": 0.745693529059317, "train_speed(iter/s)": 0.14368 }, { "epoch": 0.7686793028166958, "grad_norm": 0.766539990901947, "learning_rate": 7.242808167313915e-05, "loss": 0.888249683380127, "memory(GiB)": 91.52, "step": 59240, "token_acc": 0.7691179705256984, "train_speed(iter/s)": 0.143678 }, { "epoch": 0.7687441812183515, "grad_norm": 0.7404181361198425, "learning_rate": 7.242328768024918e-05, "loss": 0.8582648277282715, "memory(GiB)": 91.52, "step": 59245, "token_acc": 0.7524710068529257, "train_speed(iter/s)": 0.143676 }, { "epoch": 0.7688090596200072, "grad_norm": 0.7587968111038208, "learning_rate": 7.241849342931346e-05, "loss": 0.8605768203735351, "memory(GiB)": 91.52, "step": 59250, "token_acc": 0.7701962056105075, "train_speed(iter/s)": 0.143673 }, { "epoch": 0.7688739380216629, "grad_norm": 0.6541410684585571, "learning_rate": 7.241369892038717e-05, "loss": 0.8219085693359375, "memory(GiB)": 91.52, "step": 59255, "token_acc": 0.7511030506455891, "train_speed(iter/s)": 0.143671 }, { "epoch": 0.7689388164233186, "grad_norm": 0.6625531911849976, "learning_rate": 7.240890415352548e-05, "loss": 0.8864217758178711, "memory(GiB)": 91.52, "step": 59260, "token_acc": 0.7638257201239207, "train_speed(iter/s)": 0.143669 }, { "epoch": 0.7690036948249743, "grad_norm": 0.7909151911735535, "learning_rate": 7.240410912878353e-05, "loss": 0.9292981147766113, "memory(GiB)": 91.52, "step": 59265, "token_acc": 0.762911464245176, "train_speed(iter/s)": 0.143667 }, { "epoch": 0.76906857322663, "grad_norm": 0.6704897880554199, "learning_rate": 7.239931384621656e-05, "loss": 0.8292366981506347, "memory(GiB)": 91.52, "step": 59270, "token_acc": 0.7783968094498737, "train_speed(iter/s)": 0.143665 }, { "epoch": 0.7691334516282857, "grad_norm": 0.7150152921676636, "learning_rate": 7.239451830587973e-05, "loss": 0.8938621520996094, "memory(GiB)": 91.52, "step": 59275, "token_acc": 0.7703153123125125, "train_speed(iter/s)": 0.143663 }, { "epoch": 0.7691983300299414, "grad_norm": 0.8212878108024597, "learning_rate": 7.238972250782821e-05, "loss": 0.9091384887695313, "memory(GiB)": 91.52, "step": 59280, "token_acc": 0.7390583439842482, "train_speed(iter/s)": 0.143661 }, { "epoch": 0.7692632084315971, "grad_norm": 0.7857820987701416, "learning_rate": 7.238492645211723e-05, "loss": 0.8753874778747559, "memory(GiB)": 91.52, "step": 59285, "token_acc": 0.7587156279037753, "train_speed(iter/s)": 0.143659 }, { "epoch": 0.7693280868332528, "grad_norm": 0.7674109935760498, "learning_rate": 7.238013013880192e-05, "loss": 0.8764989852905274, "memory(GiB)": 91.52, "step": 59290, "token_acc": 0.7780738214501404, "train_speed(iter/s)": 0.143657 }, { "epoch": 0.7693929652349085, "grad_norm": 0.7084083557128906, "learning_rate": 7.237533356793755e-05, "loss": 0.8498571395874024, "memory(GiB)": 91.52, "step": 59295, "token_acc": 0.763030219018575, "train_speed(iter/s)": 0.143655 }, { "epoch": 0.7694578436365642, "grad_norm": 0.8276823163032532, "learning_rate": 7.237053673957926e-05, "loss": 0.8988974571228028, "memory(GiB)": 91.52, "step": 59300, "token_acc": 0.7629209083790133, "train_speed(iter/s)": 0.143653 }, { "epoch": 0.7695227220382199, "grad_norm": 0.7365827560424805, "learning_rate": 7.236573965378229e-05, "loss": 0.9111964225769043, "memory(GiB)": 91.52, "step": 59305, "token_acc": 0.7341914521369658, "train_speed(iter/s)": 0.143651 }, { "epoch": 0.7695876004398756, "grad_norm": 0.7352044582366943, "learning_rate": 7.23609423106018e-05, "loss": 0.9066415786743164, "memory(GiB)": 91.52, "step": 59310, "token_acc": 0.7563256115711942, "train_speed(iter/s)": 0.143649 }, { "epoch": 0.7696524788415312, "grad_norm": 0.7024168968200684, "learning_rate": 7.235614471009304e-05, "loss": 0.8902002334594726, "memory(GiB)": 91.52, "step": 59315, "token_acc": 0.7465330221062749, "train_speed(iter/s)": 0.143647 }, { "epoch": 0.7697173572431869, "grad_norm": 0.7791479825973511, "learning_rate": 7.23513468523112e-05, "loss": 0.9062461853027344, "memory(GiB)": 91.52, "step": 59320, "token_acc": 0.7620930455023756, "train_speed(iter/s)": 0.143645 }, { "epoch": 0.7697822356448426, "grad_norm": 0.7059034705162048, "learning_rate": 7.23465487373115e-05, "loss": 0.9178660392761231, "memory(GiB)": 91.52, "step": 59325, "token_acc": 0.753419984307304, "train_speed(iter/s)": 0.143643 }, { "epoch": 0.7698471140464983, "grad_norm": 0.7459490299224854, "learning_rate": 7.234175036514918e-05, "loss": 0.8935237884521484, "memory(GiB)": 91.52, "step": 59330, "token_acc": 0.7615240895586388, "train_speed(iter/s)": 0.143641 }, { "epoch": 0.769911992448154, "grad_norm": 0.7179960608482361, "learning_rate": 7.23369517358794e-05, "loss": 0.8834095001220703, "memory(GiB)": 91.52, "step": 59335, "token_acc": 0.7512688803277686, "train_speed(iter/s)": 0.143638 }, { "epoch": 0.7699768708498097, "grad_norm": 0.740086019039154, "learning_rate": 7.233215284955744e-05, "loss": 0.8750171661376953, "memory(GiB)": 91.52, "step": 59340, "token_acc": 0.7497221193034457, "train_speed(iter/s)": 0.143636 }, { "epoch": 0.7700417492514654, "grad_norm": 0.7279273867607117, "learning_rate": 7.23273537062385e-05, "loss": 0.8638898849487304, "memory(GiB)": 91.52, "step": 59345, "token_acc": 0.764095363106322, "train_speed(iter/s)": 0.143634 }, { "epoch": 0.7701066276531211, "grad_norm": 0.6637063026428223, "learning_rate": 7.23225543059778e-05, "loss": 0.8876767158508301, "memory(GiB)": 91.52, "step": 59350, "token_acc": 0.7518718569566027, "train_speed(iter/s)": 0.143632 }, { "epoch": 0.7701715060547768, "grad_norm": 0.6688801646232605, "learning_rate": 7.231775464883059e-05, "loss": 0.8518636703491211, "memory(GiB)": 91.52, "step": 59355, "token_acc": 0.7609870171760342, "train_speed(iter/s)": 0.14363 }, { "epoch": 0.7702363844564325, "grad_norm": 0.661000669002533, "learning_rate": 7.231295473485208e-05, "loss": 0.9041874885559082, "memory(GiB)": 91.52, "step": 59360, "token_acc": 0.7377200127492297, "train_speed(iter/s)": 0.143628 }, { "epoch": 0.7703012628580882, "grad_norm": 0.6755324602127075, "learning_rate": 7.230815456409755e-05, "loss": 0.8280040740966796, "memory(GiB)": 91.52, "step": 59365, "token_acc": 0.7854568249490675, "train_speed(iter/s)": 0.143625 }, { "epoch": 0.7703661412597439, "grad_norm": 0.6634871959686279, "learning_rate": 7.23033541366222e-05, "loss": 0.8624700546264649, "memory(GiB)": 91.52, "step": 59370, "token_acc": 0.7728487064116986, "train_speed(iter/s)": 0.143623 }, { "epoch": 0.7704310196613996, "grad_norm": 0.7051756381988525, "learning_rate": 7.229855345248128e-05, "loss": 0.8784540176391602, "memory(GiB)": 91.52, "step": 59375, "token_acc": 0.7787546582727306, "train_speed(iter/s)": 0.143621 }, { "epoch": 0.7704958980630553, "grad_norm": 0.7159348726272583, "learning_rate": 7.229375251173003e-05, "loss": 0.8985603332519532, "memory(GiB)": 91.52, "step": 59380, "token_acc": 0.738949921005254, "train_speed(iter/s)": 0.143619 }, { "epoch": 0.770560776464711, "grad_norm": 0.7809571027755737, "learning_rate": 7.228895131442374e-05, "loss": 0.8593652725219727, "memory(GiB)": 91.52, "step": 59385, "token_acc": 0.7594824530308402, "train_speed(iter/s)": 0.143617 }, { "epoch": 0.7706256548663667, "grad_norm": 0.7639727592468262, "learning_rate": 7.228414986061761e-05, "loss": 0.9344654083251953, "memory(GiB)": 91.52, "step": 59390, "token_acc": 0.7513264826381892, "train_speed(iter/s)": 0.143615 }, { "epoch": 0.7706905332680224, "grad_norm": 0.7366949319839478, "learning_rate": 7.227934815036694e-05, "loss": 0.8649377822875977, "memory(GiB)": 91.52, "step": 59395, "token_acc": 0.7914286646864902, "train_speed(iter/s)": 0.143613 }, { "epoch": 0.7707554116696781, "grad_norm": 0.6395632028579712, "learning_rate": 7.227454618372694e-05, "loss": 0.8817549705505371, "memory(GiB)": 91.52, "step": 59400, "token_acc": 0.7579286187636088, "train_speed(iter/s)": 0.14361 }, { "epoch": 0.7708202900713338, "grad_norm": 0.725332498550415, "learning_rate": 7.226974396075293e-05, "loss": 0.8408315658569336, "memory(GiB)": 91.52, "step": 59405, "token_acc": 0.7707050974899562, "train_speed(iter/s)": 0.143607 }, { "epoch": 0.7708851684729895, "grad_norm": 0.7605773210525513, "learning_rate": 7.22649414815001e-05, "loss": 0.8840738296508789, "memory(GiB)": 91.52, "step": 59410, "token_acc": 0.7592389315770216, "train_speed(iter/s)": 0.143605 }, { "epoch": 0.7709500468746452, "grad_norm": 0.7707629799842834, "learning_rate": 7.226013874602377e-05, "loss": 0.8927349090576172, "memory(GiB)": 91.52, "step": 59415, "token_acc": 0.7558950135232944, "train_speed(iter/s)": 0.143604 }, { "epoch": 0.7710149252763009, "grad_norm": 0.7349158525466919, "learning_rate": 7.225533575437922e-05, "loss": 0.8979676246643067, "memory(GiB)": 91.52, "step": 59420, "token_acc": 0.7547078851426677, "train_speed(iter/s)": 0.143602 }, { "epoch": 0.7710798036779566, "grad_norm": 0.7077314853668213, "learning_rate": 7.225053250662166e-05, "loss": 0.8955909729003906, "memory(GiB)": 91.52, "step": 59425, "token_acc": 0.7675598935226264, "train_speed(iter/s)": 0.1436 }, { "epoch": 0.7711446820796123, "grad_norm": 0.7707861661911011, "learning_rate": 7.224572900280643e-05, "loss": 0.9276586532592773, "memory(GiB)": 91.52, "step": 59430, "token_acc": 0.7449401603045054, "train_speed(iter/s)": 0.143599 }, { "epoch": 0.771209560481268, "grad_norm": 0.7177035808563232, "learning_rate": 7.224092524298878e-05, "loss": 0.8778731346130371, "memory(GiB)": 91.52, "step": 59435, "token_acc": 0.7662167396680671, "train_speed(iter/s)": 0.143596 }, { "epoch": 0.7712744388829237, "grad_norm": 0.6831574440002441, "learning_rate": 7.223612122722398e-05, "loss": 0.863680076599121, "memory(GiB)": 91.52, "step": 59440, "token_acc": 0.7635237287510601, "train_speed(iter/s)": 0.143594 }, { "epoch": 0.7713393172845794, "grad_norm": 0.7153286337852478, "learning_rate": 7.223131695556735e-05, "loss": 0.8724400520324707, "memory(GiB)": 91.52, "step": 59445, "token_acc": 0.7571761299257208, "train_speed(iter/s)": 0.143593 }, { "epoch": 0.7714041956862351, "grad_norm": 0.7391344308853149, "learning_rate": 7.222651242807413e-05, "loss": 0.8946628570556641, "memory(GiB)": 91.52, "step": 59450, "token_acc": 0.7461992363762583, "train_speed(iter/s)": 0.143591 }, { "epoch": 0.7714690740878908, "grad_norm": 0.6896578073501587, "learning_rate": 7.222170764479966e-05, "loss": 0.8478519439697265, "memory(GiB)": 91.52, "step": 59455, "token_acc": 0.7730001677008217, "train_speed(iter/s)": 0.143588 }, { "epoch": 0.7715339524895465, "grad_norm": 0.6823228001594543, "learning_rate": 7.221690260579919e-05, "loss": 0.8443768501281739, "memory(GiB)": 91.52, "step": 59460, "token_acc": 0.7603388658367911, "train_speed(iter/s)": 0.143585 }, { "epoch": 0.7715988308912022, "grad_norm": 0.7681478261947632, "learning_rate": 7.221209731112805e-05, "loss": 0.860858154296875, "memory(GiB)": 91.52, "step": 59465, "token_acc": 0.7573696782352115, "train_speed(iter/s)": 0.143583 }, { "epoch": 0.7716637092928579, "grad_norm": 0.813465416431427, "learning_rate": 7.220729176084151e-05, "loss": 0.898292064666748, "memory(GiB)": 91.52, "step": 59470, "token_acc": 0.7483333947184265, "train_speed(iter/s)": 0.143581 }, { "epoch": 0.7717285876945136, "grad_norm": 0.6805701851844788, "learning_rate": 7.22024859549949e-05, "loss": 0.8404376983642579, "memory(GiB)": 91.52, "step": 59475, "token_acc": 0.7737961262808742, "train_speed(iter/s)": 0.143579 }, { "epoch": 0.7717934660961693, "grad_norm": 0.6803757548332214, "learning_rate": 7.21976798936435e-05, "loss": 0.8704686164855957, "memory(GiB)": 91.52, "step": 59480, "token_acc": 0.758078524153215, "train_speed(iter/s)": 0.143576 }, { "epoch": 0.771858344497825, "grad_norm": 0.7939257621765137, "learning_rate": 7.219287357684261e-05, "loss": 0.882408618927002, "memory(GiB)": 91.52, "step": 59485, "token_acc": 0.7595992700458083, "train_speed(iter/s)": 0.143573 }, { "epoch": 0.7719232228994807, "grad_norm": 0.7183137536048889, "learning_rate": 7.218806700464759e-05, "loss": 0.8935237884521484, "memory(GiB)": 91.52, "step": 59490, "token_acc": 0.7509391854193789, "train_speed(iter/s)": 0.14357 }, { "epoch": 0.7719881013011364, "grad_norm": 0.8341670632362366, "learning_rate": 7.218326017711371e-05, "loss": 0.8847696304321289, "memory(GiB)": 91.52, "step": 59495, "token_acc": 0.7580925861468848, "train_speed(iter/s)": 0.143568 }, { "epoch": 0.7720529797027921, "grad_norm": 0.7043835520744324, "learning_rate": 7.217845309429631e-05, "loss": 0.8096505165100097, "memory(GiB)": 91.52, "step": 59500, "token_acc": 0.7760821863155795, "train_speed(iter/s)": 0.143565 }, { "epoch": 0.7721178581044478, "grad_norm": 0.6426109075546265, "learning_rate": 7.217364575625068e-05, "loss": 0.871275520324707, "memory(GiB)": 91.52, "step": 59505, "token_acc": 0.75773476243801, "train_speed(iter/s)": 0.143563 }, { "epoch": 0.7721827365061035, "grad_norm": 0.7634002566337585, "learning_rate": 7.216883816303219e-05, "loss": 0.8597244262695313, "memory(GiB)": 91.52, "step": 59510, "token_acc": 0.7410058211661418, "train_speed(iter/s)": 0.143561 }, { "epoch": 0.7722476149077592, "grad_norm": 0.7487270832061768, "learning_rate": 7.216403031469612e-05, "loss": 0.9236057281494141, "memory(GiB)": 91.52, "step": 59515, "token_acc": 0.7625188124230401, "train_speed(iter/s)": 0.143559 }, { "epoch": 0.7723124933094149, "grad_norm": 0.6820129156112671, "learning_rate": 7.215922221129781e-05, "loss": 0.8840991973876953, "memory(GiB)": 91.52, "step": 59520, "token_acc": 0.7665607597963855, "train_speed(iter/s)": 0.143556 }, { "epoch": 0.7723773717110706, "grad_norm": 0.72442227602005, "learning_rate": 7.21544138528926e-05, "loss": 0.8922192573547363, "memory(GiB)": 91.52, "step": 59525, "token_acc": 0.755175114691731, "train_speed(iter/s)": 0.143554 }, { "epoch": 0.7724422501127263, "grad_norm": 0.7868696451187134, "learning_rate": 7.214960523953582e-05, "loss": 0.8539778709411621, "memory(GiB)": 91.52, "step": 59530, "token_acc": 0.7723157007221673, "train_speed(iter/s)": 0.143552 }, { "epoch": 0.772507128514382, "grad_norm": 0.7943094372749329, "learning_rate": 7.214479637128281e-05, "loss": 0.8913946151733398, "memory(GiB)": 91.52, "step": 59535, "token_acc": 0.7633352721798338, "train_speed(iter/s)": 0.14355 }, { "epoch": 0.7725720069160377, "grad_norm": 0.6125766634941101, "learning_rate": 7.213998724818892e-05, "loss": 0.9084775924682618, "memory(GiB)": 91.52, "step": 59540, "token_acc": 0.7501793543244321, "train_speed(iter/s)": 0.143548 }, { "epoch": 0.7726368853176934, "grad_norm": 0.7472289800643921, "learning_rate": 7.213517787030946e-05, "loss": 0.8770654678344727, "memory(GiB)": 91.52, "step": 59545, "token_acc": 0.7535729753139887, "train_speed(iter/s)": 0.143545 }, { "epoch": 0.772701763719349, "grad_norm": 0.6860514283180237, "learning_rate": 7.213036823769982e-05, "loss": 0.8451112747192383, "memory(GiB)": 91.52, "step": 59550, "token_acc": 0.771046681739593, "train_speed(iter/s)": 0.143543 }, { "epoch": 0.7727666421210047, "grad_norm": 0.6864181160926819, "learning_rate": 7.21255583504153e-05, "loss": 0.8707266807556152, "memory(GiB)": 91.52, "step": 59555, "token_acc": 0.7542082789286092, "train_speed(iter/s)": 0.143541 }, { "epoch": 0.7728315205226604, "grad_norm": 0.7701334953308105, "learning_rate": 7.21207482085113e-05, "loss": 0.8901234626770019, "memory(GiB)": 91.52, "step": 59560, "token_acc": 0.7644046992810801, "train_speed(iter/s)": 0.143538 }, { "epoch": 0.772896398924316, "grad_norm": 0.7327852845191956, "learning_rate": 7.211593781204316e-05, "loss": 0.876491641998291, "memory(GiB)": 91.52, "step": 59565, "token_acc": 0.7750881933498461, "train_speed(iter/s)": 0.143536 }, { "epoch": 0.7729612773259718, "grad_norm": 0.6482493281364441, "learning_rate": 7.211112716106621e-05, "loss": 0.8329750061035156, "memory(GiB)": 91.52, "step": 59570, "token_acc": 0.7560097094980816, "train_speed(iter/s)": 0.143534 }, { "epoch": 0.7730261557276275, "grad_norm": 0.7407301664352417, "learning_rate": 7.210631625563585e-05, "loss": 0.8329599380493165, "memory(GiB)": 91.52, "step": 59575, "token_acc": 0.7624128312412831, "train_speed(iter/s)": 0.143532 }, { "epoch": 0.7730910341292832, "grad_norm": 0.7324344515800476, "learning_rate": 7.210150509580742e-05, "loss": 0.8827426910400391, "memory(GiB)": 91.52, "step": 59580, "token_acc": 0.7772434221989893, "train_speed(iter/s)": 0.14353 }, { "epoch": 0.7731559125309388, "grad_norm": 0.7219126224517822, "learning_rate": 7.209669368163628e-05, "loss": 0.816743278503418, "memory(GiB)": 91.52, "step": 59585, "token_acc": 0.7583280955373979, "train_speed(iter/s)": 0.143527 }, { "epoch": 0.7732207909325945, "grad_norm": 0.6962913274765015, "learning_rate": 7.209188201317781e-05, "loss": 0.8229263305664063, "memory(GiB)": 91.52, "step": 59590, "token_acc": 0.7404733987981826, "train_speed(iter/s)": 0.143525 }, { "epoch": 0.7732856693342502, "grad_norm": 0.7524318099021912, "learning_rate": 7.20870700904874e-05, "loss": 0.855408763885498, "memory(GiB)": 91.52, "step": 59595, "token_acc": 0.7637065358534122, "train_speed(iter/s)": 0.143522 }, { "epoch": 0.773350547735906, "grad_norm": 0.687450110912323, "learning_rate": 7.20822579136204e-05, "loss": 0.9030306816101075, "memory(GiB)": 91.52, "step": 59600, "token_acc": 0.7750983923687546, "train_speed(iter/s)": 0.143519 }, { "epoch": 0.7734154261375616, "grad_norm": 0.7377513647079468, "learning_rate": 7.20774454826322e-05, "loss": 0.9187492370605469, "memory(GiB)": 91.52, "step": 59605, "token_acc": 0.7493999430454416, "train_speed(iter/s)": 0.143517 }, { "epoch": 0.7734803045392173, "grad_norm": 0.7612864375114441, "learning_rate": 7.207263279757818e-05, "loss": 0.8459649085998535, "memory(GiB)": 91.52, "step": 59610, "token_acc": 0.7669927826784282, "train_speed(iter/s)": 0.143515 }, { "epoch": 0.773545182940873, "grad_norm": 0.6903777718544006, "learning_rate": 7.206781985851372e-05, "loss": 0.8602468490600585, "memory(GiB)": 91.52, "step": 59615, "token_acc": 0.7698465643762509, "train_speed(iter/s)": 0.143513 }, { "epoch": 0.7736100613425287, "grad_norm": 0.7590996623039246, "learning_rate": 7.206300666549421e-05, "loss": 0.8865789413452149, "memory(GiB)": 91.52, "step": 59620, "token_acc": 0.7678707569431839, "train_speed(iter/s)": 0.143512 }, { "epoch": 0.7736749397441844, "grad_norm": 0.6689212918281555, "learning_rate": 7.205819321857503e-05, "loss": 0.8871008872985839, "memory(GiB)": 91.52, "step": 59625, "token_acc": 0.7750449236298292, "train_speed(iter/s)": 0.143509 }, { "epoch": 0.7737398181458401, "grad_norm": 0.7597545981407166, "learning_rate": 7.20533795178116e-05, "loss": 0.8604721069335938, "memory(GiB)": 91.52, "step": 59630, "token_acc": 0.7713612704684473, "train_speed(iter/s)": 0.143507 }, { "epoch": 0.7738046965474958, "grad_norm": 0.7271091341972351, "learning_rate": 7.204856556325928e-05, "loss": 0.9234145164489747, "memory(GiB)": 91.52, "step": 59635, "token_acc": 0.7516498316498317, "train_speed(iter/s)": 0.143505 }, { "epoch": 0.7738695749491515, "grad_norm": 0.7716441750526428, "learning_rate": 7.204375135497349e-05, "loss": 0.8639185905456543, "memory(GiB)": 91.52, "step": 59640, "token_acc": 0.7551174180585206, "train_speed(iter/s)": 0.143502 }, { "epoch": 0.7739344533508072, "grad_norm": 0.8259319067001343, "learning_rate": 7.203893689300964e-05, "loss": 0.8577230453491211, "memory(GiB)": 91.52, "step": 59645, "token_acc": 0.7689830339652711, "train_speed(iter/s)": 0.1435 }, { "epoch": 0.7739993317524629, "grad_norm": 0.7314212918281555, "learning_rate": 7.203412217742311e-05, "loss": 0.8952140808105469, "memory(GiB)": 91.52, "step": 59650, "token_acc": 0.7457262402869097, "train_speed(iter/s)": 0.143497 }, { "epoch": 0.7740642101541186, "grad_norm": 0.7437614798545837, "learning_rate": 7.202930720826933e-05, "loss": 0.859316062927246, "memory(GiB)": 91.52, "step": 59655, "token_acc": 0.7706885404881203, "train_speed(iter/s)": 0.143495 }, { "epoch": 0.7741290885557743, "grad_norm": 0.7174954414367676, "learning_rate": 7.202449198560369e-05, "loss": 0.8679106712341309, "memory(GiB)": 91.52, "step": 59660, "token_acc": 0.7556587837837838, "train_speed(iter/s)": 0.143493 }, { "epoch": 0.77419396695743, "grad_norm": 0.7140592932701111, "learning_rate": 7.201967650948163e-05, "loss": 0.8903855323791504, "memory(GiB)": 91.52, "step": 59665, "token_acc": 0.7563101065861195, "train_speed(iter/s)": 0.14349 }, { "epoch": 0.7742588453590857, "grad_norm": 0.6820383071899414, "learning_rate": 7.201486077995853e-05, "loss": 0.8597846031188965, "memory(GiB)": 91.52, "step": 59670, "token_acc": 0.7611894121691303, "train_speed(iter/s)": 0.143489 }, { "epoch": 0.7743237237607414, "grad_norm": 0.7026088833808899, "learning_rate": 7.201004479708985e-05, "loss": 0.927113151550293, "memory(GiB)": 91.52, "step": 59675, "token_acc": 0.7723667034842856, "train_speed(iter/s)": 0.143487 }, { "epoch": 0.7743886021623971, "grad_norm": 0.7693305611610413, "learning_rate": 7.200522856093098e-05, "loss": 0.8508969306945801, "memory(GiB)": 91.52, "step": 59680, "token_acc": 0.77475532527346, "train_speed(iter/s)": 0.143485 }, { "epoch": 0.7744534805640528, "grad_norm": 0.6215924620628357, "learning_rate": 7.200041207153735e-05, "loss": 0.8616985321044922, "memory(GiB)": 91.52, "step": 59685, "token_acc": 0.7698646048858397, "train_speed(iter/s)": 0.143482 }, { "epoch": 0.7745183589657085, "grad_norm": 0.7539535164833069, "learning_rate": 7.199559532896441e-05, "loss": 0.9156242370605469, "memory(GiB)": 91.52, "step": 59690, "token_acc": 0.7550108387527097, "train_speed(iter/s)": 0.14348 }, { "epoch": 0.7745832373673642, "grad_norm": 0.775020956993103, "learning_rate": 7.199077833326755e-05, "loss": 0.830272102355957, "memory(GiB)": 91.52, "step": 59695, "token_acc": 0.7643258721234766, "train_speed(iter/s)": 0.143478 }, { "epoch": 0.7746481157690199, "grad_norm": 0.8738880753517151, "learning_rate": 7.198596108450223e-05, "loss": 0.8771775245666504, "memory(GiB)": 91.52, "step": 59700, "token_acc": 0.7647769930648343, "train_speed(iter/s)": 0.143476 }, { "epoch": 0.7747129941706756, "grad_norm": 0.7344014644622803, "learning_rate": 7.198114358272392e-05, "loss": 0.8677565574645996, "memory(GiB)": 91.52, "step": 59705, "token_acc": 0.7682559909383738, "train_speed(iter/s)": 0.143474 }, { "epoch": 0.7747778725723313, "grad_norm": 0.7019392251968384, "learning_rate": 7.197632582798801e-05, "loss": 0.8304986953735352, "memory(GiB)": 91.52, "step": 59710, "token_acc": 0.7857759262014581, "train_speed(iter/s)": 0.143472 }, { "epoch": 0.774842750973987, "grad_norm": 0.683811604976654, "learning_rate": 7.197150782034993e-05, "loss": 0.8512087821960449, "memory(GiB)": 91.52, "step": 59715, "token_acc": 0.7624468085106383, "train_speed(iter/s)": 0.143469 }, { "epoch": 0.7749076293756427, "grad_norm": 0.7237494587898254, "learning_rate": 7.196668955986516e-05, "loss": 0.8681766510009765, "memory(GiB)": 91.52, "step": 59720, "token_acc": 0.7517815216048112, "train_speed(iter/s)": 0.143466 }, { "epoch": 0.7749725077772984, "grad_norm": 0.7189520597457886, "learning_rate": 7.196187104658917e-05, "loss": 0.8538002014160156, "memory(GiB)": 91.52, "step": 59725, "token_acc": 0.7507916959887403, "train_speed(iter/s)": 0.143463 }, { "epoch": 0.7750373861789541, "grad_norm": 0.7455815076828003, "learning_rate": 7.195705228057736e-05, "loss": 0.8876858711242676, "memory(GiB)": 91.52, "step": 59730, "token_acc": 0.762435371724015, "train_speed(iter/s)": 0.143461 }, { "epoch": 0.7751022645806098, "grad_norm": 0.829179584980011, "learning_rate": 7.195223326188521e-05, "loss": 0.8755258560180664, "memory(GiB)": 91.52, "step": 59735, "token_acc": 0.7779028381954796, "train_speed(iter/s)": 0.143459 }, { "epoch": 0.7751671429822655, "grad_norm": 0.7081544995307922, "learning_rate": 7.194741399056815e-05, "loss": 0.8601900100708008, "memory(GiB)": 91.52, "step": 59740, "token_acc": 0.7537350597609562, "train_speed(iter/s)": 0.143457 }, { "epoch": 0.7752320213839212, "grad_norm": 0.7020719647407532, "learning_rate": 7.19425944666817e-05, "loss": 0.8761505126953125, "memory(GiB)": 91.52, "step": 59745, "token_acc": 0.7652376204719176, "train_speed(iter/s)": 0.143456 }, { "epoch": 0.7752968997855769, "grad_norm": 0.6120491623878479, "learning_rate": 7.193777469028126e-05, "loss": 0.8671509742736816, "memory(GiB)": 91.52, "step": 59750, "token_acc": 0.7648344974187671, "train_speed(iter/s)": 0.143453 }, { "epoch": 0.7753617781872326, "grad_norm": 0.6794931292533875, "learning_rate": 7.193295466142233e-05, "loss": 0.8828590393066407, "memory(GiB)": 91.52, "step": 59755, "token_acc": 0.7455991352686844, "train_speed(iter/s)": 0.143451 }, { "epoch": 0.7754266565888883, "grad_norm": 0.7941203117370605, "learning_rate": 7.192813438016037e-05, "loss": 0.8917105674743653, "memory(GiB)": 91.52, "step": 59760, "token_acc": 0.7428006077523762, "train_speed(iter/s)": 0.143449 }, { "epoch": 0.775491534990544, "grad_norm": 0.741287112236023, "learning_rate": 7.192331384655085e-05, "loss": 0.8270814895629883, "memory(GiB)": 91.52, "step": 59765, "token_acc": 0.7863427039659083, "train_speed(iter/s)": 0.143447 }, { "epoch": 0.7755564133921997, "grad_norm": 0.8217157125473022, "learning_rate": 7.191849306064923e-05, "loss": 0.8612918853759766, "memory(GiB)": 91.52, "step": 59770, "token_acc": 0.7673378961110892, "train_speed(iter/s)": 0.143445 }, { "epoch": 0.7756212917938554, "grad_norm": 0.7160407900810242, "learning_rate": 7.191367202251103e-05, "loss": 0.8052448272705078, "memory(GiB)": 91.52, "step": 59775, "token_acc": 0.770252977655281, "train_speed(iter/s)": 0.143443 }, { "epoch": 0.7756861701955111, "grad_norm": 0.6787497401237488, "learning_rate": 7.190885073219169e-05, "loss": 0.8245456695556641, "memory(GiB)": 91.52, "step": 59780, "token_acc": 0.7822492484285324, "train_speed(iter/s)": 0.143441 }, { "epoch": 0.7757510485971668, "grad_norm": 0.738559901714325, "learning_rate": 7.19040291897467e-05, "loss": 0.862310791015625, "memory(GiB)": 91.52, "step": 59785, "token_acc": 0.7615357014711159, "train_speed(iter/s)": 0.143439 }, { "epoch": 0.7758159269988224, "grad_norm": 0.6885237097740173, "learning_rate": 7.189920739523155e-05, "loss": 0.8689418792724609, "memory(GiB)": 91.52, "step": 59790, "token_acc": 0.7594654011236347, "train_speed(iter/s)": 0.143437 }, { "epoch": 0.7758808054004781, "grad_norm": 0.8050944805145264, "learning_rate": 7.189438534870176e-05, "loss": 0.9586105346679688, "memory(GiB)": 91.52, "step": 59795, "token_acc": 0.7481176421648155, "train_speed(iter/s)": 0.143435 }, { "epoch": 0.7759456838021338, "grad_norm": 0.833050549030304, "learning_rate": 7.188956305021275e-05, "loss": 0.8985721588134765, "memory(GiB)": 91.52, "step": 59800, "token_acc": 0.7547375070185289, "train_speed(iter/s)": 0.143433 }, { "epoch": 0.7760105622037895, "grad_norm": 0.7357670068740845, "learning_rate": 7.188474049982008e-05, "loss": 0.8396766662597657, "memory(GiB)": 91.52, "step": 59805, "token_acc": 0.7774213693652776, "train_speed(iter/s)": 0.143431 }, { "epoch": 0.7760754406054452, "grad_norm": 0.7885337471961975, "learning_rate": 7.187991769757922e-05, "loss": 0.8620054244995117, "memory(GiB)": 91.52, "step": 59810, "token_acc": 0.7653808931287156, "train_speed(iter/s)": 0.143429 }, { "epoch": 0.7761403190071009, "grad_norm": 0.6752029061317444, "learning_rate": 7.187509464354568e-05, "loss": 0.892822265625, "memory(GiB)": 91.52, "step": 59815, "token_acc": 0.7404823965637505, "train_speed(iter/s)": 0.143427 }, { "epoch": 0.7762051974087566, "grad_norm": 0.6677578091621399, "learning_rate": 7.187027133777495e-05, "loss": 0.9238943099975586, "memory(GiB)": 91.52, "step": 59820, "token_acc": 0.7462644132048649, "train_speed(iter/s)": 0.143426 }, { "epoch": 0.7762700758104123, "grad_norm": 0.6920408606529236, "learning_rate": 7.186544778032256e-05, "loss": 0.8816713333129883, "memory(GiB)": 91.52, "step": 59825, "token_acc": 0.7420592906299629, "train_speed(iter/s)": 0.143424 }, { "epoch": 0.776334954212068, "grad_norm": 0.7692393660545349, "learning_rate": 7.186062397124398e-05, "loss": 0.9041917800903321, "memory(GiB)": 91.52, "step": 59830, "token_acc": 0.7503605942593394, "train_speed(iter/s)": 0.143422 }, { "epoch": 0.7763998326137237, "grad_norm": 0.7306474447250366, "learning_rate": 7.185579991059476e-05, "loss": 0.9120872497558594, "memory(GiB)": 91.52, "step": 59835, "token_acc": 0.7432258538916872, "train_speed(iter/s)": 0.14342 }, { "epoch": 0.7764647110153794, "grad_norm": 0.7372528314590454, "learning_rate": 7.185097559843041e-05, "loss": 0.9215763092041016, "memory(GiB)": 91.52, "step": 59840, "token_acc": 0.7405468501291076, "train_speed(iter/s)": 0.143418 }, { "epoch": 0.7765295894170351, "grad_norm": 0.6875253319740295, "learning_rate": 7.184615103480644e-05, "loss": 0.8612865447998047, "memory(GiB)": 91.52, "step": 59845, "token_acc": 0.7454365360124522, "train_speed(iter/s)": 0.143415 }, { "epoch": 0.7765944678186908, "grad_norm": 0.6803435683250427, "learning_rate": 7.184132621977837e-05, "loss": 0.8779908180236816, "memory(GiB)": 91.52, "step": 59850, "token_acc": 0.7607593332065665, "train_speed(iter/s)": 0.143413 }, { "epoch": 0.7766593462203465, "grad_norm": 0.6625084280967712, "learning_rate": 7.18365011534017e-05, "loss": 0.8736486434936523, "memory(GiB)": 91.52, "step": 59855, "token_acc": 0.7838732394366197, "train_speed(iter/s)": 0.143411 }, { "epoch": 0.7767242246220022, "grad_norm": 0.6943414807319641, "learning_rate": 7.183167583573201e-05, "loss": 0.8871312141418457, "memory(GiB)": 91.52, "step": 59860, "token_acc": 0.7493466898954704, "train_speed(iter/s)": 0.143409 }, { "epoch": 0.7767891030236579, "grad_norm": 0.6974992156028748, "learning_rate": 7.182685026682478e-05, "loss": 0.8923778533935547, "memory(GiB)": 91.52, "step": 59865, "token_acc": 0.7397211997807189, "train_speed(iter/s)": 0.143408 }, { "epoch": 0.7768539814253136, "grad_norm": 0.7377483248710632, "learning_rate": 7.182202444673556e-05, "loss": 0.8604053497314453, "memory(GiB)": 91.52, "step": 59870, "token_acc": 0.7570300043484564, "train_speed(iter/s)": 0.143405 }, { "epoch": 0.7769188598269693, "grad_norm": 0.7483387589454651, "learning_rate": 7.18171983755199e-05, "loss": 0.8704647064208985, "memory(GiB)": 91.52, "step": 59875, "token_acc": 0.7404652175490801, "train_speed(iter/s)": 0.143404 }, { "epoch": 0.776983738228625, "grad_norm": 0.711887001991272, "learning_rate": 7.181237205323331e-05, "loss": 0.8775766372680665, "memory(GiB)": 91.52, "step": 59880, "token_acc": 0.7613800578034682, "train_speed(iter/s)": 0.143402 }, { "epoch": 0.7770486166302807, "grad_norm": 0.7211553454399109, "learning_rate": 7.180754547993136e-05, "loss": 0.8567415237426758, "memory(GiB)": 91.52, "step": 59885, "token_acc": 0.7519144187478355, "train_speed(iter/s)": 0.143399 }, { "epoch": 0.7771134950319364, "grad_norm": 0.6999432444572449, "learning_rate": 7.180271865566958e-05, "loss": 0.8690887451171875, "memory(GiB)": 91.52, "step": 59890, "token_acc": 0.7528169728961527, "train_speed(iter/s)": 0.143397 }, { "epoch": 0.7771783734335921, "grad_norm": 0.7787150740623474, "learning_rate": 7.179789158050351e-05, "loss": 0.8584877014160156, "memory(GiB)": 91.52, "step": 59895, "token_acc": 0.7555863983344899, "train_speed(iter/s)": 0.143395 }, { "epoch": 0.7772432518352478, "grad_norm": 0.7033435106277466, "learning_rate": 7.17930642544887e-05, "loss": 0.8636966705322265, "memory(GiB)": 91.52, "step": 59900, "token_acc": 0.7678109172763083, "train_speed(iter/s)": 0.143393 }, { "epoch": 0.7773081302369035, "grad_norm": 0.7228794693946838, "learning_rate": 7.178823667768073e-05, "loss": 0.8709588050842285, "memory(GiB)": 91.52, "step": 59905, "token_acc": 0.7642652965227774, "train_speed(iter/s)": 0.14339 }, { "epoch": 0.7773730086385592, "grad_norm": 0.7542239427566528, "learning_rate": 7.178340885013513e-05, "loss": 0.8756532669067383, "memory(GiB)": 91.52, "step": 59910, "token_acc": 0.7745676100628931, "train_speed(iter/s)": 0.143388 }, { "epoch": 0.7774378870402149, "grad_norm": 0.7885653972625732, "learning_rate": 7.177858077190744e-05, "loss": 0.896533203125, "memory(GiB)": 91.52, "step": 59915, "token_acc": 0.74032964461235, "train_speed(iter/s)": 0.143386 }, { "epoch": 0.7775027654418706, "grad_norm": 0.6661241054534912, "learning_rate": 7.177375244305325e-05, "loss": 0.8852245330810546, "memory(GiB)": 91.52, "step": 59920, "token_acc": 0.7695634934938543, "train_speed(iter/s)": 0.143384 }, { "epoch": 0.7775676438435263, "grad_norm": 0.7099456787109375, "learning_rate": 7.176892386362813e-05, "loss": 0.8734951972961426, "memory(GiB)": 91.52, "step": 59925, "token_acc": 0.7621038183179436, "train_speed(iter/s)": 0.143382 }, { "epoch": 0.777632522245182, "grad_norm": 0.7007039785385132, "learning_rate": 7.176409503368764e-05, "loss": 0.9288137435913086, "memory(GiB)": 91.52, "step": 59930, "token_acc": 0.7471223150571951, "train_speed(iter/s)": 0.14338 }, { "epoch": 0.7776974006468377, "grad_norm": 0.8234415054321289, "learning_rate": 7.175926595328735e-05, "loss": 0.8853754043579102, "memory(GiB)": 91.52, "step": 59935, "token_acc": 0.7684628099173554, "train_speed(iter/s)": 0.143378 }, { "epoch": 0.7777622790484934, "grad_norm": 0.8032433986663818, "learning_rate": 7.175443662248282e-05, "loss": 0.9240967750549316, "memory(GiB)": 91.52, "step": 59940, "token_acc": 0.7543141504133558, "train_speed(iter/s)": 0.143376 }, { "epoch": 0.7778271574501491, "grad_norm": 0.7463171482086182, "learning_rate": 7.174960704132963e-05, "loss": 0.9000760078430176, "memory(GiB)": 91.52, "step": 59945, "token_acc": 0.7518468886369103, "train_speed(iter/s)": 0.143373 }, { "epoch": 0.7778920358518048, "grad_norm": 0.7746599316596985, "learning_rate": 7.174477720988333e-05, "loss": 0.8810449600219726, "memory(GiB)": 91.52, "step": 59950, "token_acc": 0.7556818181818182, "train_speed(iter/s)": 0.143371 }, { "epoch": 0.7779569142534605, "grad_norm": 0.7457259893417358, "learning_rate": 7.173994712819958e-05, "loss": 0.89219970703125, "memory(GiB)": 91.52, "step": 59955, "token_acc": 0.7723689943359607, "train_speed(iter/s)": 0.143369 }, { "epoch": 0.7780217926551162, "grad_norm": 0.7076367735862732, "learning_rate": 7.17351167963339e-05, "loss": 0.9037944793701171, "memory(GiB)": 91.52, "step": 59960, "token_acc": 0.7597519055447616, "train_speed(iter/s)": 0.143367 }, { "epoch": 0.7780866710567719, "grad_norm": 0.9052849411964417, "learning_rate": 7.17302862143419e-05, "loss": 0.9348892211914063, "memory(GiB)": 91.52, "step": 59965, "token_acc": 0.7276734223101005, "train_speed(iter/s)": 0.143365 }, { "epoch": 0.7781515494584276, "grad_norm": 0.7095549702644348, "learning_rate": 7.172545538227918e-05, "loss": 0.8442790031433105, "memory(GiB)": 91.52, "step": 59970, "token_acc": 0.7859809335760343, "train_speed(iter/s)": 0.143363 }, { "epoch": 0.7782164278600833, "grad_norm": 0.75374436378479, "learning_rate": 7.17206243002013e-05, "loss": 0.8271690368652344, "memory(GiB)": 91.52, "step": 59975, "token_acc": 0.7834430856067733, "train_speed(iter/s)": 0.143361 }, { "epoch": 0.778281306261739, "grad_norm": 0.7004666328430176, "learning_rate": 7.171579296816387e-05, "loss": 0.8609548568725586, "memory(GiB)": 91.52, "step": 59980, "token_acc": 0.7614560018847921, "train_speed(iter/s)": 0.143359 }, { "epoch": 0.7783461846633947, "grad_norm": 0.7012982368469238, "learning_rate": 7.171096138622249e-05, "loss": 0.8394855499267578, "memory(GiB)": 91.52, "step": 59985, "token_acc": 0.7636917562724014, "train_speed(iter/s)": 0.143357 }, { "epoch": 0.7784110630650504, "grad_norm": 0.6823481917381287, "learning_rate": 7.170612955443277e-05, "loss": 0.8581760406494141, "memory(GiB)": 91.52, "step": 59990, "token_acc": 0.762315657358017, "train_speed(iter/s)": 0.143355 }, { "epoch": 0.7784759414667061, "grad_norm": 0.7149227857589722, "learning_rate": 7.17012974728503e-05, "loss": 0.8607474327087402, "memory(GiB)": 91.52, "step": 59995, "token_acc": 0.7780493393416329, "train_speed(iter/s)": 0.143353 }, { "epoch": 0.7785408198683618, "grad_norm": 0.7780655026435852, "learning_rate": 7.16964651415307e-05, "loss": 0.8779836654663086, "memory(GiB)": 91.52, "step": 60000, "token_acc": 0.7380074779242131, "train_speed(iter/s)": 0.143351 }, { "epoch": 0.7785408198683618, "eval_loss": 0.8723955750465393, "eval_runtime": 2240.8111, "eval_samples_per_second": 22.233, "eval_steps_per_second": 1.39, "eval_token_acc": 0.7613350356529667, "step": 60000 }, { "epoch": 0.7786056982700175, "grad_norm": 0.7378100156784058, "learning_rate": 7.169163256052958e-05, "loss": 0.8716659545898438, "memory(GiB)": 91.52, "step": 60005, "token_acc": 0.7628587672047297, "train_speed(iter/s)": 0.142541 }, { "epoch": 0.7786705766716732, "grad_norm": 0.7480389475822449, "learning_rate": 7.168679972990254e-05, "loss": 0.8799702644348144, "memory(GiB)": 91.52, "step": 60010, "token_acc": 0.7642553191489362, "train_speed(iter/s)": 0.142539 }, { "epoch": 0.7787354550733289, "grad_norm": 0.7342261672019958, "learning_rate": 7.168196664970522e-05, "loss": 0.8749494552612305, "memory(GiB)": 91.52, "step": 60015, "token_acc": 0.759906194404011, "train_speed(iter/s)": 0.142537 }, { "epoch": 0.7788003334749846, "grad_norm": 0.7907134890556335, "learning_rate": 7.16771333199932e-05, "loss": 0.8848964691162109, "memory(GiB)": 91.52, "step": 60020, "token_acc": 0.7418189881838629, "train_speed(iter/s)": 0.142536 }, { "epoch": 0.7788652118766403, "grad_norm": 0.7254445552825928, "learning_rate": 7.167229974082214e-05, "loss": 0.8745678901672364, "memory(GiB)": 91.52, "step": 60025, "token_acc": 0.7692680730195018, "train_speed(iter/s)": 0.142534 }, { "epoch": 0.7789300902782959, "grad_norm": 0.7586114406585693, "learning_rate": 7.166746591224767e-05, "loss": 0.856892204284668, "memory(GiB)": 91.52, "step": 60030, "token_acc": 0.7741248817407758, "train_speed(iter/s)": 0.142531 }, { "epoch": 0.7789949686799515, "grad_norm": 0.7506599426269531, "learning_rate": 7.166263183432537e-05, "loss": 0.8553543090820312, "memory(GiB)": 91.52, "step": 60035, "token_acc": 0.7664824247678913, "train_speed(iter/s)": 0.142529 }, { "epoch": 0.7790598470816072, "grad_norm": 0.8140226006507874, "learning_rate": 7.16577975071109e-05, "loss": 0.8837789535522461, "memory(GiB)": 91.52, "step": 60040, "token_acc": 0.7679640718562875, "train_speed(iter/s)": 0.142527 }, { "epoch": 0.779124725483263, "grad_norm": 0.6918233633041382, "learning_rate": 7.165296293065989e-05, "loss": 0.8743043899536133, "memory(GiB)": 91.52, "step": 60045, "token_acc": 0.7692823158882263, "train_speed(iter/s)": 0.142525 }, { "epoch": 0.7791896038849186, "grad_norm": 0.7263677716255188, "learning_rate": 7.164812810502799e-05, "loss": 0.8534074783325195, "memory(GiB)": 91.52, "step": 60050, "token_acc": 0.7657858324052512, "train_speed(iter/s)": 0.142523 }, { "epoch": 0.7792544822865743, "grad_norm": 0.7746709585189819, "learning_rate": 7.16432930302708e-05, "loss": 0.8939960479736329, "memory(GiB)": 91.52, "step": 60055, "token_acc": 0.7505076578902082, "train_speed(iter/s)": 0.142521 }, { "epoch": 0.77931936068823, "grad_norm": 0.7343437075614929, "learning_rate": 7.1638457706444e-05, "loss": 0.9584928512573242, "memory(GiB)": 91.52, "step": 60060, "token_acc": 0.7427712972781901, "train_speed(iter/s)": 0.142519 }, { "epoch": 0.7793842390898857, "grad_norm": 0.6827079057693481, "learning_rate": 7.163362213360323e-05, "loss": 0.8349332809448242, "memory(GiB)": 91.52, "step": 60065, "token_acc": 0.7856230803740898, "train_speed(iter/s)": 0.142517 }, { "epoch": 0.7794491174915414, "grad_norm": 0.8158147931098938, "learning_rate": 7.162878631180412e-05, "loss": 0.8693296432495117, "memory(GiB)": 91.52, "step": 60070, "token_acc": 0.76591360097695, "train_speed(iter/s)": 0.142514 }, { "epoch": 0.7795139958931971, "grad_norm": 0.7445773482322693, "learning_rate": 7.162395024110233e-05, "loss": 0.9049135208129883, "memory(GiB)": 91.52, "step": 60075, "token_acc": 0.7720870678617158, "train_speed(iter/s)": 0.142513 }, { "epoch": 0.7795788742948528, "grad_norm": 0.7428447604179382, "learning_rate": 7.16191139215535e-05, "loss": 0.929156494140625, "memory(GiB)": 91.52, "step": 60080, "token_acc": 0.7468375499334221, "train_speed(iter/s)": 0.142512 }, { "epoch": 0.7796437526965085, "grad_norm": 0.8478802442550659, "learning_rate": 7.161427735321332e-05, "loss": 0.8988024711608886, "memory(GiB)": 91.52, "step": 60085, "token_acc": 0.7518333915362393, "train_speed(iter/s)": 0.14251 }, { "epoch": 0.7797086310981642, "grad_norm": 0.6315020322799683, "learning_rate": 7.160944053613742e-05, "loss": 0.8586160659790039, "memory(GiB)": 91.52, "step": 60090, "token_acc": 0.7693974076614796, "train_speed(iter/s)": 0.142508 }, { "epoch": 0.7797735094998199, "grad_norm": 0.7321567535400391, "learning_rate": 7.160460347038146e-05, "loss": 0.8971477508544922, "memory(GiB)": 91.52, "step": 60095, "token_acc": 0.7506231185058104, "train_speed(iter/s)": 0.142505 }, { "epoch": 0.7798383879014756, "grad_norm": 0.6758376359939575, "learning_rate": 7.159976615600111e-05, "loss": 0.9010860443115234, "memory(GiB)": 91.52, "step": 60100, "token_acc": 0.7538114931177088, "train_speed(iter/s)": 0.142503 }, { "epoch": 0.7799032663031313, "grad_norm": 0.7167044281959534, "learning_rate": 7.159492859305207e-05, "loss": 0.8434871673583985, "memory(GiB)": 91.52, "step": 60105, "token_acc": 0.7553832306307554, "train_speed(iter/s)": 0.142501 }, { "epoch": 0.779968144704787, "grad_norm": 0.7867063879966736, "learning_rate": 7.159009078158995e-05, "loss": 0.9045055389404297, "memory(GiB)": 91.52, "step": 60110, "token_acc": 0.7494798890429958, "train_speed(iter/s)": 0.142499 }, { "epoch": 0.7800330231064427, "grad_norm": 0.7331200838088989, "learning_rate": 7.158525272167047e-05, "loss": 0.9195388793945313, "memory(GiB)": 91.52, "step": 60115, "token_acc": 0.7322999279019466, "train_speed(iter/s)": 0.142498 }, { "epoch": 0.7800979015080984, "grad_norm": 0.7388796806335449, "learning_rate": 7.15804144133493e-05, "loss": 0.8713703155517578, "memory(GiB)": 91.52, "step": 60120, "token_acc": 0.7513477088948787, "train_speed(iter/s)": 0.142496 }, { "epoch": 0.7801627799097541, "grad_norm": 0.6931727528572083, "learning_rate": 7.15755758566821e-05, "loss": 0.8687627792358399, "memory(GiB)": 91.52, "step": 60125, "token_acc": 0.7398742767340525, "train_speed(iter/s)": 0.142493 }, { "epoch": 0.7802276583114098, "grad_norm": 0.6379516124725342, "learning_rate": 7.157073705172455e-05, "loss": 0.8827921867370605, "memory(GiB)": 91.52, "step": 60130, "token_acc": 0.7706496799903393, "train_speed(iter/s)": 0.14249 }, { "epoch": 0.7802925367130655, "grad_norm": 0.7011033892631531, "learning_rate": 7.156589799853236e-05, "loss": 0.8782220840454101, "memory(GiB)": 91.52, "step": 60135, "token_acc": 0.7523730224812656, "train_speed(iter/s)": 0.142488 }, { "epoch": 0.7803574151147212, "grad_norm": 0.8185045123100281, "learning_rate": 7.156105869716121e-05, "loss": 0.8962251663208007, "memory(GiB)": 91.52, "step": 60140, "token_acc": 0.7944609015311039, "train_speed(iter/s)": 0.142486 }, { "epoch": 0.7804222935163769, "grad_norm": 0.6990076899528503, "learning_rate": 7.155621914766677e-05, "loss": 0.8859297752380371, "memory(GiB)": 91.52, "step": 60145, "token_acc": 0.7743562716458364, "train_speed(iter/s)": 0.142484 }, { "epoch": 0.7804871719180326, "grad_norm": 0.7628082036972046, "learning_rate": 7.155137935010475e-05, "loss": 0.8421461105346679, "memory(GiB)": 91.52, "step": 60150, "token_acc": 0.763967803030303, "train_speed(iter/s)": 0.142482 }, { "epoch": 0.7805520503196883, "grad_norm": 0.7803966999053955, "learning_rate": 7.154653930453085e-05, "loss": 0.8981976509094238, "memory(GiB)": 91.52, "step": 60155, "token_acc": 0.7385890516576715, "train_speed(iter/s)": 0.14248 }, { "epoch": 0.780616928721344, "grad_norm": 0.7478301525115967, "learning_rate": 7.154169901100075e-05, "loss": 0.8734625816345215, "memory(GiB)": 91.52, "step": 60160, "token_acc": 0.745945945945946, "train_speed(iter/s)": 0.142477 }, { "epoch": 0.7806818071229997, "grad_norm": 0.6416007280349731, "learning_rate": 7.153685846957018e-05, "loss": 0.839205551147461, "memory(GiB)": 91.52, "step": 60165, "token_acc": 0.7589506269347425, "train_speed(iter/s)": 0.142475 }, { "epoch": 0.7807466855246554, "grad_norm": 0.7734076380729675, "learning_rate": 7.15320176802948e-05, "loss": 0.8652503013610839, "memory(GiB)": 91.52, "step": 60170, "token_acc": 0.7367973057991932, "train_speed(iter/s)": 0.142473 }, { "epoch": 0.7808115639263111, "grad_norm": 0.7707876563072205, "learning_rate": 7.152717664323039e-05, "loss": 0.8809768676757812, "memory(GiB)": 91.52, "step": 60175, "token_acc": 0.7612860171154903, "train_speed(iter/s)": 0.142472 }, { "epoch": 0.7808764423279668, "grad_norm": 0.6827330589294434, "learning_rate": 7.152233535843258e-05, "loss": 0.9134775161743164, "memory(GiB)": 91.52, "step": 60180, "token_acc": 0.7524958099540917, "train_speed(iter/s)": 0.142469 }, { "epoch": 0.7809413207296225, "grad_norm": 0.7305602431297302, "learning_rate": 7.151749382595714e-05, "loss": 0.8585583686828613, "memory(GiB)": 91.52, "step": 60185, "token_acc": 0.7637028014616322, "train_speed(iter/s)": 0.142467 }, { "epoch": 0.7810061991312782, "grad_norm": 0.7911826372146606, "learning_rate": 7.151265204585976e-05, "loss": 0.9118006706237793, "memory(GiB)": 91.52, "step": 60190, "token_acc": 0.7402819307988039, "train_speed(iter/s)": 0.142465 }, { "epoch": 0.7810710775329339, "grad_norm": 0.7268118858337402, "learning_rate": 7.150781001819617e-05, "loss": 0.9136842727661133, "memory(GiB)": 91.52, "step": 60195, "token_acc": 0.7499628097944125, "train_speed(iter/s)": 0.142463 }, { "epoch": 0.7811359559345896, "grad_norm": 0.6886349320411682, "learning_rate": 7.150296774302208e-05, "loss": 0.8822576522827148, "memory(GiB)": 91.52, "step": 60200, "token_acc": 0.7596404069079725, "train_speed(iter/s)": 0.142462 }, { "epoch": 0.7812008343362453, "grad_norm": 0.7091140151023865, "learning_rate": 7.149812522039322e-05, "loss": 0.8933764457702636, "memory(GiB)": 91.52, "step": 60205, "token_acc": 0.7672188605627749, "train_speed(iter/s)": 0.14246 }, { "epoch": 0.781265712737901, "grad_norm": 0.6963940262794495, "learning_rate": 7.149328245036533e-05, "loss": 0.8696667671203613, "memory(GiB)": 91.52, "step": 60210, "token_acc": 0.764864766215279, "train_speed(iter/s)": 0.142458 }, { "epoch": 0.7813305911395567, "grad_norm": 0.7787685394287109, "learning_rate": 7.148843943299412e-05, "loss": 0.8631765365600585, "memory(GiB)": 91.52, "step": 60215, "token_acc": 0.7527220630372493, "train_speed(iter/s)": 0.142456 }, { "epoch": 0.7813954695412124, "grad_norm": 0.6614634394645691, "learning_rate": 7.148359616833533e-05, "loss": 0.8690803527832032, "memory(GiB)": 91.52, "step": 60220, "token_acc": 0.7617119503519262, "train_speed(iter/s)": 0.142455 }, { "epoch": 0.7814603479428681, "grad_norm": 0.7580856680870056, "learning_rate": 7.147875265644469e-05, "loss": 0.8664871215820312, "memory(GiB)": 91.52, "step": 60225, "token_acc": 0.7668774568449838, "train_speed(iter/s)": 0.142453 }, { "epoch": 0.7815252263445238, "grad_norm": 0.8108768463134766, "learning_rate": 7.147390889737796e-05, "loss": 0.919209098815918, "memory(GiB)": 91.52, "step": 60230, "token_acc": 0.7548306055344146, "train_speed(iter/s)": 0.142452 }, { "epoch": 0.7815901047461795, "grad_norm": 0.7794677019119263, "learning_rate": 7.146906489119087e-05, "loss": 0.933905029296875, "memory(GiB)": 91.52, "step": 60235, "token_acc": 0.7681603003591251, "train_speed(iter/s)": 0.14245 }, { "epoch": 0.7816549831478352, "grad_norm": 0.733528196811676, "learning_rate": 7.146422063793914e-05, "loss": 0.8591501235961914, "memory(GiB)": 91.52, "step": 60240, "token_acc": 0.7689349545854087, "train_speed(iter/s)": 0.142448 }, { "epoch": 0.7817198615494909, "grad_norm": 0.7845256924629211, "learning_rate": 7.145937613767857e-05, "loss": 0.8811416625976562, "memory(GiB)": 91.52, "step": 60245, "token_acc": 0.7580047410770926, "train_speed(iter/s)": 0.142447 }, { "epoch": 0.7817847399511466, "grad_norm": 0.7402587532997131, "learning_rate": 7.145453139046487e-05, "loss": 0.8366140365600586, "memory(GiB)": 91.52, "step": 60250, "token_acc": 0.7798567954606863, "train_speed(iter/s)": 0.142445 }, { "epoch": 0.7818496183528023, "grad_norm": 0.8452176451683044, "learning_rate": 7.14496863963538e-05, "loss": 0.9420273780822754, "memory(GiB)": 91.52, "step": 60255, "token_acc": 0.7359362365715606, "train_speed(iter/s)": 0.142443 }, { "epoch": 0.781914496754458, "grad_norm": 0.7534557580947876, "learning_rate": 7.144484115540113e-05, "loss": 0.8902545928955078, "memory(GiB)": 91.52, "step": 60260, "token_acc": 0.7497968843830584, "train_speed(iter/s)": 0.142442 }, { "epoch": 0.7819793751561136, "grad_norm": 0.6631163358688354, "learning_rate": 7.14399956676626e-05, "loss": 0.8717972755432128, "memory(GiB)": 91.52, "step": 60265, "token_acc": 0.7610037021801728, "train_speed(iter/s)": 0.14244 }, { "epoch": 0.7820442535577693, "grad_norm": 0.7806488275527954, "learning_rate": 7.143514993319398e-05, "loss": 0.9069635391235351, "memory(GiB)": 91.52, "step": 60270, "token_acc": 0.784227898966705, "train_speed(iter/s)": 0.142438 }, { "epoch": 0.782109131959425, "grad_norm": 0.691150426864624, "learning_rate": 7.143030395205103e-05, "loss": 0.877171516418457, "memory(GiB)": 91.52, "step": 60275, "token_acc": 0.7615869369509204, "train_speed(iter/s)": 0.142436 }, { "epoch": 0.7821740103610807, "grad_norm": 0.7331872582435608, "learning_rate": 7.142545772428953e-05, "loss": 0.8716361999511719, "memory(GiB)": 91.52, "step": 60280, "token_acc": 0.756396937165724, "train_speed(iter/s)": 0.142434 }, { "epoch": 0.7822388887627364, "grad_norm": 0.6949188709259033, "learning_rate": 7.142061124996523e-05, "loss": 0.8343445777893066, "memory(GiB)": 91.52, "step": 60285, "token_acc": 0.7713969127311631, "train_speed(iter/s)": 0.142432 }, { "epoch": 0.7823037671643921, "grad_norm": 0.7465184330940247, "learning_rate": 7.141576452913392e-05, "loss": 0.8760356903076172, "memory(GiB)": 91.52, "step": 60290, "token_acc": 0.7611639619473979, "train_speed(iter/s)": 0.142431 }, { "epoch": 0.7823686455660478, "grad_norm": 0.6906752586364746, "learning_rate": 7.141091756185138e-05, "loss": 0.869818115234375, "memory(GiB)": 91.52, "step": 60295, "token_acc": 0.7746217482935422, "train_speed(iter/s)": 0.142429 }, { "epoch": 0.7824335239677035, "grad_norm": 0.7118738889694214, "learning_rate": 7.140607034817337e-05, "loss": 0.8646178245544434, "memory(GiB)": 91.52, "step": 60300, "token_acc": 0.7508778904127741, "train_speed(iter/s)": 0.142427 }, { "epoch": 0.7824984023693592, "grad_norm": 0.6995629668235779, "learning_rate": 7.140122288815568e-05, "loss": 0.8532064437866211, "memory(GiB)": 91.52, "step": 60305, "token_acc": 0.7482926829268293, "train_speed(iter/s)": 0.142425 }, { "epoch": 0.7825632807710149, "grad_norm": 0.7406359314918518, "learning_rate": 7.13963751818541e-05, "loss": 0.8812080383300781, "memory(GiB)": 91.52, "step": 60310, "token_acc": 0.761232651140908, "train_speed(iter/s)": 0.142424 }, { "epoch": 0.7826281591726706, "grad_norm": 0.8168978691101074, "learning_rate": 7.139152722932439e-05, "loss": 0.8674308776855468, "memory(GiB)": 91.52, "step": 60315, "token_acc": 0.7744539229562756, "train_speed(iter/s)": 0.142422 }, { "epoch": 0.7826930375743263, "grad_norm": 0.6676910519599915, "learning_rate": 7.138667903062238e-05, "loss": 0.918577003479004, "memory(GiB)": 91.52, "step": 60320, "token_acc": 0.7440508863130025, "train_speed(iter/s)": 0.14242 }, { "epoch": 0.782757915975982, "grad_norm": 0.7253937125205994, "learning_rate": 7.138183058580384e-05, "loss": 0.8907694816589355, "memory(GiB)": 91.52, "step": 60325, "token_acc": 0.7581228811945302, "train_speed(iter/s)": 0.142418 }, { "epoch": 0.7828227943776377, "grad_norm": 0.7388659119606018, "learning_rate": 7.137698189492458e-05, "loss": 0.8794258117675782, "memory(GiB)": 91.52, "step": 60330, "token_acc": 0.7579069318401236, "train_speed(iter/s)": 0.142416 }, { "epoch": 0.7828876727792934, "grad_norm": 0.7275609374046326, "learning_rate": 7.137213295804038e-05, "loss": 0.8795231819152832, "memory(GiB)": 91.52, "step": 60335, "token_acc": 0.7669922028825706, "train_speed(iter/s)": 0.142414 }, { "epoch": 0.7829525511809491, "grad_norm": 0.7372375726699829, "learning_rate": 7.136728377520705e-05, "loss": 0.9184945106506348, "memory(GiB)": 91.52, "step": 60340, "token_acc": 0.7540695425939599, "train_speed(iter/s)": 0.142412 }, { "epoch": 0.7830174295826048, "grad_norm": 0.6704896688461304, "learning_rate": 7.136243434648037e-05, "loss": 0.8837193489074707, "memory(GiB)": 91.52, "step": 60345, "token_acc": 0.7628507381927804, "train_speed(iter/s)": 0.142411 }, { "epoch": 0.7830823079842605, "grad_norm": 0.6979457139968872, "learning_rate": 7.13575846719162e-05, "loss": 0.903993797302246, "memory(GiB)": 91.52, "step": 60350, "token_acc": 0.7479498142566763, "train_speed(iter/s)": 0.142409 }, { "epoch": 0.7831471863859162, "grad_norm": 0.7374027967453003, "learning_rate": 7.135273475157031e-05, "loss": 0.8490462303161621, "memory(GiB)": 91.52, "step": 60355, "token_acc": 0.7710745785084299, "train_speed(iter/s)": 0.142407 }, { "epoch": 0.7832120647875719, "grad_norm": 0.7263529300689697, "learning_rate": 7.134788458549854e-05, "loss": 0.8505170822143555, "memory(GiB)": 91.52, "step": 60360, "token_acc": 0.7659326528972112, "train_speed(iter/s)": 0.142405 }, { "epoch": 0.7832769431892276, "grad_norm": 0.7559145092964172, "learning_rate": 7.134303417375668e-05, "loss": 0.8867265701293945, "memory(GiB)": 91.52, "step": 60365, "token_acc": 0.7625545130170477, "train_speed(iter/s)": 0.142403 }, { "epoch": 0.7833418215908833, "grad_norm": 0.8092820644378662, "learning_rate": 7.133818351640054e-05, "loss": 0.9116082191467285, "memory(GiB)": 91.52, "step": 60370, "token_acc": 0.7343742446947358, "train_speed(iter/s)": 0.142401 }, { "epoch": 0.783406699992539, "grad_norm": 0.8417507410049438, "learning_rate": 7.133333261348596e-05, "loss": 0.910163402557373, "memory(GiB)": 91.52, "step": 60375, "token_acc": 0.7580246913580246, "train_speed(iter/s)": 0.1424 }, { "epoch": 0.7834715783941947, "grad_norm": 0.7250189185142517, "learning_rate": 7.132848146506877e-05, "loss": 0.8779302597045898, "memory(GiB)": 91.52, "step": 60380, "token_acc": 0.7781993228083058, "train_speed(iter/s)": 0.142398 }, { "epoch": 0.7835364567958504, "grad_norm": 0.8259847164154053, "learning_rate": 7.13236300712048e-05, "loss": 0.8934228897094727, "memory(GiB)": 91.52, "step": 60385, "token_acc": 0.7714661266830083, "train_speed(iter/s)": 0.142396 }, { "epoch": 0.7836013351975061, "grad_norm": 0.7002555131912231, "learning_rate": 7.131877843194985e-05, "loss": 0.9214874267578125, "memory(GiB)": 91.52, "step": 60390, "token_acc": 0.739286794608947, "train_speed(iter/s)": 0.142394 }, { "epoch": 0.7836662135991618, "grad_norm": 0.6582884788513184, "learning_rate": 7.131392654735978e-05, "loss": 0.8793724060058594, "memory(GiB)": 91.52, "step": 60395, "token_acc": 0.777567140600316, "train_speed(iter/s)": 0.142392 }, { "epoch": 0.7837310920008175, "grad_norm": 0.6548983454704285, "learning_rate": 7.13090744174904e-05, "loss": 0.8696775436401367, "memory(GiB)": 91.52, "step": 60400, "token_acc": 0.7571360446896928, "train_speed(iter/s)": 0.14239 }, { "epoch": 0.7837959704024732, "grad_norm": 0.7066346406936646, "learning_rate": 7.130422204239758e-05, "loss": 0.860289478302002, "memory(GiB)": 91.52, "step": 60405, "token_acc": 0.7774859656610091, "train_speed(iter/s)": 0.142388 }, { "epoch": 0.7838608488041289, "grad_norm": 0.7384089827537537, "learning_rate": 7.129936942213712e-05, "loss": 0.8617359161376953, "memory(GiB)": 91.52, "step": 60410, "token_acc": 0.7600772113091859, "train_speed(iter/s)": 0.142385 }, { "epoch": 0.7839257272057846, "grad_norm": 0.8007893562316895, "learning_rate": 7.129451655676492e-05, "loss": 0.9041523933410645, "memory(GiB)": 91.52, "step": 60415, "token_acc": 0.7690996772906922, "train_speed(iter/s)": 0.142384 }, { "epoch": 0.7839906056074403, "grad_norm": 0.7350031733512878, "learning_rate": 7.128966344633677e-05, "loss": 0.8723518371582031, "memory(GiB)": 91.52, "step": 60420, "token_acc": 0.7576081153230112, "train_speed(iter/s)": 0.142381 }, { "epoch": 0.784055484009096, "grad_norm": 0.7657032012939453, "learning_rate": 7.128481009090854e-05, "loss": 0.8689451217651367, "memory(GiB)": 91.52, "step": 60425, "token_acc": 0.7617971807681293, "train_speed(iter/s)": 0.142379 }, { "epoch": 0.7841203624107517, "grad_norm": 0.8179176449775696, "learning_rate": 7.127995649053611e-05, "loss": 0.8930313110351562, "memory(GiB)": 91.52, "step": 60430, "token_acc": 0.7266915290356952, "train_speed(iter/s)": 0.142378 }, { "epoch": 0.7841852408124074, "grad_norm": 0.7648863196372986, "learning_rate": 7.127510264527529e-05, "loss": 0.8919109344482422, "memory(GiB)": 91.52, "step": 60435, "token_acc": 0.7337848480075603, "train_speed(iter/s)": 0.142376 }, { "epoch": 0.7842501192140631, "grad_norm": 0.7169881463050842, "learning_rate": 7.127024855518196e-05, "loss": 0.9107383728027344, "memory(GiB)": 91.52, "step": 60440, "token_acc": 0.7588763033011189, "train_speed(iter/s)": 0.142373 }, { "epoch": 0.7843149976157188, "grad_norm": 0.8305321931838989, "learning_rate": 7.1265394220312e-05, "loss": 0.8800210952758789, "memory(GiB)": 91.52, "step": 60445, "token_acc": 0.7436469919623285, "train_speed(iter/s)": 0.142372 }, { "epoch": 0.7843798760173745, "grad_norm": 0.7517331838607788, "learning_rate": 7.126053964072122e-05, "loss": 0.8774157524108886, "memory(GiB)": 91.52, "step": 60450, "token_acc": 0.7728035092246162, "train_speed(iter/s)": 0.14237 }, { "epoch": 0.7844447544190302, "grad_norm": 0.7684587240219116, "learning_rate": 7.125568481646555e-05, "loss": 0.9049581527709961, "memory(GiB)": 91.52, "step": 60455, "token_acc": 0.7480638915779284, "train_speed(iter/s)": 0.142368 }, { "epoch": 0.7845096328206859, "grad_norm": 0.7966737151145935, "learning_rate": 7.12508297476008e-05, "loss": 0.8877830505371094, "memory(GiB)": 91.52, "step": 60460, "token_acc": 0.7542141230068337, "train_speed(iter/s)": 0.142366 }, { "epoch": 0.7845745112223416, "grad_norm": 0.7899027466773987, "learning_rate": 7.124597443418288e-05, "loss": 0.8867260932922363, "memory(GiB)": 91.52, "step": 60465, "token_acc": 0.7491201772679875, "train_speed(iter/s)": 0.142364 }, { "epoch": 0.7846393896239973, "grad_norm": 0.817398726940155, "learning_rate": 7.124111887626765e-05, "loss": 0.8668371200561523, "memory(GiB)": 91.52, "step": 60470, "token_acc": 0.7478068531153933, "train_speed(iter/s)": 0.142363 }, { "epoch": 0.784704268025653, "grad_norm": 0.7320149540901184, "learning_rate": 7.1236263073911e-05, "loss": 0.8980817794799805, "memory(GiB)": 91.52, "step": 60475, "token_acc": 0.7499350817969359, "train_speed(iter/s)": 0.142361 }, { "epoch": 0.7847691464273087, "grad_norm": 0.7110080718994141, "learning_rate": 7.12314070271688e-05, "loss": 0.9018686294555665, "memory(GiB)": 91.52, "step": 60480, "token_acc": 0.7494144367946625, "train_speed(iter/s)": 0.14236 }, { "epoch": 0.7848340248289644, "grad_norm": 0.8011425733566284, "learning_rate": 7.122655073609692e-05, "loss": 0.8794230461120606, "memory(GiB)": 91.52, "step": 60485, "token_acc": 0.7614860787863115, "train_speed(iter/s)": 0.142358 }, { "epoch": 0.7848989032306201, "grad_norm": 0.7565678954124451, "learning_rate": 7.122169420075126e-05, "loss": 0.8800686836242676, "memory(GiB)": 91.52, "step": 60490, "token_acc": 0.7584302030026774, "train_speed(iter/s)": 0.142356 }, { "epoch": 0.7849637816322758, "grad_norm": 0.702613353729248, "learning_rate": 7.121683742118772e-05, "loss": 0.8725666046142578, "memory(GiB)": 91.52, "step": 60495, "token_acc": 0.7618273157685497, "train_speed(iter/s)": 0.142355 }, { "epoch": 0.7850286600339315, "grad_norm": 0.760217010974884, "learning_rate": 7.121198039746218e-05, "loss": 0.8770558357238769, "memory(GiB)": 91.52, "step": 60500, "token_acc": 0.7577667219458264, "train_speed(iter/s)": 0.142353 }, { "epoch": 0.785093538435587, "grad_norm": 0.7623838782310486, "learning_rate": 7.120712312963055e-05, "loss": 0.8687602043151855, "memory(GiB)": 91.52, "step": 60505, "token_acc": 0.7474727374034864, "train_speed(iter/s)": 0.142351 }, { "epoch": 0.7851584168372427, "grad_norm": 0.7281264662742615, "learning_rate": 7.120226561774868e-05, "loss": 0.8544708251953125, "memory(GiB)": 91.52, "step": 60510, "token_acc": 0.7587850310454775, "train_speed(iter/s)": 0.14235 }, { "epoch": 0.7852232952388984, "grad_norm": 0.7144094109535217, "learning_rate": 7.119740786187252e-05, "loss": 0.8219938278198242, "memory(GiB)": 91.52, "step": 60515, "token_acc": 0.7724576981068856, "train_speed(iter/s)": 0.142348 }, { "epoch": 0.7852881736405541, "grad_norm": 0.7649825215339661, "learning_rate": 7.119254986205795e-05, "loss": 0.8721267700195312, "memory(GiB)": 91.52, "step": 60520, "token_acc": 0.7731499269960809, "train_speed(iter/s)": 0.142346 }, { "epoch": 0.7853530520422098, "grad_norm": 0.7774695158004761, "learning_rate": 7.118769161836088e-05, "loss": 0.87014741897583, "memory(GiB)": 91.52, "step": 60525, "token_acc": 0.7651581139935618, "train_speed(iter/s)": 0.142344 }, { "epoch": 0.7854179304438655, "grad_norm": 0.8003538250923157, "learning_rate": 7.118283313083722e-05, "loss": 0.906119441986084, "memory(GiB)": 91.52, "step": 60530, "token_acc": 0.7780322612309001, "train_speed(iter/s)": 0.142342 }, { "epoch": 0.7854828088455212, "grad_norm": 0.647639274597168, "learning_rate": 7.117797439954289e-05, "loss": 0.8682080268859863, "memory(GiB)": 91.52, "step": 60535, "token_acc": 0.7587887225896276, "train_speed(iter/s)": 0.14234 }, { "epoch": 0.7855476872471769, "grad_norm": 0.8139553666114807, "learning_rate": 7.117311542453379e-05, "loss": 0.8892231941223144, "memory(GiB)": 91.52, "step": 60540, "token_acc": 0.7493284936479129, "train_speed(iter/s)": 0.142338 }, { "epoch": 0.7856125656488326, "grad_norm": 0.7861557006835938, "learning_rate": 7.116825620586583e-05, "loss": 0.8563432693481445, "memory(GiB)": 91.52, "step": 60545, "token_acc": 0.778475204639724, "train_speed(iter/s)": 0.142337 }, { "epoch": 0.7856774440504883, "grad_norm": 0.7507283091545105, "learning_rate": 7.116339674359495e-05, "loss": 0.9063703536987304, "memory(GiB)": 91.52, "step": 60550, "token_acc": 0.765125128998968, "train_speed(iter/s)": 0.142334 }, { "epoch": 0.785742322452144, "grad_norm": 0.7482241988182068, "learning_rate": 7.115853703777705e-05, "loss": 0.9016183853149414, "memory(GiB)": 91.52, "step": 60555, "token_acc": 0.7497928220819787, "train_speed(iter/s)": 0.142333 }, { "epoch": 0.7858072008537997, "grad_norm": 0.7383410930633545, "learning_rate": 7.115367708846807e-05, "loss": 0.8887968063354492, "memory(GiB)": 91.52, "step": 60560, "token_acc": 0.7672101449275363, "train_speed(iter/s)": 0.14233 }, { "epoch": 0.7858720792554554, "grad_norm": 0.6910561323165894, "learning_rate": 7.114881689572393e-05, "loss": 0.8937124252319336, "memory(GiB)": 91.52, "step": 60565, "token_acc": 0.7468777981997267, "train_speed(iter/s)": 0.142329 }, { "epoch": 0.7859369576571111, "grad_norm": 0.6694542169570923, "learning_rate": 7.114395645960059e-05, "loss": 0.8662248611450195, "memory(GiB)": 91.52, "step": 60570, "token_acc": 0.7562653894154552, "train_speed(iter/s)": 0.142327 }, { "epoch": 0.7860018360587668, "grad_norm": 0.801231324672699, "learning_rate": 7.113909578015395e-05, "loss": 0.8991491317749023, "memory(GiB)": 91.52, "step": 60575, "token_acc": 0.7620538043073254, "train_speed(iter/s)": 0.142325 }, { "epoch": 0.7860667144604225, "grad_norm": 0.6772239804267883, "learning_rate": 7.113423485743995e-05, "loss": 0.851678466796875, "memory(GiB)": 91.52, "step": 60580, "token_acc": 0.7727491396218037, "train_speed(iter/s)": 0.142323 }, { "epoch": 0.7861315928620782, "grad_norm": 0.6753326654434204, "learning_rate": 7.112937369151453e-05, "loss": 0.8335196495056152, "memory(GiB)": 91.52, "step": 60585, "token_acc": 0.7709270034382438, "train_speed(iter/s)": 0.142321 }, { "epoch": 0.7861964712637339, "grad_norm": 0.6743677258491516, "learning_rate": 7.112451228243366e-05, "loss": 0.8792852401733399, "memory(GiB)": 91.52, "step": 60590, "token_acc": 0.7600649687251616, "train_speed(iter/s)": 0.142318 }, { "epoch": 0.7862613496653896, "grad_norm": 0.7275748252868652, "learning_rate": 7.111965063025324e-05, "loss": 0.8787426948547363, "memory(GiB)": 91.52, "step": 60595, "token_acc": 0.7513020422335154, "train_speed(iter/s)": 0.142317 }, { "epoch": 0.7863262280670453, "grad_norm": 0.8241020441055298, "learning_rate": 7.111478873502923e-05, "loss": 0.9306562423706055, "memory(GiB)": 91.52, "step": 60600, "token_acc": 0.7520831190206769, "train_speed(iter/s)": 0.142315 }, { "epoch": 0.786391106468701, "grad_norm": 0.715445339679718, "learning_rate": 7.110992659681763e-05, "loss": 0.8547077178955078, "memory(GiB)": 91.52, "step": 60605, "token_acc": 0.7694723137014629, "train_speed(iter/s)": 0.142313 }, { "epoch": 0.7864559848703567, "grad_norm": 0.8037269711494446, "learning_rate": 7.110506421567433e-05, "loss": 0.875767707824707, "memory(GiB)": 91.52, "step": 60610, "token_acc": 0.762426714249299, "train_speed(iter/s)": 0.142311 }, { "epoch": 0.7865208632720124, "grad_norm": 0.7288603782653809, "learning_rate": 7.11002015916553e-05, "loss": 0.8604846954345703, "memory(GiB)": 91.52, "step": 60615, "token_acc": 0.762471300354832, "train_speed(iter/s)": 0.14231 }, { "epoch": 0.7865857416736681, "grad_norm": 0.744269609451294, "learning_rate": 7.109533872481652e-05, "loss": 0.889798355102539, "memory(GiB)": 91.52, "step": 60620, "token_acc": 0.7534285210646924, "train_speed(iter/s)": 0.142307 }, { "epoch": 0.7866506200753238, "grad_norm": 0.7427534461021423, "learning_rate": 7.109047561521393e-05, "loss": 0.8537298202514648, "memory(GiB)": 91.52, "step": 60625, "token_acc": 0.7771751412429379, "train_speed(iter/s)": 0.142305 }, { "epoch": 0.7867154984769795, "grad_norm": 0.6811725497245789, "learning_rate": 7.108561226290352e-05, "loss": 0.7926095962524414, "memory(GiB)": 91.52, "step": 60630, "token_acc": 0.7746978851963746, "train_speed(iter/s)": 0.142303 }, { "epoch": 0.7867803768786352, "grad_norm": 0.729866087436676, "learning_rate": 7.108074866794123e-05, "loss": 0.8943414688110352, "memory(GiB)": 91.52, "step": 60635, "token_acc": 0.7454834513658043, "train_speed(iter/s)": 0.142301 }, { "epoch": 0.7868452552802909, "grad_norm": 0.7163184285163879, "learning_rate": 7.107588483038304e-05, "loss": 0.8572944641113281, "memory(GiB)": 91.52, "step": 60640, "token_acc": 0.7704878717281074, "train_speed(iter/s)": 0.142299 }, { "epoch": 0.7869101336819466, "grad_norm": 0.7003957033157349, "learning_rate": 7.107102075028494e-05, "loss": 0.8675338745117187, "memory(GiB)": 91.52, "step": 60645, "token_acc": 0.7628184963825102, "train_speed(iter/s)": 0.142297 }, { "epoch": 0.7869750120836023, "grad_norm": 0.7847926616668701, "learning_rate": 7.106615642770286e-05, "loss": 0.8888601303100586, "memory(GiB)": 91.52, "step": 60650, "token_acc": 0.7345724632325771, "train_speed(iter/s)": 0.142295 }, { "epoch": 0.787039890485258, "grad_norm": 0.76468425989151, "learning_rate": 7.106129186269283e-05, "loss": 0.8856521606445312, "memory(GiB)": 91.52, "step": 60655, "token_acc": 0.7578454014400218, "train_speed(iter/s)": 0.142293 }, { "epoch": 0.7871047688869137, "grad_norm": 0.7476505041122437, "learning_rate": 7.105642705531081e-05, "loss": 0.8891185760498047, "memory(GiB)": 91.52, "step": 60660, "token_acc": 0.7610973256134547, "train_speed(iter/s)": 0.142291 }, { "epoch": 0.7871696472885694, "grad_norm": 0.767639696598053, "learning_rate": 7.105156200561278e-05, "loss": 0.8509523391723632, "memory(GiB)": 91.52, "step": 60665, "token_acc": 0.7752661568451042, "train_speed(iter/s)": 0.142289 }, { "epoch": 0.7872345256902251, "grad_norm": 0.7061169147491455, "learning_rate": 7.104669671365472e-05, "loss": 0.864459228515625, "memory(GiB)": 91.52, "step": 60670, "token_acc": 0.7570067000094366, "train_speed(iter/s)": 0.142286 }, { "epoch": 0.7872994040918808, "grad_norm": 0.6362441182136536, "learning_rate": 7.104183117949264e-05, "loss": 0.8590502738952637, "memory(GiB)": 91.52, "step": 60675, "token_acc": 0.7455090807211839, "train_speed(iter/s)": 0.142284 }, { "epoch": 0.7873642824935365, "grad_norm": 0.6632702350616455, "learning_rate": 7.103696540318251e-05, "loss": 0.8641029357910156, "memory(GiB)": 91.52, "step": 60680, "token_acc": 0.7805371659863479, "train_speed(iter/s)": 0.142282 }, { "epoch": 0.7874291608951922, "grad_norm": 0.8025270104408264, "learning_rate": 7.103209938478035e-05, "loss": 0.851832389831543, "memory(GiB)": 91.52, "step": 60685, "token_acc": 0.7804010127968247, "train_speed(iter/s)": 0.14228 }, { "epoch": 0.7874940392968479, "grad_norm": 0.8001232743263245, "learning_rate": 7.102723312434213e-05, "loss": 0.889559555053711, "memory(GiB)": 91.52, "step": 60690, "token_acc": 0.7842626573536956, "train_speed(iter/s)": 0.142279 }, { "epoch": 0.7875589176985036, "grad_norm": 0.7352542877197266, "learning_rate": 7.102236662192386e-05, "loss": 0.8809991836547851, "memory(GiB)": 91.52, "step": 60695, "token_acc": 0.7700245117419151, "train_speed(iter/s)": 0.142278 }, { "epoch": 0.7876237961001593, "grad_norm": 0.7232165932655334, "learning_rate": 7.101749987758158e-05, "loss": 0.8828760147094726, "memory(GiB)": 91.52, "step": 60700, "token_acc": 0.7756012468186108, "train_speed(iter/s)": 0.142276 }, { "epoch": 0.787688674501815, "grad_norm": 0.702211320400238, "learning_rate": 7.101263289137122e-05, "loss": 0.8889533996582031, "memory(GiB)": 91.52, "step": 60705, "token_acc": 0.7706434545144223, "train_speed(iter/s)": 0.142273 }, { "epoch": 0.7877535529034707, "grad_norm": 0.7373394966125488, "learning_rate": 7.100776566334886e-05, "loss": 0.8813188552856446, "memory(GiB)": 91.52, "step": 60710, "token_acc": 0.7684580782629282, "train_speed(iter/s)": 0.142272 }, { "epoch": 0.7878184313051264, "grad_norm": 0.8099038004875183, "learning_rate": 7.100289819357047e-05, "loss": 0.8861814498901367, "memory(GiB)": 91.52, "step": 60715, "token_acc": 0.755982545045045, "train_speed(iter/s)": 0.14227 }, { "epoch": 0.7878833097067821, "grad_norm": 0.836763322353363, "learning_rate": 7.09980304820921e-05, "loss": 0.8601107597351074, "memory(GiB)": 91.52, "step": 60720, "token_acc": 0.7810412926391382, "train_speed(iter/s)": 0.142269 }, { "epoch": 0.7879481881084378, "grad_norm": 0.6836109161376953, "learning_rate": 7.099316252896974e-05, "loss": 0.8470465660095214, "memory(GiB)": 91.52, "step": 60725, "token_acc": 0.7668975847219622, "train_speed(iter/s)": 0.142267 }, { "epoch": 0.7880130665100935, "grad_norm": 0.7291046380996704, "learning_rate": 7.09882943342594e-05, "loss": 0.9352166175842285, "memory(GiB)": 91.52, "step": 60730, "token_acc": 0.7378264301156446, "train_speed(iter/s)": 0.142265 }, { "epoch": 0.7880779449117492, "grad_norm": 1.2273731231689453, "learning_rate": 7.098342589801714e-05, "loss": 0.9113829612731934, "memory(GiB)": 91.52, "step": 60735, "token_acc": 0.7539716902581183, "train_speed(iter/s)": 0.142263 }, { "epoch": 0.7881428233134049, "grad_norm": 0.6499683856964111, "learning_rate": 7.097855722029895e-05, "loss": 0.9150894165039063, "memory(GiB)": 91.52, "step": 60740, "token_acc": 0.7505200250932743, "train_speed(iter/s)": 0.142261 }, { "epoch": 0.7882077017150605, "grad_norm": 0.673329770565033, "learning_rate": 7.097368830116089e-05, "loss": 0.8396305084228516, "memory(GiB)": 91.52, "step": 60745, "token_acc": 0.7665503973533748, "train_speed(iter/s)": 0.142259 }, { "epoch": 0.7882725801167162, "grad_norm": 0.7457724809646606, "learning_rate": 7.096881914065896e-05, "loss": 0.8688385009765625, "memory(GiB)": 91.52, "step": 60750, "token_acc": 0.7593317583530206, "train_speed(iter/s)": 0.142256 }, { "epoch": 0.7883374585183719, "grad_norm": 0.7289028763771057, "learning_rate": 7.096394973884922e-05, "loss": 0.870570182800293, "memory(GiB)": 91.52, "step": 60755, "token_acc": 0.7729424033370769, "train_speed(iter/s)": 0.142254 }, { "epoch": 0.7884023369200276, "grad_norm": 0.8773207664489746, "learning_rate": 7.095908009578769e-05, "loss": 0.8574764251708984, "memory(GiB)": 91.52, "step": 60760, "token_acc": 0.7660973921070852, "train_speed(iter/s)": 0.142253 }, { "epoch": 0.7884672153216833, "grad_norm": 0.6968557834625244, "learning_rate": 7.09542102115304e-05, "loss": 0.9036296844482422, "memory(GiB)": 91.52, "step": 60765, "token_acc": 0.7588490691000074, "train_speed(iter/s)": 0.142251 }, { "epoch": 0.788532093723339, "grad_norm": 0.7891998291015625, "learning_rate": 7.094934008613344e-05, "loss": 0.8388651847839356, "memory(GiB)": 91.52, "step": 60770, "token_acc": 0.7746166049709149, "train_speed(iter/s)": 0.14225 }, { "epoch": 0.7885969721249947, "grad_norm": 0.6998284459114075, "learning_rate": 7.09444697196528e-05, "loss": 0.9023102760314942, "memory(GiB)": 91.52, "step": 60775, "token_acc": 0.7646682653876898, "train_speed(iter/s)": 0.142248 }, { "epoch": 0.7886618505266504, "grad_norm": 0.7657215595245361, "learning_rate": 7.093959911214456e-05, "loss": 0.8476003646850586, "memory(GiB)": 91.52, "step": 60780, "token_acc": 0.7603615896298823, "train_speed(iter/s)": 0.142246 }, { "epoch": 0.7887267289283061, "grad_norm": 0.6589831709861755, "learning_rate": 7.093472826366475e-05, "loss": 0.8987354278564453, "memory(GiB)": 91.52, "step": 60785, "token_acc": 0.7518408962175089, "train_speed(iter/s)": 0.142244 }, { "epoch": 0.7887916073299618, "grad_norm": 0.7146768569946289, "learning_rate": 7.092985717426944e-05, "loss": 0.8903056144714355, "memory(GiB)": 91.52, "step": 60790, "token_acc": 0.7545566324316505, "train_speed(iter/s)": 0.142241 }, { "epoch": 0.7888564857316175, "grad_norm": 0.733491837978363, "learning_rate": 7.092498584401469e-05, "loss": 0.8839939117431641, "memory(GiB)": 91.52, "step": 60795, "token_acc": 0.7805395574416489, "train_speed(iter/s)": 0.142239 }, { "epoch": 0.7889213641332732, "grad_norm": 0.7485752701759338, "learning_rate": 7.092011427295653e-05, "loss": 0.8781404495239258, "memory(GiB)": 91.52, "step": 60800, "token_acc": 0.7687775143921436, "train_speed(iter/s)": 0.142238 }, { "epoch": 0.7889862425349289, "grad_norm": 0.732429027557373, "learning_rate": 7.091524246115107e-05, "loss": 0.8833244323730469, "memory(GiB)": 91.52, "step": 60805, "token_acc": 0.7699049863740148, "train_speed(iter/s)": 0.142236 }, { "epoch": 0.7890511209365846, "grad_norm": 0.6889779567718506, "learning_rate": 7.091037040865432e-05, "loss": 0.8560120582580566, "memory(GiB)": 91.52, "step": 60810, "token_acc": 0.7498971616618676, "train_speed(iter/s)": 0.142234 }, { "epoch": 0.7891159993382403, "grad_norm": 0.8191732168197632, "learning_rate": 7.090549811552238e-05, "loss": 0.8647579193115235, "memory(GiB)": 91.52, "step": 60815, "token_acc": 0.7682945617008166, "train_speed(iter/s)": 0.142232 }, { "epoch": 0.789180877739896, "grad_norm": 0.7668706178665161, "learning_rate": 7.090062558181132e-05, "loss": 0.9088240623474121, "memory(GiB)": 91.52, "step": 60820, "token_acc": 0.7575237234523271, "train_speed(iter/s)": 0.14223 }, { "epoch": 0.7892457561415517, "grad_norm": 0.7865003943443298, "learning_rate": 7.08957528075772e-05, "loss": 0.8794270515441894, "memory(GiB)": 91.52, "step": 60825, "token_acc": 0.7631785082247817, "train_speed(iter/s)": 0.142228 }, { "epoch": 0.7893106345432074, "grad_norm": 0.7347919344902039, "learning_rate": 7.08908797928761e-05, "loss": 0.8728174209594727, "memory(GiB)": 91.52, "step": 60830, "token_acc": 0.7605163033815738, "train_speed(iter/s)": 0.142226 }, { "epoch": 0.7893755129448631, "grad_norm": 0.718539834022522, "learning_rate": 7.088600653776409e-05, "loss": 0.8163364410400391, "memory(GiB)": 91.52, "step": 60835, "token_acc": 0.7884827206388545, "train_speed(iter/s)": 0.142224 }, { "epoch": 0.7894403913465188, "grad_norm": 0.7657061815261841, "learning_rate": 7.088113304229729e-05, "loss": 0.8892882347106934, "memory(GiB)": 91.52, "step": 60840, "token_acc": 0.7454545454545455, "train_speed(iter/s)": 0.142223 }, { "epoch": 0.7895052697481745, "grad_norm": 0.7978814244270325, "learning_rate": 7.087625930653174e-05, "loss": 0.8568492889404297, "memory(GiB)": 91.52, "step": 60845, "token_acc": 0.7589789587771424, "train_speed(iter/s)": 0.14222 }, { "epoch": 0.7895701481498302, "grad_norm": 0.7318153381347656, "learning_rate": 7.087138533052353e-05, "loss": 0.8042127609252929, "memory(GiB)": 91.52, "step": 60850, "token_acc": 0.7671502837144507, "train_speed(iter/s)": 0.142219 }, { "epoch": 0.7896350265514859, "grad_norm": 0.9986652731895447, "learning_rate": 7.086651111432878e-05, "loss": 0.8760220527648925, "memory(GiB)": 91.52, "step": 60855, "token_acc": 0.7868103288129296, "train_speed(iter/s)": 0.142217 }, { "epoch": 0.7896999049531416, "grad_norm": 0.7812591791152954, "learning_rate": 7.086163665800355e-05, "loss": 0.9131110191345215, "memory(GiB)": 91.52, "step": 60860, "token_acc": 0.7639891549341783, "train_speed(iter/s)": 0.142215 }, { "epoch": 0.7897647833547973, "grad_norm": 0.7078964114189148, "learning_rate": 7.085676196160396e-05, "loss": 0.902766227722168, "memory(GiB)": 91.52, "step": 60865, "token_acc": 0.7604119863132064, "train_speed(iter/s)": 0.142214 }, { "epoch": 0.789829661756453, "grad_norm": 0.7559180855751038, "learning_rate": 7.085188702518608e-05, "loss": 0.8040565490722656, "memory(GiB)": 91.52, "step": 60870, "token_acc": 0.7946551988941791, "train_speed(iter/s)": 0.142211 }, { "epoch": 0.7898945401581087, "grad_norm": 0.7439594864845276, "learning_rate": 7.084701184880603e-05, "loss": 0.8625625610351563, "memory(GiB)": 91.52, "step": 60875, "token_acc": 0.767135073128764, "train_speed(iter/s)": 0.14221 }, { "epoch": 0.7899594185597644, "grad_norm": 0.7808445692062378, "learning_rate": 7.084213643251991e-05, "loss": 0.8963953018188476, "memory(GiB)": 91.52, "step": 60880, "token_acc": 0.7710935821491053, "train_speed(iter/s)": 0.142208 }, { "epoch": 0.7900242969614201, "grad_norm": 0.6717284321784973, "learning_rate": 7.083726077638384e-05, "loss": 0.8503005981445313, "memory(GiB)": 91.52, "step": 60885, "token_acc": 0.7654536135315223, "train_speed(iter/s)": 0.142206 }, { "epoch": 0.7900891753630758, "grad_norm": 0.7397410869598389, "learning_rate": 7.08323848804539e-05, "loss": 0.9004793167114258, "memory(GiB)": 91.52, "step": 60890, "token_acc": 0.7659282856719514, "train_speed(iter/s)": 0.142204 }, { "epoch": 0.7901540537647315, "grad_norm": 0.8041763305664062, "learning_rate": 7.082750874478623e-05, "loss": 0.9012888908386231, "memory(GiB)": 91.52, "step": 60895, "token_acc": 0.7528401918707397, "train_speed(iter/s)": 0.142203 }, { "epoch": 0.7902189321663872, "grad_norm": 0.713982105255127, "learning_rate": 7.08226323694369e-05, "loss": 0.8612808227539063, "memory(GiB)": 91.52, "step": 60900, "token_acc": 0.7592586485506051, "train_speed(iter/s)": 0.1422 }, { "epoch": 0.7902838105680429, "grad_norm": 0.7589812278747559, "learning_rate": 7.081775575446207e-05, "loss": 0.9191793441772461, "memory(GiB)": 91.52, "step": 60905, "token_acc": 0.7310665663268796, "train_speed(iter/s)": 0.142198 }, { "epoch": 0.7903486889696986, "grad_norm": 0.7277398109436035, "learning_rate": 7.081287889991785e-05, "loss": 0.8925729751586914, "memory(GiB)": 91.52, "step": 60910, "token_acc": 0.762049229037, "train_speed(iter/s)": 0.142196 }, { "epoch": 0.7904135673713543, "grad_norm": 0.8222147226333618, "learning_rate": 7.080800180586037e-05, "loss": 0.8837088584899903, "memory(GiB)": 91.52, "step": 60915, "token_acc": 0.7775064226585049, "train_speed(iter/s)": 0.142194 }, { "epoch": 0.79047844577301, "grad_norm": 0.687095582485199, "learning_rate": 7.080312447234574e-05, "loss": 0.8486359596252442, "memory(GiB)": 91.52, "step": 60920, "token_acc": 0.7807294544880398, "train_speed(iter/s)": 0.142192 }, { "epoch": 0.7905433241746657, "grad_norm": 0.6661247611045837, "learning_rate": 7.079824689943007e-05, "loss": 0.8120286941528321, "memory(GiB)": 91.52, "step": 60925, "token_acc": 0.7725009022013714, "train_speed(iter/s)": 0.14219 }, { "epoch": 0.7906082025763214, "grad_norm": 0.7516220808029175, "learning_rate": 7.079336908716953e-05, "loss": 0.8768719673156739, "memory(GiB)": 91.52, "step": 60930, "token_acc": 0.757851482242442, "train_speed(iter/s)": 0.142188 }, { "epoch": 0.7906730809779771, "grad_norm": 0.7670546770095825, "learning_rate": 7.078849103562025e-05, "loss": 0.872402286529541, "memory(GiB)": 91.52, "step": 60935, "token_acc": 0.77, "train_speed(iter/s)": 0.142187 }, { "epoch": 0.7907379593796328, "grad_norm": 0.69588303565979, "learning_rate": 7.078361274483834e-05, "loss": 0.8814411163330078, "memory(GiB)": 91.52, "step": 60940, "token_acc": 0.7525504420766266, "train_speed(iter/s)": 0.142185 }, { "epoch": 0.7908028377812885, "grad_norm": 0.6723767518997192, "learning_rate": 7.077873421487995e-05, "loss": 0.8637392044067382, "memory(GiB)": 91.52, "step": 60945, "token_acc": 0.765551885494082, "train_speed(iter/s)": 0.142183 }, { "epoch": 0.7908677161829442, "grad_norm": 0.795242428779602, "learning_rate": 7.077385544580121e-05, "loss": 0.9070966720581055, "memory(GiB)": 91.52, "step": 60950, "token_acc": 0.7423651359786001, "train_speed(iter/s)": 0.142182 }, { "epoch": 0.7909325945845999, "grad_norm": 0.717934250831604, "learning_rate": 7.07689764376583e-05, "loss": 0.8554146766662598, "memory(GiB)": 91.52, "step": 60955, "token_acc": 0.7509915763230461, "train_speed(iter/s)": 0.14218 }, { "epoch": 0.7909974729862556, "grad_norm": 0.7527301907539368, "learning_rate": 7.076409719050734e-05, "loss": 0.9215951919555664, "memory(GiB)": 91.52, "step": 60960, "token_acc": 0.7533240884248308, "train_speed(iter/s)": 0.142178 }, { "epoch": 0.7910623513879113, "grad_norm": 0.7602246403694153, "learning_rate": 7.075921770440449e-05, "loss": 0.8731767654418945, "memory(GiB)": 91.52, "step": 60965, "token_acc": 0.764218059355925, "train_speed(iter/s)": 0.142176 }, { "epoch": 0.791127229789567, "grad_norm": 0.730434000492096, "learning_rate": 7.07543379794059e-05, "loss": 0.900901985168457, "memory(GiB)": 91.52, "step": 60970, "token_acc": 0.7518741936608668, "train_speed(iter/s)": 0.142174 }, { "epoch": 0.7911921081912227, "grad_norm": 0.7728828191757202, "learning_rate": 7.074945801556772e-05, "loss": 0.9078045845031738, "memory(GiB)": 91.52, "step": 60975, "token_acc": 0.757035260613782, "train_speed(iter/s)": 0.142172 }, { "epoch": 0.7912569865928784, "grad_norm": 0.7938775420188904, "learning_rate": 7.074457781294612e-05, "loss": 0.8530945777893066, "memory(GiB)": 91.52, "step": 60980, "token_acc": 0.7753954714640199, "train_speed(iter/s)": 0.142171 }, { "epoch": 0.7913218649945339, "grad_norm": 0.7219678163528442, "learning_rate": 7.073969737159725e-05, "loss": 0.8910165786743164, "memory(GiB)": 91.52, "step": 60985, "token_acc": 0.7507602124931306, "train_speed(iter/s)": 0.142168 }, { "epoch": 0.7913867433961896, "grad_norm": 0.7480489611625671, "learning_rate": 7.073481669157727e-05, "loss": 0.8896595001220703, "memory(GiB)": 91.52, "step": 60990, "token_acc": 0.7631210146450844, "train_speed(iter/s)": 0.142167 }, { "epoch": 0.7914516217978453, "grad_norm": 0.7380498647689819, "learning_rate": 7.072993577294235e-05, "loss": 0.8599263191223144, "memory(GiB)": 91.52, "step": 60995, "token_acc": 0.7668361906324338, "train_speed(iter/s)": 0.142164 }, { "epoch": 0.791516500199501, "grad_norm": 0.7870045900344849, "learning_rate": 7.072505461574866e-05, "loss": 0.8894344329833984, "memory(GiB)": 91.52, "step": 61000, "token_acc": 0.7530172078174882, "train_speed(iter/s)": 0.142162 }, { "epoch": 0.7915813786011567, "grad_norm": 0.8553252220153809, "learning_rate": 7.07201732200524e-05, "loss": 0.8840145111083985, "memory(GiB)": 91.52, "step": 61005, "token_acc": 0.753662656416974, "train_speed(iter/s)": 0.14216 }, { "epoch": 0.7916462570028124, "grad_norm": 0.8146055340766907, "learning_rate": 7.07152915859097e-05, "loss": 0.8813119888305664, "memory(GiB)": 91.52, "step": 61010, "token_acc": 0.7493746469781328, "train_speed(iter/s)": 0.142158 }, { "epoch": 0.7917111354044681, "grad_norm": 0.8011586666107178, "learning_rate": 7.071040971337677e-05, "loss": 0.849891185760498, "memory(GiB)": 91.52, "step": 61015, "token_acc": 0.7500937945524124, "train_speed(iter/s)": 0.142156 }, { "epoch": 0.7917760138061238, "grad_norm": 0.7791101336479187, "learning_rate": 7.070552760250977e-05, "loss": 0.8840036392211914, "memory(GiB)": 91.52, "step": 61020, "token_acc": 0.7624170819444903, "train_speed(iter/s)": 0.142155 }, { "epoch": 0.7918408922077795, "grad_norm": 0.7701247930526733, "learning_rate": 7.07006452533649e-05, "loss": 0.8731069564819336, "memory(GiB)": 91.52, "step": 61025, "token_acc": 0.7459772506588986, "train_speed(iter/s)": 0.142152 }, { "epoch": 0.7919057706094352, "grad_norm": 0.8231725096702576, "learning_rate": 7.069576266599831e-05, "loss": 0.8701629638671875, "memory(GiB)": 91.52, "step": 61030, "token_acc": 0.7717097355769231, "train_speed(iter/s)": 0.142151 }, { "epoch": 0.7919706490110909, "grad_norm": 0.8176720142364502, "learning_rate": 7.069087984046623e-05, "loss": 0.8526170730590821, "memory(GiB)": 91.52, "step": 61035, "token_acc": 0.7796901345719467, "train_speed(iter/s)": 0.142149 }, { "epoch": 0.7920355274127466, "grad_norm": 0.7229297161102295, "learning_rate": 7.068599677682485e-05, "loss": 0.8616021156311036, "memory(GiB)": 91.52, "step": 61040, "token_acc": 0.7666678259659861, "train_speed(iter/s)": 0.142147 }, { "epoch": 0.7921004058144023, "grad_norm": 0.6845755577087402, "learning_rate": 7.068111347513034e-05, "loss": 0.8565913200378418, "memory(GiB)": 91.52, "step": 61045, "token_acc": 0.7571466560297833, "train_speed(iter/s)": 0.142145 }, { "epoch": 0.792165284216058, "grad_norm": 0.7031458020210266, "learning_rate": 7.067622993543892e-05, "loss": 0.8638079643249512, "memory(GiB)": 91.52, "step": 61050, "token_acc": 0.759483260802522, "train_speed(iter/s)": 0.142143 }, { "epoch": 0.7922301626177137, "grad_norm": 0.6230024099349976, "learning_rate": 7.067134615780675e-05, "loss": 0.8373661041259766, "memory(GiB)": 91.52, "step": 61055, "token_acc": 0.7678270949407322, "train_speed(iter/s)": 0.142141 }, { "epoch": 0.7922950410193694, "grad_norm": 0.7616822719573975, "learning_rate": 7.066646214229008e-05, "loss": 0.8646190643310547, "memory(GiB)": 91.52, "step": 61060, "token_acc": 0.7595829383886256, "train_speed(iter/s)": 0.142139 }, { "epoch": 0.7923599194210251, "grad_norm": 0.735575258731842, "learning_rate": 7.066157788894509e-05, "loss": 0.8922313690185547, "memory(GiB)": 91.52, "step": 61065, "token_acc": 0.7388450131627179, "train_speed(iter/s)": 0.142136 }, { "epoch": 0.7924247978226808, "grad_norm": 0.7300105094909668, "learning_rate": 7.065669339782799e-05, "loss": 0.8841178894042969, "memory(GiB)": 91.52, "step": 61070, "token_acc": 0.7550705240399465, "train_speed(iter/s)": 0.142133 }, { "epoch": 0.7924896762243365, "grad_norm": 0.7207289934158325, "learning_rate": 7.065180866899499e-05, "loss": 0.8744850158691406, "memory(GiB)": 91.52, "step": 61075, "token_acc": 0.7586103123119107, "train_speed(iter/s)": 0.142131 }, { "epoch": 0.7925545546259922, "grad_norm": 0.734548807144165, "learning_rate": 7.06469237025023e-05, "loss": 0.8748425483703614, "memory(GiB)": 91.52, "step": 61080, "token_acc": 0.754057244024786, "train_speed(iter/s)": 0.142129 }, { "epoch": 0.7926194330276479, "grad_norm": 0.7335665225982666, "learning_rate": 7.064203849840616e-05, "loss": 0.8780125617980957, "memory(GiB)": 91.52, "step": 61085, "token_acc": 0.7593420580268267, "train_speed(iter/s)": 0.142127 }, { "epoch": 0.7926843114293036, "grad_norm": 0.7994884848594666, "learning_rate": 7.063715305676273e-05, "loss": 0.8856733322143555, "memory(GiB)": 91.52, "step": 61090, "token_acc": 0.7525539555803493, "train_speed(iter/s)": 0.142125 }, { "epoch": 0.7927491898309593, "grad_norm": 0.6943832039833069, "learning_rate": 7.063226737762831e-05, "loss": 0.8634326934814454, "memory(GiB)": 91.52, "step": 61095, "token_acc": 0.7470617447442477, "train_speed(iter/s)": 0.142124 }, { "epoch": 0.792814068232615, "grad_norm": 0.6797778606414795, "learning_rate": 7.062738146105907e-05, "loss": 0.8769955635070801, "memory(GiB)": 91.52, "step": 61100, "token_acc": 0.7571628112969714, "train_speed(iter/s)": 0.142122 }, { "epoch": 0.7928789466342707, "grad_norm": 0.7881066799163818, "learning_rate": 7.062249530711126e-05, "loss": 0.86175537109375, "memory(GiB)": 91.52, "step": 61105, "token_acc": 0.7603746978243352, "train_speed(iter/s)": 0.142119 }, { "epoch": 0.7929438250359264, "grad_norm": 0.7114166617393494, "learning_rate": 7.06176089158411e-05, "loss": 0.8838604927062989, "memory(GiB)": 91.52, "step": 61110, "token_acc": 0.7431054764083126, "train_speed(iter/s)": 0.142117 }, { "epoch": 0.7930087034375821, "grad_norm": 0.7313322424888611, "learning_rate": 7.061272228730482e-05, "loss": 0.8537013053894043, "memory(GiB)": 91.52, "step": 61115, "token_acc": 0.7572808586762075, "train_speed(iter/s)": 0.142115 }, { "epoch": 0.7930735818392378, "grad_norm": 0.7500284314155579, "learning_rate": 7.060783542155866e-05, "loss": 0.8495059967041015, "memory(GiB)": 91.52, "step": 61120, "token_acc": 0.777357322233307, "train_speed(iter/s)": 0.142113 }, { "epoch": 0.7931384602408935, "grad_norm": 0.7630585432052612, "learning_rate": 7.060294831865883e-05, "loss": 0.8739410400390625, "memory(GiB)": 91.52, "step": 61125, "token_acc": 0.7624029292238361, "train_speed(iter/s)": 0.142111 }, { "epoch": 0.7932033386425492, "grad_norm": 0.7156805992126465, "learning_rate": 7.059806097866165e-05, "loss": 0.8709989547729492, "memory(GiB)": 91.52, "step": 61130, "token_acc": 0.7603097773475315, "train_speed(iter/s)": 0.14211 }, { "epoch": 0.7932682170442049, "grad_norm": 0.7797923684120178, "learning_rate": 7.059317340162325e-05, "loss": 0.8508872985839844, "memory(GiB)": 91.52, "step": 61135, "token_acc": 0.7646511627906977, "train_speed(iter/s)": 0.142107 }, { "epoch": 0.7933330954458606, "grad_norm": 0.8195279836654663, "learning_rate": 7.058828558759998e-05, "loss": 0.8951778411865234, "memory(GiB)": 91.52, "step": 61140, "token_acc": 0.7639228042536431, "train_speed(iter/s)": 0.142105 }, { "epoch": 0.7933979738475163, "grad_norm": 0.7097960114479065, "learning_rate": 7.058339753664803e-05, "loss": 0.8710126876831055, "memory(GiB)": 91.52, "step": 61145, "token_acc": 0.780238309352518, "train_speed(iter/s)": 0.142103 }, { "epoch": 0.793462852249172, "grad_norm": 0.7840011119842529, "learning_rate": 7.057850924882367e-05, "loss": 0.9024086952209472, "memory(GiB)": 91.52, "step": 61150, "token_acc": 0.765246293460806, "train_speed(iter/s)": 0.1421 }, { "epoch": 0.7935277306508277, "grad_norm": 0.6908212900161743, "learning_rate": 7.057362072418314e-05, "loss": 0.8574884414672852, "memory(GiB)": 91.52, "step": 61155, "token_acc": 0.7729583786247765, "train_speed(iter/s)": 0.142098 }, { "epoch": 0.7935926090524834, "grad_norm": 0.7715744376182556, "learning_rate": 7.056873196278271e-05, "loss": 0.8561311721801758, "memory(GiB)": 91.52, "step": 61160, "token_acc": 0.7626520432867602, "train_speed(iter/s)": 0.142096 }, { "epoch": 0.7936574874541391, "grad_norm": 0.7106292843818665, "learning_rate": 7.056384296467865e-05, "loss": 0.9046649932861328, "memory(GiB)": 91.52, "step": 61165, "token_acc": 0.7664722402246705, "train_speed(iter/s)": 0.142094 }, { "epoch": 0.7937223658557948, "grad_norm": 0.7805710434913635, "learning_rate": 7.055895372992718e-05, "loss": 0.8502622604370117, "memory(GiB)": 91.52, "step": 61170, "token_acc": 0.7576821088753705, "train_speed(iter/s)": 0.142093 }, { "epoch": 0.7937872442574505, "grad_norm": 0.7094125151634216, "learning_rate": 7.055406425858462e-05, "loss": 0.8728699684143066, "memory(GiB)": 91.52, "step": 61175, "token_acc": 0.765787860208461, "train_speed(iter/s)": 0.142091 }, { "epoch": 0.7938521226591062, "grad_norm": 0.7526530623435974, "learning_rate": 7.054917455070721e-05, "loss": 0.8717365264892578, "memory(GiB)": 91.52, "step": 61180, "token_acc": 0.7570847434451168, "train_speed(iter/s)": 0.142088 }, { "epoch": 0.7939170010607619, "grad_norm": 0.6919623613357544, "learning_rate": 7.054428460635122e-05, "loss": 0.8365158081054688, "memory(GiB)": 91.52, "step": 61185, "token_acc": 0.779631928722705, "train_speed(iter/s)": 0.142087 }, { "epoch": 0.7939818794624176, "grad_norm": 0.7610474824905396, "learning_rate": 7.053939442557291e-05, "loss": 0.853282356262207, "memory(GiB)": 91.52, "step": 61190, "token_acc": 0.7554270489569772, "train_speed(iter/s)": 0.142085 }, { "epoch": 0.7940467578640733, "grad_norm": 0.771689772605896, "learning_rate": 7.053450400842858e-05, "loss": 0.9082195281982421, "memory(GiB)": 91.52, "step": 61195, "token_acc": 0.7420017328241825, "train_speed(iter/s)": 0.142082 }, { "epoch": 0.794111636265729, "grad_norm": 0.7509384751319885, "learning_rate": 7.05296133549745e-05, "loss": 0.8992241859436035, "memory(GiB)": 91.52, "step": 61200, "token_acc": 0.7710606804365064, "train_speed(iter/s)": 0.142081 }, { "epoch": 0.7941765146673847, "grad_norm": 0.7424944043159485, "learning_rate": 7.052472246526695e-05, "loss": 0.8592128753662109, "memory(GiB)": 91.52, "step": 61205, "token_acc": 0.7863963622584312, "train_speed(iter/s)": 0.142079 }, { "epoch": 0.7942413930690404, "grad_norm": 0.758836567401886, "learning_rate": 7.051983133936221e-05, "loss": 0.8938475608825683, "memory(GiB)": 91.52, "step": 61210, "token_acc": 0.7547948854555141, "train_speed(iter/s)": 0.142078 }, { "epoch": 0.7943062714706961, "grad_norm": 0.7346078157424927, "learning_rate": 7.051493997731659e-05, "loss": 0.8747241973876954, "memory(GiB)": 91.52, "step": 61215, "token_acc": 0.7673503793200337, "train_speed(iter/s)": 0.142076 }, { "epoch": 0.7943711498723517, "grad_norm": 0.7198134064674377, "learning_rate": 7.051004837918635e-05, "loss": 0.8921625137329101, "memory(GiB)": 91.52, "step": 61220, "token_acc": 0.7616768700083906, "train_speed(iter/s)": 0.142074 }, { "epoch": 0.7944360282740074, "grad_norm": 0.6336640119552612, "learning_rate": 7.05051565450278e-05, "loss": 0.828897762298584, "memory(GiB)": 91.52, "step": 61225, "token_acc": 0.754687315750167, "train_speed(iter/s)": 0.142072 }, { "epoch": 0.7945009066756631, "grad_norm": 0.7261055111885071, "learning_rate": 7.05002644748972e-05, "loss": 0.8635967254638672, "memory(GiB)": 91.52, "step": 61230, "token_acc": 0.7631536669612017, "train_speed(iter/s)": 0.142071 }, { "epoch": 0.7945657850773188, "grad_norm": 0.6667258739471436, "learning_rate": 7.049537216885091e-05, "loss": 0.9097370147705078, "memory(GiB)": 91.52, "step": 61235, "token_acc": 0.7452771369786146, "train_speed(iter/s)": 0.142069 }, { "epoch": 0.7946306634789745, "grad_norm": 0.7406108975410461, "learning_rate": 7.049047962694517e-05, "loss": 0.8570903778076172, "memory(GiB)": 91.52, "step": 61240, "token_acc": 0.75426944971537, "train_speed(iter/s)": 0.142068 }, { "epoch": 0.7946955418806302, "grad_norm": 0.7082820534706116, "learning_rate": 7.048558684923633e-05, "loss": 0.8693209648132324, "memory(GiB)": 91.52, "step": 61245, "token_acc": 0.7802723592197276, "train_speed(iter/s)": 0.142066 }, { "epoch": 0.7947604202822859, "grad_norm": 0.8443446755409241, "learning_rate": 7.048069383578066e-05, "loss": 0.8755380630493164, "memory(GiB)": 91.52, "step": 61250, "token_acc": 0.7793785403228447, "train_speed(iter/s)": 0.142064 }, { "epoch": 0.7948252986839416, "grad_norm": 0.7542039155960083, "learning_rate": 7.04758005866345e-05, "loss": 0.8426176071166992, "memory(GiB)": 91.52, "step": 61255, "token_acc": 0.7677800580284481, "train_speed(iter/s)": 0.142062 }, { "epoch": 0.7948901770855973, "grad_norm": 0.7899487614631653, "learning_rate": 7.047090710185413e-05, "loss": 0.9153218269348145, "memory(GiB)": 91.52, "step": 61260, "token_acc": 0.7379049992669696, "train_speed(iter/s)": 0.142061 }, { "epoch": 0.794955055487253, "grad_norm": 0.719614565372467, "learning_rate": 7.046601338149588e-05, "loss": 0.8837067604064941, "memory(GiB)": 91.52, "step": 61265, "token_acc": 0.7531685456200546, "train_speed(iter/s)": 0.142059 }, { "epoch": 0.7950199338889087, "grad_norm": 0.7222021222114563, "learning_rate": 7.046111942561606e-05, "loss": 0.8802865982055664, "memory(GiB)": 91.52, "step": 61270, "token_acc": 0.7449670515688289, "train_speed(iter/s)": 0.142057 }, { "epoch": 0.7950848122905644, "grad_norm": 0.6772992014884949, "learning_rate": 7.0456225234271e-05, "loss": 0.8817981719970703, "memory(GiB)": 91.52, "step": 61275, "token_acc": 0.7805210162324865, "train_speed(iter/s)": 0.142054 }, { "epoch": 0.7951496906922201, "grad_norm": 0.7645671367645264, "learning_rate": 7.045133080751702e-05, "loss": 0.9027104377746582, "memory(GiB)": 91.52, "step": 61280, "token_acc": 0.7561820872892675, "train_speed(iter/s)": 0.142052 }, { "epoch": 0.7952145690938758, "grad_norm": 0.8102926015853882, "learning_rate": 7.044643614541045e-05, "loss": 0.917322826385498, "memory(GiB)": 91.52, "step": 61285, "token_acc": 0.7365828670502009, "train_speed(iter/s)": 0.142051 }, { "epoch": 0.7952794474955315, "grad_norm": 0.8284065127372742, "learning_rate": 7.04415412480076e-05, "loss": 0.8916170120239257, "memory(GiB)": 91.52, "step": 61290, "token_acc": 0.7455174333081541, "train_speed(iter/s)": 0.142049 }, { "epoch": 0.7953443258971872, "grad_norm": 0.6249005198478699, "learning_rate": 7.043664611536481e-05, "loss": 0.8269298553466797, "memory(GiB)": 91.52, "step": 61295, "token_acc": 0.7556008850781358, "train_speed(iter/s)": 0.142048 }, { "epoch": 0.7954092042988429, "grad_norm": 0.7011808753013611, "learning_rate": 7.04317507475384e-05, "loss": 0.9023218154907227, "memory(GiB)": 91.52, "step": 61300, "token_acc": 0.7511700769736588, "train_speed(iter/s)": 0.142046 }, { "epoch": 0.7954740827004986, "grad_norm": 0.715394914150238, "learning_rate": 7.042685514458473e-05, "loss": 0.8835842132568359, "memory(GiB)": 91.52, "step": 61305, "token_acc": 0.7458443426201545, "train_speed(iter/s)": 0.142043 }, { "epoch": 0.7955389611021543, "grad_norm": 0.6765944361686707, "learning_rate": 7.042195930656013e-05, "loss": 0.8823403358459473, "memory(GiB)": 91.52, "step": 61310, "token_acc": 0.7709143815036215, "train_speed(iter/s)": 0.142042 }, { "epoch": 0.79560383950381, "grad_norm": 0.7539440393447876, "learning_rate": 7.041706323352093e-05, "loss": 0.8991510391235351, "memory(GiB)": 91.52, "step": 61315, "token_acc": 0.757516315213701, "train_speed(iter/s)": 0.14204 }, { "epoch": 0.7956687179054657, "grad_norm": 0.7066920399665833, "learning_rate": 7.041216692552346e-05, "loss": 0.8571660041809082, "memory(GiB)": 91.52, "step": 61320, "token_acc": 0.7608734371103908, "train_speed(iter/s)": 0.142038 }, { "epoch": 0.7957335963071214, "grad_norm": 0.6151299476623535, "learning_rate": 7.040727038262412e-05, "loss": 0.8506004333496093, "memory(GiB)": 91.52, "step": 61325, "token_acc": 0.7581688450253401, "train_speed(iter/s)": 0.142035 }, { "epoch": 0.7957984747087771, "grad_norm": 0.7313352823257446, "learning_rate": 7.040237360487922e-05, "loss": 0.8866672515869141, "memory(GiB)": 91.52, "step": 61330, "token_acc": 0.7443326721450836, "train_speed(iter/s)": 0.142034 }, { "epoch": 0.7958633531104328, "grad_norm": 0.7866383194923401, "learning_rate": 7.039747659234511e-05, "loss": 0.9307531356811524, "memory(GiB)": 91.52, "step": 61335, "token_acc": 0.7702468530325323, "train_speed(iter/s)": 0.142031 }, { "epoch": 0.7959282315120885, "grad_norm": 0.7589394450187683, "learning_rate": 7.039257934507815e-05, "loss": 0.9104766845703125, "memory(GiB)": 91.52, "step": 61340, "token_acc": 0.7595108695652174, "train_speed(iter/s)": 0.14203 }, { "epoch": 0.7959931099137442, "grad_norm": 0.7440109252929688, "learning_rate": 7.03876818631347e-05, "loss": 0.8699148178100586, "memory(GiB)": 91.52, "step": 61345, "token_acc": 0.7742770167427702, "train_speed(iter/s)": 0.142028 }, { "epoch": 0.7960579883153999, "grad_norm": 0.7874877452850342, "learning_rate": 7.038278414657111e-05, "loss": 0.845771598815918, "memory(GiB)": 91.52, "step": 61350, "token_acc": 0.7820975283901136, "train_speed(iter/s)": 0.142026 }, { "epoch": 0.7961228667170556, "grad_norm": 0.7532999515533447, "learning_rate": 7.037788619544376e-05, "loss": 0.8659385681152344, "memory(GiB)": 91.52, "step": 61355, "token_acc": 0.7703664294739643, "train_speed(iter/s)": 0.142025 }, { "epoch": 0.7961877451187113, "grad_norm": 0.7809948921203613, "learning_rate": 7.037298800980901e-05, "loss": 0.8945344924926758, "memory(GiB)": 91.52, "step": 61360, "token_acc": 0.7369811320754717, "train_speed(iter/s)": 0.142023 }, { "epoch": 0.796252623520367, "grad_norm": 0.7394985556602478, "learning_rate": 7.036808958972324e-05, "loss": 0.8758001327514648, "memory(GiB)": 91.52, "step": 61365, "token_acc": 0.7596850970340535, "train_speed(iter/s)": 0.142021 }, { "epoch": 0.7963175019220227, "grad_norm": 0.6572214365005493, "learning_rate": 7.036319093524279e-05, "loss": 0.8093677520751953, "memory(GiB)": 91.52, "step": 61370, "token_acc": 0.779856330439988, "train_speed(iter/s)": 0.142018 }, { "epoch": 0.7963823803236784, "grad_norm": 0.7270228266716003, "learning_rate": 7.035829204642405e-05, "loss": 0.8667797088623047, "memory(GiB)": 91.52, "step": 61375, "token_acc": 0.7853944977939268, "train_speed(iter/s)": 0.142017 }, { "epoch": 0.7964472587253341, "grad_norm": 0.7458961009979248, "learning_rate": 7.035339292332339e-05, "loss": 0.9380460739135742, "memory(GiB)": 91.52, "step": 61380, "token_acc": 0.7597188934476174, "train_speed(iter/s)": 0.142015 }, { "epoch": 0.7965121371269898, "grad_norm": 0.8441410660743713, "learning_rate": 7.03484935659972e-05, "loss": 0.8948629379272461, "memory(GiB)": 91.52, "step": 61385, "token_acc": 0.7609622931240403, "train_speed(iter/s)": 0.142014 }, { "epoch": 0.7965770155286455, "grad_norm": 0.7560097575187683, "learning_rate": 7.034359397450185e-05, "loss": 0.8658367156982422, "memory(GiB)": 91.52, "step": 61390, "token_acc": 0.7447620369085611, "train_speed(iter/s)": 0.142012 }, { "epoch": 0.7966418939303012, "grad_norm": 0.7209262251853943, "learning_rate": 7.033869414889374e-05, "loss": 0.8927169799804687, "memory(GiB)": 91.52, "step": 61395, "token_acc": 0.7698529120247394, "train_speed(iter/s)": 0.14201 }, { "epoch": 0.7967067723319569, "grad_norm": 0.7030488848686218, "learning_rate": 7.033379408922925e-05, "loss": 0.8349742889404297, "memory(GiB)": 91.52, "step": 61400, "token_acc": 0.7555337407770987, "train_speed(iter/s)": 0.142009 }, { "epoch": 0.7967716507336126, "grad_norm": 0.679821252822876, "learning_rate": 7.032889379556475e-05, "loss": 0.8848737716674805, "memory(GiB)": 91.52, "step": 61405, "token_acc": 0.773317050831934, "train_speed(iter/s)": 0.142007 }, { "epoch": 0.7968365291352683, "grad_norm": 0.7373642921447754, "learning_rate": 7.032399326795665e-05, "loss": 0.8854955673217774, "memory(GiB)": 91.52, "step": 61410, "token_acc": 0.767196160857111, "train_speed(iter/s)": 0.142005 }, { "epoch": 0.796901407536924, "grad_norm": 0.7258526086807251, "learning_rate": 7.031909250646134e-05, "loss": 0.9212334632873536, "memory(GiB)": 91.52, "step": 61415, "token_acc": 0.7340238978834468, "train_speed(iter/s)": 0.142003 }, { "epoch": 0.7969662859385797, "grad_norm": 0.6763957142829895, "learning_rate": 7.031419151113526e-05, "loss": 0.8949481964111328, "memory(GiB)": 91.52, "step": 61420, "token_acc": 0.7444488146744377, "train_speed(iter/s)": 0.142001 }, { "epoch": 0.7970311643402354, "grad_norm": 0.7285334467887878, "learning_rate": 7.030929028203472e-05, "loss": 0.8810737609863282, "memory(GiB)": 91.52, "step": 61425, "token_acc": 0.7580231930960086, "train_speed(iter/s)": 0.142 }, { "epoch": 0.797096042741891, "grad_norm": 0.7312619090080261, "learning_rate": 7.03043888192162e-05, "loss": 0.8791699409484863, "memory(GiB)": 91.52, "step": 61430, "token_acc": 0.7563590255897948, "train_speed(iter/s)": 0.141998 }, { "epoch": 0.7971609211435468, "grad_norm": 0.7037487030029297, "learning_rate": 7.029948712273608e-05, "loss": 0.8665031433105469, "memory(GiB)": 91.52, "step": 61435, "token_acc": 0.7606949252171641, "train_speed(iter/s)": 0.141995 }, { "epoch": 0.7972257995452025, "grad_norm": 0.7066187262535095, "learning_rate": 7.029458519265078e-05, "loss": 0.8661298751831055, "memory(GiB)": 91.52, "step": 61440, "token_acc": 0.7669128156264888, "train_speed(iter/s)": 0.141993 }, { "epoch": 0.7972906779468582, "grad_norm": 0.6932363510131836, "learning_rate": 7.028968302901668e-05, "loss": 0.8719045639038085, "memory(GiB)": 91.52, "step": 61445, "token_acc": 0.7636266855926189, "train_speed(iter/s)": 0.141991 }, { "epoch": 0.7973555563485138, "grad_norm": 1.015641212463379, "learning_rate": 7.028478063189022e-05, "loss": 0.8947357177734375, "memory(GiB)": 91.52, "step": 61450, "token_acc": 0.7690226214958329, "train_speed(iter/s)": 0.141989 }, { "epoch": 0.7974204347501695, "grad_norm": 0.6886834502220154, "learning_rate": 7.027987800132782e-05, "loss": 0.8416786193847656, "memory(GiB)": 91.52, "step": 61455, "token_acc": 0.7803088533296066, "train_speed(iter/s)": 0.141988 }, { "epoch": 0.7974853131518251, "grad_norm": 0.7319281101226807, "learning_rate": 7.027497513738586e-05, "loss": 0.8825223922729493, "memory(GiB)": 91.52, "step": 61460, "token_acc": 0.7650258269763623, "train_speed(iter/s)": 0.141985 }, { "epoch": 0.7975501915534808, "grad_norm": 0.690250813961029, "learning_rate": 7.027007204012082e-05, "loss": 0.8059673309326172, "memory(GiB)": 91.52, "step": 61465, "token_acc": 0.7881332325201957, "train_speed(iter/s)": 0.141983 }, { "epoch": 0.7976150699551365, "grad_norm": 0.7106049060821533, "learning_rate": 7.026516870958908e-05, "loss": 0.9178711891174316, "memory(GiB)": 91.52, "step": 61470, "token_acc": 0.7349514913261442, "train_speed(iter/s)": 0.141981 }, { "epoch": 0.7976799483567922, "grad_norm": 0.8112105131149292, "learning_rate": 7.02602651458471e-05, "loss": 0.8905981063842774, "memory(GiB)": 91.52, "step": 61475, "token_acc": 0.7553565393714092, "train_speed(iter/s)": 0.14198 }, { "epoch": 0.7977448267584479, "grad_norm": 0.6810737252235413, "learning_rate": 7.025536134895127e-05, "loss": 0.8387799263000488, "memory(GiB)": 91.52, "step": 61480, "token_acc": 0.7657977344069878, "train_speed(iter/s)": 0.141978 }, { "epoch": 0.7978097051601036, "grad_norm": 0.7949782609939575, "learning_rate": 7.025045731895805e-05, "loss": 0.852077865600586, "memory(GiB)": 91.52, "step": 61485, "token_acc": 0.758120795712076, "train_speed(iter/s)": 0.141976 }, { "epoch": 0.7978745835617593, "grad_norm": 0.7003561854362488, "learning_rate": 7.024555305592388e-05, "loss": 0.8440478324890137, "memory(GiB)": 91.52, "step": 61490, "token_acc": 0.7650232091331075, "train_speed(iter/s)": 0.141974 }, { "epoch": 0.797939461963415, "grad_norm": 0.7307329773902893, "learning_rate": 7.024064855990518e-05, "loss": 0.8496776580810547, "memory(GiB)": 91.52, "step": 61495, "token_acc": 0.7675096186171604, "train_speed(iter/s)": 0.141972 }, { "epoch": 0.7980043403650707, "grad_norm": 0.6888997554779053, "learning_rate": 7.023574383095841e-05, "loss": 0.8156685829162598, "memory(GiB)": 91.52, "step": 61500, "token_acc": 0.7744063324538258, "train_speed(iter/s)": 0.14197 }, { "epoch": 0.7980692187667264, "grad_norm": 0.744217038154602, "learning_rate": 7.023083886913999e-05, "loss": 0.9067434310913086, "memory(GiB)": 91.52, "step": 61505, "token_acc": 0.7547267547267548, "train_speed(iter/s)": 0.141968 }, { "epoch": 0.7981340971683821, "grad_norm": 0.6879167556762695, "learning_rate": 7.022593367450639e-05, "loss": 0.8901126861572266, "memory(GiB)": 91.52, "step": 61510, "token_acc": 0.7553176096078715, "train_speed(iter/s)": 0.141967 }, { "epoch": 0.7981989755700378, "grad_norm": 0.7392178177833557, "learning_rate": 7.022102824711406e-05, "loss": 0.8618049621582031, "memory(GiB)": 91.52, "step": 61515, "token_acc": 0.7641489585524932, "train_speed(iter/s)": 0.141966 }, { "epoch": 0.7982638539716935, "grad_norm": 0.6679858565330505, "learning_rate": 7.02161225870194e-05, "loss": 0.8992963790893554, "memory(GiB)": 91.52, "step": 61520, "token_acc": 0.7682148272284733, "train_speed(iter/s)": 0.141964 }, { "epoch": 0.7983287323733492, "grad_norm": 0.7871823906898499, "learning_rate": 7.021121669427893e-05, "loss": 0.8496484756469727, "memory(GiB)": 91.52, "step": 61525, "token_acc": 0.7598398932621748, "train_speed(iter/s)": 0.141961 }, { "epoch": 0.7983936107750049, "grad_norm": 0.7184416055679321, "learning_rate": 7.020631056894907e-05, "loss": 0.8768645286560058, "memory(GiB)": 91.52, "step": 61530, "token_acc": 0.7507829438689473, "train_speed(iter/s)": 0.141959 }, { "epoch": 0.7984584891766606, "grad_norm": 0.7068644762039185, "learning_rate": 7.020140421108629e-05, "loss": 0.8607444763183594, "memory(GiB)": 91.52, "step": 61535, "token_acc": 0.7534402221941674, "train_speed(iter/s)": 0.141958 }, { "epoch": 0.7985233675783163, "grad_norm": 0.6592010259628296, "learning_rate": 7.019649762074705e-05, "loss": 0.8483346939086914, "memory(GiB)": 91.52, "step": 61540, "token_acc": 0.7560852657352729, "train_speed(iter/s)": 0.141955 }, { "epoch": 0.798588245979972, "grad_norm": 0.7564353942871094, "learning_rate": 7.019159079798782e-05, "loss": 0.8702224731445313, "memory(GiB)": 91.52, "step": 61545, "token_acc": 0.7482820680628273, "train_speed(iter/s)": 0.141954 }, { "epoch": 0.7986531243816277, "grad_norm": 0.7780430912971497, "learning_rate": 7.018668374286506e-05, "loss": 0.9006019592285156, "memory(GiB)": 91.52, "step": 61550, "token_acc": 0.7772018429913167, "train_speed(iter/s)": 0.141952 }, { "epoch": 0.7987180027832834, "grad_norm": 0.7590966820716858, "learning_rate": 7.018177645543522e-05, "loss": 0.8655112266540528, "memory(GiB)": 91.52, "step": 61555, "token_acc": 0.7667429692609549, "train_speed(iter/s)": 0.14195 }, { "epoch": 0.7987828811849391, "grad_norm": 0.6980844736099243, "learning_rate": 7.017686893575484e-05, "loss": 0.8936306953430175, "memory(GiB)": 91.52, "step": 61560, "token_acc": 0.7562216641116801, "train_speed(iter/s)": 0.141948 }, { "epoch": 0.7988477595865948, "grad_norm": 0.6987038850784302, "learning_rate": 7.017196118388032e-05, "loss": 0.871584129333496, "memory(GiB)": 91.52, "step": 61565, "token_acc": 0.7628651664903898, "train_speed(iter/s)": 0.141945 }, { "epoch": 0.7989126379882505, "grad_norm": 0.7156906127929688, "learning_rate": 7.016705319986817e-05, "loss": 0.8290925979614258, "memory(GiB)": 91.52, "step": 61570, "token_acc": 0.7892047978676144, "train_speed(iter/s)": 0.141944 }, { "epoch": 0.7989775163899062, "grad_norm": 0.7179670333862305, "learning_rate": 7.016214498377487e-05, "loss": 0.8331001281738282, "memory(GiB)": 91.52, "step": 61575, "token_acc": 0.7632765531062125, "train_speed(iter/s)": 0.141941 }, { "epoch": 0.7990423947915619, "grad_norm": 0.7448573112487793, "learning_rate": 7.015723653565692e-05, "loss": 0.8739606857299804, "memory(GiB)": 91.52, "step": 61580, "token_acc": 0.7596939802697805, "train_speed(iter/s)": 0.14194 }, { "epoch": 0.7991072731932176, "grad_norm": 0.754815399646759, "learning_rate": 7.015232785557078e-05, "loss": 0.8809078216552735, "memory(GiB)": 91.52, "step": 61585, "token_acc": 0.7506782329102053, "train_speed(iter/s)": 0.141938 }, { "epoch": 0.7991721515948733, "grad_norm": 0.701677680015564, "learning_rate": 7.014741894357294e-05, "loss": 0.9017167091369629, "memory(GiB)": 91.52, "step": 61590, "token_acc": 0.7586547321311043, "train_speed(iter/s)": 0.141936 }, { "epoch": 0.799237029996529, "grad_norm": 0.7316059470176697, "learning_rate": 7.01425097997199e-05, "loss": 0.8285257339477539, "memory(GiB)": 91.52, "step": 61595, "token_acc": 0.7733538543897216, "train_speed(iter/s)": 0.141934 }, { "epoch": 0.7993019083981847, "grad_norm": 0.7656704187393188, "learning_rate": 7.013760042406817e-05, "loss": 0.8596476554870606, "memory(GiB)": 91.52, "step": 61600, "token_acc": 0.7737713951165754, "train_speed(iter/s)": 0.141932 }, { "epoch": 0.7993667867998404, "grad_norm": 0.7269066572189331, "learning_rate": 7.013269081667422e-05, "loss": 0.8534675598144531, "memory(GiB)": 91.52, "step": 61605, "token_acc": 0.7704886788137532, "train_speed(iter/s)": 0.14193 }, { "epoch": 0.7994316652014961, "grad_norm": 0.6785447001457214, "learning_rate": 7.012778097759455e-05, "loss": 0.8613781929016113, "memory(GiB)": 91.52, "step": 61610, "token_acc": 0.7462447229767321, "train_speed(iter/s)": 0.141928 }, { "epoch": 0.7994965436031518, "grad_norm": 0.729333221912384, "learning_rate": 7.01228709068857e-05, "loss": 0.9077569961547851, "memory(GiB)": 91.52, "step": 61615, "token_acc": 0.7488271041279637, "train_speed(iter/s)": 0.141927 }, { "epoch": 0.7995614220048075, "grad_norm": 0.6991079449653625, "learning_rate": 7.011796060460411e-05, "loss": 0.8686674118041993, "memory(GiB)": 91.52, "step": 61620, "token_acc": 0.7493043051235881, "train_speed(iter/s)": 0.141925 }, { "epoch": 0.7996263004064632, "grad_norm": 0.7228294610977173, "learning_rate": 7.011305007080634e-05, "loss": 0.8951101303100586, "memory(GiB)": 91.52, "step": 61625, "token_acc": 0.7542238969247599, "train_speed(iter/s)": 0.141923 }, { "epoch": 0.7996911788081189, "grad_norm": 0.7099853754043579, "learning_rate": 7.01081393055489e-05, "loss": 0.917347526550293, "memory(GiB)": 91.52, "step": 61630, "token_acc": 0.7562997456983388, "train_speed(iter/s)": 0.141922 }, { "epoch": 0.7997560572097746, "grad_norm": 0.7456815838813782, "learning_rate": 7.010322830888826e-05, "loss": 0.8978329658508301, "memory(GiB)": 91.52, "step": 61635, "token_acc": 0.754287225202085, "train_speed(iter/s)": 0.14192 }, { "epoch": 0.7998209356114303, "grad_norm": 0.6288992762565613, "learning_rate": 7.009831708088098e-05, "loss": 0.8194948196411133, "memory(GiB)": 91.52, "step": 61640, "token_acc": 0.7616655651024454, "train_speed(iter/s)": 0.141918 }, { "epoch": 0.799885814013086, "grad_norm": 0.7601751685142517, "learning_rate": 7.009340562158356e-05, "loss": 0.9119644165039062, "memory(GiB)": 91.52, "step": 61645, "token_acc": 0.7620891669149041, "train_speed(iter/s)": 0.141916 }, { "epoch": 0.7999506924147417, "grad_norm": 0.7404528260231018, "learning_rate": 7.008849393105252e-05, "loss": 0.9067597389221191, "memory(GiB)": 91.52, "step": 61650, "token_acc": 0.7530390481411021, "train_speed(iter/s)": 0.141914 }, { "epoch": 0.8000155708163974, "grad_norm": 0.8227148652076721, "learning_rate": 7.008358200934438e-05, "loss": 0.8566880226135254, "memory(GiB)": 91.52, "step": 61655, "token_acc": 0.7873005801305294, "train_speed(iter/s)": 0.141913 }, { "epoch": 0.8000804492180531, "grad_norm": 0.753551185131073, "learning_rate": 7.007866985651566e-05, "loss": 0.8847272872924805, "memory(GiB)": 91.52, "step": 61660, "token_acc": 0.759224119755429, "train_speed(iter/s)": 0.141911 }, { "epoch": 0.8001453276197088, "grad_norm": 0.666601300239563, "learning_rate": 7.007375747262292e-05, "loss": 0.8595552444458008, "memory(GiB)": 91.52, "step": 61665, "token_acc": 0.7554945054945055, "train_speed(iter/s)": 0.141909 }, { "epoch": 0.8002102060213645, "grad_norm": 0.7202698588371277, "learning_rate": 7.006884485772268e-05, "loss": 0.874810791015625, "memory(GiB)": 91.52, "step": 61670, "token_acc": 0.767271185334986, "train_speed(iter/s)": 0.141907 }, { "epoch": 0.8002750844230202, "grad_norm": 0.7134739756584167, "learning_rate": 7.006393201187144e-05, "loss": 0.9118034362792968, "memory(GiB)": 91.52, "step": 61675, "token_acc": 0.7591970829271815, "train_speed(iter/s)": 0.141906 }, { "epoch": 0.8003399628246759, "grad_norm": 0.7230015993118286, "learning_rate": 7.005901893512578e-05, "loss": 0.8776373863220215, "memory(GiB)": 91.52, "step": 61680, "token_acc": 0.7516842764054044, "train_speed(iter/s)": 0.141904 }, { "epoch": 0.8004048412263316, "grad_norm": 0.814081072807312, "learning_rate": 7.00541056275422e-05, "loss": 0.9277376174926758, "memory(GiB)": 91.52, "step": 61685, "token_acc": 0.7713945982673688, "train_speed(iter/s)": 0.141902 }, { "epoch": 0.8004697196279873, "grad_norm": 0.7495077848434448, "learning_rate": 7.004919208917728e-05, "loss": 0.9152313232421875, "memory(GiB)": 91.52, "step": 61690, "token_acc": 0.7353045041574369, "train_speed(iter/s)": 0.1419 }, { "epoch": 0.800534598029643, "grad_norm": 0.8432332277297974, "learning_rate": 7.004427832008753e-05, "loss": 0.8564149856567382, "memory(GiB)": 91.52, "step": 61695, "token_acc": 0.7733292150882205, "train_speed(iter/s)": 0.141898 }, { "epoch": 0.8005994764312986, "grad_norm": 0.7381132245063782, "learning_rate": 7.003936432032953e-05, "loss": 0.884471321105957, "memory(GiB)": 91.52, "step": 61700, "token_acc": 0.7472018941024537, "train_speed(iter/s)": 0.141896 }, { "epoch": 0.8006643548329543, "grad_norm": 0.785586953163147, "learning_rate": 7.003445008995981e-05, "loss": 0.8735631942749024, "memory(GiB)": 91.52, "step": 61705, "token_acc": 0.7653756890049318, "train_speed(iter/s)": 0.141893 }, { "epoch": 0.80072923323461, "grad_norm": 0.7694663405418396, "learning_rate": 7.002953562903493e-05, "loss": 0.8506805419921875, "memory(GiB)": 91.52, "step": 61710, "token_acc": 0.7533308722354599, "train_speed(iter/s)": 0.141892 }, { "epoch": 0.8007941116362657, "grad_norm": 0.6728582978248596, "learning_rate": 7.002462093761146e-05, "loss": 0.867765998840332, "memory(GiB)": 91.52, "step": 61715, "token_acc": 0.7664833202371598, "train_speed(iter/s)": 0.14189 }, { "epoch": 0.8008589900379214, "grad_norm": 0.6950896978378296, "learning_rate": 7.001970601574592e-05, "loss": 0.8611869812011719, "memory(GiB)": 91.52, "step": 61720, "token_acc": 0.737945089594709, "train_speed(iter/s)": 0.141888 }, { "epoch": 0.8009238684395771, "grad_norm": 0.7925435304641724, "learning_rate": 7.001479086349489e-05, "loss": 0.9184434890747071, "memory(GiB)": 91.52, "step": 61725, "token_acc": 0.7622709958280428, "train_speed(iter/s)": 0.141887 }, { "epoch": 0.8009887468412328, "grad_norm": 0.7462198734283447, "learning_rate": 7.000987548091495e-05, "loss": 0.8999393463134766, "memory(GiB)": 91.52, "step": 61730, "token_acc": 0.7763010108573568, "train_speed(iter/s)": 0.141885 }, { "epoch": 0.8010536252428885, "grad_norm": 0.7982413172721863, "learning_rate": 7.000495986806264e-05, "loss": 0.8862119674682617, "memory(GiB)": 91.52, "step": 61735, "token_acc": 0.7453795017897582, "train_speed(iter/s)": 0.141882 }, { "epoch": 0.8011185036445442, "grad_norm": 0.7385017275810242, "learning_rate": 7.000004402499454e-05, "loss": 0.9193744659423828, "memory(GiB)": 91.52, "step": 61740, "token_acc": 0.7416514781749096, "train_speed(iter/s)": 0.141881 }, { "epoch": 0.8011833820461999, "grad_norm": 0.7184102535247803, "learning_rate": 6.999512795176722e-05, "loss": 0.8624784469604492, "memory(GiB)": 91.52, "step": 61745, "token_acc": 0.7681227493345859, "train_speed(iter/s)": 0.141879 }, { "epoch": 0.8012482604478556, "grad_norm": 0.8324900269508362, "learning_rate": 6.999021164843726e-05, "loss": 0.878448486328125, "memory(GiB)": 91.52, "step": 61750, "token_acc": 0.7605397547247177, "train_speed(iter/s)": 0.141877 }, { "epoch": 0.8013131388495113, "grad_norm": 0.6963118314743042, "learning_rate": 6.998529511506123e-05, "loss": 0.8863615036010742, "memory(GiB)": 91.52, "step": 61755, "token_acc": 0.7648171730034564, "train_speed(iter/s)": 0.141876 }, { "epoch": 0.801378017251167, "grad_norm": 0.7660284042358398, "learning_rate": 6.998037835169571e-05, "loss": 0.893730354309082, "memory(GiB)": 91.52, "step": 61760, "token_acc": 0.7556015037593985, "train_speed(iter/s)": 0.141873 }, { "epoch": 0.8014428956528227, "grad_norm": 0.690914511680603, "learning_rate": 6.997546135839728e-05, "loss": 0.8353754997253418, "memory(GiB)": 91.52, "step": 61765, "token_acc": 0.780336899249561, "train_speed(iter/s)": 0.141872 }, { "epoch": 0.8015077740544784, "grad_norm": 0.760063648223877, "learning_rate": 6.997054413522251e-05, "loss": 0.8543387413024902, "memory(GiB)": 91.52, "step": 61770, "token_acc": 0.7588618007777446, "train_speed(iter/s)": 0.14187 }, { "epoch": 0.8015726524561341, "grad_norm": 0.6960294842720032, "learning_rate": 6.996562668222801e-05, "loss": 0.8980299949645996, "memory(GiB)": 91.52, "step": 61775, "token_acc": 0.7580457396071951, "train_speed(iter/s)": 0.141868 }, { "epoch": 0.8016375308577898, "grad_norm": 0.7157390713691711, "learning_rate": 6.99607089994704e-05, "loss": 0.8530317306518554, "memory(GiB)": 91.52, "step": 61780, "token_acc": 0.76563749800032, "train_speed(iter/s)": 0.141867 }, { "epoch": 0.8017024092594455, "grad_norm": 0.8471119403839111, "learning_rate": 6.995579108700618e-05, "loss": 0.8996952056884766, "memory(GiB)": 91.52, "step": 61785, "token_acc": 0.7708193832599118, "train_speed(iter/s)": 0.141865 }, { "epoch": 0.8017672876611012, "grad_norm": 0.7928172945976257, "learning_rate": 6.995087294489204e-05, "loss": 0.8218725204467774, "memory(GiB)": 91.52, "step": 61790, "token_acc": 0.7777571911621659, "train_speed(iter/s)": 0.141863 }, { "epoch": 0.8018321660627569, "grad_norm": 0.7816073298454285, "learning_rate": 6.994595457318453e-05, "loss": 0.8569458961486817, "memory(GiB)": 91.52, "step": 61795, "token_acc": 0.7656913967258795, "train_speed(iter/s)": 0.141862 }, { "epoch": 0.8018970444644126, "grad_norm": 0.7426360845565796, "learning_rate": 6.994103597194024e-05, "loss": 0.9402273178100586, "memory(GiB)": 91.52, "step": 61800, "token_acc": 0.7351177367440285, "train_speed(iter/s)": 0.14186 }, { "epoch": 0.8019619228660683, "grad_norm": 0.7416625618934631, "learning_rate": 6.993611714121582e-05, "loss": 0.8532954216003418, "memory(GiB)": 91.52, "step": 61805, "token_acc": 0.7840345604808415, "train_speed(iter/s)": 0.141859 }, { "epoch": 0.802026801267724, "grad_norm": 0.7942329049110413, "learning_rate": 6.993119808106783e-05, "loss": 0.9082953453063964, "memory(GiB)": 91.52, "step": 61810, "token_acc": 0.7427898983861327, "train_speed(iter/s)": 0.141857 }, { "epoch": 0.8020916796693797, "grad_norm": 0.6775735020637512, "learning_rate": 6.992627879155291e-05, "loss": 0.8713732719421386, "memory(GiB)": 91.52, "step": 61815, "token_acc": 0.7602878759918804, "train_speed(iter/s)": 0.141855 }, { "epoch": 0.8021565580710354, "grad_norm": 0.6786603927612305, "learning_rate": 6.992135927272763e-05, "loss": 0.870503044128418, "memory(GiB)": 91.52, "step": 61820, "token_acc": 0.7581318317323618, "train_speed(iter/s)": 0.141852 }, { "epoch": 0.8022214364726911, "grad_norm": 0.7258481979370117, "learning_rate": 6.991643952464865e-05, "loss": 0.8748570442199707, "memory(GiB)": 91.52, "step": 61825, "token_acc": 0.7565951326596269, "train_speed(iter/s)": 0.14185 }, { "epoch": 0.8022863148743468, "grad_norm": 0.7244582772254944, "learning_rate": 6.991151954737256e-05, "loss": 0.8749958038330078, "memory(GiB)": 91.52, "step": 61830, "token_acc": 0.7425800662617339, "train_speed(iter/s)": 0.141849 }, { "epoch": 0.8023511932760025, "grad_norm": 0.8044571876525879, "learning_rate": 6.990659934095598e-05, "loss": 0.8746488571166993, "memory(GiB)": 91.52, "step": 61835, "token_acc": 0.7664487517681692, "train_speed(iter/s)": 0.141848 }, { "epoch": 0.8024160716776582, "grad_norm": 0.7232952117919922, "learning_rate": 6.990167890545554e-05, "loss": 0.8742324829101562, "memory(GiB)": 91.52, "step": 61840, "token_acc": 0.7595100374276965, "train_speed(iter/s)": 0.141846 }, { "epoch": 0.8024809500793139, "grad_norm": 0.7530960440635681, "learning_rate": 6.989675824092785e-05, "loss": 0.9131799697875976, "memory(GiB)": 91.52, "step": 61845, "token_acc": 0.7484381810257156, "train_speed(iter/s)": 0.141845 }, { "epoch": 0.8025458284809696, "grad_norm": 0.8057612776756287, "learning_rate": 6.989183734742958e-05, "loss": 0.8759355545043945, "memory(GiB)": 91.52, "step": 61850, "token_acc": 0.7713747501153314, "train_speed(iter/s)": 0.141843 }, { "epoch": 0.8026107068826253, "grad_norm": 0.7008236050605774, "learning_rate": 6.988691622501728e-05, "loss": 0.860008716583252, "memory(GiB)": 91.52, "step": 61855, "token_acc": 0.7609585561896793, "train_speed(iter/s)": 0.141841 }, { "epoch": 0.802675585284281, "grad_norm": 0.7300757169723511, "learning_rate": 6.988199487374766e-05, "loss": 0.811520767211914, "memory(GiB)": 91.52, "step": 61860, "token_acc": 0.7713014903574866, "train_speed(iter/s)": 0.141839 }, { "epoch": 0.8027404636859367, "grad_norm": 0.7728543877601624, "learning_rate": 6.987707329367732e-05, "loss": 0.8938663482666016, "memory(GiB)": 91.52, "step": 61865, "token_acc": 0.7490121557626761, "train_speed(iter/s)": 0.141837 }, { "epoch": 0.8028053420875924, "grad_norm": 0.8411154747009277, "learning_rate": 6.987215148486289e-05, "loss": 0.8828589439392089, "memory(GiB)": 91.52, "step": 61870, "token_acc": 0.7422058092465024, "train_speed(iter/s)": 0.141836 }, { "epoch": 0.802870220489248, "grad_norm": 0.6561788320541382, "learning_rate": 6.986722944736103e-05, "loss": 0.8178600311279297, "memory(GiB)": 91.52, "step": 61875, "token_acc": 0.7635899182561308, "train_speed(iter/s)": 0.141833 }, { "epoch": 0.8029350988909038, "grad_norm": 0.7112467885017395, "learning_rate": 6.986230718122835e-05, "loss": 0.9111274719238281, "memory(GiB)": 91.52, "step": 61880, "token_acc": 0.7586859064163631, "train_speed(iter/s)": 0.141831 }, { "epoch": 0.8029999772925595, "grad_norm": 0.6988643407821655, "learning_rate": 6.985738468652155e-05, "loss": 0.8196638107299805, "memory(GiB)": 91.52, "step": 61885, "token_acc": 0.7701831750339213, "train_speed(iter/s)": 0.141829 }, { "epoch": 0.8030648556942152, "grad_norm": 0.856045126914978, "learning_rate": 6.985246196329723e-05, "loss": 0.8214364051818848, "memory(GiB)": 91.52, "step": 61890, "token_acc": 0.765933811617681, "train_speed(iter/s)": 0.141827 }, { "epoch": 0.8031297340958709, "grad_norm": 0.7191996574401855, "learning_rate": 6.984753901161203e-05, "loss": 0.9063461303710938, "memory(GiB)": 91.52, "step": 61895, "token_acc": 0.7660942083923922, "train_speed(iter/s)": 0.141825 }, { "epoch": 0.8031946124975265, "grad_norm": 0.7889426946640015, "learning_rate": 6.984261583152266e-05, "loss": 0.8745317459106445, "memory(GiB)": 91.52, "step": 61900, "token_acc": 0.7594537438617872, "train_speed(iter/s)": 0.141823 }, { "epoch": 0.8032594908991822, "grad_norm": 0.9085841178894043, "learning_rate": 6.983769242308575e-05, "loss": 0.925656509399414, "memory(GiB)": 91.52, "step": 61905, "token_acc": 0.7573736475470579, "train_speed(iter/s)": 0.141822 }, { "epoch": 0.803324369300838, "grad_norm": 0.7413962483406067, "learning_rate": 6.983276878635794e-05, "loss": 0.8759180068969726, "memory(GiB)": 91.52, "step": 61910, "token_acc": 0.7693053871470639, "train_speed(iter/s)": 0.14182 }, { "epoch": 0.8033892477024936, "grad_norm": 0.6841990351676941, "learning_rate": 6.982784492139589e-05, "loss": 0.8903374671936035, "memory(GiB)": 91.52, "step": 61915, "token_acc": 0.7583566387002264, "train_speed(iter/s)": 0.141818 }, { "epoch": 0.8034541261041493, "grad_norm": 0.8065025806427002, "learning_rate": 6.982292082825628e-05, "loss": 0.8354867935180664, "memory(GiB)": 91.52, "step": 61920, "token_acc": 0.7724873237480031, "train_speed(iter/s)": 0.141817 }, { "epoch": 0.803519004505805, "grad_norm": 0.8028720021247864, "learning_rate": 6.981799650699579e-05, "loss": 0.8725029945373535, "memory(GiB)": 91.52, "step": 61925, "token_acc": 0.7644077932705927, "train_speed(iter/s)": 0.141814 }, { "epoch": 0.8035838829074607, "grad_norm": 0.7187129855155945, "learning_rate": 6.981307195767107e-05, "loss": 0.8913514137268066, "memory(GiB)": 91.52, "step": 61930, "token_acc": 0.7546300648679138, "train_speed(iter/s)": 0.141812 }, { "epoch": 0.8036487613091164, "grad_norm": 0.7774643898010254, "learning_rate": 6.980814718033878e-05, "loss": 0.8538768768310547, "memory(GiB)": 91.52, "step": 61935, "token_acc": 0.7627074592507334, "train_speed(iter/s)": 0.14181 }, { "epoch": 0.803713639710772, "grad_norm": 0.7969041466712952, "learning_rate": 6.980322217505561e-05, "loss": 0.9013763427734375, "memory(GiB)": 91.52, "step": 61940, "token_acc": 0.751840891886312, "train_speed(iter/s)": 0.141808 }, { "epoch": 0.8037785181124277, "grad_norm": 0.7296900153160095, "learning_rate": 6.979829694187824e-05, "loss": 0.8372295379638672, "memory(GiB)": 91.52, "step": 61945, "token_acc": 0.7480286738351255, "train_speed(iter/s)": 0.141806 }, { "epoch": 0.8038433965140834, "grad_norm": 0.7414132356643677, "learning_rate": 6.979337148086334e-05, "loss": 0.8592161178588867, "memory(GiB)": 91.52, "step": 61950, "token_acc": 0.770040668775418, "train_speed(iter/s)": 0.141804 }, { "epoch": 0.8039082749157391, "grad_norm": 0.718352735042572, "learning_rate": 6.978844579206758e-05, "loss": 0.9091773986816406, "memory(GiB)": 91.52, "step": 61955, "token_acc": 0.7662985505427621, "train_speed(iter/s)": 0.141802 }, { "epoch": 0.8039731533173948, "grad_norm": 0.7042512893676758, "learning_rate": 6.978351987554767e-05, "loss": 0.8774314880371094, "memory(GiB)": 91.52, "step": 61960, "token_acc": 0.7632334290100794, "train_speed(iter/s)": 0.1418 }, { "epoch": 0.8040380317190505, "grad_norm": 0.7351537942886353, "learning_rate": 6.977859373136031e-05, "loss": 0.8844050407409668, "memory(GiB)": 91.52, "step": 61965, "token_acc": 0.7685880077369439, "train_speed(iter/s)": 0.141798 }, { "epoch": 0.8041029101207062, "grad_norm": 0.7341248393058777, "learning_rate": 6.977366735956214e-05, "loss": 0.8385589599609375, "memory(GiB)": 91.52, "step": 61970, "token_acc": 0.77265404031363, "train_speed(iter/s)": 0.141796 }, { "epoch": 0.8041677885223619, "grad_norm": 0.7349317073822021, "learning_rate": 6.97687407602099e-05, "loss": 0.8805183410644531, "memory(GiB)": 91.52, "step": 61975, "token_acc": 0.7792287214907547, "train_speed(iter/s)": 0.141794 }, { "epoch": 0.8042326669240176, "grad_norm": 0.775684654712677, "learning_rate": 6.976381393336025e-05, "loss": 0.8931596755981446, "memory(GiB)": 91.52, "step": 61980, "token_acc": 0.7616824147946616, "train_speed(iter/s)": 0.141793 }, { "epoch": 0.8042975453256733, "grad_norm": 0.7897065281867981, "learning_rate": 6.975888687906989e-05, "loss": 0.8862795829772949, "memory(GiB)": 91.52, "step": 61985, "token_acc": 0.7687130021935755, "train_speed(iter/s)": 0.141791 }, { "epoch": 0.804362423727329, "grad_norm": 0.6535444855690002, "learning_rate": 6.975395959739553e-05, "loss": 0.868288230895996, "memory(GiB)": 91.52, "step": 61990, "token_acc": 0.768004924423774, "train_speed(iter/s)": 0.141789 }, { "epoch": 0.8044273021289847, "grad_norm": 0.7570752501487732, "learning_rate": 6.97490320883939e-05, "loss": 0.8885475158691406, "memory(GiB)": 91.52, "step": 61995, "token_acc": 0.7328315040047464, "train_speed(iter/s)": 0.141787 }, { "epoch": 0.8044921805306404, "grad_norm": 0.8223073482513428, "learning_rate": 6.974410435212166e-05, "loss": 0.855747127532959, "memory(GiB)": 91.52, "step": 62000, "token_acc": 0.7787073833925957, "train_speed(iter/s)": 0.141786 }, { "epoch": 0.8045570589322961, "grad_norm": 0.7263962626457214, "learning_rate": 6.973917638863554e-05, "loss": 0.9129171371459961, "memory(GiB)": 91.52, "step": 62005, "token_acc": 0.7410092807424594, "train_speed(iter/s)": 0.141784 }, { "epoch": 0.8046219373339518, "grad_norm": 0.7411984205245972, "learning_rate": 6.973424819799225e-05, "loss": 0.9036417007446289, "memory(GiB)": 91.52, "step": 62010, "token_acc": 0.7422788281137249, "train_speed(iter/s)": 0.141782 }, { "epoch": 0.8046868157356075, "grad_norm": 0.8085788488388062, "learning_rate": 6.972931978024851e-05, "loss": 0.8504363059997558, "memory(GiB)": 91.52, "step": 62015, "token_acc": 0.7527577428935087, "train_speed(iter/s)": 0.14178 }, { "epoch": 0.8047516941372632, "grad_norm": 0.7066795229911804, "learning_rate": 6.972439113546101e-05, "loss": 0.8316523551940918, "memory(GiB)": 91.52, "step": 62020, "token_acc": 0.75212900887842, "train_speed(iter/s)": 0.141779 }, { "epoch": 0.8048165725389189, "grad_norm": 0.6730638742446899, "learning_rate": 6.97194622636865e-05, "loss": 0.8518951416015625, "memory(GiB)": 91.52, "step": 62025, "token_acc": 0.7403605737597052, "train_speed(iter/s)": 0.141777 }, { "epoch": 0.8048814509405746, "grad_norm": 0.7077286243438721, "learning_rate": 6.971453316498169e-05, "loss": 0.8905633926391602, "memory(GiB)": 91.52, "step": 62030, "token_acc": 0.7615441296872442, "train_speed(iter/s)": 0.141775 }, { "epoch": 0.8049463293422303, "grad_norm": 0.6745194792747498, "learning_rate": 6.97096038394033e-05, "loss": 0.8708763122558594, "memory(GiB)": 91.52, "step": 62035, "token_acc": 0.7533801884711148, "train_speed(iter/s)": 0.141773 }, { "epoch": 0.805011207743886, "grad_norm": 0.6835513114929199, "learning_rate": 6.970467428700806e-05, "loss": 0.8793468475341797, "memory(GiB)": 91.52, "step": 62040, "token_acc": 0.7710852478839177, "train_speed(iter/s)": 0.141771 }, { "epoch": 0.8050760861455417, "grad_norm": 0.7698925137519836, "learning_rate": 6.969974450785268e-05, "loss": 0.8654752731323242, "memory(GiB)": 91.52, "step": 62045, "token_acc": 0.7568822929647672, "train_speed(iter/s)": 0.141769 }, { "epoch": 0.8051409645471974, "grad_norm": 0.6764900088310242, "learning_rate": 6.969481450199393e-05, "loss": 0.848022174835205, "memory(GiB)": 91.52, "step": 62050, "token_acc": 0.7593842510361161, "train_speed(iter/s)": 0.141768 }, { "epoch": 0.8052058429488531, "grad_norm": 0.8053933382034302, "learning_rate": 6.968988426948849e-05, "loss": 0.8846628189086914, "memory(GiB)": 91.52, "step": 62055, "token_acc": 0.7473882387685038, "train_speed(iter/s)": 0.141766 }, { "epoch": 0.8052707213505088, "grad_norm": 0.695758044719696, "learning_rate": 6.968495381039317e-05, "loss": 0.9070513725280762, "memory(GiB)": 91.52, "step": 62060, "token_acc": 0.7620701395982294, "train_speed(iter/s)": 0.141764 }, { "epoch": 0.8053355997521645, "grad_norm": 0.7560153603553772, "learning_rate": 6.968002312476463e-05, "loss": 0.8159830093383789, "memory(GiB)": 91.52, "step": 62065, "token_acc": 0.7554021608643458, "train_speed(iter/s)": 0.141762 }, { "epoch": 0.8054004781538202, "grad_norm": 0.7669605612754822, "learning_rate": 6.967509221265968e-05, "loss": 0.8655241966247559, "memory(GiB)": 91.52, "step": 62070, "token_acc": 0.7756481101392529, "train_speed(iter/s)": 0.14176 }, { "epoch": 0.8054653565554759, "grad_norm": 0.7344626188278198, "learning_rate": 6.967016107413502e-05, "loss": 0.8801407814025879, "memory(GiB)": 91.52, "step": 62075, "token_acc": 0.7571490724993079, "train_speed(iter/s)": 0.141758 }, { "epoch": 0.8055302349571316, "grad_norm": 0.7098442912101746, "learning_rate": 6.966522970924741e-05, "loss": 0.838764762878418, "memory(GiB)": 91.52, "step": 62080, "token_acc": 0.7728950538135758, "train_speed(iter/s)": 0.141756 }, { "epoch": 0.8055951133587873, "grad_norm": 0.8011995553970337, "learning_rate": 6.966029811805362e-05, "loss": 0.8602470397949219, "memory(GiB)": 91.52, "step": 62085, "token_acc": 0.7691691462577249, "train_speed(iter/s)": 0.141755 }, { "epoch": 0.805659991760443, "grad_norm": 0.6314993500709534, "learning_rate": 6.965536630061036e-05, "loss": 0.9208621978759766, "memory(GiB)": 91.52, "step": 62090, "token_acc": 0.7405491169977925, "train_speed(iter/s)": 0.141753 }, { "epoch": 0.8057248701620987, "grad_norm": 0.705730676651001, "learning_rate": 6.965043425697443e-05, "loss": 0.8606634140014648, "memory(GiB)": 91.52, "step": 62095, "token_acc": 0.7488846138103229, "train_speed(iter/s)": 0.141751 }, { "epoch": 0.8057897485637544, "grad_norm": 0.7643436789512634, "learning_rate": 6.964550198720254e-05, "loss": 0.9148075103759765, "memory(GiB)": 91.52, "step": 62100, "token_acc": 0.7491689332124509, "train_speed(iter/s)": 0.141749 }, { "epoch": 0.8058546269654101, "grad_norm": 0.7057351469993591, "learning_rate": 6.964056949135151e-05, "loss": 0.8958610534667969, "memory(GiB)": 91.52, "step": 62105, "token_acc": 0.7649557746568176, "train_speed(iter/s)": 0.141748 }, { "epoch": 0.8059195053670658, "grad_norm": 0.7115770578384399, "learning_rate": 6.963563676947807e-05, "loss": 0.8642277717590332, "memory(GiB)": 91.52, "step": 62110, "token_acc": 0.7592133663762092, "train_speed(iter/s)": 0.141747 }, { "epoch": 0.8059843837687215, "grad_norm": 0.6433905959129333, "learning_rate": 6.963070382163896e-05, "loss": 0.8519370079040527, "memory(GiB)": 91.52, "step": 62115, "token_acc": 0.7561535701471116, "train_speed(iter/s)": 0.141745 }, { "epoch": 0.8060492621703772, "grad_norm": 0.8262462615966797, "learning_rate": 6.9625770647891e-05, "loss": 0.92364501953125, "memory(GiB)": 91.52, "step": 62120, "token_acc": 0.7654000669568128, "train_speed(iter/s)": 0.141744 }, { "epoch": 0.8061141405720329, "grad_norm": 0.8046914935112, "learning_rate": 6.962083724829092e-05, "loss": 0.8827935218811035, "memory(GiB)": 91.52, "step": 62125, "token_acc": 0.7662850774997467, "train_speed(iter/s)": 0.141742 }, { "epoch": 0.8061790189736886, "grad_norm": 0.7965083718299866, "learning_rate": 6.961590362289551e-05, "loss": 0.8562726974487305, "memory(GiB)": 91.52, "step": 62130, "token_acc": 0.7630716073758065, "train_speed(iter/s)": 0.141739 }, { "epoch": 0.8062438973753443, "grad_norm": 0.640824556350708, "learning_rate": 6.961096977176153e-05, "loss": 0.8685049057006836, "memory(GiB)": 91.52, "step": 62135, "token_acc": 0.7603418803418803, "train_speed(iter/s)": 0.141737 }, { "epoch": 0.806308775777, "grad_norm": 0.7060651779174805, "learning_rate": 6.960603569494579e-05, "loss": 0.8861618041992188, "memory(GiB)": 91.52, "step": 62140, "token_acc": 0.7472286514800829, "train_speed(iter/s)": 0.141736 }, { "epoch": 0.8063736541786557, "grad_norm": 0.7425511479377747, "learning_rate": 6.960110139250506e-05, "loss": 0.8345376968383789, "memory(GiB)": 91.52, "step": 62145, "token_acc": 0.7702407002188184, "train_speed(iter/s)": 0.141734 }, { "epoch": 0.8064385325803114, "grad_norm": 0.748913049697876, "learning_rate": 6.959616686449609e-05, "loss": 0.8799257278442383, "memory(GiB)": 91.52, "step": 62150, "token_acc": 0.7532688616031005, "train_speed(iter/s)": 0.141732 }, { "epoch": 0.8065034109819671, "grad_norm": 0.8269160389900208, "learning_rate": 6.959123211097572e-05, "loss": 0.8714203834533691, "memory(GiB)": 91.52, "step": 62155, "token_acc": 0.7720233747260774, "train_speed(iter/s)": 0.14173 }, { "epoch": 0.8065682893836228, "grad_norm": 0.7157098650932312, "learning_rate": 6.958629713200069e-05, "loss": 0.8963688850402832, "memory(GiB)": 91.52, "step": 62160, "token_acc": 0.73311622092832, "train_speed(iter/s)": 0.141728 }, { "epoch": 0.8066331677852785, "grad_norm": 0.7852112054824829, "learning_rate": 6.958136192762782e-05, "loss": 0.8575255393981933, "memory(GiB)": 91.52, "step": 62165, "token_acc": 0.7697993210433587, "train_speed(iter/s)": 0.141726 }, { "epoch": 0.8066980461869342, "grad_norm": 0.7812413573265076, "learning_rate": 6.957642649791389e-05, "loss": 0.8679441452026367, "memory(GiB)": 91.52, "step": 62170, "token_acc": 0.7658446733328321, "train_speed(iter/s)": 0.141725 }, { "epoch": 0.8067629245885898, "grad_norm": 0.7518567442893982, "learning_rate": 6.957149084291573e-05, "loss": 0.8636575698852539, "memory(GiB)": 91.52, "step": 62175, "token_acc": 0.7660971945638123, "train_speed(iter/s)": 0.141723 }, { "epoch": 0.8068278029902455, "grad_norm": 0.7440592646598816, "learning_rate": 6.956655496269009e-05, "loss": 0.8658096313476562, "memory(GiB)": 91.52, "step": 62180, "token_acc": 0.7734968964630697, "train_speed(iter/s)": 0.14172 }, { "epoch": 0.8068926813919012, "grad_norm": 0.7577657699584961, "learning_rate": 6.95616188572938e-05, "loss": 0.8713233947753907, "memory(GiB)": 91.52, "step": 62185, "token_acc": 0.7679819452533488, "train_speed(iter/s)": 0.141719 }, { "epoch": 0.8069575597935569, "grad_norm": 0.8214028477668762, "learning_rate": 6.955668252678366e-05, "loss": 0.8578727722167969, "memory(GiB)": 91.52, "step": 62190, "token_acc": 0.7470463270098021, "train_speed(iter/s)": 0.141717 }, { "epoch": 0.8070224381952126, "grad_norm": 0.7481951713562012, "learning_rate": 6.955174597121649e-05, "loss": 0.8511733055114746, "memory(GiB)": 91.52, "step": 62195, "token_acc": 0.7648865200663584, "train_speed(iter/s)": 0.141716 }, { "epoch": 0.8070873165968683, "grad_norm": 0.7247920632362366, "learning_rate": 6.954680919064906e-05, "loss": 0.8817636489868164, "memory(GiB)": 91.52, "step": 62200, "token_acc": 0.7557765824283637, "train_speed(iter/s)": 0.141714 }, { "epoch": 0.807152194998524, "grad_norm": 0.7458894848823547, "learning_rate": 6.954187218513822e-05, "loss": 0.8338044166564942, "memory(GiB)": 91.52, "step": 62205, "token_acc": 0.7661986635633306, "train_speed(iter/s)": 0.141713 }, { "epoch": 0.8072170734001797, "grad_norm": 0.7088225483894348, "learning_rate": 6.953693495474079e-05, "loss": 0.8714475631713867, "memory(GiB)": 91.52, "step": 62210, "token_acc": 0.753574657603054, "train_speed(iter/s)": 0.141712 }, { "epoch": 0.8072819518018354, "grad_norm": 0.6556052565574646, "learning_rate": 6.953199749951354e-05, "loss": 0.8405550003051758, "memory(GiB)": 91.52, "step": 62215, "token_acc": 0.7811741060212712, "train_speed(iter/s)": 0.141709 }, { "epoch": 0.8073468302034911, "grad_norm": 0.7572475671768188, "learning_rate": 6.952705981951333e-05, "loss": 0.9172316551208496, "memory(GiB)": 91.52, "step": 62220, "token_acc": 0.7325208913649025, "train_speed(iter/s)": 0.141708 }, { "epoch": 0.8074117086051468, "grad_norm": 0.7138364911079407, "learning_rate": 6.952212191479697e-05, "loss": 0.8393678665161133, "memory(GiB)": 91.52, "step": 62225, "token_acc": 0.777462792345854, "train_speed(iter/s)": 0.141706 }, { "epoch": 0.8074765870068025, "grad_norm": 0.6713956594467163, "learning_rate": 6.951718378542131e-05, "loss": 0.8650238037109375, "memory(GiB)": 91.52, "step": 62230, "token_acc": 0.7662965863546322, "train_speed(iter/s)": 0.141704 }, { "epoch": 0.8075414654084582, "grad_norm": 0.8094018697738647, "learning_rate": 6.951224543144312e-05, "loss": 0.8513669013977051, "memory(GiB)": 91.52, "step": 62235, "token_acc": 0.7466139033942559, "train_speed(iter/s)": 0.141702 }, { "epoch": 0.8076063438101139, "grad_norm": 0.7406708598136902, "learning_rate": 6.950730685291929e-05, "loss": 0.9376626014709473, "memory(GiB)": 91.52, "step": 62240, "token_acc": 0.7449603781454192, "train_speed(iter/s)": 0.141701 }, { "epoch": 0.8076712222117696, "grad_norm": 0.7698479294776917, "learning_rate": 6.950236804990663e-05, "loss": 0.8685244560241699, "memory(GiB)": 91.52, "step": 62245, "token_acc": 0.7626684866063812, "train_speed(iter/s)": 0.141699 }, { "epoch": 0.8077361006134253, "grad_norm": 0.7421929836273193, "learning_rate": 6.949742902246194e-05, "loss": 0.9014117240905761, "memory(GiB)": 91.52, "step": 62250, "token_acc": 0.7484654606865643, "train_speed(iter/s)": 0.141697 }, { "epoch": 0.807800979015081, "grad_norm": 0.7086955904960632, "learning_rate": 6.949248977064214e-05, "loss": 0.8746886253356934, "memory(GiB)": 91.52, "step": 62255, "token_acc": 0.764341531924088, "train_speed(iter/s)": 0.141695 }, { "epoch": 0.8078658574167367, "grad_norm": 0.665067732334137, "learning_rate": 6.948755029450398e-05, "loss": 0.8958310127258301, "memory(GiB)": 91.52, "step": 62260, "token_acc": 0.7573811684232568, "train_speed(iter/s)": 0.141694 }, { "epoch": 0.8079307358183924, "grad_norm": 0.7303066849708557, "learning_rate": 6.948261059410438e-05, "loss": 0.8558457374572754, "memory(GiB)": 91.52, "step": 62265, "token_acc": 0.7558108548248601, "train_speed(iter/s)": 0.141692 }, { "epoch": 0.8079956142200481, "grad_norm": 0.7290465235710144, "learning_rate": 6.947767066950013e-05, "loss": 0.8966619491577148, "memory(GiB)": 91.52, "step": 62270, "token_acc": 0.7718044454815939, "train_speed(iter/s)": 0.14169 }, { "epoch": 0.8080604926217038, "grad_norm": 0.7701146602630615, "learning_rate": 6.94727305207481e-05, "loss": 0.8387811660766602, "memory(GiB)": 91.52, "step": 62275, "token_acc": 0.7594176010325079, "train_speed(iter/s)": 0.141688 }, { "epoch": 0.8081253710233595, "grad_norm": 0.7416370511054993, "learning_rate": 6.946779014790514e-05, "loss": 0.911845588684082, "memory(GiB)": 91.52, "step": 62280, "token_acc": 0.7708090134069122, "train_speed(iter/s)": 0.141686 }, { "epoch": 0.8081902494250152, "grad_norm": 0.8742815852165222, "learning_rate": 6.946284955102812e-05, "loss": 0.8700864791870118, "memory(GiB)": 91.52, "step": 62285, "token_acc": 0.7652233017472312, "train_speed(iter/s)": 0.141685 }, { "epoch": 0.8082551278266709, "grad_norm": 0.6934120655059814, "learning_rate": 6.945790873017387e-05, "loss": 0.8807103157043457, "memory(GiB)": 91.52, "step": 62290, "token_acc": 0.7557881664153286, "train_speed(iter/s)": 0.141683 }, { "epoch": 0.8083200062283266, "grad_norm": 0.7100827693939209, "learning_rate": 6.945296768539926e-05, "loss": 0.8744396209716797, "memory(GiB)": 91.52, "step": 62295, "token_acc": 0.7780594936256829, "train_speed(iter/s)": 0.141682 }, { "epoch": 0.8083848846299823, "grad_norm": 0.8254320621490479, "learning_rate": 6.944802641676114e-05, "loss": 0.9264440536499023, "memory(GiB)": 91.52, "step": 62300, "token_acc": 0.7316925356048618, "train_speed(iter/s)": 0.14168 }, { "epoch": 0.808449763031638, "grad_norm": 0.8120222091674805, "learning_rate": 6.944308492431639e-05, "loss": 0.8834100723266601, "memory(GiB)": 91.52, "step": 62305, "token_acc": 0.7602605380999963, "train_speed(iter/s)": 0.141679 }, { "epoch": 0.8085146414332937, "grad_norm": 0.6817638278007507, "learning_rate": 6.943814320812187e-05, "loss": 0.8440057754516601, "memory(GiB)": 91.52, "step": 62310, "token_acc": 0.7819664563779043, "train_speed(iter/s)": 0.141677 }, { "epoch": 0.8085795198349494, "grad_norm": 0.6835953593254089, "learning_rate": 6.943320126823444e-05, "loss": 0.8723501205444336, "memory(GiB)": 91.52, "step": 62315, "token_acc": 0.7668089833038286, "train_speed(iter/s)": 0.141675 }, { "epoch": 0.808644398236605, "grad_norm": 0.7416682839393616, "learning_rate": 6.942825910471101e-05, "loss": 0.8775416374206543, "memory(GiB)": 91.52, "step": 62320, "token_acc": 0.7503311021226331, "train_speed(iter/s)": 0.141673 }, { "epoch": 0.8087092766382608, "grad_norm": 0.7491672039031982, "learning_rate": 6.94233167176084e-05, "loss": 0.9064596176147461, "memory(GiB)": 91.52, "step": 62325, "token_acc": 0.7629092998021318, "train_speed(iter/s)": 0.141672 }, { "epoch": 0.8087741550399165, "grad_norm": 0.7573305368423462, "learning_rate": 6.941837410698352e-05, "loss": 0.940461254119873, "memory(GiB)": 91.52, "step": 62330, "token_acc": 0.7421917808219178, "train_speed(iter/s)": 0.141669 }, { "epoch": 0.8088390334415722, "grad_norm": 0.7197703123092651, "learning_rate": 6.941343127289323e-05, "loss": 0.8438785552978516, "memory(GiB)": 91.52, "step": 62335, "token_acc": 0.7813235762010817, "train_speed(iter/s)": 0.141667 }, { "epoch": 0.8089039118432279, "grad_norm": 0.7479493021965027, "learning_rate": 6.940848821539445e-05, "loss": 0.8913652420043945, "memory(GiB)": 91.52, "step": 62340, "token_acc": 0.7672319571272432, "train_speed(iter/s)": 0.141666 }, { "epoch": 0.8089687902448836, "grad_norm": 0.66375333070755, "learning_rate": 6.9403544934544e-05, "loss": 0.8349483489990235, "memory(GiB)": 91.52, "step": 62345, "token_acc": 0.7653150915274964, "train_speed(iter/s)": 0.141663 }, { "epoch": 0.8090336686465393, "grad_norm": 0.6516003012657166, "learning_rate": 6.939860143039883e-05, "loss": 0.8625465393066406, "memory(GiB)": 91.52, "step": 62350, "token_acc": 0.7572545952287837, "train_speed(iter/s)": 0.141662 }, { "epoch": 0.809098547048195, "grad_norm": 0.7759862542152405, "learning_rate": 6.939365770301578e-05, "loss": 0.8628219604492188, "memory(GiB)": 91.52, "step": 62355, "token_acc": 0.751608910891089, "train_speed(iter/s)": 0.14166 }, { "epoch": 0.8091634254498506, "grad_norm": 0.7101909518241882, "learning_rate": 6.938871375245179e-05, "loss": 0.8345380783081054, "memory(GiB)": 91.52, "step": 62360, "token_acc": 0.7489070768826217, "train_speed(iter/s)": 0.141658 }, { "epoch": 0.8092283038515063, "grad_norm": 0.6716459393501282, "learning_rate": 6.938376957876371e-05, "loss": 0.821710205078125, "memory(GiB)": 91.52, "step": 62365, "token_acc": 0.7869811598469237, "train_speed(iter/s)": 0.141656 }, { "epoch": 0.809293182253162, "grad_norm": 0.8112543225288391, "learning_rate": 6.937882518200847e-05, "loss": 0.8937650680541992, "memory(GiB)": 91.52, "step": 62370, "token_acc": 0.7588232825548701, "train_speed(iter/s)": 0.141654 }, { "epoch": 0.8093580606548177, "grad_norm": 0.7043179869651794, "learning_rate": 6.937388056224296e-05, "loss": 0.8901554107666015, "memory(GiB)": 91.52, "step": 62375, "token_acc": 0.7546975370135001, "train_speed(iter/s)": 0.141653 }, { "epoch": 0.8094229390564734, "grad_norm": 0.6606401801109314, "learning_rate": 6.936893571952406e-05, "loss": 0.8737874031066895, "memory(GiB)": 91.52, "step": 62380, "token_acc": 0.7696952976518914, "train_speed(iter/s)": 0.14165 }, { "epoch": 0.8094878174581291, "grad_norm": 0.7289639711380005, "learning_rate": 6.936399065390871e-05, "loss": 0.8914049148559571, "memory(GiB)": 91.52, "step": 62385, "token_acc": 0.7531067020514765, "train_speed(iter/s)": 0.141648 }, { "epoch": 0.8095526958597848, "grad_norm": 0.7112277150154114, "learning_rate": 6.93590453654538e-05, "loss": 0.8558786392211915, "memory(GiB)": 91.52, "step": 62390, "token_acc": 0.7813509712243889, "train_speed(iter/s)": 0.141646 }, { "epoch": 0.8096175742614405, "grad_norm": 0.6903024911880493, "learning_rate": 6.935409985421623e-05, "loss": 0.8659744262695312, "memory(GiB)": 91.52, "step": 62395, "token_acc": 0.764736138257381, "train_speed(iter/s)": 0.141644 }, { "epoch": 0.8096824526630962, "grad_norm": 0.6934438943862915, "learning_rate": 6.934915412025294e-05, "loss": 0.8688648223876954, "memory(GiB)": 91.52, "step": 62400, "token_acc": 0.7545110102412125, "train_speed(iter/s)": 0.141643 }, { "epoch": 0.8097473310647519, "grad_norm": 0.7963253259658813, "learning_rate": 6.934420816362081e-05, "loss": 0.9052012443542481, "memory(GiB)": 91.52, "step": 62405, "token_acc": 0.7533791109504879, "train_speed(iter/s)": 0.141642 }, { "epoch": 0.8098122094664076, "grad_norm": 0.6917551159858704, "learning_rate": 6.933926198437681e-05, "loss": 0.8539617538452149, "memory(GiB)": 91.52, "step": 62410, "token_acc": 0.7384133771092617, "train_speed(iter/s)": 0.14164 }, { "epoch": 0.8098770878680632, "grad_norm": 0.7661629319190979, "learning_rate": 6.933431558257781e-05, "loss": 0.8378589630126954, "memory(GiB)": 91.52, "step": 62415, "token_acc": 0.7568602183141298, "train_speed(iter/s)": 0.141637 }, { "epoch": 0.8099419662697189, "grad_norm": 0.843943178653717, "learning_rate": 6.932936895828074e-05, "loss": 0.8870948791503906, "memory(GiB)": 91.52, "step": 62420, "token_acc": 0.7533632286995515, "train_speed(iter/s)": 0.141636 }, { "epoch": 0.8100068446713746, "grad_norm": 0.7023230791091919, "learning_rate": 6.932442211154255e-05, "loss": 0.847871208190918, "memory(GiB)": 91.52, "step": 62425, "token_acc": 0.7738338048555826, "train_speed(iter/s)": 0.141634 }, { "epoch": 0.8100717230730303, "grad_norm": 0.6983484625816345, "learning_rate": 6.931947504242016e-05, "loss": 0.9178659439086914, "memory(GiB)": 91.52, "step": 62430, "token_acc": 0.754884366640707, "train_speed(iter/s)": 0.141632 }, { "epoch": 0.810136601474686, "grad_norm": 0.7336323261260986, "learning_rate": 6.931452775097048e-05, "loss": 0.8517253875732422, "memory(GiB)": 91.52, "step": 62435, "token_acc": 0.7768579234972678, "train_speed(iter/s)": 0.14163 }, { "epoch": 0.8102014798763417, "grad_norm": 0.7574945092201233, "learning_rate": 6.930958023725044e-05, "loss": 0.8880655288696289, "memory(GiB)": 91.52, "step": 62440, "token_acc": 0.7684836790480785, "train_speed(iter/s)": 0.141628 }, { "epoch": 0.8102663582779974, "grad_norm": 0.7104069590568542, "learning_rate": 6.930463250131704e-05, "loss": 0.8366224288940429, "memory(GiB)": 91.52, "step": 62445, "token_acc": 0.7917632976733747, "train_speed(iter/s)": 0.141627 }, { "epoch": 0.8103312366796531, "grad_norm": 0.7012327313423157, "learning_rate": 6.929968454322714e-05, "loss": 0.8378607749938964, "memory(GiB)": 91.52, "step": 62450, "token_acc": 0.7437155582396014, "train_speed(iter/s)": 0.141625 }, { "epoch": 0.8103961150813088, "grad_norm": 0.7037849426269531, "learning_rate": 6.929473636303773e-05, "loss": 0.8774934768676758, "memory(GiB)": 91.52, "step": 62455, "token_acc": 0.7550218730470494, "train_speed(iter/s)": 0.141623 }, { "epoch": 0.8104609934829645, "grad_norm": 0.7510741949081421, "learning_rate": 6.928978796080571e-05, "loss": 0.8833806037902832, "memory(GiB)": 91.52, "step": 62460, "token_acc": 0.7533608711978971, "train_speed(iter/s)": 0.141621 }, { "epoch": 0.8105258718846202, "grad_norm": 0.7028417587280273, "learning_rate": 6.928483933658808e-05, "loss": 0.9046521186828613, "memory(GiB)": 91.52, "step": 62465, "token_acc": 0.7503000377190275, "train_speed(iter/s)": 0.14162 }, { "epoch": 0.8105907502862759, "grad_norm": 0.756715714931488, "learning_rate": 6.927989049044175e-05, "loss": 0.8855784416198731, "memory(GiB)": 91.52, "step": 62470, "token_acc": 0.7630012475494564, "train_speed(iter/s)": 0.141618 }, { "epoch": 0.8106556286879316, "grad_norm": 0.758073091506958, "learning_rate": 6.927494142242366e-05, "loss": 0.8737140655517578, "memory(GiB)": 91.52, "step": 62475, "token_acc": 0.7671929020702295, "train_speed(iter/s)": 0.141617 }, { "epoch": 0.8107205070895873, "grad_norm": 0.7088079452514648, "learning_rate": 6.926999213259082e-05, "loss": 0.8585500717163086, "memory(GiB)": 91.52, "step": 62480, "token_acc": 0.765947242206235, "train_speed(iter/s)": 0.141615 }, { "epoch": 0.810785385491243, "grad_norm": 0.7714797258377075, "learning_rate": 6.926504262100014e-05, "loss": 0.8855520248413086, "memory(GiB)": 91.52, "step": 62485, "token_acc": 0.7698806072508495, "train_speed(iter/s)": 0.141613 }, { "epoch": 0.8108502638928987, "grad_norm": 0.7147926092147827, "learning_rate": 6.926009288770858e-05, "loss": 0.8719705581665039, "memory(GiB)": 91.52, "step": 62490, "token_acc": 0.7521466195252182, "train_speed(iter/s)": 0.141611 }, { "epoch": 0.8109151422945544, "grad_norm": 0.7424294948577881, "learning_rate": 6.92551429327731e-05, "loss": 0.8656938552856446, "memory(GiB)": 91.52, "step": 62495, "token_acc": 0.7733686630060018, "train_speed(iter/s)": 0.141609 }, { "epoch": 0.8109800206962101, "grad_norm": 0.7435498833656311, "learning_rate": 6.92501927562507e-05, "loss": 0.9021201133728027, "memory(GiB)": 91.52, "step": 62500, "token_acc": 0.7395918101624296, "train_speed(iter/s)": 0.141607 }, { "epoch": 0.8110448990978658, "grad_norm": 0.7453989386558533, "learning_rate": 6.924524235819831e-05, "loss": 0.8854511260986329, "memory(GiB)": 91.52, "step": 62505, "token_acc": 0.7668540228039185, "train_speed(iter/s)": 0.141605 }, { "epoch": 0.8111097774995215, "grad_norm": 0.659982442855835, "learning_rate": 6.92402917386729e-05, "loss": 0.814147663116455, "memory(GiB)": 91.52, "step": 62510, "token_acc": 0.7801268498942917, "train_speed(iter/s)": 0.141603 }, { "epoch": 0.8111746559011772, "grad_norm": 0.7471276521682739, "learning_rate": 6.923534089773147e-05, "loss": 0.855528736114502, "memory(GiB)": 91.52, "step": 62515, "token_acc": 0.7818523358379614, "train_speed(iter/s)": 0.141601 }, { "epoch": 0.8112395343028329, "grad_norm": 0.7405923008918762, "learning_rate": 6.923038983543096e-05, "loss": 0.8623614311218262, "memory(GiB)": 91.52, "step": 62520, "token_acc": 0.7633742732308115, "train_speed(iter/s)": 0.141599 }, { "epoch": 0.8113044127044886, "grad_norm": 0.8130446672439575, "learning_rate": 6.922543855182836e-05, "loss": 0.8880339622497558, "memory(GiB)": 91.52, "step": 62525, "token_acc": 0.766445736958384, "train_speed(iter/s)": 0.141598 }, { "epoch": 0.8113692911061443, "grad_norm": 0.7827983498573303, "learning_rate": 6.922048704698066e-05, "loss": 0.8626123428344726, "memory(GiB)": 91.52, "step": 62530, "token_acc": 0.7455173075079107, "train_speed(iter/s)": 0.141595 }, { "epoch": 0.8114341695078, "grad_norm": 0.8413935899734497, "learning_rate": 6.921553532094482e-05, "loss": 0.8501018524169922, "memory(GiB)": 91.52, "step": 62535, "token_acc": 0.75253227408143, "train_speed(iter/s)": 0.141594 }, { "epoch": 0.8114990479094557, "grad_norm": 0.7537428736686707, "learning_rate": 6.921058337377786e-05, "loss": 0.8316681861877442, "memory(GiB)": 91.52, "step": 62540, "token_acc": 0.770000390823465, "train_speed(iter/s)": 0.141591 }, { "epoch": 0.8115639263111114, "grad_norm": 0.7584043741226196, "learning_rate": 6.920563120553672e-05, "loss": 0.8859971046447754, "memory(GiB)": 91.52, "step": 62545, "token_acc": 0.7647839340147032, "train_speed(iter/s)": 0.141589 }, { "epoch": 0.8116288047127671, "grad_norm": 0.7912386059761047, "learning_rate": 6.920067881627842e-05, "loss": 0.8868207931518555, "memory(GiB)": 91.52, "step": 62550, "token_acc": 0.7553335229697056, "train_speed(iter/s)": 0.141587 }, { "epoch": 0.8116936831144228, "grad_norm": 0.7078951001167297, "learning_rate": 6.919572620605994e-05, "loss": 0.8933442115783692, "memory(GiB)": 91.52, "step": 62555, "token_acc": 0.7678482936337507, "train_speed(iter/s)": 0.141585 }, { "epoch": 0.8117585615160785, "grad_norm": 0.7560224533081055, "learning_rate": 6.919077337493829e-05, "loss": 0.8713127136230469, "memory(GiB)": 91.52, "step": 62560, "token_acc": 0.7528673835125448, "train_speed(iter/s)": 0.141583 }, { "epoch": 0.8118234399177342, "grad_norm": 0.7351862192153931, "learning_rate": 6.918582032297042e-05, "loss": 0.9005796432495117, "memory(GiB)": 91.52, "step": 62565, "token_acc": 0.7561463001089984, "train_speed(iter/s)": 0.14158 }, { "epoch": 0.8118883183193899, "grad_norm": 0.8121449947357178, "learning_rate": 6.918086705021342e-05, "loss": 0.8534530639648438, "memory(GiB)": 91.52, "step": 62570, "token_acc": 0.7611033040295434, "train_speed(iter/s)": 0.141579 }, { "epoch": 0.8119531967210456, "grad_norm": 0.8151111602783203, "learning_rate": 6.91759135567242e-05, "loss": 0.8711453437805176, "memory(GiB)": 91.52, "step": 62575, "token_acc": 0.7786503781268179, "train_speed(iter/s)": 0.141576 }, { "epoch": 0.8120180751227013, "grad_norm": 0.7868974804878235, "learning_rate": 6.917095984255981e-05, "loss": 0.9029420852661133, "memory(GiB)": 91.52, "step": 62580, "token_acc": 0.752537501122788, "train_speed(iter/s)": 0.141574 }, { "epoch": 0.812082953524357, "grad_norm": 0.6803812384605408, "learning_rate": 6.916600590777725e-05, "loss": 0.8860179901123046, "memory(GiB)": 91.52, "step": 62585, "token_acc": 0.7455242551255807, "train_speed(iter/s)": 0.141572 }, { "epoch": 0.8121478319260127, "grad_norm": 0.7622539401054382, "learning_rate": 6.916105175243353e-05, "loss": 0.9085341453552246, "memory(GiB)": 91.52, "step": 62590, "token_acc": 0.7476568624109607, "train_speed(iter/s)": 0.14157 }, { "epoch": 0.8122127103276684, "grad_norm": 0.7024504542350769, "learning_rate": 6.915609737658564e-05, "loss": 0.8775604248046875, "memory(GiB)": 91.52, "step": 62595, "token_acc": 0.7620882363150404, "train_speed(iter/s)": 0.141569 }, { "epoch": 0.8122775887293241, "grad_norm": 0.7427080273628235, "learning_rate": 6.915114278029062e-05, "loss": 0.8999160766601563, "memory(GiB)": 91.52, "step": 62600, "token_acc": 0.7525697185236587, "train_speed(iter/s)": 0.141567 }, { "epoch": 0.8123424671309798, "grad_norm": 0.7367706894874573, "learning_rate": 6.91461879636055e-05, "loss": 0.8648758888244629, "memory(GiB)": 91.52, "step": 62605, "token_acc": 0.7541443156255407, "train_speed(iter/s)": 0.141565 }, { "epoch": 0.8124073455326355, "grad_norm": 0.8409032225608826, "learning_rate": 6.914123292658727e-05, "loss": 0.8968042373657227, "memory(GiB)": 91.52, "step": 62610, "token_acc": 0.7875358017875013, "train_speed(iter/s)": 0.141563 }, { "epoch": 0.8124722239342912, "grad_norm": 0.7100554704666138, "learning_rate": 6.913627766929296e-05, "loss": 0.8811384201049804, "memory(GiB)": 91.52, "step": 62615, "token_acc": 0.7512241817311587, "train_speed(iter/s)": 0.141561 }, { "epoch": 0.8125371023359469, "grad_norm": 0.7076144814491272, "learning_rate": 6.91313221917796e-05, "loss": 0.8938724517822265, "memory(GiB)": 91.52, "step": 62620, "token_acc": 0.7538071065989848, "train_speed(iter/s)": 0.141559 }, { "epoch": 0.8126019807376026, "grad_norm": 0.6933206915855408, "learning_rate": 6.912636649410422e-05, "loss": 0.8619705200195312, "memory(GiB)": 91.52, "step": 62625, "token_acc": 0.7612363859487652, "train_speed(iter/s)": 0.141556 }, { "epoch": 0.8126668591392583, "grad_norm": 0.6484941244125366, "learning_rate": 6.912141057632385e-05, "loss": 0.8063735961914062, "memory(GiB)": 91.52, "step": 62630, "token_acc": 0.786328219134067, "train_speed(iter/s)": 0.141555 }, { "epoch": 0.812731737540914, "grad_norm": 0.7884681820869446, "learning_rate": 6.911645443849551e-05, "loss": 0.8528437614440918, "memory(GiB)": 91.52, "step": 62635, "token_acc": 0.7704989591028137, "train_speed(iter/s)": 0.141553 }, { "epoch": 0.8127966159425697, "grad_norm": 0.7664347887039185, "learning_rate": 6.911149808067624e-05, "loss": 0.8980987548828125, "memory(GiB)": 91.52, "step": 62640, "token_acc": 0.7506730699265081, "train_speed(iter/s)": 0.141551 }, { "epoch": 0.8128614943442254, "grad_norm": 0.7050440907478333, "learning_rate": 6.910654150292308e-05, "loss": 0.8908124923706054, "memory(GiB)": 91.52, "step": 62645, "token_acc": 0.7419059740672977, "train_speed(iter/s)": 0.141549 }, { "epoch": 0.8129263727458811, "grad_norm": 0.84517502784729, "learning_rate": 6.910158470529308e-05, "loss": 0.9270530700683594, "memory(GiB)": 91.52, "step": 62650, "token_acc": 0.7555249162891471, "train_speed(iter/s)": 0.141548 }, { "epoch": 0.8129912511475367, "grad_norm": 0.6750907897949219, "learning_rate": 6.909662768784326e-05, "loss": 0.8416090965270996, "memory(GiB)": 91.52, "step": 62655, "token_acc": 0.7573185923388353, "train_speed(iter/s)": 0.141547 }, { "epoch": 0.8130561295491924, "grad_norm": 0.7562460899353027, "learning_rate": 6.90916704506307e-05, "loss": 0.8958706855773926, "memory(GiB)": 91.52, "step": 62660, "token_acc": 0.7405511218190032, "train_speed(iter/s)": 0.141546 }, { "epoch": 0.8131210079508481, "grad_norm": 0.6933611631393433, "learning_rate": 6.90867129937124e-05, "loss": 0.8193326950073242, "memory(GiB)": 91.52, "step": 62665, "token_acc": 0.7573171483147452, "train_speed(iter/s)": 0.141544 }, { "epoch": 0.8131858863525038, "grad_norm": 0.7459157109260559, "learning_rate": 6.908175531714544e-05, "loss": 0.8858781814575195, "memory(GiB)": 91.52, "step": 62670, "token_acc": 0.7565867973814054, "train_speed(iter/s)": 0.141542 }, { "epoch": 0.8132507647541595, "grad_norm": 0.8207932710647583, "learning_rate": 6.907679742098687e-05, "loss": 0.8371162414550781, "memory(GiB)": 91.52, "step": 62675, "token_acc": 0.77344020964199, "train_speed(iter/s)": 0.141541 }, { "epoch": 0.8133156431558152, "grad_norm": 0.7110556364059448, "learning_rate": 6.907183930529375e-05, "loss": 0.8835214614868164, "memory(GiB)": 91.52, "step": 62680, "token_acc": 0.7545708397048069, "train_speed(iter/s)": 0.141539 }, { "epoch": 0.8133805215574709, "grad_norm": 0.677212119102478, "learning_rate": 6.906688097012312e-05, "loss": 0.8351839065551758, "memory(GiB)": 91.52, "step": 62685, "token_acc": 0.7722550908090259, "train_speed(iter/s)": 0.141537 }, { "epoch": 0.8134453999591266, "grad_norm": 0.7194343209266663, "learning_rate": 6.906192241553207e-05, "loss": 0.8464951515197754, "memory(GiB)": 91.52, "step": 62690, "token_acc": 0.7597524970161442, "train_speed(iter/s)": 0.141535 }, { "epoch": 0.8135102783607823, "grad_norm": 0.7716785669326782, "learning_rate": 6.905696364157764e-05, "loss": 0.8825254440307617, "memory(GiB)": 91.52, "step": 62695, "token_acc": 0.7645705792844757, "train_speed(iter/s)": 0.141534 }, { "epoch": 0.813575156762438, "grad_norm": 0.7071310877799988, "learning_rate": 6.905200464831688e-05, "loss": 0.892997932434082, "memory(GiB)": 91.52, "step": 62700, "token_acc": 0.7480801028844387, "train_speed(iter/s)": 0.141532 }, { "epoch": 0.8136400351640937, "grad_norm": 0.7230697274208069, "learning_rate": 6.904704543580688e-05, "loss": 0.8725481033325195, "memory(GiB)": 91.52, "step": 62705, "token_acc": 0.7378563520619983, "train_speed(iter/s)": 0.14153 }, { "epoch": 0.8137049135657494, "grad_norm": 0.8228330016136169, "learning_rate": 6.904208600410471e-05, "loss": 0.8671573638916016, "memory(GiB)": 91.52, "step": 62710, "token_acc": 0.7517542740495025, "train_speed(iter/s)": 0.141529 }, { "epoch": 0.8137697919674051, "grad_norm": 0.7296364903450012, "learning_rate": 6.903712635326745e-05, "loss": 0.8145676612854004, "memory(GiB)": 91.52, "step": 62715, "token_acc": 0.7747279575892857, "train_speed(iter/s)": 0.141527 }, { "epoch": 0.8138346703690608, "grad_norm": 0.7535614371299744, "learning_rate": 6.903216648335215e-05, "loss": 0.8901612281799316, "memory(GiB)": 91.52, "step": 62720, "token_acc": 0.7739120789736385, "train_speed(iter/s)": 0.141525 }, { "epoch": 0.8138995487707165, "grad_norm": 0.8213402032852173, "learning_rate": 6.902720639441592e-05, "loss": 0.8834827423095704, "memory(GiB)": 91.52, "step": 62725, "token_acc": 0.7748407983208225, "train_speed(iter/s)": 0.141522 }, { "epoch": 0.8139644271723722, "grad_norm": 0.7472107410430908, "learning_rate": 6.90222460865158e-05, "loss": 0.8638360977172852, "memory(GiB)": 91.52, "step": 62730, "token_acc": 0.7737348234507853, "train_speed(iter/s)": 0.141521 }, { "epoch": 0.8140293055740279, "grad_norm": 0.8122210502624512, "learning_rate": 6.901728555970893e-05, "loss": 0.8504816055297851, "memory(GiB)": 91.52, "step": 62735, "token_acc": 0.7601656119563754, "train_speed(iter/s)": 0.141519 }, { "epoch": 0.8140941839756836, "grad_norm": 0.6590017676353455, "learning_rate": 6.901232481405233e-05, "loss": 0.8961739540100098, "memory(GiB)": 91.52, "step": 62740, "token_acc": 0.7687763421543564, "train_speed(iter/s)": 0.141517 }, { "epoch": 0.8141590623773393, "grad_norm": 0.7273315191268921, "learning_rate": 6.900736384960314e-05, "loss": 0.8694774627685546, "memory(GiB)": 91.52, "step": 62745, "token_acc": 0.7694806328804443, "train_speed(iter/s)": 0.141515 }, { "epoch": 0.814223940778995, "grad_norm": 0.795453667640686, "learning_rate": 6.900240266641843e-05, "loss": 0.8277403831481933, "memory(GiB)": 91.52, "step": 62750, "token_acc": 0.7670213830603463, "train_speed(iter/s)": 0.141513 }, { "epoch": 0.8142888191806507, "grad_norm": 0.7237322926521301, "learning_rate": 6.899744126455527e-05, "loss": 0.8950710296630859, "memory(GiB)": 91.52, "step": 62755, "token_acc": 0.7653140039213356, "train_speed(iter/s)": 0.14151 }, { "epoch": 0.8143536975823064, "grad_norm": 0.7106825113296509, "learning_rate": 6.89924796440708e-05, "loss": 0.8825118064880371, "memory(GiB)": 91.52, "step": 62760, "token_acc": 0.7627551020408163, "train_speed(iter/s)": 0.141508 }, { "epoch": 0.8144185759839621, "grad_norm": 0.7119072079658508, "learning_rate": 6.898751780502208e-05, "loss": 0.8699033737182618, "memory(GiB)": 91.52, "step": 62765, "token_acc": 0.747757642321069, "train_speed(iter/s)": 0.141506 }, { "epoch": 0.8144834543856178, "grad_norm": 0.7182720303535461, "learning_rate": 6.898255574746626e-05, "loss": 0.8513349533081055, "memory(GiB)": 91.52, "step": 62770, "token_acc": 0.7801715846574078, "train_speed(iter/s)": 0.141504 }, { "epoch": 0.8145483327872735, "grad_norm": 0.6923858523368835, "learning_rate": 6.897759347146038e-05, "loss": 0.9168817520141601, "memory(GiB)": 91.52, "step": 62775, "token_acc": 0.7529618956131925, "train_speed(iter/s)": 0.141502 }, { "epoch": 0.8146132111889292, "grad_norm": 0.7801922559738159, "learning_rate": 6.897263097706158e-05, "loss": 0.8718112945556641, "memory(GiB)": 91.52, "step": 62780, "token_acc": 0.7703291027154664, "train_speed(iter/s)": 0.1415 }, { "epoch": 0.8146780895905849, "grad_norm": 0.7511959671974182, "learning_rate": 6.896766826432696e-05, "loss": 0.8796172142028809, "memory(GiB)": 91.52, "step": 62785, "token_acc": 0.7407574704656011, "train_speed(iter/s)": 0.141498 }, { "epoch": 0.8147429679922406, "grad_norm": 0.7265440821647644, "learning_rate": 6.896270533331365e-05, "loss": 0.8416122436523438, "memory(GiB)": 91.52, "step": 62790, "token_acc": 0.7906159199339957, "train_speed(iter/s)": 0.141496 }, { "epoch": 0.8148078463938963, "grad_norm": 0.6944963335990906, "learning_rate": 6.895774218407875e-05, "loss": 0.8646556854248046, "memory(GiB)": 91.52, "step": 62795, "token_acc": 0.7708062033858174, "train_speed(iter/s)": 0.141494 }, { "epoch": 0.814872724795552, "grad_norm": 0.7067464590072632, "learning_rate": 6.895277881667935e-05, "loss": 0.8854238510131835, "memory(GiB)": 91.52, "step": 62800, "token_acc": 0.7795858504731641, "train_speed(iter/s)": 0.141492 }, { "epoch": 0.8149376031972076, "grad_norm": 0.7567527890205383, "learning_rate": 6.894781523117262e-05, "loss": 0.8574514389038086, "memory(GiB)": 91.52, "step": 62805, "token_acc": 0.7753694581280788, "train_speed(iter/s)": 0.14149 }, { "epoch": 0.8150024815988633, "grad_norm": 0.7314332127571106, "learning_rate": 6.894285142761562e-05, "loss": 0.840229606628418, "memory(GiB)": 91.52, "step": 62810, "token_acc": 0.7736376664685938, "train_speed(iter/s)": 0.141488 }, { "epoch": 0.815067360000519, "grad_norm": 0.6857496500015259, "learning_rate": 6.893788740606552e-05, "loss": 0.8393596649169922, "memory(GiB)": 91.52, "step": 62815, "token_acc": 0.7812143757124858, "train_speed(iter/s)": 0.141487 }, { "epoch": 0.8151322384021747, "grad_norm": 0.7556763887405396, "learning_rate": 6.893292316657943e-05, "loss": 0.8955924987792969, "memory(GiB)": 91.52, "step": 62820, "token_acc": 0.7529172270893162, "train_speed(iter/s)": 0.141485 }, { "epoch": 0.8151971168038304, "grad_norm": 0.7770604491233826, "learning_rate": 6.892795870921449e-05, "loss": 0.8751734733581543, "memory(GiB)": 91.52, "step": 62825, "token_acc": 0.750720516466203, "train_speed(iter/s)": 0.141484 }, { "epoch": 0.8152619952054861, "grad_norm": 0.7358041405677795, "learning_rate": 6.892299403402781e-05, "loss": 0.87066650390625, "memory(GiB)": 91.52, "step": 62830, "token_acc": 0.7617756970966285, "train_speed(iter/s)": 0.141482 }, { "epoch": 0.8153268736071418, "grad_norm": 0.6035923361778259, "learning_rate": 6.891802914107652e-05, "loss": 0.809971809387207, "memory(GiB)": 91.52, "step": 62835, "token_acc": 0.7834012099165711, "train_speed(iter/s)": 0.141481 }, { "epoch": 0.8153917520087975, "grad_norm": 0.7103416919708252, "learning_rate": 6.891306403041779e-05, "loss": 0.8773799896240234, "memory(GiB)": 91.52, "step": 62840, "token_acc": 0.7487964052637209, "train_speed(iter/s)": 0.141479 }, { "epoch": 0.8154566304104532, "grad_norm": 0.7495936751365662, "learning_rate": 6.890809870210871e-05, "loss": 0.8342798233032227, "memory(GiB)": 91.52, "step": 62845, "token_acc": 0.7596739130434783, "train_speed(iter/s)": 0.141478 }, { "epoch": 0.8155215088121089, "grad_norm": 0.7493079304695129, "learning_rate": 6.890313315620647e-05, "loss": 0.850159740447998, "memory(GiB)": 91.52, "step": 62850, "token_acc": 0.7587234324306429, "train_speed(iter/s)": 0.141476 }, { "epoch": 0.8155863872137646, "grad_norm": 0.760601282119751, "learning_rate": 6.889816739276819e-05, "loss": 0.8315155029296875, "memory(GiB)": 91.52, "step": 62855, "token_acc": 0.7752101464341937, "train_speed(iter/s)": 0.141474 }, { "epoch": 0.8156512656154203, "grad_norm": 0.8686649799346924, "learning_rate": 6.8893201411851e-05, "loss": 0.8576841354370117, "memory(GiB)": 91.52, "step": 62860, "token_acc": 0.7698696027958363, "train_speed(iter/s)": 0.141472 }, { "epoch": 0.815716144017076, "grad_norm": 0.8508127927780151, "learning_rate": 6.888823521351209e-05, "loss": 0.8479537010192871, "memory(GiB)": 91.52, "step": 62865, "token_acc": 0.7665590964098427, "train_speed(iter/s)": 0.141471 }, { "epoch": 0.8157810224187317, "grad_norm": 0.703635573387146, "learning_rate": 6.888326879780855e-05, "loss": 0.8457667350769043, "memory(GiB)": 91.52, "step": 62870, "token_acc": 0.7738499536894103, "train_speed(iter/s)": 0.141469 }, { "epoch": 0.8158459008203874, "grad_norm": 0.7478069067001343, "learning_rate": 6.88783021647976e-05, "loss": 0.8672339439392089, "memory(GiB)": 91.52, "step": 62875, "token_acc": 0.7693283555040052, "train_speed(iter/s)": 0.141468 }, { "epoch": 0.8159107792220431, "grad_norm": 0.6454420685768127, "learning_rate": 6.887333531453635e-05, "loss": 0.8650972366333007, "memory(GiB)": 91.52, "step": 62880, "token_acc": 0.7663951056590816, "train_speed(iter/s)": 0.141466 }, { "epoch": 0.8159756576236988, "grad_norm": 0.7451222538948059, "learning_rate": 6.886836824708196e-05, "loss": 0.8976272583007813, "memory(GiB)": 91.52, "step": 62885, "token_acc": 0.7491645122643295, "train_speed(iter/s)": 0.141464 }, { "epoch": 0.8160405360253545, "grad_norm": 0.6232609748840332, "learning_rate": 6.886340096249162e-05, "loss": 0.8496181488037109, "memory(GiB)": 91.52, "step": 62890, "token_acc": 0.7598569594718504, "train_speed(iter/s)": 0.141461 }, { "epoch": 0.8161054144270101, "grad_norm": 0.7580679655075073, "learning_rate": 6.885843346082246e-05, "loss": 0.8703644752502442, "memory(GiB)": 91.52, "step": 62895, "token_acc": 0.7548010500138159, "train_speed(iter/s)": 0.14146 }, { "epoch": 0.8161702928286658, "grad_norm": 0.8223159909248352, "learning_rate": 6.885346574213168e-05, "loss": 0.9064067840576172, "memory(GiB)": 91.52, "step": 62900, "token_acc": 0.7335158150851582, "train_speed(iter/s)": 0.141458 }, { "epoch": 0.8162351712303215, "grad_norm": 0.692791759967804, "learning_rate": 6.884849780647642e-05, "loss": 0.8649833679199219, "memory(GiB)": 91.52, "step": 62905, "token_acc": 0.7676415725196213, "train_speed(iter/s)": 0.141456 }, { "epoch": 0.8163000496319772, "grad_norm": 0.6746039390563965, "learning_rate": 6.884352965391386e-05, "loss": 0.8689968109130859, "memory(GiB)": 91.52, "step": 62910, "token_acc": 0.7549634273772204, "train_speed(iter/s)": 0.141455 }, { "epoch": 0.8163649280336329, "grad_norm": 0.6548125147819519, "learning_rate": 6.883856128450118e-05, "loss": 0.8266536712646484, "memory(GiB)": 91.52, "step": 62915, "token_acc": 0.7846164767685373, "train_speed(iter/s)": 0.141452 }, { "epoch": 0.8164298064352886, "grad_norm": 0.7373316287994385, "learning_rate": 6.883359269829555e-05, "loss": 0.9133365631103516, "memory(GiB)": 91.52, "step": 62920, "token_acc": 0.7603311633172941, "train_speed(iter/s)": 0.14145 }, { "epoch": 0.8164946848369443, "grad_norm": 0.7173032164573669, "learning_rate": 6.882862389535413e-05, "loss": 0.880890941619873, "memory(GiB)": 91.52, "step": 62925, "token_acc": 0.7525440313111545, "train_speed(iter/s)": 0.141448 }, { "epoch": 0.8165595632386, "grad_norm": 0.6572384238243103, "learning_rate": 6.882365487573414e-05, "loss": 0.8289592742919922, "memory(GiB)": 91.52, "step": 62930, "token_acc": 0.7668263914485809, "train_speed(iter/s)": 0.141446 }, { "epoch": 0.8166244416402557, "grad_norm": 0.7788558602333069, "learning_rate": 6.881868563949274e-05, "loss": 0.8702030181884766, "memory(GiB)": 91.52, "step": 62935, "token_acc": 0.742870252717775, "train_speed(iter/s)": 0.141445 }, { "epoch": 0.8166893200419114, "grad_norm": 0.7407411336898804, "learning_rate": 6.881371618668708e-05, "loss": 0.8714354515075684, "memory(GiB)": 91.52, "step": 62940, "token_acc": 0.7530603115214121, "train_speed(iter/s)": 0.141444 }, { "epoch": 0.8167541984435671, "grad_norm": 0.6847151517868042, "learning_rate": 6.880874651737443e-05, "loss": 0.8176249504089356, "memory(GiB)": 91.52, "step": 62945, "token_acc": 0.7757957201942097, "train_speed(iter/s)": 0.141442 }, { "epoch": 0.8168190768452228, "grad_norm": 0.7213864922523499, "learning_rate": 6.880377663161191e-05, "loss": 0.8754203796386719, "memory(GiB)": 91.52, "step": 62950, "token_acc": 0.7466260201327364, "train_speed(iter/s)": 0.14144 }, { "epoch": 0.8168839552468785, "grad_norm": 0.7250216007232666, "learning_rate": 6.879880652945675e-05, "loss": 0.8685367584228516, "memory(GiB)": 91.52, "step": 62955, "token_acc": 0.7549611734253667, "train_speed(iter/s)": 0.141438 }, { "epoch": 0.8169488336485342, "grad_norm": 0.7441167235374451, "learning_rate": 6.879383621096613e-05, "loss": 0.8506729125976562, "memory(GiB)": 91.52, "step": 62960, "token_acc": 0.7479050058501724, "train_speed(iter/s)": 0.141436 }, { "epoch": 0.8170137120501899, "grad_norm": 0.7368524670600891, "learning_rate": 6.878886567619725e-05, "loss": 0.8231077194213867, "memory(GiB)": 91.52, "step": 62965, "token_acc": 0.7741534208707671, "train_speed(iter/s)": 0.141434 }, { "epoch": 0.8170785904518456, "grad_norm": 0.7334762215614319, "learning_rate": 6.878389492520733e-05, "loss": 0.8952709197998047, "memory(GiB)": 91.52, "step": 62970, "token_acc": 0.7576936409505836, "train_speed(iter/s)": 0.141432 }, { "epoch": 0.8171434688535013, "grad_norm": 0.7900433540344238, "learning_rate": 6.877892395805354e-05, "loss": 0.921209716796875, "memory(GiB)": 91.52, "step": 62975, "token_acc": 0.7487993547677531, "train_speed(iter/s)": 0.14143 }, { "epoch": 0.817208347255157, "grad_norm": 0.7460567355155945, "learning_rate": 6.877395277479309e-05, "loss": 0.8768748283386231, "memory(GiB)": 91.52, "step": 62980, "token_acc": 0.7571232191952012, "train_speed(iter/s)": 0.141429 }, { "epoch": 0.8172732256568127, "grad_norm": 0.7429292798042297, "learning_rate": 6.876898137548323e-05, "loss": 0.8814388275146484, "memory(GiB)": 91.52, "step": 62985, "token_acc": 0.7499497823903583, "train_speed(iter/s)": 0.141427 }, { "epoch": 0.8173381040584684, "grad_norm": 0.8297730088233948, "learning_rate": 6.876400976018112e-05, "loss": 0.8497484207153321, "memory(GiB)": 91.52, "step": 62990, "token_acc": 0.7804671512243314, "train_speed(iter/s)": 0.141425 }, { "epoch": 0.8174029824601241, "grad_norm": 0.6825725436210632, "learning_rate": 6.875903792894399e-05, "loss": 0.8267494201660156, "memory(GiB)": 91.52, "step": 62995, "token_acc": 0.756982277084339, "train_speed(iter/s)": 0.141423 }, { "epoch": 0.8174678608617798, "grad_norm": 0.6804088354110718, "learning_rate": 6.875406588182907e-05, "loss": 0.8605184555053711, "memory(GiB)": 91.52, "step": 63000, "token_acc": 0.7596994580936867, "train_speed(iter/s)": 0.141421 }, { "epoch": 0.8175327392634355, "grad_norm": 0.7560026049613953, "learning_rate": 6.874909361889356e-05, "loss": 0.8697870254516602, "memory(GiB)": 91.52, "step": 63005, "token_acc": 0.7697229085774797, "train_speed(iter/s)": 0.14142 }, { "epoch": 0.8175976176650912, "grad_norm": 0.7474086284637451, "learning_rate": 6.87441211401947e-05, "loss": 0.8962968826293946, "memory(GiB)": 91.52, "step": 63010, "token_acc": 0.7582158834807764, "train_speed(iter/s)": 0.141419 }, { "epoch": 0.8176624960667469, "grad_norm": 0.696018397808075, "learning_rate": 6.873914844578968e-05, "loss": 0.8405458450317382, "memory(GiB)": 91.52, "step": 63015, "token_acc": 0.7639847692164814, "train_speed(iter/s)": 0.141417 }, { "epoch": 0.8177273744684026, "grad_norm": 0.7386336922645569, "learning_rate": 6.873417553573576e-05, "loss": 0.9186161994934082, "memory(GiB)": 91.52, "step": 63020, "token_acc": 0.756879136189481, "train_speed(iter/s)": 0.141415 }, { "epoch": 0.8177922528700583, "grad_norm": 0.7474821209907532, "learning_rate": 6.872920241009015e-05, "loss": 0.8444128036499023, "memory(GiB)": 91.52, "step": 63025, "token_acc": 0.7744641192917054, "train_speed(iter/s)": 0.141412 }, { "epoch": 0.817857131271714, "grad_norm": 0.7288300395011902, "learning_rate": 6.872422906891008e-05, "loss": 0.8694347381591797, "memory(GiB)": 91.52, "step": 63030, "token_acc": 0.7615013332433254, "train_speed(iter/s)": 0.14141 }, { "epoch": 0.8179220096733697, "grad_norm": 0.6919645667076111, "learning_rate": 6.87192555122528e-05, "loss": 0.894778060913086, "memory(GiB)": 91.52, "step": 63035, "token_acc": 0.7697667973235143, "train_speed(iter/s)": 0.141409 }, { "epoch": 0.8179868880750254, "grad_norm": 0.7478907108306885, "learning_rate": 6.871428174017552e-05, "loss": 0.9065585136413574, "memory(GiB)": 91.52, "step": 63040, "token_acc": 0.7631961259079904, "train_speed(iter/s)": 0.141407 }, { "epoch": 0.8180517664766811, "grad_norm": 0.7392551898956299, "learning_rate": 6.87093077527355e-05, "loss": 0.8690229415893554, "memory(GiB)": 91.52, "step": 63045, "token_acc": 0.7521669341894061, "train_speed(iter/s)": 0.141405 }, { "epoch": 0.8181166448783368, "grad_norm": 0.7367845177650452, "learning_rate": 6.870433354998997e-05, "loss": 0.8810037612915039, "memory(GiB)": 91.52, "step": 63050, "token_acc": 0.7660616153205662, "train_speed(iter/s)": 0.141403 }, { "epoch": 0.8181815232799925, "grad_norm": 0.8062857985496521, "learning_rate": 6.869935913199617e-05, "loss": 0.8676549911499023, "memory(GiB)": 91.52, "step": 63055, "token_acc": 0.7633713207019507, "train_speed(iter/s)": 0.141402 }, { "epoch": 0.8182464016816482, "grad_norm": 0.7680665850639343, "learning_rate": 6.869438449881133e-05, "loss": 0.8764928817749024, "memory(GiB)": 91.52, "step": 63060, "token_acc": 0.7559119965488843, "train_speed(iter/s)": 0.1414 }, { "epoch": 0.8183112800833039, "grad_norm": 0.64902663230896, "learning_rate": 6.868940965049273e-05, "loss": 0.8570738792419433, "memory(GiB)": 91.52, "step": 63065, "token_acc": 0.7763404825737266, "train_speed(iter/s)": 0.141398 }, { "epoch": 0.8183761584849596, "grad_norm": 0.7134215235710144, "learning_rate": 6.868443458709761e-05, "loss": 0.8834707260131835, "memory(GiB)": 91.52, "step": 63070, "token_acc": 0.772808647187975, "train_speed(iter/s)": 0.141397 }, { "epoch": 0.8184410368866153, "grad_norm": 0.6309480667114258, "learning_rate": 6.867945930868322e-05, "loss": 0.9114082336425782, "memory(GiB)": 91.52, "step": 63075, "token_acc": 0.7559724091520862, "train_speed(iter/s)": 0.141395 }, { "epoch": 0.818505915288271, "grad_norm": 0.6796623468399048, "learning_rate": 6.867448381530683e-05, "loss": 0.8397520065307618, "memory(GiB)": 91.52, "step": 63080, "token_acc": 0.7788971021129064, "train_speed(iter/s)": 0.141393 }, { "epoch": 0.8185707936899267, "grad_norm": 0.742695689201355, "learning_rate": 6.866950810702565e-05, "loss": 0.8404293060302734, "memory(GiB)": 91.52, "step": 63085, "token_acc": 0.7716339869281046, "train_speed(iter/s)": 0.141392 }, { "epoch": 0.8186356720915824, "grad_norm": 0.7240009307861328, "learning_rate": 6.866453218389699e-05, "loss": 0.8848926544189453, "memory(GiB)": 91.52, "step": 63090, "token_acc": 0.7529015916222449, "train_speed(iter/s)": 0.14139 }, { "epoch": 0.8187005504932381, "grad_norm": 0.6795881986618042, "learning_rate": 6.86595560459781e-05, "loss": 0.8590204238891601, "memory(GiB)": 91.52, "step": 63095, "token_acc": 0.7383888906148063, "train_speed(iter/s)": 0.141388 }, { "epoch": 0.8187654288948938, "grad_norm": 0.70344078540802, "learning_rate": 6.865457969332623e-05, "loss": 0.854025936126709, "memory(GiB)": 91.52, "step": 63100, "token_acc": 0.7486853714487383, "train_speed(iter/s)": 0.141386 }, { "epoch": 0.8188303072965495, "grad_norm": 0.728887677192688, "learning_rate": 6.864960312599865e-05, "loss": 0.8736330986022949, "memory(GiB)": 91.52, "step": 63105, "token_acc": 0.7634979465090654, "train_speed(iter/s)": 0.141384 }, { "epoch": 0.8188951856982052, "grad_norm": 0.6989810466766357, "learning_rate": 6.864462634405265e-05, "loss": 0.8592537879943848, "memory(GiB)": 91.52, "step": 63110, "token_acc": 0.7566202988150438, "train_speed(iter/s)": 0.141382 }, { "epoch": 0.8189600640998609, "grad_norm": 0.8189826607704163, "learning_rate": 6.863964934754549e-05, "loss": 0.9294820785522461, "memory(GiB)": 91.52, "step": 63115, "token_acc": 0.7351635184663096, "train_speed(iter/s)": 0.14138 }, { "epoch": 0.8190249425015166, "grad_norm": 0.7835361361503601, "learning_rate": 6.863467213653445e-05, "loss": 0.9085002899169922, "memory(GiB)": 91.52, "step": 63120, "token_acc": 0.7629665666639639, "train_speed(iter/s)": 0.141378 }, { "epoch": 0.8190898209031723, "grad_norm": 0.7033512592315674, "learning_rate": 6.862969471107679e-05, "loss": 0.8318805694580078, "memory(GiB)": 91.52, "step": 63125, "token_acc": 0.7609027474923681, "train_speed(iter/s)": 0.141376 }, { "epoch": 0.8191546993048279, "grad_norm": 0.7828177809715271, "learning_rate": 6.862471707122981e-05, "loss": 0.9178816795349121, "memory(GiB)": 91.52, "step": 63130, "token_acc": 0.7456168313675486, "train_speed(iter/s)": 0.141375 }, { "epoch": 0.8192195777064836, "grad_norm": 0.6783789992332458, "learning_rate": 6.861973921705077e-05, "loss": 0.8791028976440429, "memory(GiB)": 91.52, "step": 63135, "token_acc": 0.7779841091053, "train_speed(iter/s)": 0.141373 }, { "epoch": 0.8192844561081393, "grad_norm": 0.7216525673866272, "learning_rate": 6.861476114859698e-05, "loss": 0.8597659111022949, "memory(GiB)": 91.52, "step": 63140, "token_acc": 0.7711811179795709, "train_speed(iter/s)": 0.141371 }, { "epoch": 0.819349334509795, "grad_norm": 0.6673295497894287, "learning_rate": 6.860978286592572e-05, "loss": 0.8170493125915528, "memory(GiB)": 91.52, "step": 63145, "token_acc": 0.7809996184662342, "train_speed(iter/s)": 0.141369 }, { "epoch": 0.8194142129114507, "grad_norm": 0.7738572359085083, "learning_rate": 6.860480436909428e-05, "loss": 0.8857154846191406, "memory(GiB)": 91.52, "step": 63150, "token_acc": 0.7687188019966722, "train_speed(iter/s)": 0.141367 }, { "epoch": 0.8194790913131064, "grad_norm": 0.7336784601211548, "learning_rate": 6.859982565815995e-05, "loss": 0.8651262283325195, "memory(GiB)": 91.52, "step": 63155, "token_acc": 0.7709519637903344, "train_speed(iter/s)": 0.141366 }, { "epoch": 0.8195439697147621, "grad_norm": 0.8388716578483582, "learning_rate": 6.859484673318e-05, "loss": 0.874870777130127, "memory(GiB)": 91.52, "step": 63160, "token_acc": 0.7593527369061683, "train_speed(iter/s)": 0.141364 }, { "epoch": 0.8196088481164178, "grad_norm": 0.711017370223999, "learning_rate": 6.858986759421178e-05, "loss": 0.8702470779418945, "memory(GiB)": 91.52, "step": 63165, "token_acc": 0.7745959466393022, "train_speed(iter/s)": 0.141362 }, { "epoch": 0.8196737265180735, "grad_norm": 0.7538272738456726, "learning_rate": 6.858488824131254e-05, "loss": 0.8844432830810547, "memory(GiB)": 91.52, "step": 63170, "token_acc": 0.7683443852592673, "train_speed(iter/s)": 0.14136 }, { "epoch": 0.8197386049197292, "grad_norm": 0.6629922389984131, "learning_rate": 6.85799086745396e-05, "loss": 0.8900501251220703, "memory(GiB)": 91.52, "step": 63175, "token_acc": 0.7580923389142568, "train_speed(iter/s)": 0.141358 }, { "epoch": 0.8198034833213849, "grad_norm": 0.7504304647445679, "learning_rate": 6.857492889395028e-05, "loss": 0.924467658996582, "memory(GiB)": 91.52, "step": 63180, "token_acc": 0.7353581537121707, "train_speed(iter/s)": 0.141356 }, { "epoch": 0.8198683617230406, "grad_norm": 0.7739506363868713, "learning_rate": 6.856994889960188e-05, "loss": 0.8933803558349609, "memory(GiB)": 91.52, "step": 63185, "token_acc": 0.7557319579320627, "train_speed(iter/s)": 0.141355 }, { "epoch": 0.8199332401246963, "grad_norm": 0.7942336201667786, "learning_rate": 6.856496869155169e-05, "loss": 0.8648542404174805, "memory(GiB)": 91.52, "step": 63190, "token_acc": 0.749493873201918, "train_speed(iter/s)": 0.141353 }, { "epoch": 0.819998118526352, "grad_norm": 0.7393240332603455, "learning_rate": 6.855998826985703e-05, "loss": 0.8727535247802735, "memory(GiB)": 91.52, "step": 63195, "token_acc": 0.7722968845448992, "train_speed(iter/s)": 0.141351 }, { "epoch": 0.8200629969280077, "grad_norm": 0.7367021441459656, "learning_rate": 6.855500763457525e-05, "loss": 0.9191953659057617, "memory(GiB)": 91.52, "step": 63200, "token_acc": 0.7524228418070672, "train_speed(iter/s)": 0.141349 }, { "epoch": 0.8201278753296634, "grad_norm": 0.7383813261985779, "learning_rate": 6.85500267857636e-05, "loss": 0.8297998428344726, "memory(GiB)": 91.52, "step": 63205, "token_acc": 0.7725436579454353, "train_speed(iter/s)": 0.141347 }, { "epoch": 0.8201927537313191, "grad_norm": 0.8235152363777161, "learning_rate": 6.854504572347945e-05, "loss": 0.8718032836914062, "memory(GiB)": 91.52, "step": 63210, "token_acc": 0.766862851390072, "train_speed(iter/s)": 0.141345 }, { "epoch": 0.8202576321329748, "grad_norm": 0.8109854459762573, "learning_rate": 6.854006444778011e-05, "loss": 0.8427949905395508, "memory(GiB)": 91.52, "step": 63215, "token_acc": 0.7572261827608555, "train_speed(iter/s)": 0.141344 }, { "epoch": 0.8203225105346305, "grad_norm": 0.7163186073303223, "learning_rate": 6.853508295872289e-05, "loss": 0.8632925033569336, "memory(GiB)": 91.52, "step": 63220, "token_acc": 0.7699766077906228, "train_speed(iter/s)": 0.141341 }, { "epoch": 0.8203873889362862, "grad_norm": 0.6693307161331177, "learning_rate": 6.853010125636515e-05, "loss": 0.86276216506958, "memory(GiB)": 91.52, "step": 63225, "token_acc": 0.758947577555133, "train_speed(iter/s)": 0.141339 }, { "epoch": 0.8204522673379419, "grad_norm": 0.8696900010108948, "learning_rate": 6.852511934076417e-05, "loss": 0.8722598075866699, "memory(GiB)": 91.52, "step": 63230, "token_acc": 0.7647099905716381, "train_speed(iter/s)": 0.141337 }, { "epoch": 0.8205171457395976, "grad_norm": 0.7812395095825195, "learning_rate": 6.852013721197734e-05, "loss": 0.8861769676208496, "memory(GiB)": 91.52, "step": 63235, "token_acc": 0.7561687170474517, "train_speed(iter/s)": 0.141336 }, { "epoch": 0.8205820241412533, "grad_norm": 0.7863688468933105, "learning_rate": 6.851515487006194e-05, "loss": 0.887812614440918, "memory(GiB)": 91.52, "step": 63240, "token_acc": 0.7635570242331808, "train_speed(iter/s)": 0.141334 }, { "epoch": 0.820646902542909, "grad_norm": 0.7846587896347046, "learning_rate": 6.851017231507533e-05, "loss": 0.914666748046875, "memory(GiB)": 91.52, "step": 63245, "token_acc": 0.7590919109543751, "train_speed(iter/s)": 0.141332 }, { "epoch": 0.8207117809445647, "grad_norm": 0.6777937412261963, "learning_rate": 6.850518954707485e-05, "loss": 0.8439291000366211, "memory(GiB)": 91.52, "step": 63250, "token_acc": 0.761212260009924, "train_speed(iter/s)": 0.14133 }, { "epoch": 0.8207766593462203, "grad_norm": 0.7679742574691772, "learning_rate": 6.850020656611785e-05, "loss": 0.8786628723144532, "memory(GiB)": 91.52, "step": 63255, "token_acc": 0.7680464676860632, "train_speed(iter/s)": 0.141329 }, { "epoch": 0.820841537747876, "grad_norm": 0.7310730218887329, "learning_rate": 6.849522337226166e-05, "loss": 0.837127685546875, "memory(GiB)": 91.52, "step": 63260, "token_acc": 0.7640832173467145, "train_speed(iter/s)": 0.141327 }, { "epoch": 0.8209064161495317, "grad_norm": 0.752825140953064, "learning_rate": 6.849023996556362e-05, "loss": 0.9375036239624024, "memory(GiB)": 91.52, "step": 63265, "token_acc": 0.7424590734935562, "train_speed(iter/s)": 0.141325 }, { "epoch": 0.8209712945511874, "grad_norm": 0.7266727685928345, "learning_rate": 6.84852563460811e-05, "loss": 0.8711776733398438, "memory(GiB)": 91.52, "step": 63270, "token_acc": 0.7619223287946878, "train_speed(iter/s)": 0.141323 }, { "epoch": 0.8210361729528431, "grad_norm": 0.7071147561073303, "learning_rate": 6.848027251387144e-05, "loss": 0.8634086608886719, "memory(GiB)": 91.52, "step": 63275, "token_acc": 0.7697942643391521, "train_speed(iter/s)": 0.141321 }, { "epoch": 0.8211010513544988, "grad_norm": 0.6920295357704163, "learning_rate": 6.847528846899198e-05, "loss": 0.8161903381347656, "memory(GiB)": 91.52, "step": 63280, "token_acc": 0.7486205777345017, "train_speed(iter/s)": 0.141319 }, { "epoch": 0.8211659297561545, "grad_norm": 0.7360131740570068, "learning_rate": 6.847030421150011e-05, "loss": 0.88406343460083, "memory(GiB)": 91.52, "step": 63285, "token_acc": 0.7760066480320315, "train_speed(iter/s)": 0.141319 }, { "epoch": 0.8212308081578102, "grad_norm": 0.7475764751434326, "learning_rate": 6.846531974145315e-05, "loss": 0.9009685516357422, "memory(GiB)": 91.52, "step": 63290, "token_acc": 0.745462780240044, "train_speed(iter/s)": 0.141317 }, { "epoch": 0.8212956865594659, "grad_norm": 0.6922751069068909, "learning_rate": 6.846033505890848e-05, "loss": 0.8565345764160156, "memory(GiB)": 91.52, "step": 63295, "token_acc": 0.7565238973195292, "train_speed(iter/s)": 0.141314 }, { "epoch": 0.8213605649611216, "grad_norm": 0.7204789519309998, "learning_rate": 6.845535016392346e-05, "loss": 0.8744606971740723, "memory(GiB)": 91.52, "step": 63300, "token_acc": 0.7578686964795432, "train_speed(iter/s)": 0.141313 }, { "epoch": 0.8214254433627773, "grad_norm": 0.7724459171295166, "learning_rate": 6.845036505655547e-05, "loss": 0.8318085670471191, "memory(GiB)": 91.52, "step": 63305, "token_acc": 0.7687568601069291, "train_speed(iter/s)": 0.141312 }, { "epoch": 0.821490321764433, "grad_norm": 0.7952983379364014, "learning_rate": 6.844537973686185e-05, "loss": 0.8718647003173828, "memory(GiB)": 91.52, "step": 63310, "token_acc": 0.7455699141066315, "train_speed(iter/s)": 0.14131 }, { "epoch": 0.8215552001660887, "grad_norm": 0.745179295539856, "learning_rate": 6.844039420490001e-05, "loss": 0.8403301239013672, "memory(GiB)": 91.52, "step": 63315, "token_acc": 0.762891046386192, "train_speed(iter/s)": 0.141309 }, { "epoch": 0.8216200785677444, "grad_norm": 0.7154320478439331, "learning_rate": 6.84354084607273e-05, "loss": 0.8825918197631836, "memory(GiB)": 91.52, "step": 63320, "token_acc": 0.7582474070938284, "train_speed(iter/s)": 0.141307 }, { "epoch": 0.8216849569694001, "grad_norm": 0.7680631279945374, "learning_rate": 6.843042250440107e-05, "loss": 0.8831413269042969, "memory(GiB)": 91.52, "step": 63325, "token_acc": 0.7591063235587473, "train_speed(iter/s)": 0.141305 }, { "epoch": 0.8217498353710558, "grad_norm": 0.7277478575706482, "learning_rate": 6.842543633597874e-05, "loss": 0.8324533462524414, "memory(GiB)": 91.52, "step": 63330, "token_acc": 0.7598403137694355, "train_speed(iter/s)": 0.141304 }, { "epoch": 0.8218147137727115, "grad_norm": 0.8264256119728088, "learning_rate": 6.842044995551767e-05, "loss": 0.8708703994750977, "memory(GiB)": 91.52, "step": 63335, "token_acc": 0.761935905820798, "train_speed(iter/s)": 0.141302 }, { "epoch": 0.8218795921743672, "grad_norm": 0.7526516914367676, "learning_rate": 6.841546336307525e-05, "loss": 0.8306998252868653, "memory(GiB)": 91.52, "step": 63340, "token_acc": 0.7650082111793687, "train_speed(iter/s)": 0.1413 }, { "epoch": 0.8219444705760229, "grad_norm": 0.6463926434516907, "learning_rate": 6.841047655870885e-05, "loss": 0.8564722061157226, "memory(GiB)": 91.52, "step": 63345, "token_acc": 0.7578995673266029, "train_speed(iter/s)": 0.141298 }, { "epoch": 0.8220093489776786, "grad_norm": 0.801821768283844, "learning_rate": 6.84054895424759e-05, "loss": 0.9047173500061035, "memory(GiB)": 91.52, "step": 63350, "token_acc": 0.7727105146733521, "train_speed(iter/s)": 0.141296 }, { "epoch": 0.8220742273793343, "grad_norm": 0.6821680665016174, "learning_rate": 6.840050231443374e-05, "loss": 0.877560043334961, "memory(GiB)": 91.52, "step": 63355, "token_acc": 0.752154759624593, "train_speed(iter/s)": 0.141294 }, { "epoch": 0.82213910578099, "grad_norm": 0.7326065301895142, "learning_rate": 6.839551487463977e-05, "loss": 0.8576591491699219, "memory(GiB)": 91.52, "step": 63360, "token_acc": 0.7837301587301587, "train_speed(iter/s)": 0.141293 }, { "epoch": 0.8222039841826457, "grad_norm": 0.6948901414871216, "learning_rate": 6.839052722315145e-05, "loss": 0.860816478729248, "memory(GiB)": 91.52, "step": 63365, "token_acc": 0.757208163852993, "train_speed(iter/s)": 0.14129 }, { "epoch": 0.8222688625843013, "grad_norm": 0.7150248289108276, "learning_rate": 6.838553936002607e-05, "loss": 0.8636113166809082, "memory(GiB)": 91.52, "step": 63370, "token_acc": 0.7558633013178467, "train_speed(iter/s)": 0.141288 }, { "epoch": 0.822333740985957, "grad_norm": 0.8926693797111511, "learning_rate": 6.838055128532111e-05, "loss": 0.8936468124389648, "memory(GiB)": 91.52, "step": 63375, "token_acc": 0.7470023980815348, "train_speed(iter/s)": 0.141287 }, { "epoch": 0.8223986193876127, "grad_norm": 0.7773441076278687, "learning_rate": 6.837556299909395e-05, "loss": 0.8598151206970215, "memory(GiB)": 91.52, "step": 63380, "token_acc": 0.7699313621964097, "train_speed(iter/s)": 0.141285 }, { "epoch": 0.8224634977892684, "grad_norm": 0.7449237704277039, "learning_rate": 6.8370574501402e-05, "loss": 0.8165811538696289, "memory(GiB)": 91.52, "step": 63385, "token_acc": 0.7817876133875509, "train_speed(iter/s)": 0.141283 }, { "epoch": 0.8225283761909241, "grad_norm": 0.7138910889625549, "learning_rate": 6.836558579230265e-05, "loss": 0.8958185195922852, "memory(GiB)": 91.52, "step": 63390, "token_acc": 0.7508154347959874, "train_speed(iter/s)": 0.141281 }, { "epoch": 0.8225932545925798, "grad_norm": 0.7184708714485168, "learning_rate": 6.836059687185332e-05, "loss": 0.88095121383667, "memory(GiB)": 91.52, "step": 63395, "token_acc": 0.7663277144612025, "train_speed(iter/s)": 0.141279 }, { "epoch": 0.8226581329942355, "grad_norm": 0.7233601212501526, "learning_rate": 6.835560774011144e-05, "loss": 0.8831549644470215, "memory(GiB)": 91.52, "step": 63400, "token_acc": 0.7567266775777414, "train_speed(iter/s)": 0.141278 }, { "epoch": 0.8227230113958912, "grad_norm": 0.8307979106903076, "learning_rate": 6.835061839713438e-05, "loss": 0.9014769554138183, "memory(GiB)": 91.52, "step": 63405, "token_acc": 0.7514927155481251, "train_speed(iter/s)": 0.141276 }, { "epoch": 0.8227878897975469, "grad_norm": 0.7253682017326355, "learning_rate": 6.83456288429796e-05, "loss": 0.8409488677978516, "memory(GiB)": 91.52, "step": 63410, "token_acc": 0.764102564102564, "train_speed(iter/s)": 0.141275 }, { "epoch": 0.8228527681992026, "grad_norm": 0.8476235866546631, "learning_rate": 6.834063907770449e-05, "loss": 0.8403682708740234, "memory(GiB)": 91.52, "step": 63415, "token_acc": 0.7729018102029621, "train_speed(iter/s)": 0.141273 }, { "epoch": 0.8229176466008583, "grad_norm": 0.7563409209251404, "learning_rate": 6.833564910136652e-05, "loss": 0.8699396133422852, "memory(GiB)": 91.52, "step": 63420, "token_acc": 0.7770542855120673, "train_speed(iter/s)": 0.141272 }, { "epoch": 0.822982525002514, "grad_norm": 0.6814383268356323, "learning_rate": 6.833065891402305e-05, "loss": 0.8535596847534179, "memory(GiB)": 91.52, "step": 63425, "token_acc": 0.7763566056176433, "train_speed(iter/s)": 0.14127 }, { "epoch": 0.8230474034041697, "grad_norm": 0.804157555103302, "learning_rate": 6.832566851573155e-05, "loss": 0.9053125381469727, "memory(GiB)": 91.52, "step": 63430, "token_acc": 0.7369820374813566, "train_speed(iter/s)": 0.141269 }, { "epoch": 0.8231122818058254, "grad_norm": 0.7590990662574768, "learning_rate": 6.832067790654943e-05, "loss": 0.88510160446167, "memory(GiB)": 91.52, "step": 63435, "token_acc": 0.7390586609336609, "train_speed(iter/s)": 0.141267 }, { "epoch": 0.8231771602074811, "grad_norm": 0.7125628590583801, "learning_rate": 6.831568708653412e-05, "loss": 0.8698092460632324, "memory(GiB)": 91.52, "step": 63440, "token_acc": 0.752461373944467, "train_speed(iter/s)": 0.141266 }, { "epoch": 0.8232420386091368, "grad_norm": 0.7542150616645813, "learning_rate": 6.831069605574307e-05, "loss": 0.8477096557617188, "memory(GiB)": 91.52, "step": 63445, "token_acc": 0.737711305598849, "train_speed(iter/s)": 0.141264 }, { "epoch": 0.8233069170107925, "grad_norm": 0.7166007161140442, "learning_rate": 6.83057048142337e-05, "loss": 0.8898479461669921, "memory(GiB)": 91.52, "step": 63450, "token_acc": 0.7562201207383885, "train_speed(iter/s)": 0.141262 }, { "epoch": 0.8233717954124482, "grad_norm": 0.7761049866676331, "learning_rate": 6.830071336206348e-05, "loss": 0.8515762329101563, "memory(GiB)": 91.52, "step": 63455, "token_acc": 0.7775014801657786, "train_speed(iter/s)": 0.14126 }, { "epoch": 0.8234366738141039, "grad_norm": 0.7317290902137756, "learning_rate": 6.829572169928982e-05, "loss": 0.8096218109130859, "memory(GiB)": 91.52, "step": 63460, "token_acc": 0.7608666306695464, "train_speed(iter/s)": 0.141258 }, { "epoch": 0.8235015522157596, "grad_norm": 0.7668846249580383, "learning_rate": 6.829072982597015e-05, "loss": 0.8447649002075195, "memory(GiB)": 91.52, "step": 63465, "token_acc": 0.7502207208936317, "train_speed(iter/s)": 0.141257 }, { "epoch": 0.8235664306174153, "grad_norm": 0.6596786379814148, "learning_rate": 6.828573774216197e-05, "loss": 0.8464113235473633, "memory(GiB)": 91.52, "step": 63470, "token_acc": 0.750566807738815, "train_speed(iter/s)": 0.141256 }, { "epoch": 0.823631309019071, "grad_norm": 0.747706413269043, "learning_rate": 6.828074544792269e-05, "loss": 0.8947817802429199, "memory(GiB)": 91.52, "step": 63475, "token_acc": 0.7558728539860615, "train_speed(iter/s)": 0.141254 }, { "epoch": 0.8236961874207267, "grad_norm": 0.7745622992515564, "learning_rate": 6.827575294330976e-05, "loss": 0.8837878227233886, "memory(GiB)": 91.52, "step": 63480, "token_acc": 0.7516775760569758, "train_speed(iter/s)": 0.141253 }, { "epoch": 0.8237610658223824, "grad_norm": 0.7097417712211609, "learning_rate": 6.827076022838062e-05, "loss": 0.854979419708252, "memory(GiB)": 91.52, "step": 63485, "token_acc": 0.7584471842719094, "train_speed(iter/s)": 0.141251 }, { "epoch": 0.8238259442240381, "grad_norm": 0.9452740550041199, "learning_rate": 6.826576730319278e-05, "loss": 0.9554712295532226, "memory(GiB)": 91.52, "step": 63490, "token_acc": 0.7561166824551661, "train_speed(iter/s)": 0.141249 }, { "epoch": 0.8238908226256938, "grad_norm": 0.7391461133956909, "learning_rate": 6.826077416780365e-05, "loss": 0.8985614776611328, "memory(GiB)": 91.52, "step": 63495, "token_acc": 0.749081199937176, "train_speed(iter/s)": 0.141248 }, { "epoch": 0.8239557010273495, "grad_norm": 0.7882450222969055, "learning_rate": 6.825578082227072e-05, "loss": 0.9044812202453614, "memory(GiB)": 91.52, "step": 63500, "token_acc": 0.7734482232728382, "train_speed(iter/s)": 0.141245 }, { "epoch": 0.8240205794290052, "grad_norm": 0.7319607138633728, "learning_rate": 6.825078726665142e-05, "loss": 0.8494882583618164, "memory(GiB)": 91.52, "step": 63505, "token_acc": 0.7804190946629105, "train_speed(iter/s)": 0.141244 }, { "epoch": 0.8240854578306609, "grad_norm": 0.7204556465148926, "learning_rate": 6.824579350100325e-05, "loss": 0.8678226470947266, "memory(GiB)": 91.52, "step": 63510, "token_acc": 0.7742621643180005, "train_speed(iter/s)": 0.141243 }, { "epoch": 0.8241503362323166, "grad_norm": 0.7763428688049316, "learning_rate": 6.824079952538367e-05, "loss": 0.9099250793457031, "memory(GiB)": 91.52, "step": 63515, "token_acc": 0.7504732287723094, "train_speed(iter/s)": 0.141241 }, { "epoch": 0.8242152146339723, "grad_norm": 0.7728615999221802, "learning_rate": 6.823580533985012e-05, "loss": 0.8795439720153808, "memory(GiB)": 91.52, "step": 63520, "token_acc": 0.7484818951945423, "train_speed(iter/s)": 0.141239 }, { "epoch": 0.824280093035628, "grad_norm": 0.7227303385734558, "learning_rate": 6.82308109444601e-05, "loss": 0.8921918869018555, "memory(GiB)": 91.52, "step": 63525, "token_acc": 0.7374696536676808, "train_speed(iter/s)": 0.141237 }, { "epoch": 0.8243449714372837, "grad_norm": 0.7902659177780151, "learning_rate": 6.82258163392711e-05, "loss": 0.8789294242858887, "memory(GiB)": 91.52, "step": 63530, "token_acc": 0.7741650780201934, "train_speed(iter/s)": 0.141235 }, { "epoch": 0.8244098498389394, "grad_norm": 0.6689080595970154, "learning_rate": 6.822082152434056e-05, "loss": 0.8500597953796387, "memory(GiB)": 91.52, "step": 63535, "token_acc": 0.7600117894116347, "train_speed(iter/s)": 0.141233 }, { "epoch": 0.8244747282405951, "grad_norm": 0.7246580719947815, "learning_rate": 6.821582649972598e-05, "loss": 0.8673433303833008, "memory(GiB)": 91.52, "step": 63540, "token_acc": 0.7429025942241801, "train_speed(iter/s)": 0.141231 }, { "epoch": 0.8245396066422508, "grad_norm": 0.8424232006072998, "learning_rate": 6.821083126548486e-05, "loss": 0.9410429000854492, "memory(GiB)": 91.52, "step": 63545, "token_acc": 0.7569630472276246, "train_speed(iter/s)": 0.14123 }, { "epoch": 0.8246044850439065, "grad_norm": 0.7990204691886902, "learning_rate": 6.820583582167465e-05, "loss": 0.897907829284668, "memory(GiB)": 91.52, "step": 63550, "token_acc": 0.7675392015993003, "train_speed(iter/s)": 0.141229 }, { "epoch": 0.8246693634455622, "grad_norm": 0.7097160220146179, "learning_rate": 6.820084016835285e-05, "loss": 0.8611915588378907, "memory(GiB)": 91.52, "step": 63555, "token_acc": 0.7596827239311279, "train_speed(iter/s)": 0.141227 }, { "epoch": 0.8247342418472179, "grad_norm": 0.7620795369148254, "learning_rate": 6.819584430557696e-05, "loss": 0.8893302917480469, "memory(GiB)": 91.52, "step": 63560, "token_acc": 0.7480651969981238, "train_speed(iter/s)": 0.141225 }, { "epoch": 0.8247991202488736, "grad_norm": 0.7320564985275269, "learning_rate": 6.819084823340445e-05, "loss": 0.8181373596191406, "memory(GiB)": 91.52, "step": 63565, "token_acc": 0.7707423580786026, "train_speed(iter/s)": 0.141223 }, { "epoch": 0.8248639986505293, "grad_norm": 0.6852485537528992, "learning_rate": 6.818585195189285e-05, "loss": 0.8572036743164062, "memory(GiB)": 91.52, "step": 63570, "token_acc": 0.7577937649880095, "train_speed(iter/s)": 0.141221 }, { "epoch": 0.824928877052185, "grad_norm": 0.7012558579444885, "learning_rate": 6.818085546109962e-05, "loss": 0.8620604515075684, "memory(GiB)": 91.52, "step": 63575, "token_acc": 0.7639080537138626, "train_speed(iter/s)": 0.141219 }, { "epoch": 0.8249937554538407, "grad_norm": 0.8699509501457214, "learning_rate": 6.81758587610823e-05, "loss": 0.8382322311401367, "memory(GiB)": 91.52, "step": 63580, "token_acc": 0.7688287272043617, "train_speed(iter/s)": 0.141217 }, { "epoch": 0.8250586338554964, "grad_norm": 0.6886647343635559, "learning_rate": 6.817086185189834e-05, "loss": 0.8710723876953125, "memory(GiB)": 91.52, "step": 63585, "token_acc": 0.7458408623590255, "train_speed(iter/s)": 0.141215 }, { "epoch": 0.8251235122571521, "grad_norm": 0.7486932277679443, "learning_rate": 6.816586473360527e-05, "loss": 0.859004020690918, "memory(GiB)": 91.52, "step": 63590, "token_acc": 0.7452373070093599, "train_speed(iter/s)": 0.141213 }, { "epoch": 0.8251883906588078, "grad_norm": 0.7103939652442932, "learning_rate": 6.816086740626061e-05, "loss": 0.8685233116149902, "memory(GiB)": 91.52, "step": 63595, "token_acc": 0.7491351159106341, "train_speed(iter/s)": 0.141211 }, { "epoch": 0.8252532690604635, "grad_norm": 0.6988767385482788, "learning_rate": 6.815586986992186e-05, "loss": 0.8578608512878418, "memory(GiB)": 91.52, "step": 63600, "token_acc": 0.7695718308035858, "train_speed(iter/s)": 0.141209 }, { "epoch": 0.8253181474621192, "grad_norm": 0.7809297442436218, "learning_rate": 6.815087212464652e-05, "loss": 0.8585890769958496, "memory(GiB)": 91.52, "step": 63605, "token_acc": 0.7473834832379395, "train_speed(iter/s)": 0.141207 }, { "epoch": 0.8253830258637748, "grad_norm": 0.7624459862709045, "learning_rate": 6.814587417049212e-05, "loss": 0.8651535034179687, "memory(GiB)": 91.52, "step": 63610, "token_acc": 0.7714388701379559, "train_speed(iter/s)": 0.141205 }, { "epoch": 0.8254479042654305, "grad_norm": 0.7216240763664246, "learning_rate": 6.814087600751617e-05, "loss": 0.8468763351440429, "memory(GiB)": 91.52, "step": 63615, "token_acc": 0.7669971433372542, "train_speed(iter/s)": 0.141203 }, { "epoch": 0.8255127826670862, "grad_norm": 0.7348352670669556, "learning_rate": 6.813587763577618e-05, "loss": 0.8905383110046386, "memory(GiB)": 91.52, "step": 63620, "token_acc": 0.7559928595715742, "train_speed(iter/s)": 0.141202 }, { "epoch": 0.8255776610687419, "grad_norm": 0.7381240725517273, "learning_rate": 6.813087905532967e-05, "loss": 0.8421624183654786, "memory(GiB)": 91.52, "step": 63625, "token_acc": 0.7569481458920277, "train_speed(iter/s)": 0.1412 }, { "epoch": 0.8256425394703976, "grad_norm": 0.761411726474762, "learning_rate": 6.812588026623417e-05, "loss": 0.8896328926086425, "memory(GiB)": 91.52, "step": 63630, "token_acc": 0.7611499387563946, "train_speed(iter/s)": 0.141199 }, { "epoch": 0.8257074178720533, "grad_norm": 0.7647085785865784, "learning_rate": 6.812088126854722e-05, "loss": 0.8699140548706055, "memory(GiB)": 91.52, "step": 63635, "token_acc": 0.7658062408103251, "train_speed(iter/s)": 0.141197 }, { "epoch": 0.825772296273709, "grad_norm": 0.7213128209114075, "learning_rate": 6.811588206232633e-05, "loss": 0.8742527961730957, "memory(GiB)": 91.52, "step": 63640, "token_acc": 0.7658981188613284, "train_speed(iter/s)": 0.141196 }, { "epoch": 0.8258371746753647, "grad_norm": 0.6726292371749878, "learning_rate": 6.811088264762904e-05, "loss": 0.8616649627685546, "memory(GiB)": 91.52, "step": 63645, "token_acc": 0.7565411614550096, "train_speed(iter/s)": 0.141194 }, { "epoch": 0.8259020530770204, "grad_norm": 0.7419055700302124, "learning_rate": 6.810588302451286e-05, "loss": 0.86972074508667, "memory(GiB)": 91.52, "step": 63650, "token_acc": 0.745766968001148, "train_speed(iter/s)": 0.141193 }, { "epoch": 0.8259669314786761, "grad_norm": 0.7247927188873291, "learning_rate": 6.810088319303536e-05, "loss": 0.8396167755126953, "memory(GiB)": 91.52, "step": 63655, "token_acc": 0.7716049382716049, "train_speed(iter/s)": 0.141191 }, { "epoch": 0.8260318098803318, "grad_norm": 0.8223465085029602, "learning_rate": 6.809588315325406e-05, "loss": 0.8581794738769531, "memory(GiB)": 91.52, "step": 63660, "token_acc": 0.7778986210914741, "train_speed(iter/s)": 0.14119 }, { "epoch": 0.8260966882819875, "grad_norm": 0.7343741655349731, "learning_rate": 6.809088290522649e-05, "loss": 0.8787021636962891, "memory(GiB)": 91.52, "step": 63665, "token_acc": 0.7495144957691774, "train_speed(iter/s)": 0.141188 }, { "epoch": 0.8261615666836432, "grad_norm": 0.6463984847068787, "learning_rate": 6.808588244901022e-05, "loss": 0.774223518371582, "memory(GiB)": 91.52, "step": 63670, "token_acc": 0.8048588164613998, "train_speed(iter/s)": 0.141186 }, { "epoch": 0.8262264450852989, "grad_norm": 0.7989123463630676, "learning_rate": 6.808088178466277e-05, "loss": 0.912137222290039, "memory(GiB)": 91.52, "step": 63675, "token_acc": 0.7533674116637439, "train_speed(iter/s)": 0.141185 }, { "epoch": 0.8262913234869546, "grad_norm": 0.721333920955658, "learning_rate": 6.80758809122417e-05, "loss": 0.8236038208007812, "memory(GiB)": 91.52, "step": 63680, "token_acc": 0.7816007236423331, "train_speed(iter/s)": 0.141183 }, { "epoch": 0.8263562018886103, "grad_norm": 0.696983814239502, "learning_rate": 6.807087983180453e-05, "loss": 0.8329919815063477, "memory(GiB)": 91.52, "step": 63685, "token_acc": 0.7725561020596372, "train_speed(iter/s)": 0.141181 }, { "epoch": 0.826421080290266, "grad_norm": 0.7481213212013245, "learning_rate": 6.806587854340889e-05, "loss": 0.8700797080993652, "memory(GiB)": 91.52, "step": 63690, "token_acc": 0.7672122909038697, "train_speed(iter/s)": 0.141179 }, { "epoch": 0.8264859586919217, "grad_norm": 0.796432614326477, "learning_rate": 6.806087704711223e-05, "loss": 0.9058300018310547, "memory(GiB)": 91.52, "step": 63695, "token_acc": 0.7454367225527312, "train_speed(iter/s)": 0.141178 }, { "epoch": 0.8265508370935774, "grad_norm": 0.7181226015090942, "learning_rate": 6.80558753429722e-05, "loss": 0.8499275207519531, "memory(GiB)": 91.52, "step": 63700, "token_acc": 0.7663731107949083, "train_speed(iter/s)": 0.141176 }, { "epoch": 0.826615715495233, "grad_norm": 0.8186410069465637, "learning_rate": 6.80508734310463e-05, "loss": 0.8948454856872559, "memory(GiB)": 91.52, "step": 63705, "token_acc": 0.762682626298187, "train_speed(iter/s)": 0.141175 }, { "epoch": 0.8266805938968887, "grad_norm": 0.5948630571365356, "learning_rate": 6.804587131139211e-05, "loss": 0.8335498809814453, "memory(GiB)": 91.52, "step": 63710, "token_acc": 0.7771327873950451, "train_speed(iter/s)": 0.141173 }, { "epoch": 0.8267454722985444, "grad_norm": 0.7846335768699646, "learning_rate": 6.804086898406719e-05, "loss": 0.9062928199768067, "memory(GiB)": 91.52, "step": 63715, "token_acc": 0.741757443718228, "train_speed(iter/s)": 0.141171 }, { "epoch": 0.8268103507002001, "grad_norm": 0.7828793525695801, "learning_rate": 6.803586644912911e-05, "loss": 0.8617207527160644, "memory(GiB)": 91.52, "step": 63720, "token_acc": 0.7651253164105672, "train_speed(iter/s)": 0.14117 }, { "epoch": 0.8268752291018558, "grad_norm": 0.6835479140281677, "learning_rate": 6.803086370663547e-05, "loss": 0.8704659461975097, "memory(GiB)": 91.52, "step": 63725, "token_acc": 0.7565305130983968, "train_speed(iter/s)": 0.141168 }, { "epoch": 0.8269401075035115, "grad_norm": 0.7413220405578613, "learning_rate": 6.802586075664376e-05, "loss": 0.8798391342163085, "memory(GiB)": 91.52, "step": 63730, "token_acc": 0.7513995438523741, "train_speed(iter/s)": 0.141166 }, { "epoch": 0.8270049859051672, "grad_norm": 0.6959645748138428, "learning_rate": 6.802085759921165e-05, "loss": 0.8672242164611816, "memory(GiB)": 91.52, "step": 63735, "token_acc": 0.7545157780195865, "train_speed(iter/s)": 0.141164 }, { "epoch": 0.8270698643068229, "grad_norm": 0.7239664793014526, "learning_rate": 6.801585423439664e-05, "loss": 0.8909158706665039, "memory(GiB)": 91.52, "step": 63740, "token_acc": 0.7400628329463188, "train_speed(iter/s)": 0.141163 }, { "epoch": 0.8271347427084786, "grad_norm": 0.8632171750068665, "learning_rate": 6.801085066225635e-05, "loss": 0.8450458526611329, "memory(GiB)": 91.52, "step": 63745, "token_acc": 0.7605101234058643, "train_speed(iter/s)": 0.141161 }, { "epoch": 0.8271996211101343, "grad_norm": 0.6954814195632935, "learning_rate": 6.800584688284835e-05, "loss": 0.8513222694396972, "memory(GiB)": 91.52, "step": 63750, "token_acc": 0.7453827768523212, "train_speed(iter/s)": 0.141159 }, { "epoch": 0.82726449951179, "grad_norm": 0.7301416993141174, "learning_rate": 6.800084289623022e-05, "loss": 0.8793344497680664, "memory(GiB)": 91.52, "step": 63755, "token_acc": 0.7626096909576497, "train_speed(iter/s)": 0.141158 }, { "epoch": 0.8273293779134457, "grad_norm": 0.7239577770233154, "learning_rate": 6.799583870245956e-05, "loss": 0.8518600463867188, "memory(GiB)": 91.52, "step": 63760, "token_acc": 0.766372253339078, "train_speed(iter/s)": 0.141156 }, { "epoch": 0.8273942563151014, "grad_norm": 0.7008599042892456, "learning_rate": 6.799083430159395e-05, "loss": 0.9153602600097657, "memory(GiB)": 91.52, "step": 63765, "token_acc": 0.7439097059132812, "train_speed(iter/s)": 0.141154 }, { "epoch": 0.8274591347167571, "grad_norm": 0.6958898305892944, "learning_rate": 6.798582969369096e-05, "loss": 0.8669207572937012, "memory(GiB)": 91.52, "step": 63770, "token_acc": 0.7613730045716856, "train_speed(iter/s)": 0.141152 }, { "epoch": 0.8275240131184128, "grad_norm": 0.8178365230560303, "learning_rate": 6.798082487880819e-05, "loss": 0.9097457885742187, "memory(GiB)": 91.52, "step": 63775, "token_acc": 0.7624409861863962, "train_speed(iter/s)": 0.141151 }, { "epoch": 0.8275888915200685, "grad_norm": 0.7586054801940918, "learning_rate": 6.797581985700328e-05, "loss": 0.8434060096740723, "memory(GiB)": 91.52, "step": 63780, "token_acc": 0.786455453273916, "train_speed(iter/s)": 0.141149 }, { "epoch": 0.8276537699217242, "grad_norm": 0.7581726312637329, "learning_rate": 6.797081462833376e-05, "loss": 0.8914562225341797, "memory(GiB)": 91.52, "step": 63785, "token_acc": 0.7679798524914553, "train_speed(iter/s)": 0.141147 }, { "epoch": 0.8277186483233799, "grad_norm": 0.6568841338157654, "learning_rate": 6.796580919285727e-05, "loss": 0.8660022735595703, "memory(GiB)": 91.52, "step": 63790, "token_acc": 0.7616346312716858, "train_speed(iter/s)": 0.141145 }, { "epoch": 0.8277835267250356, "grad_norm": 0.7114633321762085, "learning_rate": 6.796080355063141e-05, "loss": 0.8545818328857422, "memory(GiB)": 91.52, "step": 63795, "token_acc": 0.7685503685503685, "train_speed(iter/s)": 0.141143 }, { "epoch": 0.8278484051266913, "grad_norm": 0.7108785510063171, "learning_rate": 6.795579770171378e-05, "loss": 0.8863624572753906, "memory(GiB)": 91.52, "step": 63800, "token_acc": 0.7640179808699766, "train_speed(iter/s)": 0.14114 }, { "epoch": 0.827913283528347, "grad_norm": 0.8273184299468994, "learning_rate": 6.795079164616198e-05, "loss": 0.8868415832519532, "memory(GiB)": 91.52, "step": 63805, "token_acc": 0.7878705875278128, "train_speed(iter/s)": 0.141139 }, { "epoch": 0.8279781619300027, "grad_norm": 0.6713276505470276, "learning_rate": 6.794578538403362e-05, "loss": 0.8879904747009277, "memory(GiB)": 91.52, "step": 63810, "token_acc": 0.753029700104909, "train_speed(iter/s)": 0.141137 }, { "epoch": 0.8280430403316584, "grad_norm": 0.7469003796577454, "learning_rate": 6.794077891538632e-05, "loss": 0.8583745956420898, "memory(GiB)": 91.52, "step": 63815, "token_acc": 0.7543227665706052, "train_speed(iter/s)": 0.141135 }, { "epoch": 0.8281079187333141, "grad_norm": 0.7526448369026184, "learning_rate": 6.79357722402777e-05, "loss": 0.8750323295593262, "memory(GiB)": 91.52, "step": 63820, "token_acc": 0.7690901233336658, "train_speed(iter/s)": 0.141134 }, { "epoch": 0.8281727971349698, "grad_norm": 0.7140501737594604, "learning_rate": 6.793076535876536e-05, "loss": 0.868871021270752, "memory(GiB)": 91.52, "step": 63825, "token_acc": 0.7599946202212434, "train_speed(iter/s)": 0.141131 }, { "epoch": 0.8282376755366255, "grad_norm": 0.7869296073913574, "learning_rate": 6.792575827090693e-05, "loss": 0.915519905090332, "memory(GiB)": 91.52, "step": 63830, "token_acc": 0.7493098287044736, "train_speed(iter/s)": 0.14113 }, { "epoch": 0.8283025539382812, "grad_norm": 0.7980839610099792, "learning_rate": 6.792075097676003e-05, "loss": 0.8762467384338379, "memory(GiB)": 91.52, "step": 63835, "token_acc": 0.771383788625168, "train_speed(iter/s)": 0.141129 }, { "epoch": 0.8283674323399369, "grad_norm": 0.7369152307510376, "learning_rate": 6.791574347638228e-05, "loss": 0.845804500579834, "memory(GiB)": 91.52, "step": 63840, "token_acc": 0.7648561005269559, "train_speed(iter/s)": 0.141127 }, { "epoch": 0.8284323107415925, "grad_norm": 0.6530173420906067, "learning_rate": 6.79107357698313e-05, "loss": 0.8982128143310547, "memory(GiB)": 91.52, "step": 63845, "token_acc": 0.74220864857361, "train_speed(iter/s)": 0.141126 }, { "epoch": 0.8284971891432482, "grad_norm": 0.7385874390602112, "learning_rate": 6.790572785716474e-05, "loss": 0.8346458435058594, "memory(GiB)": 91.52, "step": 63850, "token_acc": 0.7731568998109641, "train_speed(iter/s)": 0.141123 }, { "epoch": 0.8285620675449039, "grad_norm": 0.775385320186615, "learning_rate": 6.790071973844019e-05, "loss": 0.8913099288940429, "memory(GiB)": 91.52, "step": 63855, "token_acc": 0.760524017467249, "train_speed(iter/s)": 0.141121 }, { "epoch": 0.8286269459465596, "grad_norm": 0.7122288346290588, "learning_rate": 6.789571141371532e-05, "loss": 0.8454612731933594, "memory(GiB)": 91.52, "step": 63860, "token_acc": 0.7819525176833112, "train_speed(iter/s)": 0.14112 }, { "epoch": 0.8286918243482153, "grad_norm": 0.7149189710617065, "learning_rate": 6.789070288304776e-05, "loss": 0.8432425498962403, "memory(GiB)": 91.52, "step": 63865, "token_acc": 0.7722185648452929, "train_speed(iter/s)": 0.141118 }, { "epoch": 0.828756702749871, "grad_norm": 0.7538477778434753, "learning_rate": 6.788569414649514e-05, "loss": 0.8759893417358399, "memory(GiB)": 91.52, "step": 63870, "token_acc": 0.7442213661283154, "train_speed(iter/s)": 0.141116 }, { "epoch": 0.8288215811515267, "grad_norm": 0.6770235300064087, "learning_rate": 6.788068520411512e-05, "loss": 0.8733613967895508, "memory(GiB)": 91.52, "step": 63875, "token_acc": 0.7581890256624244, "train_speed(iter/s)": 0.141114 }, { "epoch": 0.8288864595531824, "grad_norm": 0.7980383634567261, "learning_rate": 6.78756760559653e-05, "loss": 0.8508445739746093, "memory(GiB)": 91.52, "step": 63880, "token_acc": 0.7532660690597741, "train_speed(iter/s)": 0.141113 }, { "epoch": 0.8289513379548381, "grad_norm": 0.7066190838813782, "learning_rate": 6.787066670210337e-05, "loss": 0.8672225952148438, "memory(GiB)": 91.52, "step": 63885, "token_acc": 0.7609168667466987, "train_speed(iter/s)": 0.141111 }, { "epoch": 0.8290162163564938, "grad_norm": 0.7517988085746765, "learning_rate": 6.786565714258694e-05, "loss": 0.8605334281921386, "memory(GiB)": 91.52, "step": 63890, "token_acc": 0.7620865620865621, "train_speed(iter/s)": 0.14111 }, { "epoch": 0.8290810947581495, "grad_norm": 0.7478216886520386, "learning_rate": 6.786064737747368e-05, "loss": 0.8598264694213867, "memory(GiB)": 91.52, "step": 63895, "token_acc": 0.7611752980694696, "train_speed(iter/s)": 0.141108 }, { "epoch": 0.8291459731598052, "grad_norm": 0.785639226436615, "learning_rate": 6.785563740682126e-05, "loss": 0.8569280624389648, "memory(GiB)": 91.52, "step": 63900, "token_acc": 0.7523082688835133, "train_speed(iter/s)": 0.141107 }, { "epoch": 0.8292108515614609, "grad_norm": 0.7586345672607422, "learning_rate": 6.785062723068729e-05, "loss": 0.8596330642700195, "memory(GiB)": 91.52, "step": 63905, "token_acc": 0.7664783427495292, "train_speed(iter/s)": 0.141106 }, { "epoch": 0.8292757299631166, "grad_norm": 0.7791494727134705, "learning_rate": 6.784561684912947e-05, "loss": 0.852335262298584, "memory(GiB)": 91.52, "step": 63910, "token_acc": 0.7699249232589963, "train_speed(iter/s)": 0.141104 }, { "epoch": 0.8293406083647723, "grad_norm": 0.7796746492385864, "learning_rate": 6.784060626220542e-05, "loss": 0.8612459182739258, "memory(GiB)": 91.52, "step": 63915, "token_acc": 0.759383332193888, "train_speed(iter/s)": 0.141101 }, { "epoch": 0.829405486766428, "grad_norm": 0.8395439982414246, "learning_rate": 6.783559546997284e-05, "loss": 0.8682854652404786, "memory(GiB)": 91.52, "step": 63920, "token_acc": 0.7636460165245379, "train_speed(iter/s)": 0.1411 }, { "epoch": 0.8294703651680837, "grad_norm": 0.7395158410072327, "learning_rate": 6.783058447248936e-05, "loss": 0.8496218681335449, "memory(GiB)": 91.52, "step": 63925, "token_acc": 0.7722815796591831, "train_speed(iter/s)": 0.141097 }, { "epoch": 0.8295352435697394, "grad_norm": 0.7422481775283813, "learning_rate": 6.782557326981266e-05, "loss": 0.8708175659179688, "memory(GiB)": 91.52, "step": 63930, "token_acc": 0.7432762836185819, "train_speed(iter/s)": 0.141096 }, { "epoch": 0.8296001219713951, "grad_norm": 0.7016010880470276, "learning_rate": 6.782056186200043e-05, "loss": 0.8567045211791993, "memory(GiB)": 91.52, "step": 63935, "token_acc": 0.7744655224330022, "train_speed(iter/s)": 0.141094 }, { "epoch": 0.8296650003730508, "grad_norm": 0.7252867221832275, "learning_rate": 6.781555024911031e-05, "loss": 0.8581550598144532, "memory(GiB)": 91.52, "step": 63940, "token_acc": 0.7698782655353968, "train_speed(iter/s)": 0.141093 }, { "epoch": 0.8297298787747065, "grad_norm": 0.6526768207550049, "learning_rate": 6.781053843119999e-05, "loss": 0.8878571510314941, "memory(GiB)": 91.52, "step": 63945, "token_acc": 0.7554578532443905, "train_speed(iter/s)": 0.141091 }, { "epoch": 0.8297947571763622, "grad_norm": 0.7882559895515442, "learning_rate": 6.780552640832713e-05, "loss": 0.8693257331848144, "memory(GiB)": 91.52, "step": 63950, "token_acc": 0.7698182686078625, "train_speed(iter/s)": 0.14109 }, { "epoch": 0.8298596355780179, "grad_norm": 0.7436057329177856, "learning_rate": 6.780051418054941e-05, "loss": 0.852238655090332, "memory(GiB)": 91.52, "step": 63955, "token_acc": 0.7652044993070551, "train_speed(iter/s)": 0.141088 }, { "epoch": 0.8299245139796736, "grad_norm": 0.7545621991157532, "learning_rate": 6.779550174792454e-05, "loss": 0.8722893714904785, "memory(GiB)": 91.52, "step": 63960, "token_acc": 0.7538637890825725, "train_speed(iter/s)": 0.141086 }, { "epoch": 0.8299893923813293, "grad_norm": 0.7025913596153259, "learning_rate": 6.779048911051016e-05, "loss": 0.8332088470458985, "memory(GiB)": 91.52, "step": 63965, "token_acc": 0.7697609738401386, "train_speed(iter/s)": 0.141084 }, { "epoch": 0.830054270782985, "grad_norm": 0.826344907283783, "learning_rate": 6.7785476268364e-05, "loss": 0.8558015823364258, "memory(GiB)": 91.52, "step": 63970, "token_acc": 0.7589773164167628, "train_speed(iter/s)": 0.141083 }, { "epoch": 0.8301191491846407, "grad_norm": 0.7212422490119934, "learning_rate": 6.778046322154372e-05, "loss": 0.887544822692871, "memory(GiB)": 91.52, "step": 63975, "token_acc": 0.7702058769625512, "train_speed(iter/s)": 0.141081 }, { "epoch": 0.8301840275862964, "grad_norm": 0.7950279712677002, "learning_rate": 6.7775449970107e-05, "loss": 0.8543557167053223, "memory(GiB)": 91.52, "step": 63980, "token_acc": 0.7590965700629438, "train_speed(iter/s)": 0.141079 }, { "epoch": 0.8302489059879521, "grad_norm": 0.8041894435882568, "learning_rate": 6.777043651411154e-05, "loss": 0.8373965263366699, "memory(GiB)": 91.52, "step": 63985, "token_acc": 0.7829152905471749, "train_speed(iter/s)": 0.141077 }, { "epoch": 0.8303137843896078, "grad_norm": 0.8075464963912964, "learning_rate": 6.776542285361505e-05, "loss": 0.8728480339050293, "memory(GiB)": 91.52, "step": 63990, "token_acc": 0.7541708698688601, "train_speed(iter/s)": 0.141076 }, { "epoch": 0.8303786627912635, "grad_norm": 0.668430507183075, "learning_rate": 6.776040898867522e-05, "loss": 0.8961978912353515, "memory(GiB)": 91.52, "step": 63995, "token_acc": 0.7628607277289837, "train_speed(iter/s)": 0.141075 }, { "epoch": 0.8304435411929192, "grad_norm": 0.6470646858215332, "learning_rate": 6.775539491934975e-05, "loss": 0.8472244262695312, "memory(GiB)": 91.52, "step": 64000, "token_acc": 0.7542123562449853, "train_speed(iter/s)": 0.141073 }, { "epoch": 0.8305084195945749, "grad_norm": 0.6801638603210449, "learning_rate": 6.775038064569633e-05, "loss": 0.8592964172363281, "memory(GiB)": 91.52, "step": 64005, "token_acc": 0.7637370645678473, "train_speed(iter/s)": 0.141071 }, { "epoch": 0.8305732979962306, "grad_norm": 0.8037089109420776, "learning_rate": 6.774536616777268e-05, "loss": 0.8331743240356445, "memory(GiB)": 91.52, "step": 64010, "token_acc": 0.7718247715096124, "train_speed(iter/s)": 0.141069 }, { "epoch": 0.8306381763978863, "grad_norm": 0.6868287324905396, "learning_rate": 6.774035148563649e-05, "loss": 0.8138964653015137, "memory(GiB)": 91.52, "step": 64015, "token_acc": 0.7761397296653161, "train_speed(iter/s)": 0.141066 }, { "epoch": 0.830703054799542, "grad_norm": 0.8348903656005859, "learning_rate": 6.773533659934548e-05, "loss": 0.8880130767822265, "memory(GiB)": 91.52, "step": 64020, "token_acc": 0.7541373838131943, "train_speed(iter/s)": 0.141065 }, { "epoch": 0.8307679332011977, "grad_norm": 0.7481246590614319, "learning_rate": 6.773032150895735e-05, "loss": 0.8940852165222168, "memory(GiB)": 91.52, "step": 64025, "token_acc": 0.7566633928269042, "train_speed(iter/s)": 0.141063 }, { "epoch": 0.8308328116028534, "grad_norm": 0.7387808561325073, "learning_rate": 6.772530621452984e-05, "loss": 0.8224591255187989, "memory(GiB)": 91.52, "step": 64030, "token_acc": 0.7745931758530183, "train_speed(iter/s)": 0.141061 }, { "epoch": 0.8308976900045091, "grad_norm": 0.6902974247932434, "learning_rate": 6.772029071612063e-05, "loss": 0.8603061676025391, "memory(GiB)": 91.52, "step": 64035, "token_acc": 0.7513337752943776, "train_speed(iter/s)": 0.141059 }, { "epoch": 0.8309625684061648, "grad_norm": 0.7285555005073547, "learning_rate": 6.771527501378746e-05, "loss": 0.8745835304260254, "memory(GiB)": 91.52, "step": 64040, "token_acc": 0.7760756851390479, "train_speed(iter/s)": 0.141057 }, { "epoch": 0.8310274468078205, "grad_norm": 0.6424816846847534, "learning_rate": 6.771025910758803e-05, "loss": 0.8526420593261719, "memory(GiB)": 91.52, "step": 64045, "token_acc": 0.7814886203573282, "train_speed(iter/s)": 0.141056 }, { "epoch": 0.8310923252094762, "grad_norm": 0.7237600088119507, "learning_rate": 6.770524299758012e-05, "loss": 0.8713517189025879, "memory(GiB)": 91.52, "step": 64050, "token_acc": 0.7603011749131227, "train_speed(iter/s)": 0.141054 }, { "epoch": 0.8311572036111319, "grad_norm": 0.6803472638130188, "learning_rate": 6.770022668382138e-05, "loss": 0.8673925399780273, "memory(GiB)": 91.52, "step": 64055, "token_acc": 0.7575230199664589, "train_speed(iter/s)": 0.141052 }, { "epoch": 0.8312220820127876, "grad_norm": 0.7073240280151367, "learning_rate": 6.769521016636957e-05, "loss": 0.8383512496948242, "memory(GiB)": 91.52, "step": 64060, "token_acc": 0.7798411541265395, "train_speed(iter/s)": 0.14105 }, { "epoch": 0.8312869604144433, "grad_norm": 0.6790093779563904, "learning_rate": 6.769019344528244e-05, "loss": 0.8581677436828613, "memory(GiB)": 91.52, "step": 64065, "token_acc": 0.7501426560657359, "train_speed(iter/s)": 0.141048 }, { "epoch": 0.831351838816099, "grad_norm": 0.6899862885475159, "learning_rate": 6.768517652061768e-05, "loss": 0.883244514465332, "memory(GiB)": 91.52, "step": 64070, "token_acc": 0.7572579419364646, "train_speed(iter/s)": 0.141046 }, { "epoch": 0.8314167172177547, "grad_norm": 0.7355484366416931, "learning_rate": 6.768015939243307e-05, "loss": 0.8882840156555176, "memory(GiB)": 91.52, "step": 64075, "token_acc": 0.7799390496256443, "train_speed(iter/s)": 0.141044 }, { "epoch": 0.8314815956194104, "grad_norm": 0.7251093983650208, "learning_rate": 6.767514206078629e-05, "loss": 0.849498176574707, "memory(GiB)": 91.52, "step": 64080, "token_acc": 0.7591073504715153, "train_speed(iter/s)": 0.141043 }, { "epoch": 0.831546474021066, "grad_norm": 0.7744113206863403, "learning_rate": 6.767012452573515e-05, "loss": 0.875709342956543, "memory(GiB)": 91.52, "step": 64085, "token_acc": 0.7571736138072213, "train_speed(iter/s)": 0.141042 }, { "epoch": 0.8316113524227217, "grad_norm": 0.7348787784576416, "learning_rate": 6.766510678733732e-05, "loss": 0.8842446327209472, "memory(GiB)": 91.52, "step": 64090, "token_acc": 0.7543396073011098, "train_speed(iter/s)": 0.14104 }, { "epoch": 0.8316762308243774, "grad_norm": 0.7977176904678345, "learning_rate": 6.76600888456506e-05, "loss": 0.8757495880126953, "memory(GiB)": 91.52, "step": 64095, "token_acc": 0.7767105345225472, "train_speed(iter/s)": 0.141039 }, { "epoch": 0.8317411092260331, "grad_norm": 0.7317614555358887, "learning_rate": 6.765507070073272e-05, "loss": 0.8175577163696289, "memory(GiB)": 91.52, "step": 64100, "token_acc": 0.7586501118041662, "train_speed(iter/s)": 0.141037 }, { "epoch": 0.8318059876276888, "grad_norm": 0.7648282647132874, "learning_rate": 6.765005235264142e-05, "loss": 0.8471044540405274, "memory(GiB)": 91.52, "step": 64105, "token_acc": 0.7516078630863704, "train_speed(iter/s)": 0.141035 }, { "epoch": 0.8318708660293445, "grad_norm": 0.7159597873687744, "learning_rate": 6.764503380143444e-05, "loss": 0.8966794967651367, "memory(GiB)": 91.52, "step": 64110, "token_acc": 0.7520667021465318, "train_speed(iter/s)": 0.141033 }, { "epoch": 0.8319357444310002, "grad_norm": 0.699327826499939, "learning_rate": 6.764001504716953e-05, "loss": 0.8796474456787109, "memory(GiB)": 91.52, "step": 64115, "token_acc": 0.7760954347327369, "train_speed(iter/s)": 0.141032 }, { "epoch": 0.8320006228326559, "grad_norm": 0.7640075087547302, "learning_rate": 6.76349960899045e-05, "loss": 0.8746784210205079, "memory(GiB)": 91.52, "step": 64120, "token_acc": 0.7735346711702762, "train_speed(iter/s)": 0.14103 }, { "epoch": 0.8320655012343116, "grad_norm": 0.6337478160858154, "learning_rate": 6.762997692969705e-05, "loss": 0.8344356536865234, "memory(GiB)": 91.52, "step": 64125, "token_acc": 0.7711339515886035, "train_speed(iter/s)": 0.141028 }, { "epoch": 0.8321303796359673, "grad_norm": 0.7807773947715759, "learning_rate": 6.762495756660496e-05, "loss": 0.9120987892150879, "memory(GiB)": 91.52, "step": 64130, "token_acc": 0.7539970236959591, "train_speed(iter/s)": 0.141027 }, { "epoch": 0.832195258037623, "grad_norm": 0.6546946167945862, "learning_rate": 6.7619938000686e-05, "loss": 0.8387157440185546, "memory(GiB)": 91.52, "step": 64135, "token_acc": 0.7491557579206981, "train_speed(iter/s)": 0.141025 }, { "epoch": 0.8322601364392787, "grad_norm": 0.7144160270690918, "learning_rate": 6.761491823199793e-05, "loss": 0.8821191787719727, "memory(GiB)": 91.52, "step": 64140, "token_acc": 0.7483803986710963, "train_speed(iter/s)": 0.141023 }, { "epoch": 0.8323250148409344, "grad_norm": 0.7317438125610352, "learning_rate": 6.760989826059851e-05, "loss": 0.8487989425659179, "memory(GiB)": 91.52, "step": 64145, "token_acc": 0.7580718191633868, "train_speed(iter/s)": 0.141021 }, { "epoch": 0.83238989324259, "grad_norm": 0.7498019933700562, "learning_rate": 6.76048780865455e-05, "loss": 0.8558084487915039, "memory(GiB)": 91.52, "step": 64150, "token_acc": 0.7577565234778552, "train_speed(iter/s)": 0.141019 }, { "epoch": 0.8324547716442458, "grad_norm": 0.6817206740379333, "learning_rate": 6.759985770989672e-05, "loss": 0.8456470489501953, "memory(GiB)": 91.52, "step": 64155, "token_acc": 0.7770440629127147, "train_speed(iter/s)": 0.141017 }, { "epoch": 0.8325196500459014, "grad_norm": 0.7815641760826111, "learning_rate": 6.759483713070988e-05, "loss": 0.81708984375, "memory(GiB)": 91.52, "step": 64160, "token_acc": 0.7549959709911361, "train_speed(iter/s)": 0.141015 }, { "epoch": 0.8325845284475571, "grad_norm": 0.7436844110488892, "learning_rate": 6.75898163490428e-05, "loss": 0.8761405944824219, "memory(GiB)": 91.52, "step": 64165, "token_acc": 0.7648749154834348, "train_speed(iter/s)": 0.141014 }, { "epoch": 0.8326494068492128, "grad_norm": 0.7336458563804626, "learning_rate": 6.758479536495323e-05, "loss": 0.8058287620544433, "memory(GiB)": 91.52, "step": 64170, "token_acc": 0.7812742511252522, "train_speed(iter/s)": 0.141012 }, { "epoch": 0.8327142852508685, "grad_norm": 0.7559237480163574, "learning_rate": 6.757977417849899e-05, "loss": 0.880277156829834, "memory(GiB)": 91.52, "step": 64175, "token_acc": 0.7577267475447718, "train_speed(iter/s)": 0.14101 }, { "epoch": 0.8327791636525242, "grad_norm": 0.7013562321662903, "learning_rate": 6.757475278973781e-05, "loss": 0.8662714004516602, "memory(GiB)": 91.52, "step": 64180, "token_acc": 0.7610844306738962, "train_speed(iter/s)": 0.141008 }, { "epoch": 0.83284404205418, "grad_norm": 0.6991225481033325, "learning_rate": 6.756973119872753e-05, "loss": 0.9173455238342285, "memory(GiB)": 91.52, "step": 64185, "token_acc": 0.7419230903566065, "train_speed(iter/s)": 0.141007 }, { "epoch": 0.8329089204558356, "grad_norm": 0.8440846800804138, "learning_rate": 6.756470940552593e-05, "loss": 0.8564294815063477, "memory(GiB)": 91.52, "step": 64190, "token_acc": 0.7464778397417082, "train_speed(iter/s)": 0.141006 }, { "epoch": 0.8329737988574913, "grad_norm": 0.7245286107063293, "learning_rate": 6.755968741019075e-05, "loss": 0.8962549209594727, "memory(GiB)": 91.52, "step": 64195, "token_acc": 0.7442758342845632, "train_speed(iter/s)": 0.141004 }, { "epoch": 0.833038677259147, "grad_norm": 0.6737253069877625, "learning_rate": 6.755466521277984e-05, "loss": 0.8738594055175781, "memory(GiB)": 91.52, "step": 64200, "token_acc": 0.7657571544432858, "train_speed(iter/s)": 0.141002 }, { "epoch": 0.8331035556608027, "grad_norm": 0.7460862994194031, "learning_rate": 6.754964281335094e-05, "loss": 0.816403579711914, "memory(GiB)": 91.52, "step": 64205, "token_acc": 0.7611624591897733, "train_speed(iter/s)": 0.141001 }, { "epoch": 0.8331684340624584, "grad_norm": 0.7339704036712646, "learning_rate": 6.754462021196192e-05, "loss": 0.8786109924316406, "memory(GiB)": 91.52, "step": 64210, "token_acc": 0.7620727447085417, "train_speed(iter/s)": 0.140999 }, { "epoch": 0.8332333124641141, "grad_norm": 0.7158899903297424, "learning_rate": 6.753959740867053e-05, "loss": 0.8512198448181152, "memory(GiB)": 91.52, "step": 64215, "token_acc": 0.73659793814433, "train_speed(iter/s)": 0.140996 }, { "epoch": 0.8332981908657698, "grad_norm": 0.7694886326789856, "learning_rate": 6.753457440353456e-05, "loss": 0.8591794013977051, "memory(GiB)": 91.52, "step": 64220, "token_acc": 0.7585457016649677, "train_speed(iter/s)": 0.140994 }, { "epoch": 0.8333630692674255, "grad_norm": 0.6986316442489624, "learning_rate": 6.752955119661187e-05, "loss": 0.8450529098510742, "memory(GiB)": 91.52, "step": 64225, "token_acc": 0.769298958410283, "train_speed(iter/s)": 0.140993 }, { "epoch": 0.8334279476690812, "grad_norm": 0.8366994857788086, "learning_rate": 6.75245277879602e-05, "loss": 0.902804946899414, "memory(GiB)": 91.52, "step": 64230, "token_acc": 0.7466095645967167, "train_speed(iter/s)": 0.140992 }, { "epoch": 0.8334928260707369, "grad_norm": 0.740619957447052, "learning_rate": 6.751950417763741e-05, "loss": 0.863039207458496, "memory(GiB)": 91.52, "step": 64235, "token_acc": 0.7604930286042835, "train_speed(iter/s)": 0.140991 }, { "epoch": 0.8335577044723926, "grad_norm": 0.7493857145309448, "learning_rate": 6.751448036570128e-05, "loss": 0.8867301940917969, "memory(GiB)": 91.52, "step": 64240, "token_acc": 0.7560060576856887, "train_speed(iter/s)": 0.140988 }, { "epoch": 0.8336225828740483, "grad_norm": 0.6915146708488464, "learning_rate": 6.750945635220966e-05, "loss": 0.8649563789367676, "memory(GiB)": 91.52, "step": 64245, "token_acc": 0.7610316040548599, "train_speed(iter/s)": 0.140987 }, { "epoch": 0.833687461275704, "grad_norm": 0.7770071625709534, "learning_rate": 6.750443213722033e-05, "loss": 0.8585182189941406, "memory(GiB)": 91.52, "step": 64250, "token_acc": 0.7692281571530443, "train_speed(iter/s)": 0.140986 }, { "epoch": 0.8337523396773597, "grad_norm": 0.70233553647995, "learning_rate": 6.749940772079111e-05, "loss": 0.8759763717651368, "memory(GiB)": 91.52, "step": 64255, "token_acc": 0.7451536804677082, "train_speed(iter/s)": 0.140985 }, { "epoch": 0.8338172180790154, "grad_norm": 0.7436473965644836, "learning_rate": 6.749438310297986e-05, "loss": 0.8692109107971191, "memory(GiB)": 91.52, "step": 64260, "token_acc": 0.7596206146314931, "train_speed(iter/s)": 0.140983 }, { "epoch": 0.8338820964806711, "grad_norm": 0.7159560322761536, "learning_rate": 6.748935828384436e-05, "loss": 0.8646312713623047, "memory(GiB)": 91.52, "step": 64265, "token_acc": 0.7811935567105311, "train_speed(iter/s)": 0.140982 }, { "epoch": 0.8339469748823268, "grad_norm": 0.7244982719421387, "learning_rate": 6.748433326344244e-05, "loss": 0.8606866836547852, "memory(GiB)": 91.52, "step": 64270, "token_acc": 0.7688625176994134, "train_speed(iter/s)": 0.14098 }, { "epoch": 0.8340118532839825, "grad_norm": 0.7505826950073242, "learning_rate": 6.747930804183195e-05, "loss": 0.8953705787658691, "memory(GiB)": 91.52, "step": 64275, "token_acc": 0.7512700086264736, "train_speed(iter/s)": 0.140979 }, { "epoch": 0.8340767316856382, "grad_norm": 0.7309601306915283, "learning_rate": 6.747428261907072e-05, "loss": 0.9153135299682618, "memory(GiB)": 91.52, "step": 64280, "token_acc": 0.7582288269508792, "train_speed(iter/s)": 0.140978 }, { "epoch": 0.8341416100872939, "grad_norm": 0.6864126920700073, "learning_rate": 6.746925699521653e-05, "loss": 0.8642974853515625, "memory(GiB)": 91.52, "step": 64285, "token_acc": 0.7589401460777184, "train_speed(iter/s)": 0.140975 }, { "epoch": 0.8342064884889496, "grad_norm": 0.756309449672699, "learning_rate": 6.74642311703273e-05, "loss": 0.8440922737121582, "memory(GiB)": 91.52, "step": 64290, "token_acc": 0.7799043062200957, "train_speed(iter/s)": 0.140974 }, { "epoch": 0.8342713668906053, "grad_norm": 0.8779170513153076, "learning_rate": 6.74592051444608e-05, "loss": 0.8742502212524415, "memory(GiB)": 91.52, "step": 64295, "token_acc": 0.7593110787057108, "train_speed(iter/s)": 0.140973 }, { "epoch": 0.834336245292261, "grad_norm": 0.762387752532959, "learning_rate": 6.745417891767489e-05, "loss": 0.9025052070617676, "memory(GiB)": 91.52, "step": 64300, "token_acc": 0.7356572805300406, "train_speed(iter/s)": 0.140971 }, { "epoch": 0.8344011236939167, "grad_norm": 0.7435747385025024, "learning_rate": 6.744915249002741e-05, "loss": 0.8708749771118164, "memory(GiB)": 91.52, "step": 64305, "token_acc": 0.7528778379910034, "train_speed(iter/s)": 0.140969 }, { "epoch": 0.8344660020955724, "grad_norm": 0.7569200396537781, "learning_rate": 6.744412586157621e-05, "loss": 0.9176985740661621, "memory(GiB)": 91.52, "step": 64310, "token_acc": 0.7674531744101192, "train_speed(iter/s)": 0.140967 }, { "epoch": 0.8345308804972281, "grad_norm": 0.7559080123901367, "learning_rate": 6.743909903237914e-05, "loss": 0.8937630653381348, "memory(GiB)": 91.52, "step": 64315, "token_acc": 0.7540295943982032, "train_speed(iter/s)": 0.140965 }, { "epoch": 0.8345957588988838, "grad_norm": 0.7094064950942993, "learning_rate": 6.743407200249403e-05, "loss": 0.8120622634887695, "memory(GiB)": 91.52, "step": 64320, "token_acc": 0.7704887099817599, "train_speed(iter/s)": 0.140964 }, { "epoch": 0.8346606373005394, "grad_norm": 0.7520509958267212, "learning_rate": 6.742904477197876e-05, "loss": 0.8871295928955079, "memory(GiB)": 91.52, "step": 64325, "token_acc": 0.772816246635674, "train_speed(iter/s)": 0.140962 }, { "epoch": 0.8347255157021951, "grad_norm": 0.717253565788269, "learning_rate": 6.742401734089114e-05, "loss": 0.9122465133666993, "memory(GiB)": 91.52, "step": 64330, "token_acc": 0.7550309322335206, "train_speed(iter/s)": 0.140961 }, { "epoch": 0.8347903941038508, "grad_norm": 0.7590875029563904, "learning_rate": 6.741898970928906e-05, "loss": 0.841555404663086, "memory(GiB)": 91.52, "step": 64335, "token_acc": 0.7793152446038297, "train_speed(iter/s)": 0.140959 }, { "epoch": 0.8348552725055065, "grad_norm": 0.8138423562049866, "learning_rate": 6.741396187723037e-05, "loss": 0.8557754516601562, "memory(GiB)": 91.52, "step": 64340, "token_acc": 0.7548835020004707, "train_speed(iter/s)": 0.140958 }, { "epoch": 0.8349201509071622, "grad_norm": 0.6251400709152222, "learning_rate": 6.740893384477293e-05, "loss": 0.9010824203491211, "memory(GiB)": 91.52, "step": 64345, "token_acc": 0.7696318251615898, "train_speed(iter/s)": 0.140956 }, { "epoch": 0.8349850293088179, "grad_norm": 0.7592899203300476, "learning_rate": 6.740390561197458e-05, "loss": 0.9205756187438965, "memory(GiB)": 91.52, "step": 64350, "token_acc": 0.7399672827259755, "train_speed(iter/s)": 0.140955 }, { "epoch": 0.8350499077104736, "grad_norm": 0.6756477952003479, "learning_rate": 6.739887717889322e-05, "loss": 0.8608142852783203, "memory(GiB)": 91.52, "step": 64355, "token_acc": 0.7535174953959485, "train_speed(iter/s)": 0.140953 }, { "epoch": 0.8351147861121293, "grad_norm": 0.7950403094291687, "learning_rate": 6.739384854558671e-05, "loss": 0.8193092346191406, "memory(GiB)": 91.52, "step": 64360, "token_acc": 0.7790602401054469, "train_speed(iter/s)": 0.140952 }, { "epoch": 0.835179664513785, "grad_norm": 0.7000020742416382, "learning_rate": 6.738881971211291e-05, "loss": 0.8591906547546386, "memory(GiB)": 91.52, "step": 64365, "token_acc": 0.7744682505735506, "train_speed(iter/s)": 0.140951 }, { "epoch": 0.8352445429154407, "grad_norm": 0.7906826734542847, "learning_rate": 6.738379067852968e-05, "loss": 0.8690031051635743, "memory(GiB)": 91.52, "step": 64370, "token_acc": 0.7675795152287116, "train_speed(iter/s)": 0.140949 }, { "epoch": 0.8353094213170964, "grad_norm": 0.8885117173194885, "learning_rate": 6.737876144489491e-05, "loss": 0.9035126686096191, "memory(GiB)": 91.52, "step": 64375, "token_acc": 0.7442310731784416, "train_speed(iter/s)": 0.140947 }, { "epoch": 0.8353742997187521, "grad_norm": 0.6917139887809753, "learning_rate": 6.737373201126647e-05, "loss": 0.8624090194702149, "memory(GiB)": 91.52, "step": 64380, "token_acc": 0.7553343337115251, "train_speed(iter/s)": 0.140945 }, { "epoch": 0.8354391781204078, "grad_norm": 0.775879442691803, "learning_rate": 6.736870237770223e-05, "loss": 0.9126296997070312, "memory(GiB)": 91.52, "step": 64385, "token_acc": 0.7274805366904091, "train_speed(iter/s)": 0.140943 }, { "epoch": 0.8355040565220635, "grad_norm": 0.7740457057952881, "learning_rate": 6.736367254426011e-05, "loss": 0.8788453102111816, "memory(GiB)": 91.52, "step": 64390, "token_acc": 0.7578809547789304, "train_speed(iter/s)": 0.140941 }, { "epoch": 0.8355689349237192, "grad_norm": 0.7696103453636169, "learning_rate": 6.735864251099794e-05, "loss": 0.90579833984375, "memory(GiB)": 91.52, "step": 64395, "token_acc": 0.7647470739271066, "train_speed(iter/s)": 0.14094 }, { "epoch": 0.8356338133253749, "grad_norm": 0.741668164730072, "learning_rate": 6.735361227797363e-05, "loss": 0.8348098754882812, "memory(GiB)": 91.52, "step": 64400, "token_acc": 0.7725597625803086, "train_speed(iter/s)": 0.140938 }, { "epoch": 0.8356986917270306, "grad_norm": 0.748552680015564, "learning_rate": 6.734858184524506e-05, "loss": 0.8989049911499023, "memory(GiB)": 91.52, "step": 64405, "token_acc": 0.7619000265181649, "train_speed(iter/s)": 0.140936 }, { "epoch": 0.8357635701286863, "grad_norm": 0.8511648178100586, "learning_rate": 6.734355121287016e-05, "loss": 0.8867265701293945, "memory(GiB)": 91.52, "step": 64410, "token_acc": 0.7398662846227316, "train_speed(iter/s)": 0.140935 }, { "epoch": 0.835828448530342, "grad_norm": 0.7586705684661865, "learning_rate": 6.733852038090675e-05, "loss": 0.8280782699584961, "memory(GiB)": 91.52, "step": 64415, "token_acc": 0.7950580830646101, "train_speed(iter/s)": 0.140933 }, { "epoch": 0.8358933269319977, "grad_norm": 0.6995985507965088, "learning_rate": 6.73334893494128e-05, "loss": 0.8625925064086915, "memory(GiB)": 91.52, "step": 64420, "token_acc": 0.75247409023611, "train_speed(iter/s)": 0.140932 }, { "epoch": 0.8359582053336534, "grad_norm": 0.6930602192878723, "learning_rate": 6.732845811844614e-05, "loss": 0.8456624984741211, "memory(GiB)": 91.52, "step": 64425, "token_acc": 0.7602596040382851, "train_speed(iter/s)": 0.14093 }, { "epoch": 0.8360230837353091, "grad_norm": 0.7202129364013672, "learning_rate": 6.73234266880647e-05, "loss": 0.8413159370422363, "memory(GiB)": 91.52, "step": 64430, "token_acc": 0.7684675154123956, "train_speed(iter/s)": 0.140928 }, { "epoch": 0.8360879621369648, "grad_norm": 0.7504859566688538, "learning_rate": 6.73183950583264e-05, "loss": 0.8496368408203125, "memory(GiB)": 91.52, "step": 64435, "token_acc": 0.7571840042869583, "train_speed(iter/s)": 0.140927 }, { "epoch": 0.8361528405386205, "grad_norm": 0.7030398845672607, "learning_rate": 6.73133632292891e-05, "loss": 0.8286825180053711, "memory(GiB)": 91.52, "step": 64440, "token_acc": 0.7601363803907114, "train_speed(iter/s)": 0.140925 }, { "epoch": 0.8362177189402762, "grad_norm": 0.6812193393707275, "learning_rate": 6.730833120101076e-05, "loss": 0.8878840446472168, "memory(GiB)": 91.52, "step": 64445, "token_acc": 0.7395412198980489, "train_speed(iter/s)": 0.140924 }, { "epoch": 0.8362825973419319, "grad_norm": 0.707030177116394, "learning_rate": 6.730329897354924e-05, "loss": 0.8712323188781739, "memory(GiB)": 91.52, "step": 64450, "token_acc": 0.7721154513286617, "train_speed(iter/s)": 0.140922 }, { "epoch": 0.8363474757435876, "grad_norm": 0.799346923828125, "learning_rate": 6.729826654696247e-05, "loss": 0.9381863594055175, "memory(GiB)": 91.52, "step": 64455, "token_acc": 0.7318951165371809, "train_speed(iter/s)": 0.140921 }, { "epoch": 0.8364123541452433, "grad_norm": 0.6986408233642578, "learning_rate": 6.729323392130837e-05, "loss": 0.8583284378051758, "memory(GiB)": 91.52, "step": 64460, "token_acc": 0.7560701172568992, "train_speed(iter/s)": 0.140918 }, { "epoch": 0.836477232546899, "grad_norm": 0.6799761056900024, "learning_rate": 6.728820109664484e-05, "loss": 0.8409400939941406, "memory(GiB)": 91.52, "step": 64465, "token_acc": 0.78802775024777, "train_speed(iter/s)": 0.140917 }, { "epoch": 0.8365421109485547, "grad_norm": 0.8327632546424866, "learning_rate": 6.728316807302982e-05, "loss": 0.8485307693481445, "memory(GiB)": 91.52, "step": 64470, "token_acc": 0.7514272208283335, "train_speed(iter/s)": 0.140915 }, { "epoch": 0.8366069893502104, "grad_norm": 0.692932665348053, "learning_rate": 6.727813485052117e-05, "loss": 0.8766189575195312, "memory(GiB)": 91.52, "step": 64475, "token_acc": 0.7626367441696859, "train_speed(iter/s)": 0.140914 }, { "epoch": 0.8366718677518661, "grad_norm": 0.8236255645751953, "learning_rate": 6.727310142917691e-05, "loss": 0.864384651184082, "memory(GiB)": 91.52, "step": 64480, "token_acc": 0.7597798165137615, "train_speed(iter/s)": 0.140912 }, { "epoch": 0.8367367461535218, "grad_norm": 0.7668225765228271, "learning_rate": 6.726806780905487e-05, "loss": 0.8817682266235352, "memory(GiB)": 91.52, "step": 64485, "token_acc": 0.767400074583856, "train_speed(iter/s)": 0.140911 }, { "epoch": 0.8368016245551775, "grad_norm": 0.7999758124351501, "learning_rate": 6.726303399021304e-05, "loss": 0.8648783683776855, "memory(GiB)": 91.52, "step": 64490, "token_acc": 0.7572349471951721, "train_speed(iter/s)": 0.140909 }, { "epoch": 0.8368665029568332, "grad_norm": 0.7461546659469604, "learning_rate": 6.725799997270932e-05, "loss": 0.8593869209289551, "memory(GiB)": 91.52, "step": 64495, "token_acc": 0.7835312862108922, "train_speed(iter/s)": 0.140907 }, { "epoch": 0.8369313813584889, "grad_norm": 0.7309044599533081, "learning_rate": 6.725296575660166e-05, "loss": 0.8843133926391602, "memory(GiB)": 91.52, "step": 64500, "token_acc": 0.7498010082249934, "train_speed(iter/s)": 0.140906 }, { "epoch": 0.8369962597601446, "grad_norm": 0.6911014318466187, "learning_rate": 6.724793134194795e-05, "loss": 0.8810935974121094, "memory(GiB)": 91.52, "step": 64505, "token_acc": 0.7464586395830549, "train_speed(iter/s)": 0.140905 }, { "epoch": 0.8370611381618003, "grad_norm": 0.7838214039802551, "learning_rate": 6.724289672880616e-05, "loss": 0.8397335052490235, "memory(GiB)": 91.52, "step": 64510, "token_acc": 0.7608668472598458, "train_speed(iter/s)": 0.140903 }, { "epoch": 0.837126016563456, "grad_norm": 0.7299644947052002, "learning_rate": 6.723786191723427e-05, "loss": 0.9013346672058106, "memory(GiB)": 91.52, "step": 64515, "token_acc": 0.7515523465703972, "train_speed(iter/s)": 0.140901 }, { "epoch": 0.8371908949651117, "grad_norm": 0.7447495460510254, "learning_rate": 6.723282690729013e-05, "loss": 0.8912502288818359, "memory(GiB)": 91.52, "step": 64520, "token_acc": 0.7387776423472053, "train_speed(iter/s)": 0.1409 }, { "epoch": 0.8372557733667674, "grad_norm": 0.7843104004859924, "learning_rate": 6.722779169903174e-05, "loss": 0.9011295318603516, "memory(GiB)": 91.52, "step": 64525, "token_acc": 0.7572052401746725, "train_speed(iter/s)": 0.140898 }, { "epoch": 0.8373206517684231, "grad_norm": 0.8246099352836609, "learning_rate": 6.722275629251702e-05, "loss": 0.8096090316772461, "memory(GiB)": 91.52, "step": 64530, "token_acc": 0.7939006394490901, "train_speed(iter/s)": 0.140897 }, { "epoch": 0.8373855301700788, "grad_norm": 0.7242750525474548, "learning_rate": 6.721772068780395e-05, "loss": 0.8882194519042969, "memory(GiB)": 91.52, "step": 64535, "token_acc": 0.7532193462250742, "train_speed(iter/s)": 0.140896 }, { "epoch": 0.8374504085717345, "grad_norm": 0.7449865341186523, "learning_rate": 6.721268488495045e-05, "loss": 0.9071764945983887, "memory(GiB)": 91.52, "step": 64540, "token_acc": 0.7666893346934149, "train_speed(iter/s)": 0.140894 }, { "epoch": 0.8375152869733902, "grad_norm": 0.7957205176353455, "learning_rate": 6.720764888401447e-05, "loss": 0.8469439506530761, "memory(GiB)": 91.52, "step": 64545, "token_acc": 0.7722630586092499, "train_speed(iter/s)": 0.140893 }, { "epoch": 0.8375801653750459, "grad_norm": 0.7031036615371704, "learning_rate": 6.720261268505399e-05, "loss": 0.876464557647705, "memory(GiB)": 91.52, "step": 64550, "token_acc": 0.7506843880182503, "train_speed(iter/s)": 0.140891 }, { "epoch": 0.8376450437767016, "grad_norm": 0.7380620241165161, "learning_rate": 6.719757628812693e-05, "loss": 0.8956474304199219, "memory(GiB)": 91.52, "step": 64555, "token_acc": 0.7543958154791808, "train_speed(iter/s)": 0.140889 }, { "epoch": 0.8377099221783573, "grad_norm": 0.7887600064277649, "learning_rate": 6.71925396932913e-05, "loss": 0.9190163612365723, "memory(GiB)": 91.52, "step": 64560, "token_acc": 0.7471354678190271, "train_speed(iter/s)": 0.140888 }, { "epoch": 0.8377748005800129, "grad_norm": 0.8094664216041565, "learning_rate": 6.7187502900605e-05, "loss": 0.8355288505554199, "memory(GiB)": 91.52, "step": 64565, "token_acc": 0.7630209154862929, "train_speed(iter/s)": 0.140886 }, { "epoch": 0.8378396789816686, "grad_norm": 0.7508272528648376, "learning_rate": 6.718246591012605e-05, "loss": 0.8523741722106933, "memory(GiB)": 91.52, "step": 64570, "token_acc": 0.7616665449499744, "train_speed(iter/s)": 0.140884 }, { "epoch": 0.8379045573833243, "grad_norm": 0.7767741084098816, "learning_rate": 6.717742872191237e-05, "loss": 0.8946708679199219, "memory(GiB)": 91.52, "step": 64575, "token_acc": 0.7590757908145213, "train_speed(iter/s)": 0.140883 }, { "epoch": 0.83796943578498, "grad_norm": 0.727934718132019, "learning_rate": 6.717239133602194e-05, "loss": 0.8419417381286621, "memory(GiB)": 91.52, "step": 64580, "token_acc": 0.7677214360587002, "train_speed(iter/s)": 0.140881 }, { "epoch": 0.8380343141866357, "grad_norm": 0.7480772733688354, "learning_rate": 6.716735375251275e-05, "loss": 0.8403667449951172, "memory(GiB)": 91.52, "step": 64585, "token_acc": 0.7616739865717603, "train_speed(iter/s)": 0.14088 }, { "epoch": 0.8380991925882914, "grad_norm": 0.8021636009216309, "learning_rate": 6.716231597144274e-05, "loss": 0.8523717880249023, "memory(GiB)": 91.52, "step": 64590, "token_acc": 0.7702728781504496, "train_speed(iter/s)": 0.140879 }, { "epoch": 0.838164070989947, "grad_norm": 0.7433884143829346, "learning_rate": 6.715727799286992e-05, "loss": 0.8258439064025879, "memory(GiB)": 91.52, "step": 64595, "token_acc": 0.7733749229821318, "train_speed(iter/s)": 0.140877 }, { "epoch": 0.8382289493916028, "grad_norm": 0.7974027991294861, "learning_rate": 6.715223981685223e-05, "loss": 0.8960650444030762, "memory(GiB)": 91.52, "step": 64600, "token_acc": 0.773015873015873, "train_speed(iter/s)": 0.140875 }, { "epoch": 0.8382938277932585, "grad_norm": 0.6896650791168213, "learning_rate": 6.71472014434477e-05, "loss": 0.8750109672546387, "memory(GiB)": 91.52, "step": 64605, "token_acc": 0.7482805292379084, "train_speed(iter/s)": 0.140872 }, { "epoch": 0.8383587061949141, "grad_norm": 0.6674891114234924, "learning_rate": 6.714216287271425e-05, "loss": 0.82059326171875, "memory(GiB)": 91.52, "step": 64610, "token_acc": 0.7786317684531229, "train_speed(iter/s)": 0.140871 }, { "epoch": 0.8384235845965698, "grad_norm": 0.6843224167823792, "learning_rate": 6.71371241047099e-05, "loss": 0.8704688072204589, "memory(GiB)": 91.52, "step": 64615, "token_acc": 0.7555094915993891, "train_speed(iter/s)": 0.140869 }, { "epoch": 0.8384884629982255, "grad_norm": 0.7127337455749512, "learning_rate": 6.713208513949264e-05, "loss": 0.8601785659790039, "memory(GiB)": 91.52, "step": 64620, "token_acc": 0.7627993634281722, "train_speed(iter/s)": 0.140868 }, { "epoch": 0.8385533413998812, "grad_norm": 0.7829787135124207, "learning_rate": 6.712704597712044e-05, "loss": 0.8713285446166992, "memory(GiB)": 91.52, "step": 64625, "token_acc": 0.748321347012509, "train_speed(iter/s)": 0.140866 }, { "epoch": 0.838618219801537, "grad_norm": 0.6904255151748657, "learning_rate": 6.71220066176513e-05, "loss": 0.8873682022094727, "memory(GiB)": 91.52, "step": 64630, "token_acc": 0.7549445088620176, "train_speed(iter/s)": 0.140865 }, { "epoch": 0.8386830982031926, "grad_norm": 0.6900518536567688, "learning_rate": 6.71169670611432e-05, "loss": 0.8711580276489258, "memory(GiB)": 91.52, "step": 64635, "token_acc": 0.7648020553544319, "train_speed(iter/s)": 0.140863 }, { "epoch": 0.8387479766048483, "grad_norm": 0.7214821577072144, "learning_rate": 6.711192730765415e-05, "loss": 0.8496932983398438, "memory(GiB)": 91.52, "step": 64640, "token_acc": 0.7866404131218189, "train_speed(iter/s)": 0.140861 }, { "epoch": 0.838812855006504, "grad_norm": 0.740415096282959, "learning_rate": 6.710688735724215e-05, "loss": 0.8682954788208008, "memory(GiB)": 91.52, "step": 64645, "token_acc": 0.7631262835664742, "train_speed(iter/s)": 0.14086 }, { "epoch": 0.8388777334081597, "grad_norm": 0.7912203073501587, "learning_rate": 6.710184720996518e-05, "loss": 0.8546919822692871, "memory(GiB)": 91.52, "step": 64650, "token_acc": 0.7728441731478326, "train_speed(iter/s)": 0.140858 }, { "epoch": 0.8389426118098154, "grad_norm": 0.7172320485115051, "learning_rate": 6.709680686588126e-05, "loss": 0.8935529708862304, "memory(GiB)": 91.52, "step": 64655, "token_acc": 0.7536420078724181, "train_speed(iter/s)": 0.140857 }, { "epoch": 0.8390074902114711, "grad_norm": 0.7234422564506531, "learning_rate": 6.709176632504839e-05, "loss": 0.8876298904418946, "memory(GiB)": 91.52, "step": 64660, "token_acc": 0.7618209175016807, "train_speed(iter/s)": 0.140855 }, { "epoch": 0.8390723686131268, "grad_norm": 0.7604021430015564, "learning_rate": 6.708672558752455e-05, "loss": 0.879581356048584, "memory(GiB)": 91.52, "step": 64665, "token_acc": 0.760239445494644, "train_speed(iter/s)": 0.140854 }, { "epoch": 0.8391372470147825, "grad_norm": 0.686144232749939, "learning_rate": 6.708168465336779e-05, "loss": 0.8645121574401855, "memory(GiB)": 91.52, "step": 64670, "token_acc": 0.7830545454545454, "train_speed(iter/s)": 0.140852 }, { "epoch": 0.8392021254164382, "grad_norm": 0.7732443809509277, "learning_rate": 6.70766435226361e-05, "loss": 0.8766299247741699, "memory(GiB)": 91.52, "step": 64675, "token_acc": 0.7507617782639269, "train_speed(iter/s)": 0.140851 }, { "epoch": 0.8392670038180939, "grad_norm": 0.7396679520606995, "learning_rate": 6.70716021953875e-05, "loss": 0.8577972412109375, "memory(GiB)": 91.52, "step": 64680, "token_acc": 0.7609801869428705, "train_speed(iter/s)": 0.140849 }, { "epoch": 0.8393318822197496, "grad_norm": 0.7054516673088074, "learning_rate": 6.706656067168e-05, "loss": 0.859689998626709, "memory(GiB)": 91.52, "step": 64685, "token_acc": 0.7732137473620742, "train_speed(iter/s)": 0.140848 }, { "epoch": 0.8393967606214053, "grad_norm": 0.7085914611816406, "learning_rate": 6.706151895157162e-05, "loss": 0.8625207901000976, "memory(GiB)": 91.52, "step": 64690, "token_acc": 0.752702559949294, "train_speed(iter/s)": 0.140846 }, { "epoch": 0.839461639023061, "grad_norm": 0.6967060565948486, "learning_rate": 6.705647703512037e-05, "loss": 0.8706855773925781, "memory(GiB)": 91.52, "step": 64695, "token_acc": 0.7515879431805058, "train_speed(iter/s)": 0.140844 }, { "epoch": 0.8395265174247167, "grad_norm": 0.8477922081947327, "learning_rate": 6.705143492238429e-05, "loss": 0.84304838180542, "memory(GiB)": 91.52, "step": 64700, "token_acc": 0.7558888734931412, "train_speed(iter/s)": 0.140842 }, { "epoch": 0.8395913958263724, "grad_norm": 0.73748379945755, "learning_rate": 6.704639261342138e-05, "loss": 0.8611367225646973, "memory(GiB)": 91.52, "step": 64705, "token_acc": 0.7678018575851393, "train_speed(iter/s)": 0.14084 }, { "epoch": 0.8396562742280281, "grad_norm": 0.744046688079834, "learning_rate": 6.704135010828969e-05, "loss": 0.8484930992126465, "memory(GiB)": 91.52, "step": 64710, "token_acc": 0.7543037623212551, "train_speed(iter/s)": 0.140838 }, { "epoch": 0.8397211526296838, "grad_norm": 0.7811969518661499, "learning_rate": 6.703630740704723e-05, "loss": 0.8395198822021485, "memory(GiB)": 91.52, "step": 64715, "token_acc": 0.7663763066202091, "train_speed(iter/s)": 0.140837 }, { "epoch": 0.8397860310313395, "grad_norm": 0.6313298940658569, "learning_rate": 6.703126450975206e-05, "loss": 0.8464631080627442, "memory(GiB)": 91.52, "step": 64720, "token_acc": 0.7611746262379077, "train_speed(iter/s)": 0.140835 }, { "epoch": 0.8398509094329952, "grad_norm": 0.7594013810157776, "learning_rate": 6.702622141646217e-05, "loss": 0.8516729354858399, "memory(GiB)": 91.52, "step": 64725, "token_acc": 0.7888504421376393, "train_speed(iter/s)": 0.140834 }, { "epoch": 0.8399157878346509, "grad_norm": 0.7498282194137573, "learning_rate": 6.702117812723565e-05, "loss": 0.8851361274719238, "memory(GiB)": 91.52, "step": 64730, "token_acc": 0.7627495975613933, "train_speed(iter/s)": 0.140832 }, { "epoch": 0.8399806662363066, "grad_norm": 0.7218058109283447, "learning_rate": 6.701613464213049e-05, "loss": 0.8833364486694336, "memory(GiB)": 91.52, "step": 64735, "token_acc": 0.7581493907822708, "train_speed(iter/s)": 0.140831 }, { "epoch": 0.8400455446379623, "grad_norm": 0.8459907174110413, "learning_rate": 6.701109096120474e-05, "loss": 0.8949658393859863, "memory(GiB)": 91.52, "step": 64740, "token_acc": 0.7569413612923056, "train_speed(iter/s)": 0.140829 }, { "epoch": 0.840110423039618, "grad_norm": 0.8244332671165466, "learning_rate": 6.700604708451645e-05, "loss": 0.859608268737793, "memory(GiB)": 91.52, "step": 64745, "token_acc": 0.7656756847139083, "train_speed(iter/s)": 0.140827 }, { "epoch": 0.8401753014412737, "grad_norm": 0.7477829456329346, "learning_rate": 6.700100301212369e-05, "loss": 0.8457416534423828, "memory(GiB)": 91.52, "step": 64750, "token_acc": 0.7556331425985194, "train_speed(iter/s)": 0.140826 }, { "epoch": 0.8402401798429294, "grad_norm": 0.8226105570793152, "learning_rate": 6.699595874408447e-05, "loss": 0.8769523620605468, "memory(GiB)": 91.52, "step": 64755, "token_acc": 0.7514526840066409, "train_speed(iter/s)": 0.140824 }, { "epoch": 0.8403050582445851, "grad_norm": 0.77409827709198, "learning_rate": 6.699091428045684e-05, "loss": 0.8425205230712891, "memory(GiB)": 91.52, "step": 64760, "token_acc": 0.7763153372968143, "train_speed(iter/s)": 0.140822 }, { "epoch": 0.8403699366462408, "grad_norm": 0.7199395298957825, "learning_rate": 6.698586962129886e-05, "loss": 0.831463623046875, "memory(GiB)": 91.52, "step": 64765, "token_acc": 0.7697363050044902, "train_speed(iter/s)": 0.140821 }, { "epoch": 0.8404348150478965, "grad_norm": 0.748197615146637, "learning_rate": 6.69808247666686e-05, "loss": 0.8365188598632812, "memory(GiB)": 91.52, "step": 64770, "token_acc": 0.7769490972816563, "train_speed(iter/s)": 0.140819 }, { "epoch": 0.8404996934495522, "grad_norm": 0.7809455394744873, "learning_rate": 6.697577971662409e-05, "loss": 0.8661764144897461, "memory(GiB)": 91.52, "step": 64775, "token_acc": 0.7553687482339644, "train_speed(iter/s)": 0.140817 }, { "epoch": 0.8405645718512079, "grad_norm": 0.8164986371994019, "learning_rate": 6.69707344712234e-05, "loss": 0.8563508033752442, "memory(GiB)": 91.52, "step": 64780, "token_acc": 0.7668175077004982, "train_speed(iter/s)": 0.140816 }, { "epoch": 0.8406294502528636, "grad_norm": 0.7409636378288269, "learning_rate": 6.696568903052459e-05, "loss": 0.8565458297729492, "memory(GiB)": 91.52, "step": 64785, "token_acc": 0.7690713882087022, "train_speed(iter/s)": 0.140814 }, { "epoch": 0.8406943286545193, "grad_norm": 0.7825327515602112, "learning_rate": 6.696064339458573e-05, "loss": 0.8329559326171875, "memory(GiB)": 91.52, "step": 64790, "token_acc": 0.7760500446827524, "train_speed(iter/s)": 0.140812 }, { "epoch": 0.840759207056175, "grad_norm": 0.748777449131012, "learning_rate": 6.695559756346488e-05, "loss": 0.8759857177734375, "memory(GiB)": 91.52, "step": 64795, "token_acc": 0.7630953238705959, "train_speed(iter/s)": 0.140811 }, { "epoch": 0.8408240854578306, "grad_norm": 0.7589394450187683, "learning_rate": 6.695055153722008e-05, "loss": 0.88114013671875, "memory(GiB)": 91.52, "step": 64800, "token_acc": 0.7769033192834562, "train_speed(iter/s)": 0.140809 }, { "epoch": 0.8408889638594863, "grad_norm": 0.7346898913383484, "learning_rate": 6.694550531590947e-05, "loss": 0.8636304855346679, "memory(GiB)": 91.52, "step": 64805, "token_acc": 0.7763408521303258, "train_speed(iter/s)": 0.140807 }, { "epoch": 0.840953842261142, "grad_norm": 0.7267071604728699, "learning_rate": 6.694045889959104e-05, "loss": 0.8591608047485352, "memory(GiB)": 91.52, "step": 64810, "token_acc": 0.7717133722023698, "train_speed(iter/s)": 0.140806 }, { "epoch": 0.8410187206627977, "grad_norm": 0.6822188496589661, "learning_rate": 6.69354122883229e-05, "loss": 0.8465784072875977, "memory(GiB)": 91.52, "step": 64815, "token_acc": 0.7727514635444386, "train_speed(iter/s)": 0.140804 }, { "epoch": 0.8410835990644534, "grad_norm": 0.7409098744392395, "learning_rate": 6.693036548216314e-05, "loss": 0.873079490661621, "memory(GiB)": 91.52, "step": 64820, "token_acc": 0.7700452266591873, "train_speed(iter/s)": 0.140802 }, { "epoch": 0.8411484774661091, "grad_norm": 0.7486234307289124, "learning_rate": 6.692531848116981e-05, "loss": 0.890102481842041, "memory(GiB)": 91.52, "step": 64825, "token_acc": 0.770510083792653, "train_speed(iter/s)": 0.1408 }, { "epoch": 0.8412133558677648, "grad_norm": 0.7151257991790771, "learning_rate": 6.692027128540102e-05, "loss": 0.8130706787109375, "memory(GiB)": 91.52, "step": 64830, "token_acc": 0.7675045189454657, "train_speed(iter/s)": 0.140798 }, { "epoch": 0.8412782342694205, "grad_norm": 0.6972052454948425, "learning_rate": 6.691522389491481e-05, "loss": 0.8476034164428711, "memory(GiB)": 91.52, "step": 64835, "token_acc": 0.7675679227307073, "train_speed(iter/s)": 0.140797 }, { "epoch": 0.8413431126710762, "grad_norm": 0.7342599630355835, "learning_rate": 6.691017630976932e-05, "loss": 0.8794329643249512, "memory(GiB)": 91.52, "step": 64840, "token_acc": 0.7448011863296203, "train_speed(iter/s)": 0.140795 }, { "epoch": 0.8414079910727319, "grad_norm": 0.677608847618103, "learning_rate": 6.690512853002259e-05, "loss": 0.8727886199951171, "memory(GiB)": 91.52, "step": 64845, "token_acc": 0.7717384662389245, "train_speed(iter/s)": 0.140793 }, { "epoch": 0.8414728694743876, "grad_norm": 0.6808120012283325, "learning_rate": 6.690008055573274e-05, "loss": 0.8710500717163085, "memory(GiB)": 91.52, "step": 64850, "token_acc": 0.7517282932707554, "train_speed(iter/s)": 0.140792 }, { "epoch": 0.8415377478760433, "grad_norm": 0.7695837616920471, "learning_rate": 6.689503238695785e-05, "loss": 0.8714179039001465, "memory(GiB)": 91.52, "step": 64855, "token_acc": 0.7540940005319351, "train_speed(iter/s)": 0.14079 }, { "epoch": 0.841602626277699, "grad_norm": 0.7108356356620789, "learning_rate": 6.688998402375601e-05, "loss": 0.8348133087158203, "memory(GiB)": 91.52, "step": 64860, "token_acc": 0.7664060819048986, "train_speed(iter/s)": 0.140788 }, { "epoch": 0.8416675046793547, "grad_norm": 0.8390040397644043, "learning_rate": 6.688493546618533e-05, "loss": 0.9032649040222168, "memory(GiB)": 91.52, "step": 64865, "token_acc": 0.7804785415875427, "train_speed(iter/s)": 0.140787 }, { "epoch": 0.8417323830810104, "grad_norm": 0.76939857006073, "learning_rate": 6.687988671430388e-05, "loss": 0.8699039459228516, "memory(GiB)": 91.52, "step": 64870, "token_acc": 0.7703126041458375, "train_speed(iter/s)": 0.140785 }, { "epoch": 0.8417972614826661, "grad_norm": 0.7653834819793701, "learning_rate": 6.68748377681698e-05, "loss": 0.8874153137207031, "memory(GiB)": 91.52, "step": 64875, "token_acc": 0.7786048064085447, "train_speed(iter/s)": 0.140784 }, { "epoch": 0.8418621398843218, "grad_norm": 0.7772528529167175, "learning_rate": 6.686978862784117e-05, "loss": 0.885401725769043, "memory(GiB)": 91.52, "step": 64880, "token_acc": 0.7785196425560613, "train_speed(iter/s)": 0.140782 }, { "epoch": 0.8419270182859775, "grad_norm": 0.6769713163375854, "learning_rate": 6.686473929337609e-05, "loss": 0.871366024017334, "memory(GiB)": 91.52, "step": 64885, "token_acc": 0.7602725233803055, "train_speed(iter/s)": 0.140781 }, { "epoch": 0.8419918966876332, "grad_norm": 0.7491025924682617, "learning_rate": 6.685968976483267e-05, "loss": 0.8734813690185547, "memory(GiB)": 91.52, "step": 64890, "token_acc": 0.7484595226290813, "train_speed(iter/s)": 0.140779 }, { "epoch": 0.8420567750892889, "grad_norm": 0.7222789525985718, "learning_rate": 6.685464004226904e-05, "loss": 0.846070384979248, "memory(GiB)": 91.52, "step": 64895, "token_acc": 0.7727804117685263, "train_speed(iter/s)": 0.140778 }, { "epoch": 0.8421216534909446, "grad_norm": 0.7483490705490112, "learning_rate": 6.68495901257433e-05, "loss": 0.8403759002685547, "memory(GiB)": 91.52, "step": 64900, "token_acc": 0.7619911260809017, "train_speed(iter/s)": 0.140776 }, { "epoch": 0.8421865318926003, "grad_norm": 0.6658879518508911, "learning_rate": 6.684454001531355e-05, "loss": 0.8773675918579101, "memory(GiB)": 91.52, "step": 64905, "token_acc": 0.7541852417620102, "train_speed(iter/s)": 0.140774 }, { "epoch": 0.842251410294256, "grad_norm": 0.81508469581604, "learning_rate": 6.683948971103792e-05, "loss": 0.8945215225219727, "memory(GiB)": 91.52, "step": 64910, "token_acc": 0.757315609711068, "train_speed(iter/s)": 0.140772 }, { "epoch": 0.8423162886959117, "grad_norm": 0.7195653915405273, "learning_rate": 6.683443921297452e-05, "loss": 0.8702194213867187, "memory(GiB)": 91.52, "step": 64915, "token_acc": 0.7476418465056452, "train_speed(iter/s)": 0.140771 }, { "epoch": 0.8423811670975674, "grad_norm": 0.6930524706840515, "learning_rate": 6.682938852118149e-05, "loss": 0.8418285369873046, "memory(GiB)": 91.52, "step": 64920, "token_acc": 0.7505230722410317, "train_speed(iter/s)": 0.140769 }, { "epoch": 0.8424460454992231, "grad_norm": 0.796880304813385, "learning_rate": 6.682433763571692e-05, "loss": 0.9110261917114257, "memory(GiB)": 91.52, "step": 64925, "token_acc": 0.7510468674504751, "train_speed(iter/s)": 0.140768 }, { "epoch": 0.8425109239008788, "grad_norm": 0.7884401082992554, "learning_rate": 6.681928655663898e-05, "loss": 0.8920313835144043, "memory(GiB)": 91.52, "step": 64930, "token_acc": 0.7535502747448798, "train_speed(iter/s)": 0.140767 }, { "epoch": 0.8425758023025345, "grad_norm": 0.6970030665397644, "learning_rate": 6.681423528400577e-05, "loss": 0.8558965682983398, "memory(GiB)": 91.52, "step": 64935, "token_acc": 0.7885127835794022, "train_speed(iter/s)": 0.140765 }, { "epoch": 0.8426406807041902, "grad_norm": 0.811181366443634, "learning_rate": 6.680918381787541e-05, "loss": 0.9146442413330078, "memory(GiB)": 91.52, "step": 64940, "token_acc": 0.7634944657111487, "train_speed(iter/s)": 0.140763 }, { "epoch": 0.8427055591058459, "grad_norm": 0.7923644185066223, "learning_rate": 6.680413215830606e-05, "loss": 0.8544485092163085, "memory(GiB)": 91.52, "step": 64945, "token_acc": 0.7592292454529084, "train_speed(iter/s)": 0.140761 }, { "epoch": 0.8427704375075016, "grad_norm": 0.7341347932815552, "learning_rate": 6.679908030535583e-05, "loss": 0.8699944496154786, "memory(GiB)": 91.52, "step": 64950, "token_acc": 0.7759976802348763, "train_speed(iter/s)": 0.14076 }, { "epoch": 0.8428353159091573, "grad_norm": 0.7474686503410339, "learning_rate": 6.679402825908286e-05, "loss": 0.7956286430358886, "memory(GiB)": 91.52, "step": 64955, "token_acc": 0.7712227058264145, "train_speed(iter/s)": 0.140758 }, { "epoch": 0.842900194310813, "grad_norm": 0.6396183967590332, "learning_rate": 6.678897601954528e-05, "loss": 0.800782299041748, "memory(GiB)": 91.52, "step": 64960, "token_acc": 0.7695410840486517, "train_speed(iter/s)": 0.140756 }, { "epoch": 0.8429650727124687, "grad_norm": 0.724795937538147, "learning_rate": 6.678392358680128e-05, "loss": 0.8492708206176758, "memory(GiB)": 91.52, "step": 64965, "token_acc": 0.7743546120275169, "train_speed(iter/s)": 0.140755 }, { "epoch": 0.8430299511141244, "grad_norm": 0.738426923751831, "learning_rate": 6.677887096090895e-05, "loss": 0.8426202774047852, "memory(GiB)": 91.52, "step": 64970, "token_acc": 0.7621359223300971, "train_speed(iter/s)": 0.140753 }, { "epoch": 0.8430948295157801, "grad_norm": 0.7768236994743347, "learning_rate": 6.677381814192643e-05, "loss": 0.8846255302429199, "memory(GiB)": 91.52, "step": 64975, "token_acc": 0.7847795310935396, "train_speed(iter/s)": 0.140751 }, { "epoch": 0.8431597079174358, "grad_norm": 0.6943116784095764, "learning_rate": 6.676876512991192e-05, "loss": 0.8367972373962402, "memory(GiB)": 91.52, "step": 64980, "token_acc": 0.7650391247388734, "train_speed(iter/s)": 0.14075 }, { "epoch": 0.8432245863190915, "grad_norm": 0.7380064725875854, "learning_rate": 6.676371192492354e-05, "loss": 0.8427502632141113, "memory(GiB)": 91.52, "step": 64985, "token_acc": 0.7746020389912359, "train_speed(iter/s)": 0.140748 }, { "epoch": 0.8432894647207472, "grad_norm": 0.6997427344322205, "learning_rate": 6.675865852701944e-05, "loss": 0.8373055458068848, "memory(GiB)": 91.52, "step": 64990, "token_acc": 0.7833345811185146, "train_speed(iter/s)": 0.140746 }, { "epoch": 0.8433543431224029, "grad_norm": 0.7747727036476135, "learning_rate": 6.675360493625775e-05, "loss": 0.9020028114318848, "memory(GiB)": 91.52, "step": 64995, "token_acc": 0.7394315653484533, "train_speed(iter/s)": 0.140744 }, { "epoch": 0.8434192215240586, "grad_norm": 0.7839322686195374, "learning_rate": 6.674855115269668e-05, "loss": 0.8681669235229492, "memory(GiB)": 91.52, "step": 65000, "token_acc": 0.7744377162629758, "train_speed(iter/s)": 0.140743 }, { "epoch": 0.8434840999257143, "grad_norm": 0.7400146126747131, "learning_rate": 6.674349717639435e-05, "loss": 0.8736299514770508, "memory(GiB)": 91.52, "step": 65005, "token_acc": 0.7724624654615666, "train_speed(iter/s)": 0.140742 }, { "epoch": 0.84354897832737, "grad_norm": 0.7486026883125305, "learning_rate": 6.673844300740895e-05, "loss": 0.8580636978149414, "memory(GiB)": 91.52, "step": 65010, "token_acc": 0.7529772777087748, "train_speed(iter/s)": 0.14074 }, { "epoch": 0.8436138567290257, "grad_norm": 0.7406033277511597, "learning_rate": 6.67333886457986e-05, "loss": 0.8769918441772461, "memory(GiB)": 91.52, "step": 65015, "token_acc": 0.7567109544468547, "train_speed(iter/s)": 0.140738 }, { "epoch": 0.8436787351306814, "grad_norm": 0.6794759631156921, "learning_rate": 6.67283340916215e-05, "loss": 0.8273484230041503, "memory(GiB)": 91.52, "step": 65020, "token_acc": 0.780279292870682, "train_speed(iter/s)": 0.140737 }, { "epoch": 0.8437436135323371, "grad_norm": 0.6514108777046204, "learning_rate": 6.672327934493582e-05, "loss": 0.8486364364624024, "memory(GiB)": 91.52, "step": 65025, "token_acc": 0.7604623782510971, "train_speed(iter/s)": 0.140735 }, { "epoch": 0.8438084919339928, "grad_norm": 0.8052451610565186, "learning_rate": 6.67182244057997e-05, "loss": 0.8648838043212891, "memory(GiB)": 91.52, "step": 65030, "token_acc": 0.7648798240958374, "train_speed(iter/s)": 0.140734 }, { "epoch": 0.8438733703356485, "grad_norm": 0.684964656829834, "learning_rate": 6.671316927427134e-05, "loss": 0.8833181381225585, "memory(GiB)": 91.52, "step": 65035, "token_acc": 0.757067412239826, "train_speed(iter/s)": 0.140732 }, { "epoch": 0.843938248737304, "grad_norm": 0.7193038463592529, "learning_rate": 6.67081139504089e-05, "loss": 0.9124269485473633, "memory(GiB)": 91.52, "step": 65040, "token_acc": 0.7466222144235831, "train_speed(iter/s)": 0.14073 }, { "epoch": 0.8440031271389598, "grad_norm": 0.8322339057922363, "learning_rate": 6.670305843427057e-05, "loss": 0.8608837127685547, "memory(GiB)": 91.52, "step": 65045, "token_acc": 0.7660399529964748, "train_speed(iter/s)": 0.140729 }, { "epoch": 0.8440680055406155, "grad_norm": 0.8384381532669067, "learning_rate": 6.66980027259145e-05, "loss": 0.8501036643981934, "memory(GiB)": 91.52, "step": 65050, "token_acc": 0.7730990853044284, "train_speed(iter/s)": 0.140728 }, { "epoch": 0.8441328839422712, "grad_norm": 0.636682391166687, "learning_rate": 6.66929468253989e-05, "loss": 0.846397876739502, "memory(GiB)": 91.52, "step": 65055, "token_acc": 0.7582299389236739, "train_speed(iter/s)": 0.140726 }, { "epoch": 0.8441977623439268, "grad_norm": 0.7100024223327637, "learning_rate": 6.668789073278195e-05, "loss": 0.8884684562683105, "memory(GiB)": 91.52, "step": 65060, "token_acc": 0.7555199126369314, "train_speed(iter/s)": 0.140725 }, { "epoch": 0.8442626407455825, "grad_norm": 0.677127480506897, "learning_rate": 6.66828344481218e-05, "loss": 0.916876220703125, "memory(GiB)": 91.52, "step": 65065, "token_acc": 0.7512921047941543, "train_speed(iter/s)": 0.140723 }, { "epoch": 0.8443275191472382, "grad_norm": 0.7228124737739563, "learning_rate": 6.667777797147668e-05, "loss": 0.8452978134155273, "memory(GiB)": 91.52, "step": 65070, "token_acc": 0.7779446613931444, "train_speed(iter/s)": 0.140721 }, { "epoch": 0.844392397548894, "grad_norm": 0.7631689310073853, "learning_rate": 6.667272130290478e-05, "loss": 0.8979846954345703, "memory(GiB)": 91.52, "step": 65075, "token_acc": 0.7534802784222738, "train_speed(iter/s)": 0.140719 }, { "epoch": 0.8444572759505496, "grad_norm": 0.7556486129760742, "learning_rate": 6.666766444246426e-05, "loss": 0.9082680702209472, "memory(GiB)": 91.52, "step": 65080, "token_acc": 0.730271940196041, "train_speed(iter/s)": 0.140717 }, { "epoch": 0.8445221543522053, "grad_norm": 0.7901049256324768, "learning_rate": 6.666260739021334e-05, "loss": 0.8415935516357422, "memory(GiB)": 91.52, "step": 65085, "token_acc": 0.7771934680321063, "train_speed(iter/s)": 0.140715 }, { "epoch": 0.844587032753861, "grad_norm": 0.6856578588485718, "learning_rate": 6.665755014621019e-05, "loss": 0.8196335792541504, "memory(GiB)": 91.52, "step": 65090, "token_acc": 0.7572013504776955, "train_speed(iter/s)": 0.140714 }, { "epoch": 0.8446519111555167, "grad_norm": 0.6721110343933105, "learning_rate": 6.665249271051305e-05, "loss": 0.8451138496398926, "memory(GiB)": 91.52, "step": 65095, "token_acc": 0.7889497262319562, "train_speed(iter/s)": 0.140712 }, { "epoch": 0.8447167895571724, "grad_norm": 0.7342441082000732, "learning_rate": 6.664743508318007e-05, "loss": 0.8174023628234863, "memory(GiB)": 91.52, "step": 65100, "token_acc": 0.748483212096185, "train_speed(iter/s)": 0.14071 }, { "epoch": 0.8447816679588281, "grad_norm": 0.7976331114768982, "learning_rate": 6.66423772642695e-05, "loss": 0.8630203247070313, "memory(GiB)": 91.52, "step": 65105, "token_acc": 0.7578381606449687, "train_speed(iter/s)": 0.140708 }, { "epoch": 0.8448465463604838, "grad_norm": 0.7305362820625305, "learning_rate": 6.66373192538395e-05, "loss": 0.880635929107666, "memory(GiB)": 91.52, "step": 65110, "token_acc": 0.7581923403000479, "train_speed(iter/s)": 0.140707 }, { "epoch": 0.8449114247621395, "grad_norm": 0.7115488648414612, "learning_rate": 6.663226105194832e-05, "loss": 0.8541112899780273, "memory(GiB)": 91.52, "step": 65115, "token_acc": 0.771791485394655, "train_speed(iter/s)": 0.140706 }, { "epoch": 0.8449763031637952, "grad_norm": 0.7136640548706055, "learning_rate": 6.662720265865414e-05, "loss": 0.8646989822387695, "memory(GiB)": 91.52, "step": 65120, "token_acc": 0.756149770151244, "train_speed(iter/s)": 0.140704 }, { "epoch": 0.8450411815654509, "grad_norm": 0.7247327566146851, "learning_rate": 6.662214407401518e-05, "loss": 0.8251842498779297, "memory(GiB)": 91.52, "step": 65125, "token_acc": 0.7569091934574168, "train_speed(iter/s)": 0.140702 }, { "epoch": 0.8451060599671066, "grad_norm": 0.7331775426864624, "learning_rate": 6.661708529808968e-05, "loss": 0.8484691619873047, "memory(GiB)": 91.52, "step": 65130, "token_acc": 0.7590405054127867, "train_speed(iter/s)": 0.1407 }, { "epoch": 0.8451709383687623, "grad_norm": 0.654033899307251, "learning_rate": 6.66120263309358e-05, "loss": 0.9074302673339844, "memory(GiB)": 91.52, "step": 65135, "token_acc": 0.7487742757083732, "train_speed(iter/s)": 0.140699 }, { "epoch": 0.845235816770418, "grad_norm": 0.7160117030143738, "learning_rate": 6.660696717261181e-05, "loss": 0.8856058120727539, "memory(GiB)": 91.52, "step": 65140, "token_acc": 0.7527508868979617, "train_speed(iter/s)": 0.140697 }, { "epoch": 0.8453006951720737, "grad_norm": 0.7052870988845825, "learning_rate": 6.660190782317591e-05, "loss": 0.8617424011230469, "memory(GiB)": 91.52, "step": 65145, "token_acc": 0.7648435975808024, "train_speed(iter/s)": 0.140696 }, { "epoch": 0.8453655735737294, "grad_norm": 0.6963012218475342, "learning_rate": 6.65968482826863e-05, "loss": 0.8173819541931152, "memory(GiB)": 91.52, "step": 65150, "token_acc": 0.7909129154396298, "train_speed(iter/s)": 0.140694 }, { "epoch": 0.8454304519753851, "grad_norm": 0.7631714344024658, "learning_rate": 6.659178855120126e-05, "loss": 0.8285799026489258, "memory(GiB)": 91.52, "step": 65155, "token_acc": 0.7579087012721953, "train_speed(iter/s)": 0.140693 }, { "epoch": 0.8454953303770408, "grad_norm": 0.6951537132263184, "learning_rate": 6.658672862877896e-05, "loss": 0.8920890808105468, "memory(GiB)": 91.52, "step": 65160, "token_acc": 0.7659951596959471, "train_speed(iter/s)": 0.140691 }, { "epoch": 0.8455602087786965, "grad_norm": 0.7960256934165955, "learning_rate": 6.658166851547768e-05, "loss": 0.8145855903625489, "memory(GiB)": 91.52, "step": 65165, "token_acc": 0.7658332139274968, "train_speed(iter/s)": 0.14069 }, { "epoch": 0.8456250871803522, "grad_norm": 0.7164859771728516, "learning_rate": 6.657660821135561e-05, "loss": 0.8306775093078613, "memory(GiB)": 91.52, "step": 65170, "token_acc": 0.7611303487509276, "train_speed(iter/s)": 0.140688 }, { "epoch": 0.8456899655820079, "grad_norm": 0.752190113067627, "learning_rate": 6.6571547716471e-05, "loss": 0.8075915336608886, "memory(GiB)": 91.52, "step": 65175, "token_acc": 0.802399463277951, "train_speed(iter/s)": 0.140686 }, { "epoch": 0.8457548439836636, "grad_norm": 0.786845326423645, "learning_rate": 6.656648703088208e-05, "loss": 0.8633560180664063, "memory(GiB)": 91.52, "step": 65180, "token_acc": 0.7631991814461119, "train_speed(iter/s)": 0.140685 }, { "epoch": 0.8458197223853193, "grad_norm": 0.72025066614151, "learning_rate": 6.656142615464711e-05, "loss": 0.8611396789550781, "memory(GiB)": 91.52, "step": 65185, "token_acc": 0.7636069077246561, "train_speed(iter/s)": 0.140683 }, { "epoch": 0.845884600786975, "grad_norm": 0.7322587370872498, "learning_rate": 6.65563650878243e-05, "loss": 0.8547083854675293, "memory(GiB)": 91.52, "step": 65190, "token_acc": 0.7567586350684551, "train_speed(iter/s)": 0.140681 }, { "epoch": 0.8459494791886307, "grad_norm": 0.7019060254096985, "learning_rate": 6.65513038304719e-05, "loss": 0.8214842796325683, "memory(GiB)": 91.52, "step": 65195, "token_acc": 0.7751194687519394, "train_speed(iter/s)": 0.140679 }, { "epoch": 0.8460143575902864, "grad_norm": 0.7614770531654358, "learning_rate": 6.654624238264818e-05, "loss": 0.892115592956543, "memory(GiB)": 91.52, "step": 65200, "token_acc": 0.7699634511020047, "train_speed(iter/s)": 0.140678 }, { "epoch": 0.8460792359919421, "grad_norm": 0.7709058523178101, "learning_rate": 6.654118074441135e-05, "loss": 0.9209282875061036, "memory(GiB)": 91.52, "step": 65205, "token_acc": 0.749120518597573, "train_speed(iter/s)": 0.140676 }, { "epoch": 0.8461441143935978, "grad_norm": 0.6979911923408508, "learning_rate": 6.653611891581968e-05, "loss": 0.8858754158020019, "memory(GiB)": 91.52, "step": 65210, "token_acc": 0.7491865304578131, "train_speed(iter/s)": 0.140674 }, { "epoch": 0.8462089927952535, "grad_norm": 0.7603749632835388, "learning_rate": 6.653105689693144e-05, "loss": 0.9057207107543945, "memory(GiB)": 91.52, "step": 65215, "token_acc": 0.7684004970708326, "train_speed(iter/s)": 0.140672 }, { "epoch": 0.8462738711969092, "grad_norm": 0.739033043384552, "learning_rate": 6.652599468780484e-05, "loss": 0.8536346435546875, "memory(GiB)": 91.52, "step": 65220, "token_acc": 0.7555538384267666, "train_speed(iter/s)": 0.14067 }, { "epoch": 0.8463387495985649, "grad_norm": 0.7657029032707214, "learning_rate": 6.652093228849816e-05, "loss": 0.7879762649536133, "memory(GiB)": 91.52, "step": 65225, "token_acc": 0.7795015774696807, "train_speed(iter/s)": 0.140668 }, { "epoch": 0.8464036280002206, "grad_norm": 0.7126618027687073, "learning_rate": 6.651586969906965e-05, "loss": 0.8603550910949707, "memory(GiB)": 91.52, "step": 65230, "token_acc": 0.768039562502256, "train_speed(iter/s)": 0.140666 }, { "epoch": 0.8464685064018763, "grad_norm": 0.7773609161376953, "learning_rate": 6.651080691957758e-05, "loss": 0.8890963554382324, "memory(GiB)": 91.52, "step": 65235, "token_acc": 0.7512910029820351, "train_speed(iter/s)": 0.140664 }, { "epoch": 0.846533384803532, "grad_norm": 0.7547711730003357, "learning_rate": 6.65057439500802e-05, "loss": 0.9257941246032715, "memory(GiB)": 91.52, "step": 65240, "token_acc": 0.7480465297174982, "train_speed(iter/s)": 0.140663 }, { "epoch": 0.8465982632051877, "grad_norm": 0.6688463687896729, "learning_rate": 6.65006807906358e-05, "loss": 0.8356891632080078, "memory(GiB)": 91.52, "step": 65245, "token_acc": 0.765774977735264, "train_speed(iter/s)": 0.140661 }, { "epoch": 0.8466631416068434, "grad_norm": 0.739414632320404, "learning_rate": 6.649561744130261e-05, "loss": 0.8619887351989746, "memory(GiB)": 91.52, "step": 65250, "token_acc": 0.7500894582408931, "train_speed(iter/s)": 0.140659 }, { "epoch": 0.8467280200084991, "grad_norm": 0.6987814903259277, "learning_rate": 6.649055390213891e-05, "loss": 0.9002818107604981, "memory(GiB)": 91.52, "step": 65255, "token_acc": 0.7482372176993562, "train_speed(iter/s)": 0.140657 }, { "epoch": 0.8467928984101548, "grad_norm": 0.7384982109069824, "learning_rate": 6.6485490173203e-05, "loss": 0.8938034057617188, "memory(GiB)": 91.52, "step": 65260, "token_acc": 0.7577433628318584, "train_speed(iter/s)": 0.140656 }, { "epoch": 0.8468577768118105, "grad_norm": 0.7464979290962219, "learning_rate": 6.648042625455311e-05, "loss": 0.8586275100708007, "memory(GiB)": 91.52, "step": 65265, "token_acc": 0.7564649336582724, "train_speed(iter/s)": 0.140654 }, { "epoch": 0.8469226552134662, "grad_norm": 0.801104724407196, "learning_rate": 6.647536214624755e-05, "loss": 0.8894190788269043, "memory(GiB)": 91.52, "step": 65270, "token_acc": 0.7384434884630426, "train_speed(iter/s)": 0.140653 }, { "epoch": 0.8469875336151219, "grad_norm": 0.7545903921127319, "learning_rate": 6.647029784834457e-05, "loss": 0.8493927001953125, "memory(GiB)": 91.52, "step": 65275, "token_acc": 0.7787821934163398, "train_speed(iter/s)": 0.140651 }, { "epoch": 0.8470524120167775, "grad_norm": 0.7521262764930725, "learning_rate": 6.646523336090247e-05, "loss": 0.9009214401245117, "memory(GiB)": 91.52, "step": 65280, "token_acc": 0.7474094373554852, "train_speed(iter/s)": 0.14065 }, { "epoch": 0.8471172904184332, "grad_norm": 0.7105584740638733, "learning_rate": 6.646016868397953e-05, "loss": 0.8740335464477539, "memory(GiB)": 91.52, "step": 65285, "token_acc": 0.7616862943428943, "train_speed(iter/s)": 0.140648 }, { "epoch": 0.8471821688200889, "grad_norm": 0.7486082315444946, "learning_rate": 6.645510381763403e-05, "loss": 0.851688003540039, "memory(GiB)": 91.52, "step": 65290, "token_acc": 0.772915988277434, "train_speed(iter/s)": 0.140647 }, { "epoch": 0.8472470472217446, "grad_norm": 0.7533857822418213, "learning_rate": 6.645003876192424e-05, "loss": 0.8589319229125977, "memory(GiB)": 91.52, "step": 65295, "token_acc": 0.7693880332560992, "train_speed(iter/s)": 0.140645 }, { "epoch": 0.8473119256234003, "grad_norm": 0.7510777115821838, "learning_rate": 6.644497351690845e-05, "loss": 0.8292013168334961, "memory(GiB)": 91.52, "step": 65300, "token_acc": 0.7692336240489887, "train_speed(iter/s)": 0.140643 }, { "epoch": 0.847376804025056, "grad_norm": 0.7613644599914551, "learning_rate": 6.643990808264499e-05, "loss": 0.7919160842895507, "memory(GiB)": 91.52, "step": 65305, "token_acc": 0.7696391112633246, "train_speed(iter/s)": 0.140642 }, { "epoch": 0.8474416824267117, "grad_norm": 0.7595734596252441, "learning_rate": 6.643484245919211e-05, "loss": 0.9051626205444336, "memory(GiB)": 91.52, "step": 65310, "token_acc": 0.753919224126478, "train_speed(iter/s)": 0.14064 }, { "epoch": 0.8475065608283674, "grad_norm": 0.8125998973846436, "learning_rate": 6.642977664660813e-05, "loss": 0.8881616592407227, "memory(GiB)": 91.52, "step": 65315, "token_acc": 0.7543557915202274, "train_speed(iter/s)": 0.14064 }, { "epoch": 0.8475714392300231, "grad_norm": 0.7792202234268188, "learning_rate": 6.642471064495133e-05, "loss": 0.8546641349792481, "memory(GiB)": 91.52, "step": 65320, "token_acc": 0.7763229087761064, "train_speed(iter/s)": 0.140638 }, { "epoch": 0.8476363176316788, "grad_norm": 0.7093503475189209, "learning_rate": 6.641964445428002e-05, "loss": 0.8713069915771484, "memory(GiB)": 91.52, "step": 65325, "token_acc": 0.7764567229366702, "train_speed(iter/s)": 0.140637 }, { "epoch": 0.8477011960333345, "grad_norm": 0.9254414439201355, "learning_rate": 6.641457807465251e-05, "loss": 0.7935442924499512, "memory(GiB)": 91.52, "step": 65330, "token_acc": 0.7977450755465242, "train_speed(iter/s)": 0.140635 }, { "epoch": 0.8477660744349902, "grad_norm": 0.7843657732009888, "learning_rate": 6.640951150612708e-05, "loss": 0.8517422676086426, "memory(GiB)": 91.52, "step": 65335, "token_acc": 0.7764138535729943, "train_speed(iter/s)": 0.140634 }, { "epoch": 0.8478309528366459, "grad_norm": 0.7801127433776855, "learning_rate": 6.640444474876204e-05, "loss": 0.9275605201721191, "memory(GiB)": 91.52, "step": 65340, "token_acc": 0.7371555592836472, "train_speed(iter/s)": 0.140632 }, { "epoch": 0.8478958312383016, "grad_norm": 0.8364083766937256, "learning_rate": 6.639937780261573e-05, "loss": 0.8704706192016601, "memory(GiB)": 91.52, "step": 65345, "token_acc": 0.7633516781644833, "train_speed(iter/s)": 0.140631 }, { "epoch": 0.8479607096399573, "grad_norm": 0.6682745814323425, "learning_rate": 6.639431066774641e-05, "loss": 0.8378520965576172, "memory(GiB)": 91.52, "step": 65350, "token_acc": 0.7639922801213124, "train_speed(iter/s)": 0.140629 }, { "epoch": 0.848025588041613, "grad_norm": 0.911178469657898, "learning_rate": 6.638924334421242e-05, "loss": 0.8853172302246094, "memory(GiB)": 91.52, "step": 65355, "token_acc": 0.7575403641360358, "train_speed(iter/s)": 0.140627 }, { "epoch": 0.8480904664432687, "grad_norm": 0.8498573303222656, "learning_rate": 6.63841758320721e-05, "loss": 0.8760749816894531, "memory(GiB)": 91.52, "step": 65360, "token_acc": 0.7617713914596526, "train_speed(iter/s)": 0.140626 }, { "epoch": 0.8481553448449244, "grad_norm": 0.7953355312347412, "learning_rate": 6.637910813138373e-05, "loss": 0.8842327117919921, "memory(GiB)": 91.52, "step": 65365, "token_acc": 0.7713952771018268, "train_speed(iter/s)": 0.140625 }, { "epoch": 0.8482202232465801, "grad_norm": 0.7674110531806946, "learning_rate": 6.637404024220562e-05, "loss": 0.8725340843200684, "memory(GiB)": 91.52, "step": 65370, "token_acc": 0.7760049882629108, "train_speed(iter/s)": 0.140624 }, { "epoch": 0.8482851016482358, "grad_norm": 0.7788484692573547, "learning_rate": 6.636897216459611e-05, "loss": 0.9178015708923339, "memory(GiB)": 91.52, "step": 65375, "token_acc": 0.7476752306391737, "train_speed(iter/s)": 0.140623 }, { "epoch": 0.8483499800498915, "grad_norm": 0.7259120345115662, "learning_rate": 6.636390389861354e-05, "loss": 0.9099445343017578, "memory(GiB)": 91.52, "step": 65380, "token_acc": 0.7779287281847633, "train_speed(iter/s)": 0.140621 }, { "epoch": 0.8484148584515472, "grad_norm": 0.6976791620254517, "learning_rate": 6.63588354443162e-05, "loss": 0.8718635559082031, "memory(GiB)": 91.52, "step": 65385, "token_acc": 0.7579679041609276, "train_speed(iter/s)": 0.14062 }, { "epoch": 0.8484797368532029, "grad_norm": 0.706053614616394, "learning_rate": 6.635376680176243e-05, "loss": 0.8163206100463867, "memory(GiB)": 91.52, "step": 65390, "token_acc": 0.7778930459808581, "train_speed(iter/s)": 0.140618 }, { "epoch": 0.8485446152548586, "grad_norm": 0.6542239785194397, "learning_rate": 6.634869797101057e-05, "loss": 0.8348872184753418, "memory(GiB)": 91.52, "step": 65395, "token_acc": 0.7763919204517895, "train_speed(iter/s)": 0.140616 }, { "epoch": 0.8486094936565143, "grad_norm": 0.7119666337966919, "learning_rate": 6.634362895211897e-05, "loss": 0.8607450485229492, "memory(GiB)": 91.52, "step": 65400, "token_acc": 0.7536236792197236, "train_speed(iter/s)": 0.140614 }, { "epoch": 0.84867437205817, "grad_norm": 0.7386363744735718, "learning_rate": 6.63385597451459e-05, "loss": 0.8487238883972168, "memory(GiB)": 91.52, "step": 65405, "token_acc": 0.7572254335260116, "train_speed(iter/s)": 0.140613 }, { "epoch": 0.8487392504598257, "grad_norm": 0.7061283588409424, "learning_rate": 6.633349035014977e-05, "loss": 0.8554143905639648, "memory(GiB)": 91.52, "step": 65410, "token_acc": 0.7653750183472773, "train_speed(iter/s)": 0.140611 }, { "epoch": 0.8488041288614814, "grad_norm": 0.5956602096557617, "learning_rate": 6.632842076718887e-05, "loss": 0.8660760879516601, "memory(GiB)": 91.52, "step": 65415, "token_acc": 0.752972499817638, "train_speed(iter/s)": 0.140609 }, { "epoch": 0.8488690072631371, "grad_norm": 0.6230396032333374, "learning_rate": 6.632335099632155e-05, "loss": 0.8543548583984375, "memory(GiB)": 91.52, "step": 65420, "token_acc": 0.758767707330491, "train_speed(iter/s)": 0.140607 }, { "epoch": 0.8489338856647928, "grad_norm": 0.7624167799949646, "learning_rate": 6.631828103760616e-05, "loss": 0.8814785003662109, "memory(GiB)": 91.52, "step": 65425, "token_acc": 0.7750074537865236, "train_speed(iter/s)": 0.140606 }, { "epoch": 0.8489987640664485, "grad_norm": 0.6989186406135559, "learning_rate": 6.631321089110105e-05, "loss": 0.8023616790771484, "memory(GiB)": 91.52, "step": 65430, "token_acc": 0.7868471696861106, "train_speed(iter/s)": 0.140604 }, { "epoch": 0.8490636424681042, "grad_norm": 0.7302767038345337, "learning_rate": 6.630814055686454e-05, "loss": 0.8902528762817383, "memory(GiB)": 91.52, "step": 65435, "token_acc": 0.7454160078054032, "train_speed(iter/s)": 0.140603 }, { "epoch": 0.8491285208697599, "grad_norm": 0.7201006412506104, "learning_rate": 6.630307003495503e-05, "loss": 0.8494041442871094, "memory(GiB)": 91.52, "step": 65440, "token_acc": 0.7617459443706893, "train_speed(iter/s)": 0.140601 }, { "epoch": 0.8491933992714156, "grad_norm": 0.7069109082221985, "learning_rate": 6.629799932543082e-05, "loss": 0.8333550453186035, "memory(GiB)": 91.52, "step": 65445, "token_acc": 0.763805845433951, "train_speed(iter/s)": 0.140599 }, { "epoch": 0.8492582776730713, "grad_norm": 0.7668904662132263, "learning_rate": 6.629292842835028e-05, "loss": 0.8535859107971191, "memory(GiB)": 91.52, "step": 65450, "token_acc": 0.7656830816803184, "train_speed(iter/s)": 0.140597 }, { "epoch": 0.849323156074727, "grad_norm": 0.7694442868232727, "learning_rate": 6.628785734377178e-05, "loss": 0.8828314781188965, "memory(GiB)": 91.52, "step": 65455, "token_acc": 0.7430751428364794, "train_speed(iter/s)": 0.140595 }, { "epoch": 0.8493880344763827, "grad_norm": 0.7618711590766907, "learning_rate": 6.628278607175364e-05, "loss": 0.8050567626953125, "memory(GiB)": 91.52, "step": 65460, "token_acc": 0.7536906351325577, "train_speed(iter/s)": 0.140594 }, { "epoch": 0.8494529128780384, "grad_norm": 0.7403846383094788, "learning_rate": 6.627771461235427e-05, "loss": 0.8374934196472168, "memory(GiB)": 91.52, "step": 65465, "token_acc": 0.7708044030482641, "train_speed(iter/s)": 0.140592 }, { "epoch": 0.8495177912796941, "grad_norm": 0.6668291091918945, "learning_rate": 6.627264296563201e-05, "loss": 0.8908623695373535, "memory(GiB)": 91.52, "step": 65470, "token_acc": 0.7443997071742313, "train_speed(iter/s)": 0.140591 }, { "epoch": 0.8495826696813498, "grad_norm": 0.7133838534355164, "learning_rate": 6.626757113164521e-05, "loss": 0.8451328277587891, "memory(GiB)": 91.52, "step": 65475, "token_acc": 0.7501108647450111, "train_speed(iter/s)": 0.140588 }, { "epoch": 0.8496475480830055, "grad_norm": 0.7324261665344238, "learning_rate": 6.626249911045226e-05, "loss": 0.8335601806640625, "memory(GiB)": 91.52, "step": 65480, "token_acc": 0.7534269475091943, "train_speed(iter/s)": 0.140587 }, { "epoch": 0.8497124264846612, "grad_norm": 0.7888419032096863, "learning_rate": 6.625742690211149e-05, "loss": 0.9346167564392089, "memory(GiB)": 91.52, "step": 65485, "token_acc": 0.7437129132523311, "train_speed(iter/s)": 0.140585 }, { "epoch": 0.8497773048863169, "grad_norm": 0.7382223606109619, "learning_rate": 6.625235450668134e-05, "loss": 0.9056227684020997, "memory(GiB)": 91.52, "step": 65490, "token_acc": 0.7425401109611636, "train_speed(iter/s)": 0.140583 }, { "epoch": 0.8498421832879726, "grad_norm": 0.7259542942047119, "learning_rate": 6.624728192422011e-05, "loss": 0.8464030265808106, "memory(GiB)": 91.52, "step": 65495, "token_acc": 0.7773990653220423, "train_speed(iter/s)": 0.140581 }, { "epoch": 0.8499070616896283, "grad_norm": 0.7210988998413086, "learning_rate": 6.62422091547862e-05, "loss": 0.8444908142089844, "memory(GiB)": 91.52, "step": 65500, "token_acc": 0.7656389452332657, "train_speed(iter/s)": 0.140579 }, { "epoch": 0.849971940091284, "grad_norm": 0.703453779220581, "learning_rate": 6.623713619843801e-05, "loss": 0.8555397033691406, "memory(GiB)": 91.52, "step": 65505, "token_acc": 0.7548703192440798, "train_speed(iter/s)": 0.140577 }, { "epoch": 0.8500368184929397, "grad_norm": 0.8980903029441833, "learning_rate": 6.623206305523391e-05, "loss": 0.8707602500915528, "memory(GiB)": 91.52, "step": 65510, "token_acc": 0.7882712405819242, "train_speed(iter/s)": 0.140575 }, { "epoch": 0.8501016968945954, "grad_norm": 0.7100391983985901, "learning_rate": 6.622698972523224e-05, "loss": 0.8424135208129883, "memory(GiB)": 91.52, "step": 65515, "token_acc": 0.7776667855868712, "train_speed(iter/s)": 0.140573 }, { "epoch": 0.850166575296251, "grad_norm": 0.8094674348831177, "learning_rate": 6.622191620849144e-05, "loss": 0.8706123352050781, "memory(GiB)": 91.52, "step": 65520, "token_acc": 0.7631103074141049, "train_speed(iter/s)": 0.140571 }, { "epoch": 0.8502314536979066, "grad_norm": 0.7311353087425232, "learning_rate": 6.621684250506988e-05, "loss": 0.8798871040344238, "memory(GiB)": 91.52, "step": 65525, "token_acc": 0.7520329533752352, "train_speed(iter/s)": 0.14057 }, { "epoch": 0.8502963320995623, "grad_norm": 0.6793537735939026, "learning_rate": 6.621176861502592e-05, "loss": 0.8960996627807617, "memory(GiB)": 91.52, "step": 65530, "token_acc": 0.7582834577651631, "train_speed(iter/s)": 0.140568 }, { "epoch": 0.850361210501218, "grad_norm": 0.7448985576629639, "learning_rate": 6.620669453841798e-05, "loss": 0.8455638885498047, "memory(GiB)": 91.52, "step": 65535, "token_acc": 0.7692618881247472, "train_speed(iter/s)": 0.140566 }, { "epoch": 0.8504260889028737, "grad_norm": 0.7746062874794006, "learning_rate": 6.620162027530444e-05, "loss": 0.8835893630981445, "memory(GiB)": 91.52, "step": 65540, "token_acc": 0.7393721973094171, "train_speed(iter/s)": 0.140565 }, { "epoch": 0.8504909673045294, "grad_norm": 0.7635877132415771, "learning_rate": 6.619654582574371e-05, "loss": 0.8822128295898437, "memory(GiB)": 91.52, "step": 65545, "token_acc": 0.7398062855594689, "train_speed(iter/s)": 0.140564 }, { "epoch": 0.8505558457061851, "grad_norm": 0.7029401063919067, "learning_rate": 6.619147118979415e-05, "loss": 0.8591382026672363, "memory(GiB)": 91.52, "step": 65550, "token_acc": 0.7471203670088121, "train_speed(iter/s)": 0.140562 }, { "epoch": 0.8506207241078408, "grad_norm": 0.7125536799430847, "learning_rate": 6.618639636751417e-05, "loss": 0.8110288619995117, "memory(GiB)": 91.52, "step": 65555, "token_acc": 0.787949472475418, "train_speed(iter/s)": 0.14056 }, { "epoch": 0.8506856025094965, "grad_norm": 0.7399584650993347, "learning_rate": 6.618132135896224e-05, "loss": 0.8530935287475586, "memory(GiB)": 91.52, "step": 65560, "token_acc": 0.7729559055696296, "train_speed(iter/s)": 0.140559 }, { "epoch": 0.8507504809111522, "grad_norm": 0.6637998819351196, "learning_rate": 6.617624616419665e-05, "loss": 0.8559122085571289, "memory(GiB)": 91.52, "step": 65565, "token_acc": 0.7615114671582007, "train_speed(iter/s)": 0.140557 }, { "epoch": 0.8508153593128079, "grad_norm": 0.7135409116744995, "learning_rate": 6.617117078327588e-05, "loss": 0.8694984436035156, "memory(GiB)": 91.52, "step": 65570, "token_acc": 0.7626605866619898, "train_speed(iter/s)": 0.140556 }, { "epoch": 0.8508802377144636, "grad_norm": 0.763127863407135, "learning_rate": 6.616609521625832e-05, "loss": 0.9001298904418945, "memory(GiB)": 91.52, "step": 65575, "token_acc": 0.7687407680945347, "train_speed(iter/s)": 0.140554 }, { "epoch": 0.8509451161161193, "grad_norm": 0.7613846659660339, "learning_rate": 6.616101946320238e-05, "loss": 0.8598167419433593, "memory(GiB)": 91.52, "step": 65580, "token_acc": 0.7783186745755778, "train_speed(iter/s)": 0.140552 }, { "epoch": 0.851009994517775, "grad_norm": 0.6648562550544739, "learning_rate": 6.615594352416646e-05, "loss": 0.8630617141723633, "memory(GiB)": 91.52, "step": 65585, "token_acc": 0.7554807253849661, "train_speed(iter/s)": 0.140551 }, { "epoch": 0.8510748729194307, "grad_norm": 0.6943988800048828, "learning_rate": 6.615086739920897e-05, "loss": 0.8499210357666016, "memory(GiB)": 91.52, "step": 65590, "token_acc": 0.7740202966432475, "train_speed(iter/s)": 0.140549 }, { "epoch": 0.8511397513210864, "grad_norm": 0.7437912225723267, "learning_rate": 6.614579108838837e-05, "loss": 0.851300048828125, "memory(GiB)": 91.52, "step": 65595, "token_acc": 0.7730906901281236, "train_speed(iter/s)": 0.140547 }, { "epoch": 0.8512046297227421, "grad_norm": 0.7045382857322693, "learning_rate": 6.614071459176301e-05, "loss": 0.891804313659668, "memory(GiB)": 91.52, "step": 65600, "token_acc": 0.7524012361145912, "train_speed(iter/s)": 0.140545 }, { "epoch": 0.8512695081243978, "grad_norm": 0.7250906825065613, "learning_rate": 6.613563790939137e-05, "loss": 0.839473819732666, "memory(GiB)": 91.52, "step": 65605, "token_acc": 0.7694136291600634, "train_speed(iter/s)": 0.140543 }, { "epoch": 0.8513343865260535, "grad_norm": 0.8270531892776489, "learning_rate": 6.613056104133183e-05, "loss": 0.8396799087524414, "memory(GiB)": 91.52, "step": 65610, "token_acc": 0.760246566453985, "train_speed(iter/s)": 0.140542 }, { "epoch": 0.8513992649277092, "grad_norm": 0.7588285803794861, "learning_rate": 6.612548398764286e-05, "loss": 0.8445014953613281, "memory(GiB)": 91.52, "step": 65615, "token_acc": 0.7521185620864451, "train_speed(iter/s)": 0.14054 }, { "epoch": 0.8514641433293649, "grad_norm": 0.7556076645851135, "learning_rate": 6.612040674838283e-05, "loss": 0.8978570938110352, "memory(GiB)": 91.52, "step": 65620, "token_acc": 0.7389411599494453, "train_speed(iter/s)": 0.140539 }, { "epoch": 0.8515290217310206, "grad_norm": 0.7718615531921387, "learning_rate": 6.61153293236102e-05, "loss": 0.8856884002685547, "memory(GiB)": 91.52, "step": 65625, "token_acc": 0.759147115349748, "train_speed(iter/s)": 0.140537 }, { "epoch": 0.8515939001326763, "grad_norm": 0.7129701375961304, "learning_rate": 6.611025171338342e-05, "loss": 0.8893978118896484, "memory(GiB)": 91.52, "step": 65630, "token_acc": 0.7517615176151762, "train_speed(iter/s)": 0.140535 }, { "epoch": 0.851658778534332, "grad_norm": 0.6965407729148865, "learning_rate": 6.610517391776088e-05, "loss": 0.8303045272827149, "memory(GiB)": 91.52, "step": 65635, "token_acc": 0.7680672743178034, "train_speed(iter/s)": 0.140534 }, { "epoch": 0.8517236569359877, "grad_norm": 0.7659681439399719, "learning_rate": 6.610009593680103e-05, "loss": 0.8735393524169922, "memory(GiB)": 91.52, "step": 65640, "token_acc": 0.7579863374812211, "train_speed(iter/s)": 0.140531 }, { "epoch": 0.8517885353376434, "grad_norm": 0.693173348903656, "learning_rate": 6.609501777056233e-05, "loss": 0.8451663970947265, "memory(GiB)": 91.52, "step": 65645, "token_acc": 0.7669918646484944, "train_speed(iter/s)": 0.14053 }, { "epoch": 0.8518534137392991, "grad_norm": 0.7315745949745178, "learning_rate": 6.608993941910319e-05, "loss": 0.8730329513549805, "memory(GiB)": 91.52, "step": 65650, "token_acc": 0.7730942818102771, "train_speed(iter/s)": 0.140529 }, { "epoch": 0.8519182921409548, "grad_norm": 0.7189863920211792, "learning_rate": 6.608486088248205e-05, "loss": 0.8725899696350098, "memory(GiB)": 91.52, "step": 65655, "token_acc": 0.7588139723801787, "train_speed(iter/s)": 0.140527 }, { "epoch": 0.8519831705426105, "grad_norm": 0.6830651164054871, "learning_rate": 6.607978216075739e-05, "loss": 0.8845930099487305, "memory(GiB)": 91.52, "step": 65660, "token_acc": 0.7545113511415821, "train_speed(iter/s)": 0.140525 }, { "epoch": 0.8520480489442662, "grad_norm": 0.7144474387168884, "learning_rate": 6.607470325398761e-05, "loss": 0.8660726547241211, "memory(GiB)": 91.52, "step": 65665, "token_acc": 0.7804374200741057, "train_speed(iter/s)": 0.140523 }, { "epoch": 0.8521129273459219, "grad_norm": 0.6827306747436523, "learning_rate": 6.60696241622312e-05, "loss": 0.8559167861938477, "memory(GiB)": 91.52, "step": 65670, "token_acc": 0.7613541872896457, "train_speed(iter/s)": 0.140522 }, { "epoch": 0.8521778057475776, "grad_norm": 0.7953634858131409, "learning_rate": 6.606454488554659e-05, "loss": 0.8818485260009765, "memory(GiB)": 91.52, "step": 65675, "token_acc": 0.7474676365911846, "train_speed(iter/s)": 0.14052 }, { "epoch": 0.8522426841492333, "grad_norm": 0.7157529592514038, "learning_rate": 6.605946542399222e-05, "loss": 0.8378629684448242, "memory(GiB)": 91.52, "step": 65680, "token_acc": 0.7647223192455467, "train_speed(iter/s)": 0.140518 }, { "epoch": 0.852307562550889, "grad_norm": 0.827804684638977, "learning_rate": 6.605438577762656e-05, "loss": 0.8589423179626465, "memory(GiB)": 91.52, "step": 65685, "token_acc": 0.7666847347381259, "train_speed(iter/s)": 0.140517 }, { "epoch": 0.8523724409525447, "grad_norm": 0.7535344362258911, "learning_rate": 6.604930594650807e-05, "loss": 0.8851262092590332, "memory(GiB)": 91.52, "step": 65690, "token_acc": 0.7714364742762309, "train_speed(iter/s)": 0.140515 }, { "epoch": 0.8524373193542004, "grad_norm": 0.7363076210021973, "learning_rate": 6.604422593069517e-05, "loss": 0.8966161727905273, "memory(GiB)": 91.52, "step": 65695, "token_acc": 0.7549918651087117, "train_speed(iter/s)": 0.140513 }, { "epoch": 0.8525021977558561, "grad_norm": 0.6740930080413818, "learning_rate": 6.603914573024637e-05, "loss": 0.8990084648132324, "memory(GiB)": 91.52, "step": 65700, "token_acc": 0.7657408073684968, "train_speed(iter/s)": 0.140511 }, { "epoch": 0.8525670761575118, "grad_norm": 0.7268183827400208, "learning_rate": 6.603406534522012e-05, "loss": 0.8923395156860352, "memory(GiB)": 91.52, "step": 65705, "token_acc": 0.7648045689809031, "train_speed(iter/s)": 0.14051 }, { "epoch": 0.8526319545591675, "grad_norm": 0.746602475643158, "learning_rate": 6.602898477567488e-05, "loss": 0.8977558135986328, "memory(GiB)": 91.52, "step": 65710, "token_acc": 0.73548084553784, "train_speed(iter/s)": 0.140509 }, { "epoch": 0.8526968329608232, "grad_norm": 0.7736890316009521, "learning_rate": 6.60239040216691e-05, "loss": 0.8517322540283203, "memory(GiB)": 91.52, "step": 65715, "token_acc": 0.7769720003102458, "train_speed(iter/s)": 0.140508 }, { "epoch": 0.8527617113624789, "grad_norm": 0.7487781643867493, "learning_rate": 6.601882308326126e-05, "loss": 0.8936285972595215, "memory(GiB)": 91.52, "step": 65720, "token_acc": 0.7669882192248592, "train_speed(iter/s)": 0.140506 }, { "epoch": 0.8528265897641346, "grad_norm": 0.7670671939849854, "learning_rate": 6.601374196050986e-05, "loss": 0.8656753540039063, "memory(GiB)": 91.52, "step": 65725, "token_acc": 0.7912877904717495, "train_speed(iter/s)": 0.140505 }, { "epoch": 0.8528914681657903, "grad_norm": 0.6773935556411743, "learning_rate": 6.600866065347332e-05, "loss": 0.8597637176513672, "memory(GiB)": 91.52, "step": 65730, "token_acc": 0.751501760684941, "train_speed(iter/s)": 0.140503 }, { "epoch": 0.852956346567446, "grad_norm": 0.6258790493011475, "learning_rate": 6.600357916221016e-05, "loss": 0.8616824150085449, "memory(GiB)": 91.52, "step": 65735, "token_acc": 0.7739919063448475, "train_speed(iter/s)": 0.140501 }, { "epoch": 0.8530212249691017, "grad_norm": 0.783849835395813, "learning_rate": 6.599849748677884e-05, "loss": 0.8488435745239258, "memory(GiB)": 91.52, "step": 65740, "token_acc": 0.7758506407423774, "train_speed(iter/s)": 0.140499 }, { "epoch": 0.8530861033707574, "grad_norm": 0.705532431602478, "learning_rate": 6.599341562723784e-05, "loss": 0.8641989707946778, "memory(GiB)": 91.52, "step": 65745, "token_acc": 0.7669193195216439, "train_speed(iter/s)": 0.140497 }, { "epoch": 0.8531509817724131, "grad_norm": 0.6580520868301392, "learning_rate": 6.598833358364564e-05, "loss": 0.8434577941894531, "memory(GiB)": 91.52, "step": 65750, "token_acc": 0.7773556231003039, "train_speed(iter/s)": 0.140495 }, { "epoch": 0.8532158601740687, "grad_norm": 0.760486900806427, "learning_rate": 6.598325135606072e-05, "loss": 0.8700689315795899, "memory(GiB)": 91.52, "step": 65755, "token_acc": 0.7637678721614802, "train_speed(iter/s)": 0.140493 }, { "epoch": 0.8532807385757244, "grad_norm": 0.7648151516914368, "learning_rate": 6.597816894454158e-05, "loss": 0.8922430038452148, "memory(GiB)": 91.52, "step": 65760, "token_acc": 0.7677601985419575, "train_speed(iter/s)": 0.140491 }, { "epoch": 0.8533456169773801, "grad_norm": 0.6736800074577332, "learning_rate": 6.597308634914669e-05, "loss": 0.8569840431213379, "memory(GiB)": 91.52, "step": 65765, "token_acc": 0.7745787261101494, "train_speed(iter/s)": 0.14049 }, { "epoch": 0.8534104953790358, "grad_norm": 0.7890423536300659, "learning_rate": 6.596800356993458e-05, "loss": 0.8589742660522461, "memory(GiB)": 91.52, "step": 65770, "token_acc": 0.7519845733569018, "train_speed(iter/s)": 0.140488 }, { "epoch": 0.8534753737806915, "grad_norm": 0.7210888266563416, "learning_rate": 6.596292060696368e-05, "loss": 0.8608826637268067, "memory(GiB)": 91.52, "step": 65775, "token_acc": 0.7720168122414133, "train_speed(iter/s)": 0.140486 }, { "epoch": 0.8535402521823472, "grad_norm": 0.7677304744720459, "learning_rate": 6.595783746029254e-05, "loss": 0.8488053321838379, "memory(GiB)": 91.52, "step": 65780, "token_acc": 0.7647080950947939, "train_speed(iter/s)": 0.140485 }, { "epoch": 0.8536051305840029, "grad_norm": 0.7397280335426331, "learning_rate": 6.595275412997961e-05, "loss": 0.909034538269043, "memory(GiB)": 91.52, "step": 65785, "token_acc": 0.7574665293511843, "train_speed(iter/s)": 0.140484 }, { "epoch": 0.8536700089856586, "grad_norm": 0.7273995876312256, "learning_rate": 6.594767061608343e-05, "loss": 0.8842684745788574, "memory(GiB)": 91.52, "step": 65790, "token_acc": 0.7584260794064251, "train_speed(iter/s)": 0.140482 }, { "epoch": 0.8537348873873143, "grad_norm": 0.7138885855674744, "learning_rate": 6.594258691866249e-05, "loss": 0.8627468109130859, "memory(GiB)": 91.52, "step": 65795, "token_acc": 0.7603005960093289, "train_speed(iter/s)": 0.14048 }, { "epoch": 0.85379976578897, "grad_norm": 0.7054521441459656, "learning_rate": 6.593750303777526e-05, "loss": 0.8319792747497559, "memory(GiB)": 91.52, "step": 65800, "token_acc": 0.7529012480840814, "train_speed(iter/s)": 0.140478 }, { "epoch": 0.8538646441906257, "grad_norm": 0.7317428588867188, "learning_rate": 6.593241897348028e-05, "loss": 0.8564448356628418, "memory(GiB)": 91.52, "step": 65805, "token_acc": 0.7689771011187475, "train_speed(iter/s)": 0.140476 }, { "epoch": 0.8539295225922814, "grad_norm": 0.8114141821861267, "learning_rate": 6.592733472583606e-05, "loss": 0.8608118057250976, "memory(GiB)": 91.52, "step": 65810, "token_acc": 0.7671601119457947, "train_speed(iter/s)": 0.140474 }, { "epoch": 0.8539944009939371, "grad_norm": 0.7358788847923279, "learning_rate": 6.59222502949011e-05, "loss": 0.8193886756896973, "memory(GiB)": 91.52, "step": 65815, "token_acc": 0.789804604372219, "train_speed(iter/s)": 0.140472 }, { "epoch": 0.8540592793955928, "grad_norm": 0.7318307757377625, "learning_rate": 6.591716568073389e-05, "loss": 0.8571182250976562, "memory(GiB)": 91.52, "step": 65820, "token_acc": 0.7511666844786434, "train_speed(iter/s)": 0.140471 }, { "epoch": 0.8541241577972485, "grad_norm": 0.7186247706413269, "learning_rate": 6.591208088339296e-05, "loss": 0.9155073165893555, "memory(GiB)": 91.52, "step": 65825, "token_acc": 0.7257232884560297, "train_speed(iter/s)": 0.140469 }, { "epoch": 0.8541890361989042, "grad_norm": 0.7927209138870239, "learning_rate": 6.590699590293685e-05, "loss": 0.8328107833862305, "memory(GiB)": 91.52, "step": 65830, "token_acc": 0.7668820107350626, "train_speed(iter/s)": 0.140468 }, { "epoch": 0.8542539146005599, "grad_norm": 0.7399669885635376, "learning_rate": 6.590191073942404e-05, "loss": 0.8690044403076171, "memory(GiB)": 91.52, "step": 65835, "token_acc": 0.7519582245430809, "train_speed(iter/s)": 0.140466 }, { "epoch": 0.8543187930022156, "grad_norm": 0.7676267027854919, "learning_rate": 6.589682539291307e-05, "loss": 0.8680387496948242, "memory(GiB)": 91.52, "step": 65840, "token_acc": 0.7557265442167223, "train_speed(iter/s)": 0.140464 }, { "epoch": 0.8543836714038713, "grad_norm": 0.6638776063919067, "learning_rate": 6.589173986346243e-05, "loss": 0.8775125503540039, "memory(GiB)": 91.52, "step": 65845, "token_acc": 0.7648842541232157, "train_speed(iter/s)": 0.140462 }, { "epoch": 0.854448549805527, "grad_norm": 0.7449305653572083, "learning_rate": 6.588665415113071e-05, "loss": 0.8911853790283203, "memory(GiB)": 91.52, "step": 65850, "token_acc": 0.7808502862711658, "train_speed(iter/s)": 0.140459 }, { "epoch": 0.8545134282071827, "grad_norm": 0.6642366647720337, "learning_rate": 6.588156825597636e-05, "loss": 0.8604895591735839, "memory(GiB)": 91.52, "step": 65855, "token_acc": 0.7758472092943083, "train_speed(iter/s)": 0.140458 }, { "epoch": 0.8545783066088384, "grad_norm": 0.8711189031600952, "learning_rate": 6.587648217805797e-05, "loss": 0.879517650604248, "memory(GiB)": 91.52, "step": 65860, "token_acc": 0.7803365513542505, "train_speed(iter/s)": 0.140456 }, { "epoch": 0.8546431850104941, "grad_norm": 0.7830655574798584, "learning_rate": 6.587139591743404e-05, "loss": 0.8210573196411133, "memory(GiB)": 91.52, "step": 65865, "token_acc": 0.7564420760639463, "train_speed(iter/s)": 0.140455 }, { "epoch": 0.8547080634121498, "grad_norm": 0.7603683471679688, "learning_rate": 6.586630947416312e-05, "loss": 0.8573740005493165, "memory(GiB)": 91.52, "step": 65870, "token_acc": 0.7719714964370546, "train_speed(iter/s)": 0.140453 }, { "epoch": 0.8547729418138055, "grad_norm": 0.7397005558013916, "learning_rate": 6.586122284830372e-05, "loss": 0.841305160522461, "memory(GiB)": 91.52, "step": 65875, "token_acc": 0.7716886761901335, "train_speed(iter/s)": 0.140452 }, { "epoch": 0.8548378202154612, "grad_norm": 0.7302353978157043, "learning_rate": 6.585613603991437e-05, "loss": 0.8714720726013183, "memory(GiB)": 91.52, "step": 65880, "token_acc": 0.7635613376508009, "train_speed(iter/s)": 0.14045 }, { "epoch": 0.8549026986171169, "grad_norm": 0.7417938113212585, "learning_rate": 6.585104904905365e-05, "loss": 0.8120645523071289, "memory(GiB)": 91.52, "step": 65885, "token_acc": 0.7807999388776407, "train_speed(iter/s)": 0.140448 }, { "epoch": 0.8549675770187726, "grad_norm": 0.7548043727874756, "learning_rate": 6.584596187578007e-05, "loss": 0.860812759399414, "memory(GiB)": 91.52, "step": 65890, "token_acc": 0.7834742383155208, "train_speed(iter/s)": 0.140446 }, { "epoch": 0.8550324554204283, "grad_norm": 0.7076545357704163, "learning_rate": 6.584087452015218e-05, "loss": 0.8731428146362304, "memory(GiB)": 91.52, "step": 65895, "token_acc": 0.7557747489239598, "train_speed(iter/s)": 0.140444 }, { "epoch": 0.855097333822084, "grad_norm": 0.7161476016044617, "learning_rate": 6.583578698222853e-05, "loss": 0.8647807121276856, "memory(GiB)": 91.52, "step": 65900, "token_acc": 0.7525647143027556, "train_speed(iter/s)": 0.140442 }, { "epoch": 0.8551622122237397, "grad_norm": 0.7420089840888977, "learning_rate": 6.583069926206767e-05, "loss": 0.859288215637207, "memory(GiB)": 91.52, "step": 65905, "token_acc": 0.7736656596173213, "train_speed(iter/s)": 0.14044 }, { "epoch": 0.8552270906253954, "grad_norm": 0.7039480209350586, "learning_rate": 6.582561135972813e-05, "loss": 0.8781749725341796, "memory(GiB)": 91.52, "step": 65910, "token_acc": 0.7577297333200491, "train_speed(iter/s)": 0.140438 }, { "epoch": 0.8552919690270511, "grad_norm": 0.7055671811103821, "learning_rate": 6.582052327526847e-05, "loss": 0.90286865234375, "memory(GiB)": 91.52, "step": 65915, "token_acc": 0.7504951683572415, "train_speed(iter/s)": 0.140436 }, { "epoch": 0.8553568474287068, "grad_norm": 0.7588949799537659, "learning_rate": 6.581543500874725e-05, "loss": 0.8938490867614746, "memory(GiB)": 91.52, "step": 65920, "token_acc": 0.7610258611438996, "train_speed(iter/s)": 0.140435 }, { "epoch": 0.8554217258303625, "grad_norm": 0.8390007615089417, "learning_rate": 6.581034656022302e-05, "loss": 0.8628023147583008, "memory(GiB)": 91.52, "step": 65925, "token_acc": 0.7803350820781858, "train_speed(iter/s)": 0.140434 }, { "epoch": 0.8554866042320182, "grad_norm": 0.7100173234939575, "learning_rate": 6.580525792975433e-05, "loss": 0.8539457321166992, "memory(GiB)": 91.52, "step": 65930, "token_acc": 0.7578792230368075, "train_speed(iter/s)": 0.140432 }, { "epoch": 0.8555514826336739, "grad_norm": 0.7678571343421936, "learning_rate": 6.580016911739977e-05, "loss": 0.8831490516662598, "memory(GiB)": 91.52, "step": 65935, "token_acc": 0.7548309178743962, "train_speed(iter/s)": 0.140431 }, { "epoch": 0.8556163610353296, "grad_norm": 0.7832606434822083, "learning_rate": 6.579508012321788e-05, "loss": 0.8668292999267578, "memory(GiB)": 91.52, "step": 65940, "token_acc": 0.7512337845459673, "train_speed(iter/s)": 0.140429 }, { "epoch": 0.8556812394369853, "grad_norm": 0.6851749420166016, "learning_rate": 6.578999094726721e-05, "loss": 0.8394315719604493, "memory(GiB)": 91.52, "step": 65945, "token_acc": 0.7849569526340558, "train_speed(iter/s)": 0.140427 }, { "epoch": 0.855746117838641, "grad_norm": 0.7435061931610107, "learning_rate": 6.578490158960634e-05, "loss": 0.8711116790771485, "memory(GiB)": 91.52, "step": 65950, "token_acc": 0.7525066502967055, "train_speed(iter/s)": 0.140425 }, { "epoch": 0.8558109962402967, "grad_norm": 0.7100793719291687, "learning_rate": 6.577981205029386e-05, "loss": 0.8497459411621093, "memory(GiB)": 91.52, "step": 65955, "token_acc": 0.7593670167470905, "train_speed(iter/s)": 0.140423 }, { "epoch": 0.8558758746419524, "grad_norm": 0.7193495631217957, "learning_rate": 6.577472232938829e-05, "loss": 0.8676223754882812, "memory(GiB)": 91.52, "step": 65960, "token_acc": 0.77573960416523, "train_speed(iter/s)": 0.140421 }, { "epoch": 0.8559407530436081, "grad_norm": 0.7649718523025513, "learning_rate": 6.576963242694824e-05, "loss": 0.8747910499572754, "memory(GiB)": 91.52, "step": 65965, "token_acc": 0.7547064305684995, "train_speed(iter/s)": 0.140419 }, { "epoch": 0.8560056314452638, "grad_norm": 0.7198649644851685, "learning_rate": 6.576454234303229e-05, "loss": 0.8032513618469238, "memory(GiB)": 91.52, "step": 65970, "token_acc": 0.7590743963122386, "train_speed(iter/s)": 0.140418 }, { "epoch": 0.8560705098469195, "grad_norm": 0.7955090999603271, "learning_rate": 6.575945207769898e-05, "loss": 0.8499862670898437, "memory(GiB)": 91.52, "step": 65975, "token_acc": 0.7667614646285105, "train_speed(iter/s)": 0.140416 }, { "epoch": 0.8561353882485752, "grad_norm": 0.7794146537780762, "learning_rate": 6.575436163100693e-05, "loss": 0.9183039665222168, "memory(GiB)": 91.52, "step": 65980, "token_acc": 0.7699607868982107, "train_speed(iter/s)": 0.140415 }, { "epoch": 0.8562002666502309, "grad_norm": 0.6978190541267395, "learning_rate": 6.574927100301469e-05, "loss": 0.8858144760131836, "memory(GiB)": 91.52, "step": 65985, "token_acc": 0.754214566423036, "train_speed(iter/s)": 0.140413 }, { "epoch": 0.8562651450518866, "grad_norm": 0.7787580490112305, "learning_rate": 6.574418019378085e-05, "loss": 0.8582653045654297, "memory(GiB)": 91.52, "step": 65990, "token_acc": 0.7528357989794795, "train_speed(iter/s)": 0.140411 }, { "epoch": 0.8563300234535421, "grad_norm": 0.6702723503112793, "learning_rate": 6.5739089203364e-05, "loss": 0.8451070785522461, "memory(GiB)": 91.52, "step": 65995, "token_acc": 0.773464640667009, "train_speed(iter/s)": 0.140409 }, { "epoch": 0.8563949018551978, "grad_norm": 0.7973697185516357, "learning_rate": 6.573399803182273e-05, "loss": 0.8537904739379882, "memory(GiB)": 91.52, "step": 66000, "token_acc": 0.7691795023696683, "train_speed(iter/s)": 0.140407 }, { "epoch": 0.8564597802568535, "grad_norm": 0.7072076201438904, "learning_rate": 6.572890667921561e-05, "loss": 0.8804004669189454, "memory(GiB)": 91.52, "step": 66005, "token_acc": 0.7614871056420998, "train_speed(iter/s)": 0.140405 }, { "epoch": 0.8565246586585092, "grad_norm": 0.6902002096176147, "learning_rate": 6.572381514560127e-05, "loss": 0.8333877563476563, "memory(GiB)": 91.52, "step": 66010, "token_acc": 0.8071803907995343, "train_speed(iter/s)": 0.140403 }, { "epoch": 0.8565895370601649, "grad_norm": 0.8021546602249146, "learning_rate": 6.571872343103826e-05, "loss": 0.8360302925109864, "memory(GiB)": 91.52, "step": 66015, "token_acc": 0.7809391456992393, "train_speed(iter/s)": 0.140401 }, { "epoch": 0.8566544154618206, "grad_norm": 0.6713190674781799, "learning_rate": 6.571363153558518e-05, "loss": 0.8320562362670898, "memory(GiB)": 91.52, "step": 66020, "token_acc": 0.7647178169276029, "train_speed(iter/s)": 0.140399 }, { "epoch": 0.8567192938634763, "grad_norm": 0.7822867035865784, "learning_rate": 6.570853945930065e-05, "loss": 0.8746782302856445, "memory(GiB)": 91.52, "step": 66025, "token_acc": 0.7704244637151986, "train_speed(iter/s)": 0.140397 }, { "epoch": 0.856784172265132, "grad_norm": 0.7423926591873169, "learning_rate": 6.570344720224326e-05, "loss": 0.8954513549804688, "memory(GiB)": 91.52, "step": 66030, "token_acc": 0.7665794944008583, "train_speed(iter/s)": 0.140395 }, { "epoch": 0.8568490506667877, "grad_norm": 0.7912923097610474, "learning_rate": 6.569835476447162e-05, "loss": 0.8658254623413086, "memory(GiB)": 91.52, "step": 66035, "token_acc": 0.7781355990270308, "train_speed(iter/s)": 0.140393 }, { "epoch": 0.8569139290684434, "grad_norm": 0.754493236541748, "learning_rate": 6.569326214604432e-05, "loss": 0.8358943939208985, "memory(GiB)": 91.52, "step": 66040, "token_acc": 0.7687706653787569, "train_speed(iter/s)": 0.140392 }, { "epoch": 0.8569788074700991, "grad_norm": 0.6612057089805603, "learning_rate": 6.568816934701995e-05, "loss": 0.8244222640991211, "memory(GiB)": 91.52, "step": 66045, "token_acc": 0.7622933063759116, "train_speed(iter/s)": 0.14039 }, { "epoch": 0.8570436858717548, "grad_norm": 0.6099957227706909, "learning_rate": 6.568307636745716e-05, "loss": 0.8336252212524414, "memory(GiB)": 91.52, "step": 66050, "token_acc": 0.7884848893685915, "train_speed(iter/s)": 0.140388 }, { "epoch": 0.8571085642734105, "grad_norm": 0.6509794592857361, "learning_rate": 6.567798320741452e-05, "loss": 0.8598285675048828, "memory(GiB)": 91.52, "step": 66055, "token_acc": 0.7660182641621501, "train_speed(iter/s)": 0.140387 }, { "epoch": 0.8571734426750662, "grad_norm": 0.6754139065742493, "learning_rate": 6.567288986695067e-05, "loss": 0.8724787712097168, "memory(GiB)": 91.52, "step": 66060, "token_acc": 0.7844806007509387, "train_speed(iter/s)": 0.140385 }, { "epoch": 0.8572383210767219, "grad_norm": 0.7637208104133606, "learning_rate": 6.566779634612421e-05, "loss": 0.8288361549377441, "memory(GiB)": 91.52, "step": 66065, "token_acc": 0.777127604968689, "train_speed(iter/s)": 0.140383 }, { "epoch": 0.8573031994783776, "grad_norm": 0.7692098021507263, "learning_rate": 6.566270264499375e-05, "loss": 0.8928651809692383, "memory(GiB)": 91.52, "step": 66070, "token_acc": 0.7701646664806029, "train_speed(iter/s)": 0.140381 }, { "epoch": 0.8573680778800333, "grad_norm": 0.7405693531036377, "learning_rate": 6.565760876361792e-05, "loss": 0.8422852516174316, "memory(GiB)": 91.52, "step": 66075, "token_acc": 0.7483124963315138, "train_speed(iter/s)": 0.140379 }, { "epoch": 0.857432956281689, "grad_norm": 0.8000903129577637, "learning_rate": 6.565251470205534e-05, "loss": 0.8518928527832031, "memory(GiB)": 91.52, "step": 66080, "token_acc": 0.7702315804733084, "train_speed(iter/s)": 0.140377 }, { "epoch": 0.8574978346833447, "grad_norm": 0.6584418416023254, "learning_rate": 6.564742046036464e-05, "loss": 0.8643508911132812, "memory(GiB)": 91.52, "step": 66085, "token_acc": 0.7482142857142857, "train_speed(iter/s)": 0.140376 }, { "epoch": 0.8575627130850004, "grad_norm": 0.7677443623542786, "learning_rate": 6.56423260386044e-05, "loss": 0.850469970703125, "memory(GiB)": 91.52, "step": 66090, "token_acc": 0.7483376914456481, "train_speed(iter/s)": 0.140374 }, { "epoch": 0.8576275914866561, "grad_norm": 0.7741209268569946, "learning_rate": 6.563723143683332e-05, "loss": 0.8262266159057617, "memory(GiB)": 91.52, "step": 66095, "token_acc": 0.7685938913240103, "train_speed(iter/s)": 0.140372 }, { "epoch": 0.8576924698883118, "grad_norm": 0.7531353831291199, "learning_rate": 6.563213665510996e-05, "loss": 0.8685493469238281, "memory(GiB)": 91.52, "step": 66100, "token_acc": 0.7651236289273099, "train_speed(iter/s)": 0.14037 }, { "epoch": 0.8577573482899675, "grad_norm": 0.6911781430244446, "learning_rate": 6.562704169349298e-05, "loss": 0.8904548645019531, "memory(GiB)": 91.52, "step": 66105, "token_acc": 0.7665667759810019, "train_speed(iter/s)": 0.140368 }, { "epoch": 0.8578222266916232, "grad_norm": 0.6678725481033325, "learning_rate": 6.562194655204101e-05, "loss": 0.8594751358032227, "memory(GiB)": 91.52, "step": 66110, "token_acc": 0.7728727660709522, "train_speed(iter/s)": 0.140366 }, { "epoch": 0.8578871050932789, "grad_norm": 0.802118182182312, "learning_rate": 6.561685123081267e-05, "loss": 0.8580950736999512, "memory(GiB)": 91.52, "step": 66115, "token_acc": 0.7762766277631007, "train_speed(iter/s)": 0.140365 }, { "epoch": 0.8579519834949346, "grad_norm": 0.7916209697723389, "learning_rate": 6.561175572986664e-05, "loss": 0.8440752029418945, "memory(GiB)": 91.52, "step": 66120, "token_acc": 0.765786738723758, "train_speed(iter/s)": 0.140363 }, { "epoch": 0.8580168618965903, "grad_norm": 0.6478772163391113, "learning_rate": 6.560666004926151e-05, "loss": 0.8459980010986328, "memory(GiB)": 91.52, "step": 66125, "token_acc": 0.7595655152167531, "train_speed(iter/s)": 0.140361 }, { "epoch": 0.858081740298246, "grad_norm": 0.7979156374931335, "learning_rate": 6.560156418905596e-05, "loss": 0.8812155723571777, "memory(GiB)": 91.52, "step": 66130, "token_acc": 0.7542409586371183, "train_speed(iter/s)": 0.140359 }, { "epoch": 0.8581466186999017, "grad_norm": 0.7411620616912842, "learning_rate": 6.55964681493086e-05, "loss": 0.8517522811889648, "memory(GiB)": 91.52, "step": 66135, "token_acc": 0.7555084165868345, "train_speed(iter/s)": 0.140358 }, { "epoch": 0.8582114971015574, "grad_norm": 0.7292603254318237, "learning_rate": 6.559137193007809e-05, "loss": 0.8595251083374024, "memory(GiB)": 91.52, "step": 66140, "token_acc": 0.7597158378604263, "train_speed(iter/s)": 0.140356 }, { "epoch": 0.8582763755032131, "grad_norm": 0.7544111609458923, "learning_rate": 6.558627553142307e-05, "loss": 0.8443733215332031, "memory(GiB)": 91.52, "step": 66145, "token_acc": 0.7646869524797082, "train_speed(iter/s)": 0.140354 }, { "epoch": 0.8583412539048688, "grad_norm": 0.6261063814163208, "learning_rate": 6.55811789534022e-05, "loss": 0.8384210586547851, "memory(GiB)": 91.52, "step": 66150, "token_acc": 0.774045906603806, "train_speed(iter/s)": 0.140352 }, { "epoch": 0.8584061323065245, "grad_norm": 0.8521345257759094, "learning_rate": 6.557608219607414e-05, "loss": 0.8797636032104492, "memory(GiB)": 91.52, "step": 66155, "token_acc": 0.7636078998073218, "train_speed(iter/s)": 0.140351 }, { "epoch": 0.8584710107081802, "grad_norm": 0.7263272404670715, "learning_rate": 6.55709852594975e-05, "loss": 0.8568046569824219, "memory(GiB)": 91.52, "step": 66160, "token_acc": 0.7642997045447001, "train_speed(iter/s)": 0.140349 }, { "epoch": 0.8585358891098359, "grad_norm": 0.6891801953315735, "learning_rate": 6.556588814373096e-05, "loss": 0.8678895950317382, "memory(GiB)": 91.52, "step": 66165, "token_acc": 0.7637107588625658, "train_speed(iter/s)": 0.140347 }, { "epoch": 0.8586007675114916, "grad_norm": 0.7690956592559814, "learning_rate": 6.55607908488332e-05, "loss": 0.9001070022583008, "memory(GiB)": 91.52, "step": 66170, "token_acc": 0.7503442577802258, "train_speed(iter/s)": 0.140345 }, { "epoch": 0.8586656459131473, "grad_norm": 0.6282302141189575, "learning_rate": 6.555569337486287e-05, "loss": 0.884073543548584, "memory(GiB)": 91.52, "step": 66175, "token_acc": 0.7544886923789322, "train_speed(iter/s)": 0.140344 }, { "epoch": 0.858730524314803, "grad_norm": 0.782384991645813, "learning_rate": 6.55505957218786e-05, "loss": 0.9127891540527344, "memory(GiB)": 91.52, "step": 66180, "token_acc": 0.7716530444148768, "train_speed(iter/s)": 0.140342 }, { "epoch": 0.8587954027164587, "grad_norm": 0.7362395524978638, "learning_rate": 6.554549788993909e-05, "loss": 0.8565921783447266, "memory(GiB)": 91.52, "step": 66185, "token_acc": 0.7576544502617801, "train_speed(iter/s)": 0.14034 }, { "epoch": 0.8588602811181144, "grad_norm": 0.761981725692749, "learning_rate": 6.5540399879103e-05, "loss": 0.8691999435424804, "memory(GiB)": 91.52, "step": 66190, "token_acc": 0.7534880012756119, "train_speed(iter/s)": 0.140339 }, { "epoch": 0.8589251595197701, "grad_norm": 0.7467203736305237, "learning_rate": 6.553530168942896e-05, "loss": 0.9064212799072265, "memory(GiB)": 91.52, "step": 66195, "token_acc": 0.7421649606765996, "train_speed(iter/s)": 0.140338 }, { "epoch": 0.8589900379214258, "grad_norm": 0.7520986795425415, "learning_rate": 6.553020332097569e-05, "loss": 0.8250028610229492, "memory(GiB)": 91.52, "step": 66200, "token_acc": 0.7830202260131002, "train_speed(iter/s)": 0.140336 }, { "epoch": 0.8590549163230815, "grad_norm": 0.7092737555503845, "learning_rate": 6.552510477380182e-05, "loss": 0.848704719543457, "memory(GiB)": 91.52, "step": 66205, "token_acc": 0.7882594856482916, "train_speed(iter/s)": 0.140333 }, { "epoch": 0.8591197947247372, "grad_norm": 0.7399857044219971, "learning_rate": 6.552000604796607e-05, "loss": 0.8519807815551758, "memory(GiB)": 91.52, "step": 66210, "token_acc": 0.7863945578231293, "train_speed(iter/s)": 0.140332 }, { "epoch": 0.8591846731263929, "grad_norm": 0.6926164627075195, "learning_rate": 6.551490714352707e-05, "loss": 0.889037799835205, "memory(GiB)": 91.52, "step": 66215, "token_acc": 0.756781979082864, "train_speed(iter/s)": 0.14033 }, { "epoch": 0.8592495515280486, "grad_norm": 0.805820882320404, "learning_rate": 6.550980806054353e-05, "loss": 0.852328872680664, "memory(GiB)": 91.52, "step": 66220, "token_acc": 0.7689099361341617, "train_speed(iter/s)": 0.140329 }, { "epoch": 0.8593144299297043, "grad_norm": 0.8067695498466492, "learning_rate": 6.550470879907411e-05, "loss": 0.8523008346557617, "memory(GiB)": 91.52, "step": 66225, "token_acc": 0.7658000700280112, "train_speed(iter/s)": 0.140327 }, { "epoch": 0.85937930833136, "grad_norm": 0.6833878755569458, "learning_rate": 6.549960935917749e-05, "loss": 0.8509489059448242, "memory(GiB)": 91.52, "step": 66230, "token_acc": 0.7821066890279301, "train_speed(iter/s)": 0.140325 }, { "epoch": 0.8594441867330156, "grad_norm": 0.7462003231048584, "learning_rate": 6.549450974091238e-05, "loss": 0.8666774749755859, "memory(GiB)": 91.52, "step": 66235, "token_acc": 0.7696568382475429, "train_speed(iter/s)": 0.140324 }, { "epoch": 0.8595090651346713, "grad_norm": 0.7924595475196838, "learning_rate": 6.548940994433744e-05, "loss": 0.8962087631225586, "memory(GiB)": 91.52, "step": 66240, "token_acc": 0.7667886727027547, "train_speed(iter/s)": 0.140322 }, { "epoch": 0.859573943536327, "grad_norm": 0.8133381605148315, "learning_rate": 6.548430996951136e-05, "loss": 0.9235237121582032, "memory(GiB)": 91.52, "step": 66245, "token_acc": 0.7534387123665395, "train_speed(iter/s)": 0.140321 }, { "epoch": 0.8596388219379827, "grad_norm": 0.8060815930366516, "learning_rate": 6.547920981649285e-05, "loss": 0.8727385520935058, "memory(GiB)": 91.52, "step": 66250, "token_acc": 0.762466755319149, "train_speed(iter/s)": 0.14032 }, { "epoch": 0.8597037003396384, "grad_norm": 0.7547279000282288, "learning_rate": 6.547410948534058e-05, "loss": 0.9002529144287109, "memory(GiB)": 91.52, "step": 66255, "token_acc": 0.7566415441803362, "train_speed(iter/s)": 0.140318 }, { "epoch": 0.8597685787412941, "grad_norm": 0.7722742557525635, "learning_rate": 6.546900897611326e-05, "loss": 0.8436994552612305, "memory(GiB)": 91.52, "step": 66260, "token_acc": 0.7799716412619638, "train_speed(iter/s)": 0.140317 }, { "epoch": 0.8598334571429498, "grad_norm": 0.7749985456466675, "learning_rate": 6.546390828886958e-05, "loss": 0.8656486511230469, "memory(GiB)": 91.52, "step": 66265, "token_acc": 0.7781326519748749, "train_speed(iter/s)": 0.140316 }, { "epoch": 0.8598983355446055, "grad_norm": 0.8559497594833374, "learning_rate": 6.545880742366823e-05, "loss": 0.8660861015319824, "memory(GiB)": 91.52, "step": 66270, "token_acc": 0.7614391281630216, "train_speed(iter/s)": 0.140314 }, { "epoch": 0.8599632139462612, "grad_norm": 0.7442132830619812, "learning_rate": 6.545370638056793e-05, "loss": 0.8662506103515625, "memory(GiB)": 91.52, "step": 66275, "token_acc": 0.7550444709942917, "train_speed(iter/s)": 0.140313 }, { "epoch": 0.8600280923479169, "grad_norm": 0.7059295773506165, "learning_rate": 6.544860515962738e-05, "loss": 0.8216476440429688, "memory(GiB)": 91.52, "step": 66280, "token_acc": 0.7513384051845591, "train_speed(iter/s)": 0.140311 }, { "epoch": 0.8600929707495726, "grad_norm": 0.6907270550727844, "learning_rate": 6.544350376090524e-05, "loss": 0.8710421562194824, "memory(GiB)": 91.52, "step": 66285, "token_acc": 0.7674754361339483, "train_speed(iter/s)": 0.140309 }, { "epoch": 0.8601578491512283, "grad_norm": 0.8418139219284058, "learning_rate": 6.543840218446029e-05, "loss": 0.8907369613647461, "memory(GiB)": 91.52, "step": 66290, "token_acc": 0.749523089659144, "train_speed(iter/s)": 0.140308 }, { "epoch": 0.860222727552884, "grad_norm": 0.7116448283195496, "learning_rate": 6.543330043035119e-05, "loss": 0.852662181854248, "memory(GiB)": 91.52, "step": 66295, "token_acc": 0.7595054891986736, "train_speed(iter/s)": 0.140306 }, { "epoch": 0.8602876059545397, "grad_norm": 0.6806530952453613, "learning_rate": 6.542819849863666e-05, "loss": 0.8607177734375, "memory(GiB)": 91.52, "step": 66300, "token_acc": 0.7646845540246555, "train_speed(iter/s)": 0.140304 }, { "epoch": 0.8603524843561954, "grad_norm": 0.7665959000587463, "learning_rate": 6.542309638937542e-05, "loss": 0.886811637878418, "memory(GiB)": 91.52, "step": 66305, "token_acc": 0.7585092794349688, "train_speed(iter/s)": 0.140303 }, { "epoch": 0.8604173627578511, "grad_norm": 0.7428498864173889, "learning_rate": 6.541799410262617e-05, "loss": 0.8498006820678711, "memory(GiB)": 91.52, "step": 66310, "token_acc": 0.7776632823155023, "train_speed(iter/s)": 0.140301 }, { "epoch": 0.8604822411595068, "grad_norm": 0.7764753103256226, "learning_rate": 6.541289163844765e-05, "loss": 0.9176692962646484, "memory(GiB)": 91.52, "step": 66315, "token_acc": 0.7627977152710421, "train_speed(iter/s)": 0.1403 }, { "epoch": 0.8605471195611625, "grad_norm": 0.7296240329742432, "learning_rate": 6.540778899689854e-05, "loss": 0.9081106185913086, "memory(GiB)": 91.52, "step": 66320, "token_acc": 0.7424848016659389, "train_speed(iter/s)": 0.140298 }, { "epoch": 0.8606119979628182, "grad_norm": 0.7138558030128479, "learning_rate": 6.54026861780376e-05, "loss": 0.8330318450927734, "memory(GiB)": 91.52, "step": 66325, "token_acc": 0.7751748251748252, "train_speed(iter/s)": 0.140296 }, { "epoch": 0.8606768763644739, "grad_norm": 0.7162910103797913, "learning_rate": 6.539758318192355e-05, "loss": 0.8927547454833984, "memory(GiB)": 91.52, "step": 66330, "token_acc": 0.7418132418132418, "train_speed(iter/s)": 0.140295 }, { "epoch": 0.8607417547661296, "grad_norm": 0.7846259474754333, "learning_rate": 6.539248000861509e-05, "loss": 0.8375732421875, "memory(GiB)": 91.52, "step": 66335, "token_acc": 0.776859799713877, "train_speed(iter/s)": 0.140293 }, { "epoch": 0.8608066331677853, "grad_norm": 0.6416395306587219, "learning_rate": 6.538737665817096e-05, "loss": 0.8390129089355469, "memory(GiB)": 91.52, "step": 66340, "token_acc": 0.7644191714053615, "train_speed(iter/s)": 0.140292 }, { "epoch": 0.860871511569441, "grad_norm": 0.7855197787284851, "learning_rate": 6.53822731306499e-05, "loss": 0.8336265563964844, "memory(GiB)": 91.52, "step": 66345, "token_acc": 0.7742362076565732, "train_speed(iter/s)": 0.14029 }, { "epoch": 0.8609363899710967, "grad_norm": 0.7166980504989624, "learning_rate": 6.537716942611061e-05, "loss": 0.8354092597961426, "memory(GiB)": 91.52, "step": 66350, "token_acc": 0.7808068023236946, "train_speed(iter/s)": 0.140288 }, { "epoch": 0.8610012683727524, "grad_norm": 0.6513367295265198, "learning_rate": 6.537206554461188e-05, "loss": 0.8082191467285156, "memory(GiB)": 91.52, "step": 66355, "token_acc": 0.7750478621569878, "train_speed(iter/s)": 0.140286 }, { "epoch": 0.8610661467744081, "grad_norm": 0.8331059217453003, "learning_rate": 6.536696148621238e-05, "loss": 0.8276158332824707, "memory(GiB)": 91.52, "step": 66360, "token_acc": 0.7794002784663973, "train_speed(iter/s)": 0.140284 }, { "epoch": 0.8611310251760638, "grad_norm": 0.7720792293548584, "learning_rate": 6.536185725097088e-05, "loss": 0.8687108993530274, "memory(GiB)": 91.52, "step": 66365, "token_acc": 0.7589177210665523, "train_speed(iter/s)": 0.140283 }, { "epoch": 0.8611959035777195, "grad_norm": 0.7256072759628296, "learning_rate": 6.535675283894612e-05, "loss": 0.9182623863220215, "memory(GiB)": 91.52, "step": 66370, "token_acc": 0.7522466250251864, "train_speed(iter/s)": 0.140281 }, { "epoch": 0.8612607819793752, "grad_norm": 0.776056170463562, "learning_rate": 6.535164825019684e-05, "loss": 0.8883872985839844, "memory(GiB)": 91.52, "step": 66375, "token_acc": 0.7588117592735109, "train_speed(iter/s)": 0.14028 }, { "epoch": 0.8613256603810309, "grad_norm": 0.7637200951576233, "learning_rate": 6.534654348478178e-05, "loss": 0.8384562492370605, "memory(GiB)": 91.52, "step": 66380, "token_acc": 0.7658378877684551, "train_speed(iter/s)": 0.140279 }, { "epoch": 0.8613905387826866, "grad_norm": 0.7607345581054688, "learning_rate": 6.534143854275967e-05, "loss": 0.90179443359375, "memory(GiB)": 91.52, "step": 66385, "token_acc": 0.7661562418894368, "train_speed(iter/s)": 0.140277 }, { "epoch": 0.8614554171843423, "grad_norm": 0.7131507396697998, "learning_rate": 6.533633342418929e-05, "loss": 0.8537729263305665, "memory(GiB)": 91.52, "step": 66390, "token_acc": 0.7641503474415666, "train_speed(iter/s)": 0.140275 }, { "epoch": 0.861520295585998, "grad_norm": 0.7792508006095886, "learning_rate": 6.533122812912937e-05, "loss": 0.8668207168579102, "memory(GiB)": 91.52, "step": 66395, "token_acc": 0.7675347562169571, "train_speed(iter/s)": 0.140274 }, { "epoch": 0.8615851739876537, "grad_norm": 0.8064051270484924, "learning_rate": 6.532612265763866e-05, "loss": 0.8897810935974121, "memory(GiB)": 91.52, "step": 66400, "token_acc": 0.7580692273988608, "train_speed(iter/s)": 0.140272 }, { "epoch": 0.8616500523893094, "grad_norm": 0.6826730966567993, "learning_rate": 6.532101700977592e-05, "loss": 0.8855924606323242, "memory(GiB)": 91.52, "step": 66405, "token_acc": 0.7578102189781022, "train_speed(iter/s)": 0.14027 }, { "epoch": 0.8617149307909651, "grad_norm": 0.6980426907539368, "learning_rate": 6.53159111855999e-05, "loss": 0.8249834060668946, "memory(GiB)": 91.52, "step": 66410, "token_acc": 0.7903945983228664, "train_speed(iter/s)": 0.140269 }, { "epoch": 0.8617798091926208, "grad_norm": 0.7249055504798889, "learning_rate": 6.531080518516935e-05, "loss": 0.8542363166809082, "memory(GiB)": 91.52, "step": 66415, "token_acc": 0.7538339920948617, "train_speed(iter/s)": 0.140267 }, { "epoch": 0.8618446875942765, "grad_norm": 0.6765106320381165, "learning_rate": 6.530569900854304e-05, "loss": 0.812753963470459, "memory(GiB)": 91.52, "step": 66420, "token_acc": 0.7637053257214358, "train_speed(iter/s)": 0.140266 }, { "epoch": 0.8619095659959322, "grad_norm": 0.7046322226524353, "learning_rate": 6.530059265577975e-05, "loss": 0.9030969619750977, "memory(GiB)": 91.52, "step": 66425, "token_acc": 0.7589242053789731, "train_speed(iter/s)": 0.140264 }, { "epoch": 0.8619744443975879, "grad_norm": 0.7371827960014343, "learning_rate": 6.529548612693821e-05, "loss": 0.8369421005249024, "memory(GiB)": 91.52, "step": 66430, "token_acc": 0.7666606574124151, "train_speed(iter/s)": 0.140263 }, { "epoch": 0.8620393227992436, "grad_norm": 0.7373276352882385, "learning_rate": 6.52903794220772e-05, "loss": 0.8439912796020508, "memory(GiB)": 91.52, "step": 66435, "token_acc": 0.7730999247494951, "train_speed(iter/s)": 0.140261 }, { "epoch": 0.8621042012008993, "grad_norm": 0.912154495716095, "learning_rate": 6.52852725412555e-05, "loss": 0.8379386901855469, "memory(GiB)": 91.52, "step": 66440, "token_acc": 0.788172426077663, "train_speed(iter/s)": 0.14026 }, { "epoch": 0.862169079602555, "grad_norm": 0.7164894342422485, "learning_rate": 6.528016548453186e-05, "loss": 0.9243949890136719, "memory(GiB)": 91.52, "step": 66445, "token_acc": 0.7599514563106796, "train_speed(iter/s)": 0.140258 }, { "epoch": 0.8622339580042107, "grad_norm": 0.7738752365112305, "learning_rate": 6.527505825196505e-05, "loss": 0.878292465209961, "memory(GiB)": 91.52, "step": 66450, "token_acc": 0.7635437054544112, "train_speed(iter/s)": 0.140256 }, { "epoch": 0.8622988364058664, "grad_norm": 0.8284975290298462, "learning_rate": 6.526995084361386e-05, "loss": 0.86910400390625, "memory(GiB)": 91.52, "step": 66455, "token_acc": 0.7674884285854972, "train_speed(iter/s)": 0.140255 }, { "epoch": 0.862363714807522, "grad_norm": 0.7501404285430908, "learning_rate": 6.526484325953706e-05, "loss": 0.8832945823669434, "memory(GiB)": 91.52, "step": 66460, "token_acc": 0.7572052686489307, "train_speed(iter/s)": 0.140253 }, { "epoch": 0.8624285932091778, "grad_norm": 0.8040058016777039, "learning_rate": 6.525973549979343e-05, "loss": 0.8616159439086915, "memory(GiB)": 91.52, "step": 66465, "token_acc": 0.7609327379686724, "train_speed(iter/s)": 0.140251 }, { "epoch": 0.8624934716108335, "grad_norm": 0.7680944800376892, "learning_rate": 6.525462756444173e-05, "loss": 0.8447248458862304, "memory(GiB)": 91.52, "step": 66470, "token_acc": 0.780634310427679, "train_speed(iter/s)": 0.14025 }, { "epoch": 0.862558350012489, "grad_norm": 0.6957091689109802, "learning_rate": 6.524951945354077e-05, "loss": 0.8772035598754883, "memory(GiB)": 91.52, "step": 66475, "token_acc": 0.7672083397209872, "train_speed(iter/s)": 0.140248 }, { "epoch": 0.8626232284141447, "grad_norm": 0.6899989247322083, "learning_rate": 6.524441116714932e-05, "loss": 0.8428255081176758, "memory(GiB)": 91.52, "step": 66480, "token_acc": 0.760229651716044, "train_speed(iter/s)": 0.140247 }, { "epoch": 0.8626881068158004, "grad_norm": 0.7590208053588867, "learning_rate": 6.523930270532617e-05, "loss": 0.8559525489807129, "memory(GiB)": 91.52, "step": 66485, "token_acc": 0.7719315997166852, "train_speed(iter/s)": 0.140245 }, { "epoch": 0.8627529852174561, "grad_norm": 0.7832653522491455, "learning_rate": 6.523419406813009e-05, "loss": 0.8315220832824707, "memory(GiB)": 91.52, "step": 66490, "token_acc": 0.7837951461876552, "train_speed(iter/s)": 0.140244 }, { "epoch": 0.8628178636191118, "grad_norm": 0.8191497921943665, "learning_rate": 6.52290852556199e-05, "loss": 0.8607784271240234, "memory(GiB)": 91.52, "step": 66495, "token_acc": 0.7501224289911851, "train_speed(iter/s)": 0.140242 }, { "epoch": 0.8628827420207675, "grad_norm": 0.7633795142173767, "learning_rate": 6.52239762678544e-05, "loss": 0.8667120933532715, "memory(GiB)": 91.52, "step": 66500, "token_acc": 0.7760575741618395, "train_speed(iter/s)": 0.14024 }, { "epoch": 0.8629476204224232, "grad_norm": 0.7061238884925842, "learning_rate": 6.521886710489232e-05, "loss": 0.863283920288086, "memory(GiB)": 91.52, "step": 66505, "token_acc": 0.7615924484354408, "train_speed(iter/s)": 0.140238 }, { "epoch": 0.8630124988240789, "grad_norm": 0.7422081828117371, "learning_rate": 6.521375776679251e-05, "loss": 0.8319025039672852, "memory(GiB)": 91.52, "step": 66510, "token_acc": 0.7624640543256072, "train_speed(iter/s)": 0.140237 }, { "epoch": 0.8630773772257346, "grad_norm": 0.7712645530700684, "learning_rate": 6.520864825361377e-05, "loss": 0.853703498840332, "memory(GiB)": 91.52, "step": 66515, "token_acc": 0.7658513674626973, "train_speed(iter/s)": 0.140235 }, { "epoch": 0.8631422556273903, "grad_norm": 0.7347180247306824, "learning_rate": 6.520353856541486e-05, "loss": 0.8910161972045898, "memory(GiB)": 91.52, "step": 66520, "token_acc": 0.7541514063222273, "train_speed(iter/s)": 0.140233 }, { "epoch": 0.863207134029046, "grad_norm": 0.8106746673583984, "learning_rate": 6.519842870225463e-05, "loss": 0.8430613517761231, "memory(GiB)": 91.52, "step": 66525, "token_acc": 0.7697254027098849, "train_speed(iter/s)": 0.140232 }, { "epoch": 0.8632720124307017, "grad_norm": 0.7068238258361816, "learning_rate": 6.519331866419183e-05, "loss": 0.8533317565917968, "memory(GiB)": 91.52, "step": 66530, "token_acc": 0.7493953319627524, "train_speed(iter/s)": 0.14023 }, { "epoch": 0.8633368908323574, "grad_norm": 0.6730921864509583, "learning_rate": 6.518820845128533e-05, "loss": 0.8612017631530762, "memory(GiB)": 91.52, "step": 66535, "token_acc": 0.7660161347728082, "train_speed(iter/s)": 0.140228 }, { "epoch": 0.8634017692340131, "grad_norm": 0.6829478740692139, "learning_rate": 6.518309806359388e-05, "loss": 0.8831104278564453, "memory(GiB)": 91.52, "step": 66540, "token_acc": 0.7634883125017795, "train_speed(iter/s)": 0.140226 }, { "epoch": 0.8634666476356688, "grad_norm": 0.7569975852966309, "learning_rate": 6.517798750117633e-05, "loss": 0.8815729141235351, "memory(GiB)": 91.52, "step": 66545, "token_acc": 0.7600014721578153, "train_speed(iter/s)": 0.140224 }, { "epoch": 0.8635315260373245, "grad_norm": 0.7880076766014099, "learning_rate": 6.517287676409148e-05, "loss": 0.8229223251342773, "memory(GiB)": 91.52, "step": 66550, "token_acc": 0.7939357099438624, "train_speed(iter/s)": 0.140223 }, { "epoch": 0.8635964044389802, "grad_norm": 0.746493935585022, "learning_rate": 6.516776585239812e-05, "loss": 0.8364302635192871, "memory(GiB)": 91.52, "step": 66555, "token_acc": 0.7766371077762619, "train_speed(iter/s)": 0.140222 }, { "epoch": 0.8636612828406359, "grad_norm": 0.7108744382858276, "learning_rate": 6.516265476615511e-05, "loss": 0.7981068611145019, "memory(GiB)": 91.52, "step": 66560, "token_acc": 0.7827162190011678, "train_speed(iter/s)": 0.14022 }, { "epoch": 0.8637261612422916, "grad_norm": 0.6885388493537903, "learning_rate": 6.515754350542122e-05, "loss": 0.8672279357910156, "memory(GiB)": 91.52, "step": 66565, "token_acc": 0.7723981900452489, "train_speed(iter/s)": 0.140219 }, { "epoch": 0.8637910396439473, "grad_norm": 0.8014969229698181, "learning_rate": 6.515243207025532e-05, "loss": 0.8479887008666992, "memory(GiB)": 91.52, "step": 66570, "token_acc": 0.7770205293306016, "train_speed(iter/s)": 0.140218 }, { "epoch": 0.863855918045603, "grad_norm": 0.7432116866111755, "learning_rate": 6.51473204607162e-05, "loss": 0.8366364479064942, "memory(GiB)": 91.52, "step": 66575, "token_acc": 0.7630132659507265, "train_speed(iter/s)": 0.140216 }, { "epoch": 0.8639207964472587, "grad_norm": 0.7071408033370972, "learning_rate": 6.514220867686268e-05, "loss": 0.8253178596496582, "memory(GiB)": 91.52, "step": 66580, "token_acc": 0.7837166159123525, "train_speed(iter/s)": 0.140215 }, { "epoch": 0.8639856748489144, "grad_norm": 0.7152737379074097, "learning_rate": 6.513709671875361e-05, "loss": 0.869618034362793, "memory(GiB)": 91.52, "step": 66585, "token_acc": 0.7449455131574337, "train_speed(iter/s)": 0.140213 }, { "epoch": 0.8640505532505701, "grad_norm": 0.7603737115859985, "learning_rate": 6.51319845864478e-05, "loss": 0.8476505279541016, "memory(GiB)": 91.52, "step": 66590, "token_acc": 0.7696665798384996, "train_speed(iter/s)": 0.140211 }, { "epoch": 0.8641154316522258, "grad_norm": 0.6725230813026428, "learning_rate": 6.512687228000409e-05, "loss": 0.8250058174133301, "memory(GiB)": 91.52, "step": 66595, "token_acc": 0.7773003738630657, "train_speed(iter/s)": 0.14021 }, { "epoch": 0.8641803100538815, "grad_norm": 0.7843555212020874, "learning_rate": 6.512175979948128e-05, "loss": 0.8669999122619629, "memory(GiB)": 91.52, "step": 66600, "token_acc": 0.7412142963959922, "train_speed(iter/s)": 0.140208 }, { "epoch": 0.8642451884555372, "grad_norm": 0.7026744484901428, "learning_rate": 6.511664714493827e-05, "loss": 0.8721113204956055, "memory(GiB)": 91.52, "step": 66605, "token_acc": 0.7649600844494293, "train_speed(iter/s)": 0.140206 }, { "epoch": 0.8643100668571929, "grad_norm": 0.7660624384880066, "learning_rate": 6.511153431643382e-05, "loss": 0.8603621482849121, "memory(GiB)": 91.52, "step": 66610, "token_acc": 0.7768334229872691, "train_speed(iter/s)": 0.140204 }, { "epoch": 0.8643749452588486, "grad_norm": 0.7327598333358765, "learning_rate": 6.510642131402685e-05, "loss": 0.8822291374206543, "memory(GiB)": 91.52, "step": 66615, "token_acc": 0.7655892448512586, "train_speed(iter/s)": 0.140203 }, { "epoch": 0.8644398236605043, "grad_norm": 0.7266519069671631, "learning_rate": 6.510130813777613e-05, "loss": 0.8691753387451172, "memory(GiB)": 91.52, "step": 66620, "token_acc": 0.7534580376979861, "train_speed(iter/s)": 0.140201 }, { "epoch": 0.86450470206216, "grad_norm": 0.7517868876457214, "learning_rate": 6.509619478774053e-05, "loss": 0.8806852340698242, "memory(GiB)": 91.52, "step": 66625, "token_acc": 0.7732128951876654, "train_speed(iter/s)": 0.140199 }, { "epoch": 0.8645695804638157, "grad_norm": 0.7170104384422302, "learning_rate": 6.50910812639789e-05, "loss": 0.9094697952270507, "memory(GiB)": 91.52, "step": 66630, "token_acc": 0.740970559471217, "train_speed(iter/s)": 0.140197 }, { "epoch": 0.8646344588654714, "grad_norm": 0.7619460225105286, "learning_rate": 6.508596756655005e-05, "loss": 0.8662799835205078, "memory(GiB)": 91.52, "step": 66635, "token_acc": 0.7889209144391361, "train_speed(iter/s)": 0.140196 }, { "epoch": 0.8646993372671271, "grad_norm": 0.7266871333122253, "learning_rate": 6.508085369551289e-05, "loss": 0.8330839157104493, "memory(GiB)": 91.52, "step": 66640, "token_acc": 0.769720148567646, "train_speed(iter/s)": 0.140194 }, { "epoch": 0.8647642156687828, "grad_norm": 0.691510021686554, "learning_rate": 6.507573965092621e-05, "loss": 0.8376252174377441, "memory(GiB)": 91.52, "step": 66645, "token_acc": 0.7547270306258322, "train_speed(iter/s)": 0.140192 }, { "epoch": 0.8648290940704385, "grad_norm": 0.6656435132026672, "learning_rate": 6.507062543284892e-05, "loss": 0.8766542434692383, "memory(GiB)": 91.52, "step": 66650, "token_acc": 0.7598145686776672, "train_speed(iter/s)": 0.140191 }, { "epoch": 0.8648939724720942, "grad_norm": 0.8327745795249939, "learning_rate": 6.506551104133981e-05, "loss": 0.8498401641845703, "memory(GiB)": 91.52, "step": 66655, "token_acc": 0.7502451781627983, "train_speed(iter/s)": 0.140189 }, { "epoch": 0.8649588508737499, "grad_norm": 0.811335563659668, "learning_rate": 6.506039647645779e-05, "loss": 0.8644763946533203, "memory(GiB)": 91.52, "step": 66660, "token_acc": 0.7650153149553869, "train_speed(iter/s)": 0.140187 }, { "epoch": 0.8650237292754056, "grad_norm": 0.8031563758850098, "learning_rate": 6.505528173826169e-05, "loss": 0.8554600715637207, "memory(GiB)": 91.52, "step": 66665, "token_acc": 0.7499900330901408, "train_speed(iter/s)": 0.140186 }, { "epoch": 0.8650886076770613, "grad_norm": 0.7872311472892761, "learning_rate": 6.505016682681036e-05, "loss": 0.8900316238403321, "memory(GiB)": 91.52, "step": 66670, "token_acc": 0.7533630634282976, "train_speed(iter/s)": 0.140185 }, { "epoch": 0.865153486078717, "grad_norm": 0.658281147480011, "learning_rate": 6.50450517421627e-05, "loss": 0.8508575439453125, "memory(GiB)": 91.52, "step": 66675, "token_acc": 0.7736204771227613, "train_speed(iter/s)": 0.140183 }, { "epoch": 0.8652183644803727, "grad_norm": 0.7669715881347656, "learning_rate": 6.503993648437754e-05, "loss": 0.8368317604064941, "memory(GiB)": 91.52, "step": 66680, "token_acc": 0.7673143692513857, "train_speed(iter/s)": 0.140181 }, { "epoch": 0.8652832428820284, "grad_norm": 0.7943132519721985, "learning_rate": 6.503482105351377e-05, "loss": 0.8760133743286133, "memory(GiB)": 91.52, "step": 66685, "token_acc": 0.7519270349893055, "train_speed(iter/s)": 0.14018 }, { "epoch": 0.8653481212836841, "grad_norm": 0.7591431736946106, "learning_rate": 6.502970544963025e-05, "loss": 0.8783681869506836, "memory(GiB)": 91.52, "step": 66690, "token_acc": 0.773248494334615, "train_speed(iter/s)": 0.140179 }, { "epoch": 0.8654129996853398, "grad_norm": 0.7725286483764648, "learning_rate": 6.502458967278584e-05, "loss": 0.8648011207580566, "memory(GiB)": 91.52, "step": 66695, "token_acc": 0.7734434291612277, "train_speed(iter/s)": 0.140177 }, { "epoch": 0.8654778780869955, "grad_norm": 0.6719540953636169, "learning_rate": 6.501947372303943e-05, "loss": 0.873448371887207, "memory(GiB)": 91.52, "step": 66700, "token_acc": 0.750783872514663, "train_speed(iter/s)": 0.140175 }, { "epoch": 0.8655427564886512, "grad_norm": 0.7385801672935486, "learning_rate": 6.501435760044985e-05, "loss": 0.832547378540039, "memory(GiB)": 91.52, "step": 66705, "token_acc": 0.7747859971892168, "train_speed(iter/s)": 0.140173 }, { "epoch": 0.8656076348903068, "grad_norm": 0.7337302565574646, "learning_rate": 6.500924130507605e-05, "loss": 0.8758506774902344, "memory(GiB)": 91.52, "step": 66710, "token_acc": 0.7648450356555129, "train_speed(iter/s)": 0.140171 }, { "epoch": 0.8656725132919625, "grad_norm": 0.6670817136764526, "learning_rate": 6.500412483697683e-05, "loss": 0.809054183959961, "memory(GiB)": 91.52, "step": 66715, "token_acc": 0.7681179246896269, "train_speed(iter/s)": 0.140169 }, { "epoch": 0.8657373916936182, "grad_norm": 0.6989973187446594, "learning_rate": 6.499900819621113e-05, "loss": 0.8983688354492188, "memory(GiB)": 91.52, "step": 66720, "token_acc": 0.7312725330484711, "train_speed(iter/s)": 0.140168 }, { "epoch": 0.8658022700952739, "grad_norm": 0.7141016125679016, "learning_rate": 6.49938913828378e-05, "loss": 0.8863353729248047, "memory(GiB)": 91.52, "step": 66725, "token_acc": 0.7485048689607665, "train_speed(iter/s)": 0.140166 }, { "epoch": 0.8658671484969296, "grad_norm": 0.704646646976471, "learning_rate": 6.498877439691576e-05, "loss": 0.8325448036193848, "memory(GiB)": 91.52, "step": 66730, "token_acc": 0.7830140014738394, "train_speed(iter/s)": 0.140165 }, { "epoch": 0.8659320268985853, "grad_norm": 0.7180471420288086, "learning_rate": 6.498365723850384e-05, "loss": 0.8747671127319336, "memory(GiB)": 91.52, "step": 66735, "token_acc": 0.7681388621022179, "train_speed(iter/s)": 0.140163 }, { "epoch": 0.865996905300241, "grad_norm": 0.6614169478416443, "learning_rate": 6.497853990766095e-05, "loss": 0.8466773986816406, "memory(GiB)": 91.52, "step": 66740, "token_acc": 0.7638483965014577, "train_speed(iter/s)": 0.140162 }, { "epoch": 0.8660617837018967, "grad_norm": 0.7803350687026978, "learning_rate": 6.497342240444599e-05, "loss": 0.8829748153686523, "memory(GiB)": 91.52, "step": 66745, "token_acc": 0.7558119628856096, "train_speed(iter/s)": 0.140161 }, { "epoch": 0.8661266621035524, "grad_norm": 0.7142752408981323, "learning_rate": 6.496830472891787e-05, "loss": 0.8643274307250977, "memory(GiB)": 91.52, "step": 66750, "token_acc": 0.7592475084916577, "train_speed(iter/s)": 0.140159 }, { "epoch": 0.8661915405052081, "grad_norm": 0.6984593868255615, "learning_rate": 6.496318688113545e-05, "loss": 0.8608963966369629, "memory(GiB)": 91.52, "step": 66755, "token_acc": 0.7694040135802884, "train_speed(iter/s)": 0.140157 }, { "epoch": 0.8662564189068638, "grad_norm": 0.7959275841712952, "learning_rate": 6.495806886115764e-05, "loss": 0.8280204772949219, "memory(GiB)": 91.52, "step": 66760, "token_acc": 0.7815955615075569, "train_speed(iter/s)": 0.140156 }, { "epoch": 0.8663212973085195, "grad_norm": 0.7457959651947021, "learning_rate": 6.495295066904332e-05, "loss": 0.8067468643188477, "memory(GiB)": 91.52, "step": 66765, "token_acc": 0.7657704746659697, "train_speed(iter/s)": 0.140154 }, { "epoch": 0.8663861757101752, "grad_norm": 0.7063586115837097, "learning_rate": 6.494783230485144e-05, "loss": 0.8224875450134277, "memory(GiB)": 91.52, "step": 66770, "token_acc": 0.7949004150824933, "train_speed(iter/s)": 0.140152 }, { "epoch": 0.8664510541118309, "grad_norm": 0.7094593644142151, "learning_rate": 6.494271376864083e-05, "loss": 0.8324735641479493, "memory(GiB)": 91.52, "step": 66775, "token_acc": 0.7495471446277052, "train_speed(iter/s)": 0.140151 }, { "epoch": 0.8665159325134866, "grad_norm": 0.7382513880729675, "learning_rate": 6.493759506047047e-05, "loss": 0.8520358085632325, "memory(GiB)": 91.52, "step": 66780, "token_acc": 0.7650709586660598, "train_speed(iter/s)": 0.14015 }, { "epoch": 0.8665808109151423, "grad_norm": 0.8045856356620789, "learning_rate": 6.493247618039921e-05, "loss": 0.8326704978942872, "memory(GiB)": 91.52, "step": 66785, "token_acc": 0.7828399397892625, "train_speed(iter/s)": 0.140148 }, { "epoch": 0.866645689316798, "grad_norm": 0.8544278144836426, "learning_rate": 6.492735712848596e-05, "loss": 0.8645730018615723, "memory(GiB)": 91.52, "step": 66790, "token_acc": 0.743562428828564, "train_speed(iter/s)": 0.140147 }, { "epoch": 0.8667105677184537, "grad_norm": 0.7835596203804016, "learning_rate": 6.492223790478967e-05, "loss": 0.8443152427673339, "memory(GiB)": 91.52, "step": 66795, "token_acc": 0.7657393333097671, "train_speed(iter/s)": 0.140145 }, { "epoch": 0.8667754461201094, "grad_norm": 0.7944957613945007, "learning_rate": 6.491711850936922e-05, "loss": 0.8805994033813477, "memory(GiB)": 91.52, "step": 66800, "token_acc": 0.7560599993839899, "train_speed(iter/s)": 0.140144 }, { "epoch": 0.8668403245217651, "grad_norm": 0.7038577198982239, "learning_rate": 6.491199894228352e-05, "loss": 0.8866126060485839, "memory(GiB)": 91.52, "step": 66805, "token_acc": 0.7646479746235462, "train_speed(iter/s)": 0.140142 }, { "epoch": 0.8669052029234208, "grad_norm": 0.7831175923347473, "learning_rate": 6.49068792035915e-05, "loss": 0.8440700531005859, "memory(GiB)": 91.52, "step": 66810, "token_acc": 0.7906552955754793, "train_speed(iter/s)": 0.140141 }, { "epoch": 0.8669700813250765, "grad_norm": 0.6795807480812073, "learning_rate": 6.490175929335208e-05, "loss": 0.8270795822143555, "memory(GiB)": 91.52, "step": 66815, "token_acc": 0.7568308007013442, "train_speed(iter/s)": 0.140139 }, { "epoch": 0.8670349597267322, "grad_norm": 0.7258338332176208, "learning_rate": 6.489663921162417e-05, "loss": 0.9105942726135254, "memory(GiB)": 91.52, "step": 66820, "token_acc": 0.754770206022187, "train_speed(iter/s)": 0.140138 }, { "epoch": 0.8670998381283879, "grad_norm": 0.8134309649467468, "learning_rate": 6.48915189584667e-05, "loss": 0.8376189231872558, "memory(GiB)": 91.52, "step": 66825, "token_acc": 0.7750573571208438, "train_speed(iter/s)": 0.140137 }, { "epoch": 0.8671647165300436, "grad_norm": 0.6799453496932983, "learning_rate": 6.488639853393856e-05, "loss": 0.8674768447875977, "memory(GiB)": 91.52, "step": 66830, "token_acc": 0.7767032022351171, "train_speed(iter/s)": 0.140135 }, { "epoch": 0.8672295949316993, "grad_norm": 0.7738502025604248, "learning_rate": 6.488127793809873e-05, "loss": 0.9164963722229004, "memory(GiB)": 91.52, "step": 66835, "token_acc": 0.7468803125113033, "train_speed(iter/s)": 0.140134 }, { "epoch": 0.867294473333355, "grad_norm": 0.7635161280632019, "learning_rate": 6.487615717100612e-05, "loss": 0.8921154022216797, "memory(GiB)": 91.52, "step": 66840, "token_acc": 0.7546643358994184, "train_speed(iter/s)": 0.140132 }, { "epoch": 0.8673593517350107, "grad_norm": 0.7211715579032898, "learning_rate": 6.487103623271962e-05, "loss": 0.8261401176452636, "memory(GiB)": 91.52, "step": 66845, "token_acc": 0.7775358813094663, "train_speed(iter/s)": 0.14013 }, { "epoch": 0.8674242301366664, "grad_norm": 0.7101284861564636, "learning_rate": 6.486591512329821e-05, "loss": 0.8571211814880371, "memory(GiB)": 91.52, "step": 66850, "token_acc": 0.7644252038172281, "train_speed(iter/s)": 0.140129 }, { "epoch": 0.8674891085383221, "grad_norm": 0.7840072512626648, "learning_rate": 6.48607938428008e-05, "loss": 0.8589162826538086, "memory(GiB)": 91.52, "step": 66855, "token_acc": 0.7447995566229447, "train_speed(iter/s)": 0.140127 }, { "epoch": 0.8675539869399778, "grad_norm": 0.7739602327346802, "learning_rate": 6.485567239128634e-05, "loss": 0.8691696166992188, "memory(GiB)": 91.52, "step": 66860, "token_acc": 0.7358173076923077, "train_speed(iter/s)": 0.140125 }, { "epoch": 0.8676188653416335, "grad_norm": 0.7688442468643188, "learning_rate": 6.485055076881375e-05, "loss": 0.8675409317016601, "memory(GiB)": 91.52, "step": 66865, "token_acc": 0.7652838845345058, "train_speed(iter/s)": 0.140123 }, { "epoch": 0.8676837437432892, "grad_norm": 0.7034215331077576, "learning_rate": 6.484542897544198e-05, "loss": 0.8256536483764648, "memory(GiB)": 91.52, "step": 66870, "token_acc": 0.7473690903888341, "train_speed(iter/s)": 0.140122 }, { "epoch": 0.8677486221449449, "grad_norm": 0.7796287536621094, "learning_rate": 6.484030701122998e-05, "loss": 0.8532722473144532, "memory(GiB)": 91.52, "step": 66875, "token_acc": 0.7554087361358346, "train_speed(iter/s)": 0.14012 }, { "epoch": 0.8678135005466006, "grad_norm": 0.8054693341255188, "learning_rate": 6.483518487623666e-05, "loss": 0.804533576965332, "memory(GiB)": 91.52, "step": 66880, "token_acc": 0.775448256388536, "train_speed(iter/s)": 0.140118 }, { "epoch": 0.8678783789482563, "grad_norm": 0.8168720006942749, "learning_rate": 6.4830062570521e-05, "loss": 0.8942934989929199, "memory(GiB)": 91.52, "step": 66885, "token_acc": 0.7626575115068286, "train_speed(iter/s)": 0.140116 }, { "epoch": 0.867943257349912, "grad_norm": 0.7664195895195007, "learning_rate": 6.482494009414194e-05, "loss": 0.8875487327575684, "memory(GiB)": 91.52, "step": 66890, "token_acc": 0.7670240613906121, "train_speed(iter/s)": 0.140114 }, { "epoch": 0.8680081357515677, "grad_norm": 0.6952790021896362, "learning_rate": 6.481981744715842e-05, "loss": 0.8986139297485352, "memory(GiB)": 91.52, "step": 66895, "token_acc": 0.7758269007719074, "train_speed(iter/s)": 0.140113 }, { "epoch": 0.8680730141532234, "grad_norm": 0.7347103953361511, "learning_rate": 6.481469462962939e-05, "loss": 0.8559139251708985, "memory(GiB)": 91.52, "step": 66900, "token_acc": 0.7714893128016548, "train_speed(iter/s)": 0.140111 }, { "epoch": 0.868137892554879, "grad_norm": 0.7356231808662415, "learning_rate": 6.48095716416138e-05, "loss": 0.8570040702819824, "memory(GiB)": 91.52, "step": 66905, "token_acc": 0.7538140103283794, "train_speed(iter/s)": 0.14011 }, { "epoch": 0.8682027709565348, "grad_norm": 0.8095526695251465, "learning_rate": 6.480444848317063e-05, "loss": 0.8758125305175781, "memory(GiB)": 91.52, "step": 66910, "token_acc": 0.7529760923330585, "train_speed(iter/s)": 0.140109 }, { "epoch": 0.8682676493581905, "grad_norm": 0.6992304921150208, "learning_rate": 6.47993251543588e-05, "loss": 0.8701773643493652, "memory(GiB)": 91.52, "step": 66915, "token_acc": 0.7723871244942533, "train_speed(iter/s)": 0.140108 }, { "epoch": 0.8683325277598462, "grad_norm": 0.7844579815864563, "learning_rate": 6.479420165523731e-05, "loss": 0.872974967956543, "memory(GiB)": 91.52, "step": 66920, "token_acc": 0.7682375505171587, "train_speed(iter/s)": 0.140107 }, { "epoch": 0.8683974061615019, "grad_norm": 0.7069546580314636, "learning_rate": 6.478907798586509e-05, "loss": 0.8563192367553711, "memory(GiB)": 91.52, "step": 66925, "token_acc": 0.7657240263997552, "train_speed(iter/s)": 0.140105 }, { "epoch": 0.8684622845631575, "grad_norm": 0.7733551263809204, "learning_rate": 6.478395414630112e-05, "loss": 0.8571832656860352, "memory(GiB)": 91.52, "step": 66930, "token_acc": 0.769890767511446, "train_speed(iter/s)": 0.140104 }, { "epoch": 0.8685271629648132, "grad_norm": 0.7455161213874817, "learning_rate": 6.477883013660434e-05, "loss": 0.8281076431274415, "memory(GiB)": 91.52, "step": 66935, "token_acc": 0.7652268914120722, "train_speed(iter/s)": 0.140102 }, { "epoch": 0.868592041366469, "grad_norm": 0.683985710144043, "learning_rate": 6.477370595683375e-05, "loss": 0.8550222396850586, "memory(GiB)": 91.52, "step": 66940, "token_acc": 0.7782287577350456, "train_speed(iter/s)": 0.1401 }, { "epoch": 0.8686569197681246, "grad_norm": 0.7299941182136536, "learning_rate": 6.476858160704829e-05, "loss": 0.8899957656860351, "memory(GiB)": 91.52, "step": 66945, "token_acc": 0.7511625669478208, "train_speed(iter/s)": 0.140098 }, { "epoch": 0.8687217981697802, "grad_norm": 0.8513585329055786, "learning_rate": 6.476345708730695e-05, "loss": 0.851019287109375, "memory(GiB)": 91.52, "step": 66950, "token_acc": 0.7520075046904315, "train_speed(iter/s)": 0.140096 }, { "epoch": 0.8687866765714359, "grad_norm": 0.8020530939102173, "learning_rate": 6.475833239766868e-05, "loss": 0.8666899681091309, "memory(GiB)": 91.52, "step": 66955, "token_acc": 0.7572081125525306, "train_speed(iter/s)": 0.140095 }, { "epoch": 0.8688515549730916, "grad_norm": 0.7342022657394409, "learning_rate": 6.475320753819248e-05, "loss": 0.865757942199707, "memory(GiB)": 91.52, "step": 66960, "token_acc": 0.7595725734639359, "train_speed(iter/s)": 0.140094 }, { "epoch": 0.8689164333747473, "grad_norm": 0.6915749907493591, "learning_rate": 6.474808250893734e-05, "loss": 0.8352108955383301, "memory(GiB)": 91.52, "step": 66965, "token_acc": 0.7786631860960362, "train_speed(iter/s)": 0.140092 }, { "epoch": 0.868981311776403, "grad_norm": 0.7000858783721924, "learning_rate": 6.47429573099622e-05, "loss": 0.868824577331543, "memory(GiB)": 91.52, "step": 66970, "token_acc": 0.7453967467591446, "train_speed(iter/s)": 0.14009 }, { "epoch": 0.8690461901780587, "grad_norm": 0.8000009059906006, "learning_rate": 6.473783194132605e-05, "loss": 0.905783748626709, "memory(GiB)": 91.52, "step": 66975, "token_acc": 0.7659059700516576, "train_speed(iter/s)": 0.140089 }, { "epoch": 0.8691110685797144, "grad_norm": 0.8706633448600769, "learning_rate": 6.473270640308788e-05, "loss": 0.9228535652160644, "memory(GiB)": 91.52, "step": 66980, "token_acc": 0.765373771241332, "train_speed(iter/s)": 0.140087 }, { "epoch": 0.8691759469813701, "grad_norm": 0.7443991303443909, "learning_rate": 6.472758069530668e-05, "loss": 0.8802703857421875, "memory(GiB)": 91.52, "step": 66985, "token_acc": 0.7621886929649154, "train_speed(iter/s)": 0.140086 }, { "epoch": 0.8692408253830258, "grad_norm": 0.7319223880767822, "learning_rate": 6.472245481804142e-05, "loss": 0.8577253341674804, "memory(GiB)": 91.52, "step": 66990, "token_acc": 0.7577367370222475, "train_speed(iter/s)": 0.140084 }, { "epoch": 0.8693057037846815, "grad_norm": 0.7224172949790955, "learning_rate": 6.471732877135111e-05, "loss": 0.8264554023742676, "memory(GiB)": 91.52, "step": 66995, "token_acc": 0.77509429178156, "train_speed(iter/s)": 0.140083 }, { "epoch": 0.8693705821863372, "grad_norm": 0.7395082116127014, "learning_rate": 6.471220255529472e-05, "loss": 0.8714728355407715, "memory(GiB)": 91.52, "step": 67000, "token_acc": 0.7634241748294701, "train_speed(iter/s)": 0.140081 }, { "epoch": 0.8694354605879929, "grad_norm": 0.8267716765403748, "learning_rate": 6.470707616993126e-05, "loss": 0.8466587066650391, "memory(GiB)": 91.52, "step": 67005, "token_acc": 0.7676928737887894, "train_speed(iter/s)": 0.14008 }, { "epoch": 0.8695003389896486, "grad_norm": 0.7934616208076477, "learning_rate": 6.470194961531969e-05, "loss": 0.8678089141845703, "memory(GiB)": 91.52, "step": 67010, "token_acc": 0.7763359335131589, "train_speed(iter/s)": 0.140079 }, { "epoch": 0.8695652173913043, "grad_norm": 0.7605084776878357, "learning_rate": 6.469682289151905e-05, "loss": 0.8751232147216796, "memory(GiB)": 91.52, "step": 67015, "token_acc": 0.7536696412311055, "train_speed(iter/s)": 0.140077 }, { "epoch": 0.86963009579296, "grad_norm": 0.7611925601959229, "learning_rate": 6.46916959985883e-05, "loss": 0.8806203842163086, "memory(GiB)": 91.52, "step": 67020, "token_acc": 0.7579230737855366, "train_speed(iter/s)": 0.140076 }, { "epoch": 0.8696949741946157, "grad_norm": 0.739993691444397, "learning_rate": 6.468656893658649e-05, "loss": 0.8622711181640625, "memory(GiB)": 91.52, "step": 67025, "token_acc": 0.7539308472531202, "train_speed(iter/s)": 0.140074 }, { "epoch": 0.8697598525962714, "grad_norm": 0.7162253260612488, "learning_rate": 6.468144170557255e-05, "loss": 0.8298593521118164, "memory(GiB)": 91.52, "step": 67030, "token_acc": 0.7626752584619743, "train_speed(iter/s)": 0.140073 }, { "epoch": 0.8698247309979271, "grad_norm": 0.7919167280197144, "learning_rate": 6.467631430560555e-05, "loss": 0.8066434860229492, "memory(GiB)": 91.52, "step": 67035, "token_acc": 0.7758735164743962, "train_speed(iter/s)": 0.140071 }, { "epoch": 0.8698896093995828, "grad_norm": 0.7842411398887634, "learning_rate": 6.467118673674447e-05, "loss": 0.8675956726074219, "memory(GiB)": 91.52, "step": 67040, "token_acc": 0.7551761020704408, "train_speed(iter/s)": 0.14007 }, { "epoch": 0.8699544878012385, "grad_norm": 0.7340635657310486, "learning_rate": 6.46660589990483e-05, "loss": 0.9198299407958984, "memory(GiB)": 91.52, "step": 67045, "token_acc": 0.7497122593088001, "train_speed(iter/s)": 0.140068 }, { "epoch": 0.8700193662028942, "grad_norm": 0.7117020487785339, "learning_rate": 6.466093109257608e-05, "loss": 0.86337890625, "memory(GiB)": 91.52, "step": 67050, "token_acc": 0.7436622633049357, "train_speed(iter/s)": 0.140067 }, { "epoch": 0.8700842446045499, "grad_norm": 0.6760711073875427, "learning_rate": 6.465580301738681e-05, "loss": 0.8259500503540039, "memory(GiB)": 91.52, "step": 67055, "token_acc": 0.7824915184374901, "train_speed(iter/s)": 0.140065 }, { "epoch": 0.8701491230062056, "grad_norm": 0.6745884418487549, "learning_rate": 6.465067477353949e-05, "loss": 0.8777094841003418, "memory(GiB)": 91.52, "step": 67060, "token_acc": 0.7804530051721063, "train_speed(iter/s)": 0.140063 }, { "epoch": 0.8702140014078613, "grad_norm": 0.8079754710197449, "learning_rate": 6.464554636109316e-05, "loss": 0.8508829116821289, "memory(GiB)": 91.52, "step": 67065, "token_acc": 0.7595249374027204, "train_speed(iter/s)": 0.140062 }, { "epoch": 0.870278879809517, "grad_norm": 0.8063573837280273, "learning_rate": 6.464041778010682e-05, "loss": 0.837884521484375, "memory(GiB)": 91.52, "step": 67070, "token_acc": 0.7645442792501616, "train_speed(iter/s)": 0.14006 }, { "epoch": 0.8703437582111727, "grad_norm": 0.7654719948768616, "learning_rate": 6.46352890306395e-05, "loss": 0.8458666801452637, "memory(GiB)": 91.52, "step": 67075, "token_acc": 0.7449262142880412, "train_speed(iter/s)": 0.140058 }, { "epoch": 0.8704086366128284, "grad_norm": 0.7401890158653259, "learning_rate": 6.463016011275022e-05, "loss": 0.8675125122070313, "memory(GiB)": 91.52, "step": 67080, "token_acc": 0.7721941929232516, "train_speed(iter/s)": 0.140057 }, { "epoch": 0.8704735150144841, "grad_norm": 0.7109560966491699, "learning_rate": 6.462503102649798e-05, "loss": 0.8984142303466797, "memory(GiB)": 91.52, "step": 67085, "token_acc": 0.7524154165056389, "train_speed(iter/s)": 0.140056 }, { "epoch": 0.8705383934161398, "grad_norm": 0.6861050128936768, "learning_rate": 6.461990177194184e-05, "loss": 0.8802515029907226, "memory(GiB)": 91.52, "step": 67090, "token_acc": 0.7738717514540234, "train_speed(iter/s)": 0.140054 }, { "epoch": 0.8706032718177955, "grad_norm": 0.7669393420219421, "learning_rate": 6.461477234914081e-05, "loss": 0.8712693214416504, "memory(GiB)": 91.52, "step": 67095, "token_acc": 0.7524588567533352, "train_speed(iter/s)": 0.140052 }, { "epoch": 0.8706681502194512, "grad_norm": 0.6788253784179688, "learning_rate": 6.460964275815392e-05, "loss": 0.8383424758911133, "memory(GiB)": 91.52, "step": 67100, "token_acc": 0.7718420582454887, "train_speed(iter/s)": 0.140051 }, { "epoch": 0.8707330286211069, "grad_norm": 0.7493159174919128, "learning_rate": 6.460451299904017e-05, "loss": 0.860074806213379, "memory(GiB)": 91.52, "step": 67105, "token_acc": 0.7843378591484288, "train_speed(iter/s)": 0.140049 }, { "epoch": 0.8707979070227626, "grad_norm": 0.6316481828689575, "learning_rate": 6.459938307185867e-05, "loss": 0.8342406272888183, "memory(GiB)": 91.52, "step": 67110, "token_acc": 0.76589451294493, "train_speed(iter/s)": 0.140047 }, { "epoch": 0.8708627854244183, "grad_norm": 0.7777091264724731, "learning_rate": 6.45942529766684e-05, "loss": 0.878233528137207, "memory(GiB)": 91.52, "step": 67115, "token_acc": 0.7433416935387556, "train_speed(iter/s)": 0.140045 }, { "epoch": 0.870927663826074, "grad_norm": 0.7726870775222778, "learning_rate": 6.45891227135284e-05, "loss": 0.8829219818115235, "memory(GiB)": 91.52, "step": 67120, "token_acc": 0.7536730641958967, "train_speed(iter/s)": 0.140044 }, { "epoch": 0.8709925422277297, "grad_norm": 0.7005096673965454, "learning_rate": 6.45839922824977e-05, "loss": 0.8735195159912109, "memory(GiB)": 91.52, "step": 67125, "token_acc": 0.7540049839800641, "train_speed(iter/s)": 0.140042 }, { "epoch": 0.8710574206293854, "grad_norm": 0.7978891730308533, "learning_rate": 6.457886168363538e-05, "loss": 0.9482110977172852, "memory(GiB)": 91.52, "step": 67130, "token_acc": 0.7611496531219029, "train_speed(iter/s)": 0.140041 }, { "epoch": 0.8711222990310411, "grad_norm": 0.7399487495422363, "learning_rate": 6.457373091700043e-05, "loss": 0.897196102142334, "memory(GiB)": 91.52, "step": 67135, "token_acc": 0.7648842337375965, "train_speed(iter/s)": 0.14004 }, { "epoch": 0.8711871774326968, "grad_norm": 0.76608806848526, "learning_rate": 6.456859998265195e-05, "loss": 0.8636404037475586, "memory(GiB)": 91.52, "step": 67140, "token_acc": 0.7712907028142036, "train_speed(iter/s)": 0.140039 }, { "epoch": 0.8712520558343525, "grad_norm": 0.7938820719718933, "learning_rate": 6.456346888064893e-05, "loss": 0.8813409805297852, "memory(GiB)": 91.52, "step": 67145, "token_acc": 0.75984302624807, "train_speed(iter/s)": 0.140038 }, { "epoch": 0.8713169342360082, "grad_norm": 0.7795860767364502, "learning_rate": 6.455833761105047e-05, "loss": 0.8920215606689453, "memory(GiB)": 91.52, "step": 67150, "token_acc": 0.7640718122286807, "train_speed(iter/s)": 0.140036 }, { "epoch": 0.8713818126376639, "grad_norm": 0.7114712595939636, "learning_rate": 6.45532061739156e-05, "loss": 0.8523626327514648, "memory(GiB)": 91.52, "step": 67155, "token_acc": 0.7550225174869846, "train_speed(iter/s)": 0.140034 }, { "epoch": 0.8714466910393196, "grad_norm": 0.9052438735961914, "learning_rate": 6.454807456930335e-05, "loss": 0.8592912673950195, "memory(GiB)": 91.52, "step": 67160, "token_acc": 0.7649487115336502, "train_speed(iter/s)": 0.140032 }, { "epoch": 0.8715115694409753, "grad_norm": 0.8069605827331543, "learning_rate": 6.454294279727282e-05, "loss": 0.8960714340209961, "memory(GiB)": 91.52, "step": 67165, "token_acc": 0.755919265105447, "train_speed(iter/s)": 0.140031 }, { "epoch": 0.871576447842631, "grad_norm": 0.7410001158714294, "learning_rate": 6.453781085788302e-05, "loss": 0.8824882507324219, "memory(GiB)": 91.52, "step": 67170, "token_acc": 0.7482039131763987, "train_speed(iter/s)": 0.140029 }, { "epoch": 0.8716413262442867, "grad_norm": 0.7164748907089233, "learning_rate": 6.453267875119303e-05, "loss": 0.9015844345092774, "memory(GiB)": 91.52, "step": 67175, "token_acc": 0.7454185893210283, "train_speed(iter/s)": 0.140028 }, { "epoch": 0.8717062046459424, "grad_norm": 0.7493430972099304, "learning_rate": 6.452754647726191e-05, "loss": 0.8508926391601562, "memory(GiB)": 91.52, "step": 67180, "token_acc": 0.7769712140175219, "train_speed(iter/s)": 0.140026 }, { "epoch": 0.8717710830475981, "grad_norm": 0.724523663520813, "learning_rate": 6.452241403614874e-05, "loss": 0.8855371475219727, "memory(GiB)": 91.52, "step": 67185, "token_acc": 0.7713059701492537, "train_speed(iter/s)": 0.140025 }, { "epoch": 0.8718359614492537, "grad_norm": 0.7454750537872314, "learning_rate": 6.451728142791253e-05, "loss": 0.8758140563964844, "memory(GiB)": 91.52, "step": 67190, "token_acc": 0.7635952404366211, "train_speed(iter/s)": 0.140023 }, { "epoch": 0.8719008398509094, "grad_norm": 0.75112384557724, "learning_rate": 6.451214865261239e-05, "loss": 0.8285808563232422, "memory(GiB)": 91.52, "step": 67195, "token_acc": 0.7805729760547321, "train_speed(iter/s)": 0.140022 }, { "epoch": 0.8719657182525651, "grad_norm": 0.7820273637771606, "learning_rate": 6.45070157103074e-05, "loss": 0.8264006614685059, "memory(GiB)": 91.52, "step": 67200, "token_acc": 0.7635222847252272, "train_speed(iter/s)": 0.14002 }, { "epoch": 0.8720305966542208, "grad_norm": 0.7840689420700073, "learning_rate": 6.450188260105658e-05, "loss": 0.894994068145752, "memory(GiB)": 91.52, "step": 67205, "token_acc": 0.7568876452862677, "train_speed(iter/s)": 0.140019 }, { "epoch": 0.8720954750558765, "grad_norm": 0.7680323123931885, "learning_rate": 6.449674932491905e-05, "loss": 0.8438383102416992, "memory(GiB)": 91.52, "step": 67210, "token_acc": 0.7847238008638326, "train_speed(iter/s)": 0.140017 }, { "epoch": 0.8721603534575322, "grad_norm": 0.7413983345031738, "learning_rate": 6.449161588195385e-05, "loss": 0.8063791275024415, "memory(GiB)": 91.52, "step": 67215, "token_acc": 0.7926376224248565, "train_speed(iter/s)": 0.140015 }, { "epoch": 0.8722252318591879, "grad_norm": 0.6457734107971191, "learning_rate": 6.448648227222008e-05, "loss": 0.8573743820190429, "memory(GiB)": 91.52, "step": 67220, "token_acc": 0.7510461025735975, "train_speed(iter/s)": 0.140013 }, { "epoch": 0.8722901102608436, "grad_norm": 0.7805152535438538, "learning_rate": 6.448134849577679e-05, "loss": 0.8880779266357421, "memory(GiB)": 91.52, "step": 67225, "token_acc": 0.7497385962619265, "train_speed(iter/s)": 0.140012 }, { "epoch": 0.8723549886624993, "grad_norm": 0.772103488445282, "learning_rate": 6.447621455268307e-05, "loss": 0.8806018829345703, "memory(GiB)": 91.52, "step": 67230, "token_acc": 0.747526113249038, "train_speed(iter/s)": 0.140011 }, { "epoch": 0.872419867064155, "grad_norm": 0.8353951573371887, "learning_rate": 6.4471080442998e-05, "loss": 0.8749972343444824, "memory(GiB)": 91.52, "step": 67235, "token_acc": 0.7638863224805056, "train_speed(iter/s)": 0.140009 }, { "epoch": 0.8724847454658107, "grad_norm": 0.774868369102478, "learning_rate": 6.446594616678068e-05, "loss": 0.8434252738952637, "memory(GiB)": 91.52, "step": 67240, "token_acc": 0.7663840183960908, "train_speed(iter/s)": 0.140007 }, { "epoch": 0.8725496238674664, "grad_norm": 0.7071653008460999, "learning_rate": 6.446081172409018e-05, "loss": 0.8603574752807617, "memory(GiB)": 91.52, "step": 67245, "token_acc": 0.7493586893320108, "train_speed(iter/s)": 0.140006 }, { "epoch": 0.8726145022691221, "grad_norm": 0.7146113514900208, "learning_rate": 6.44556771149856e-05, "loss": 0.817742919921875, "memory(GiB)": 91.52, "step": 67250, "token_acc": 0.7793427230046949, "train_speed(iter/s)": 0.140004 }, { "epoch": 0.8726793806707778, "grad_norm": 0.6468207836151123, "learning_rate": 6.445054233952601e-05, "loss": 0.8385737419128418, "memory(GiB)": 91.52, "step": 67255, "token_acc": 0.7661290322580645, "train_speed(iter/s)": 0.140003 }, { "epoch": 0.8727442590724335, "grad_norm": 0.7690474390983582, "learning_rate": 6.44454073977705e-05, "loss": 0.838896369934082, "memory(GiB)": 91.52, "step": 67260, "token_acc": 0.7781318950723216, "train_speed(iter/s)": 0.140001 }, { "epoch": 0.8728091374740892, "grad_norm": 0.7566207647323608, "learning_rate": 6.444027228977818e-05, "loss": 0.8597028732299805, "memory(GiB)": 91.52, "step": 67265, "token_acc": 0.7544367862018446, "train_speed(iter/s)": 0.139999 }, { "epoch": 0.8728740158757449, "grad_norm": 0.7156299948692322, "learning_rate": 6.443513701560814e-05, "loss": 0.8843227386474609, "memory(GiB)": 91.52, "step": 67270, "token_acc": 0.7464468629961588, "train_speed(iter/s)": 0.139998 }, { "epoch": 0.8729388942774006, "grad_norm": 0.6983166337013245, "learning_rate": 6.443000157531947e-05, "loss": 0.8362244606018067, "memory(GiB)": 91.52, "step": 67275, "token_acc": 0.7687439306164819, "train_speed(iter/s)": 0.139996 }, { "epoch": 0.8730037726790563, "grad_norm": 0.7367948293685913, "learning_rate": 6.442486596897126e-05, "loss": 0.902376651763916, "memory(GiB)": 91.52, "step": 67280, "token_acc": 0.750612860644918, "train_speed(iter/s)": 0.139994 }, { "epoch": 0.873068651080712, "grad_norm": 0.7574191093444824, "learning_rate": 6.441973019662263e-05, "loss": 0.9064878463745117, "memory(GiB)": 91.52, "step": 67285, "token_acc": 0.757453470168725, "train_speed(iter/s)": 0.139993 }, { "epoch": 0.8731335294823677, "grad_norm": 0.7968020439147949, "learning_rate": 6.441459425833267e-05, "loss": 0.8462155342102051, "memory(GiB)": 91.52, "step": 67290, "token_acc": 0.7580581450466108, "train_speed(iter/s)": 0.139991 }, { "epoch": 0.8731984078840234, "grad_norm": 0.7342252731323242, "learning_rate": 6.440945815416048e-05, "loss": 0.8583254814147949, "memory(GiB)": 91.52, "step": 67295, "token_acc": 0.7748147604724525, "train_speed(iter/s)": 0.13999 }, { "epoch": 0.8732632862856791, "grad_norm": 0.7151498198509216, "learning_rate": 6.440432188416518e-05, "loss": 0.8388236999511719, "memory(GiB)": 91.52, "step": 67300, "token_acc": 0.7933935240581602, "train_speed(iter/s)": 0.139988 }, { "epoch": 0.8733281646873348, "grad_norm": 0.7786622643470764, "learning_rate": 6.439918544840588e-05, "loss": 0.8662759780883789, "memory(GiB)": 91.52, "step": 67305, "token_acc": 0.7596530677802763, "train_speed(iter/s)": 0.139986 }, { "epoch": 0.8733930430889905, "grad_norm": 0.7545990347862244, "learning_rate": 6.439404884694166e-05, "loss": 0.9071410179138184, "memory(GiB)": 91.52, "step": 67310, "token_acc": 0.7539971765323878, "train_speed(iter/s)": 0.139985 }, { "epoch": 0.8734579214906462, "grad_norm": 0.6979559659957886, "learning_rate": 6.438891207983167e-05, "loss": 0.8761692047119141, "memory(GiB)": 91.52, "step": 67315, "token_acc": 0.7753540266056358, "train_speed(iter/s)": 0.139983 }, { "epoch": 0.8735227998923019, "grad_norm": 0.7496432662010193, "learning_rate": 6.4383775147135e-05, "loss": 0.8386353492736817, "memory(GiB)": 91.52, "step": 67320, "token_acc": 0.7732496625196104, "train_speed(iter/s)": 0.139982 }, { "epoch": 0.8735876782939576, "grad_norm": 0.6756300926208496, "learning_rate": 6.437863804891078e-05, "loss": 0.8454330444335938, "memory(GiB)": 91.52, "step": 67325, "token_acc": 0.7820997658923105, "train_speed(iter/s)": 0.139981 }, { "epoch": 0.8736525566956133, "grad_norm": 0.70381098985672, "learning_rate": 6.43735007852181e-05, "loss": 0.8624097824096679, "memory(GiB)": 91.52, "step": 67330, "token_acc": 0.7781549004396173, "train_speed(iter/s)": 0.139979 }, { "epoch": 0.873717435097269, "grad_norm": 0.7186304926872253, "learning_rate": 6.436836335611611e-05, "loss": 0.8336906433105469, "memory(GiB)": 91.52, "step": 67335, "token_acc": 0.7852307692307692, "train_speed(iter/s)": 0.139977 }, { "epoch": 0.8737823134989247, "grad_norm": 0.7370535135269165, "learning_rate": 6.436322576166393e-05, "loss": 0.8886812210083008, "memory(GiB)": 91.52, "step": 67340, "token_acc": 0.7604446893332901, "train_speed(iter/s)": 0.139975 }, { "epoch": 0.8738471919005804, "grad_norm": 0.7454913258552551, "learning_rate": 6.435808800192066e-05, "loss": 0.8440752029418945, "memory(GiB)": 91.52, "step": 67345, "token_acc": 0.7766660062083086, "train_speed(iter/s)": 0.139973 }, { "epoch": 0.873912070302236, "grad_norm": 0.7617204189300537, "learning_rate": 6.435295007694544e-05, "loss": 0.8389387130737305, "memory(GiB)": 91.52, "step": 67350, "token_acc": 0.7545629915417718, "train_speed(iter/s)": 0.139971 }, { "epoch": 0.8739769487038918, "grad_norm": 0.867487370967865, "learning_rate": 6.434781198679738e-05, "loss": 0.8519985198974609, "memory(GiB)": 91.52, "step": 67355, "token_acc": 0.760092931139549, "train_speed(iter/s)": 0.139969 }, { "epoch": 0.8740418271055475, "grad_norm": 0.6950706243515015, "learning_rate": 6.434267373153564e-05, "loss": 0.892400074005127, "memory(GiB)": 91.52, "step": 67360, "token_acc": 0.7357616524415395, "train_speed(iter/s)": 0.139967 }, { "epoch": 0.8741067055072032, "grad_norm": 0.7267050743103027, "learning_rate": 6.433753531121933e-05, "loss": 0.8425424575805665, "memory(GiB)": 91.52, "step": 67365, "token_acc": 0.7704133064516129, "train_speed(iter/s)": 0.139966 }, { "epoch": 0.8741715839088589, "grad_norm": 0.7936752438545227, "learning_rate": 6.433239672590758e-05, "loss": 0.854179573059082, "memory(GiB)": 91.52, "step": 67370, "token_acc": 0.7633046497026997, "train_speed(iter/s)": 0.139965 }, { "epoch": 0.8742364623105146, "grad_norm": 0.776966392993927, "learning_rate": 6.432725797565953e-05, "loss": 0.8568344116210938, "memory(GiB)": 91.52, "step": 67375, "token_acc": 0.7538036965757653, "train_speed(iter/s)": 0.139963 }, { "epoch": 0.8743013407121702, "grad_norm": 0.7794657349586487, "learning_rate": 6.432211906053434e-05, "loss": 0.8454649925231934, "memory(GiB)": 91.52, "step": 67380, "token_acc": 0.7788743086433142, "train_speed(iter/s)": 0.139962 }, { "epoch": 0.874366219113826, "grad_norm": 0.7282135486602783, "learning_rate": 6.43169799805911e-05, "loss": 0.8475650787353516, "memory(GiB)": 91.52, "step": 67385, "token_acc": 0.768977184002776, "train_speed(iter/s)": 0.13996 }, { "epoch": 0.8744310975154816, "grad_norm": 0.7830336093902588, "learning_rate": 6.431184073588899e-05, "loss": 0.865504264831543, "memory(GiB)": 91.52, "step": 67390, "token_acc": 0.7650269651903906, "train_speed(iter/s)": 0.139958 }, { "epoch": 0.8744959759171373, "grad_norm": 0.7792172431945801, "learning_rate": 6.430670132648712e-05, "loss": 0.841864013671875, "memory(GiB)": 91.52, "step": 67395, "token_acc": 0.7642046570071073, "train_speed(iter/s)": 0.139956 }, { "epoch": 0.874560854318793, "grad_norm": 0.691892683506012, "learning_rate": 6.430156175244468e-05, "loss": 0.8324921607971192, "memory(GiB)": 91.52, "step": 67400, "token_acc": 0.7524135898002801, "train_speed(iter/s)": 0.139955 }, { "epoch": 0.8746257327204487, "grad_norm": 0.7541003823280334, "learning_rate": 6.429642201382076e-05, "loss": 0.8642833709716797, "memory(GiB)": 91.52, "step": 67405, "token_acc": 0.7803034690449991, "train_speed(iter/s)": 0.139953 }, { "epoch": 0.8746906111221044, "grad_norm": 0.7229104042053223, "learning_rate": 6.429128211067454e-05, "loss": 0.8332615852355957, "memory(GiB)": 91.52, "step": 67410, "token_acc": 0.776325903151422, "train_speed(iter/s)": 0.139952 }, { "epoch": 0.8747554895237601, "grad_norm": 0.6621785759925842, "learning_rate": 6.428614204306519e-05, "loss": 0.8627229690551758, "memory(GiB)": 91.52, "step": 67415, "token_acc": 0.7581924801655744, "train_speed(iter/s)": 0.13995 }, { "epoch": 0.8748203679254158, "grad_norm": 0.804410994052887, "learning_rate": 6.428100181105181e-05, "loss": 0.870820426940918, "memory(GiB)": 91.52, "step": 67420, "token_acc": 0.748730175183995, "train_speed(iter/s)": 0.139948 }, { "epoch": 0.8748852463270714, "grad_norm": 0.750777542591095, "learning_rate": 6.42758614146936e-05, "loss": 0.8590756416320801, "memory(GiB)": 91.52, "step": 67425, "token_acc": 0.7500513705667201, "train_speed(iter/s)": 0.139947 }, { "epoch": 0.8749501247287271, "grad_norm": 0.7155404686927795, "learning_rate": 6.427072085404968e-05, "loss": 0.8664229393005372, "memory(GiB)": 91.52, "step": 67430, "token_acc": 0.7521957116950538, "train_speed(iter/s)": 0.139946 }, { "epoch": 0.8750150031303828, "grad_norm": 0.7167901992797852, "learning_rate": 6.426558012917923e-05, "loss": 0.867333984375, "memory(GiB)": 91.52, "step": 67435, "token_acc": 0.7660678642714571, "train_speed(iter/s)": 0.139944 }, { "epoch": 0.8750798815320385, "grad_norm": 0.7078713178634644, "learning_rate": 6.42604392401414e-05, "loss": 0.8094279289245605, "memory(GiB)": 91.52, "step": 67440, "token_acc": 0.7817876175828893, "train_speed(iter/s)": 0.139943 }, { "epoch": 0.8751447599336942, "grad_norm": 0.6939792037010193, "learning_rate": 6.425529818699535e-05, "loss": 0.888491439819336, "memory(GiB)": 91.52, "step": 67445, "token_acc": 0.7565077814222905, "train_speed(iter/s)": 0.139941 }, { "epoch": 0.8752096383353499, "grad_norm": 0.7634954452514648, "learning_rate": 6.425015696980025e-05, "loss": 0.8647634506225585, "memory(GiB)": 91.52, "step": 67450, "token_acc": 0.7629278363818187, "train_speed(iter/s)": 0.13994 }, { "epoch": 0.8752745167370056, "grad_norm": 0.7693793177604675, "learning_rate": 6.424501558861527e-05, "loss": 0.9049559593200683, "memory(GiB)": 91.52, "step": 67455, "token_acc": 0.7542025820039783, "train_speed(iter/s)": 0.139939 }, { "epoch": 0.8753393951386613, "grad_norm": 0.7015870213508606, "learning_rate": 6.423987404349954e-05, "loss": 0.7934399604797363, "memory(GiB)": 91.52, "step": 67460, "token_acc": 0.7728248707170291, "train_speed(iter/s)": 0.139937 }, { "epoch": 0.875404273540317, "grad_norm": 0.6950875520706177, "learning_rate": 6.423473233451228e-05, "loss": 0.8426228523254394, "memory(GiB)": 91.52, "step": 67465, "token_acc": 0.7727767903216559, "train_speed(iter/s)": 0.139935 }, { "epoch": 0.8754691519419727, "grad_norm": 0.8200104832649231, "learning_rate": 6.422959046171263e-05, "loss": 0.8574979782104493, "memory(GiB)": 91.52, "step": 67470, "token_acc": 0.7758393680052666, "train_speed(iter/s)": 0.139934 }, { "epoch": 0.8755340303436284, "grad_norm": 0.7570304274559021, "learning_rate": 6.422444842515978e-05, "loss": 0.8099980354309082, "memory(GiB)": 91.52, "step": 67475, "token_acc": 0.7936546477277326, "train_speed(iter/s)": 0.139932 }, { "epoch": 0.8755989087452841, "grad_norm": 0.7289131879806519, "learning_rate": 6.421930622491288e-05, "loss": 0.906191062927246, "memory(GiB)": 91.52, "step": 67480, "token_acc": 0.7502620654954086, "train_speed(iter/s)": 0.13993 }, { "epoch": 0.8756637871469398, "grad_norm": 0.7010074257850647, "learning_rate": 6.421416386103111e-05, "loss": 0.8440034866333008, "memory(GiB)": 91.52, "step": 67485, "token_acc": 0.7872021521906226, "train_speed(iter/s)": 0.139929 }, { "epoch": 0.8757286655485955, "grad_norm": 0.8280858993530273, "learning_rate": 6.420902133357368e-05, "loss": 0.9035736083984375, "memory(GiB)": 91.52, "step": 67490, "token_acc": 0.7656377879106746, "train_speed(iter/s)": 0.139927 }, { "epoch": 0.8757935439502512, "grad_norm": 0.704589307308197, "learning_rate": 6.420387864259973e-05, "loss": 0.848147201538086, "memory(GiB)": 91.52, "step": 67495, "token_acc": 0.7598601257523284, "train_speed(iter/s)": 0.139926 }, { "epoch": 0.8758584223519069, "grad_norm": 0.7632250785827637, "learning_rate": 6.419873578816847e-05, "loss": 0.8720203399658203, "memory(GiB)": 91.52, "step": 67500, "token_acc": 0.7677364465661484, "train_speed(iter/s)": 0.139924 }, { "epoch": 0.8759233007535626, "grad_norm": 0.6556815505027771, "learning_rate": 6.419359277033907e-05, "loss": 0.8821886062622071, "memory(GiB)": 91.52, "step": 67505, "token_acc": 0.7518045727696192, "train_speed(iter/s)": 0.139923 }, { "epoch": 0.8759881791552183, "grad_norm": 0.800948977470398, "learning_rate": 6.41884495891707e-05, "loss": 0.8576574325561523, "memory(GiB)": 91.52, "step": 67510, "token_acc": 0.774317617866005, "train_speed(iter/s)": 0.139921 }, { "epoch": 0.876053057556874, "grad_norm": 0.7523237466812134, "learning_rate": 6.418330624472258e-05, "loss": 0.8620084762573242, "memory(GiB)": 91.52, "step": 67515, "token_acc": 0.7620980091883615, "train_speed(iter/s)": 0.139919 }, { "epoch": 0.8761179359585297, "grad_norm": 0.7840429544448853, "learning_rate": 6.417816273705388e-05, "loss": 0.8703799247741699, "memory(GiB)": 91.52, "step": 67520, "token_acc": 0.7432299359921221, "train_speed(iter/s)": 0.139918 }, { "epoch": 0.8761828143601854, "grad_norm": 0.701776921749115, "learning_rate": 6.417301906622382e-05, "loss": 0.835388469696045, "memory(GiB)": 91.52, "step": 67525, "token_acc": 0.7607885119421167, "train_speed(iter/s)": 0.139916 }, { "epoch": 0.8762476927618411, "grad_norm": 0.7335100769996643, "learning_rate": 6.416787523229153e-05, "loss": 0.8648380279541016, "memory(GiB)": 91.52, "step": 67530, "token_acc": 0.7422979340340703, "train_speed(iter/s)": 0.139914 }, { "epoch": 0.8763125711634968, "grad_norm": 0.7731450200080872, "learning_rate": 6.416273123531626e-05, "loss": 0.8132391929626465, "memory(GiB)": 91.52, "step": 67535, "token_acc": 0.7894077595066804, "train_speed(iter/s)": 0.139913 }, { "epoch": 0.8763774495651525, "grad_norm": 0.7338220477104187, "learning_rate": 6.415758707535719e-05, "loss": 0.8501548767089844, "memory(GiB)": 91.52, "step": 67540, "token_acc": 0.7852857721929469, "train_speed(iter/s)": 0.139911 }, { "epoch": 0.8764423279668082, "grad_norm": 0.6650730967521667, "learning_rate": 6.415244275247352e-05, "loss": 0.8375580787658692, "memory(GiB)": 91.52, "step": 67545, "token_acc": 0.7676479085743636, "train_speed(iter/s)": 0.13991 }, { "epoch": 0.8765072063684639, "grad_norm": 0.7164019346237183, "learning_rate": 6.414729826672446e-05, "loss": 0.8803620338439941, "memory(GiB)": 91.52, "step": 67550, "token_acc": 0.7635120468851747, "train_speed(iter/s)": 0.139908 }, { "epoch": 0.8765720847701196, "grad_norm": 0.7853261232376099, "learning_rate": 6.41421536181692e-05, "loss": 0.8719259262084961, "memory(GiB)": 91.52, "step": 67555, "token_acc": 0.7518842206542655, "train_speed(iter/s)": 0.139907 }, { "epoch": 0.8766369631717753, "grad_norm": 0.6608323454856873, "learning_rate": 6.413700880686694e-05, "loss": 0.8605763435363769, "memory(GiB)": 91.52, "step": 67560, "token_acc": 0.759009702756815, "train_speed(iter/s)": 0.139906 }, { "epoch": 0.876701841573431, "grad_norm": 0.7156742215156555, "learning_rate": 6.413186383287688e-05, "loss": 0.8768473625183105, "memory(GiB)": 91.52, "step": 67565, "token_acc": 0.7665593715855064, "train_speed(iter/s)": 0.139904 }, { "epoch": 0.8767667199750867, "grad_norm": 0.6283711791038513, "learning_rate": 6.412671869625826e-05, "loss": 0.8274030685424805, "memory(GiB)": 91.52, "step": 67570, "token_acc": 0.7667145000326137, "train_speed(iter/s)": 0.139902 }, { "epoch": 0.8768315983767424, "grad_norm": 0.7040917873382568, "learning_rate": 6.412157339707027e-05, "loss": 0.839175796508789, "memory(GiB)": 91.52, "step": 67575, "token_acc": 0.7474786520012651, "train_speed(iter/s)": 0.139901 }, { "epoch": 0.8768964767783981, "grad_norm": 0.71263587474823, "learning_rate": 6.411642793537211e-05, "loss": 0.8501834869384766, "memory(GiB)": 91.52, "step": 67580, "token_acc": 0.7548339346840596, "train_speed(iter/s)": 0.1399 }, { "epoch": 0.8769613551800538, "grad_norm": 0.7155781388282776, "learning_rate": 6.4111282311223e-05, "loss": 0.8219873428344726, "memory(GiB)": 91.52, "step": 67585, "token_acc": 0.7931201382886777, "train_speed(iter/s)": 0.139898 }, { "epoch": 0.8770262335817095, "grad_norm": 0.7437348961830139, "learning_rate": 6.410613652468218e-05, "loss": 0.8559057235717773, "memory(GiB)": 91.52, "step": 67590, "token_acc": 0.7734345991561181, "train_speed(iter/s)": 0.139897 }, { "epoch": 0.8770911119833652, "grad_norm": 0.6996463537216187, "learning_rate": 6.410099057580885e-05, "loss": 0.9192399978637695, "memory(GiB)": 91.52, "step": 67595, "token_acc": 0.7693736390418855, "train_speed(iter/s)": 0.139895 }, { "epoch": 0.8771559903850209, "grad_norm": 0.7860105633735657, "learning_rate": 6.409584446466221e-05, "loss": 0.8496252059936523, "memory(GiB)": 91.52, "step": 67600, "token_acc": 0.7510833371345162, "train_speed(iter/s)": 0.139894 }, { "epoch": 0.8772208687866766, "grad_norm": 0.7978865504264832, "learning_rate": 6.409069819130152e-05, "loss": 0.8486154556274415, "memory(GiB)": 91.52, "step": 67605, "token_acc": 0.7913448534936138, "train_speed(iter/s)": 0.139892 }, { "epoch": 0.8772857471883323, "grad_norm": 0.7446401119232178, "learning_rate": 6.408555175578596e-05, "loss": 0.8737314224243165, "memory(GiB)": 91.52, "step": 67610, "token_acc": 0.7652723663897669, "train_speed(iter/s)": 0.13989 }, { "epoch": 0.877350625589988, "grad_norm": 0.7178497314453125, "learning_rate": 6.408040515817479e-05, "loss": 0.8612399101257324, "memory(GiB)": 91.52, "step": 67615, "token_acc": 0.7587783951317232, "train_speed(iter/s)": 0.139889 }, { "epoch": 0.8774155039916437, "grad_norm": 0.7435327172279358, "learning_rate": 6.407525839852722e-05, "loss": 0.881764030456543, "memory(GiB)": 91.52, "step": 67620, "token_acc": 0.7356678904264332, "train_speed(iter/s)": 0.139888 }, { "epoch": 0.8774803823932994, "grad_norm": 0.7757409811019897, "learning_rate": 6.407011147690247e-05, "loss": 0.8441462516784668, "memory(GiB)": 91.52, "step": 67625, "token_acc": 0.7771677927927928, "train_speed(iter/s)": 0.139887 }, { "epoch": 0.8775452607949551, "grad_norm": 0.7586643695831299, "learning_rate": 6.406496439335982e-05, "loss": 0.8768833160400391, "memory(GiB)": 91.52, "step": 67630, "token_acc": 0.7635765673175745, "train_speed(iter/s)": 0.139885 }, { "epoch": 0.8776101391966108, "grad_norm": 0.7413197159767151, "learning_rate": 6.405981714795843e-05, "loss": 0.8755827903747558, "memory(GiB)": 91.52, "step": 67635, "token_acc": 0.7431685256799514, "train_speed(iter/s)": 0.139883 }, { "epoch": 0.8776750175982665, "grad_norm": 0.7313454151153564, "learning_rate": 6.405466974075758e-05, "loss": 0.8912555694580078, "memory(GiB)": 91.52, "step": 67640, "token_acc": 0.770285033270434, "train_speed(iter/s)": 0.139881 }, { "epoch": 0.8777398959999222, "grad_norm": 0.760227620601654, "learning_rate": 6.40495221718165e-05, "loss": 0.9161862373352051, "memory(GiB)": 91.52, "step": 67645, "token_acc": 0.7557719259423176, "train_speed(iter/s)": 0.13988 }, { "epoch": 0.8778047744015779, "grad_norm": 0.8354359269142151, "learning_rate": 6.404437444119443e-05, "loss": 0.8496750831604004, "memory(GiB)": 91.52, "step": 67650, "token_acc": 0.7333699332052338, "train_speed(iter/s)": 0.139879 }, { "epoch": 0.8778696528032336, "grad_norm": 0.6699796915054321, "learning_rate": 6.403922654895058e-05, "loss": 0.8419986724853515, "memory(GiB)": 91.52, "step": 67655, "token_acc": 0.7689762737498819, "train_speed(iter/s)": 0.139877 }, { "epoch": 0.8779345312048893, "grad_norm": 0.7431786060333252, "learning_rate": 6.403407849514423e-05, "loss": 0.9195865631103516, "memory(GiB)": 91.52, "step": 67660, "token_acc": 0.7519567107933134, "train_speed(iter/s)": 0.139876 }, { "epoch": 0.8779994096065449, "grad_norm": 0.6974471807479858, "learning_rate": 6.402893027983459e-05, "loss": 0.8264005661010743, "memory(GiB)": 91.52, "step": 67665, "token_acc": 0.7543940264215968, "train_speed(iter/s)": 0.139875 }, { "epoch": 0.8780642880082006, "grad_norm": 0.7695308327674866, "learning_rate": 6.402378190308093e-05, "loss": 0.8143251419067383, "memory(GiB)": 91.52, "step": 67670, "token_acc": 0.7597983901301437, "train_speed(iter/s)": 0.139873 }, { "epoch": 0.8781291664098563, "grad_norm": 0.8690938949584961, "learning_rate": 6.401863336494252e-05, "loss": 0.8923063278198242, "memory(GiB)": 91.52, "step": 67675, "token_acc": 0.7502743376704812, "train_speed(iter/s)": 0.139872 }, { "epoch": 0.878194044811512, "grad_norm": 0.7366236448287964, "learning_rate": 6.401348466547855e-05, "loss": 0.8563332557678223, "memory(GiB)": 91.52, "step": 67680, "token_acc": 0.7785444813465321, "train_speed(iter/s)": 0.13987 }, { "epoch": 0.8782589232131677, "grad_norm": 0.7184737324714661, "learning_rate": 6.400833580474831e-05, "loss": 0.8883050918579102, "memory(GiB)": 91.52, "step": 67685, "token_acc": 0.7604547436198397, "train_speed(iter/s)": 0.139868 }, { "epoch": 0.8783238016148234, "grad_norm": 0.7291764616966248, "learning_rate": 6.400318678281104e-05, "loss": 0.8461168289184571, "memory(GiB)": 91.52, "step": 67690, "token_acc": 0.7654385054488843, "train_speed(iter/s)": 0.139867 }, { "epoch": 0.8783886800164791, "grad_norm": 0.7803084850311279, "learning_rate": 6.399803759972599e-05, "loss": 0.859132957458496, "memory(GiB)": 91.52, "step": 67695, "token_acc": 0.7609795918367347, "train_speed(iter/s)": 0.139865 }, { "epoch": 0.8784535584181348, "grad_norm": 0.7661375403404236, "learning_rate": 6.399288825555242e-05, "loss": 0.8375985145568847, "memory(GiB)": 91.52, "step": 67700, "token_acc": 0.7733388454600121, "train_speed(iter/s)": 0.139864 }, { "epoch": 0.8785184368197905, "grad_norm": 0.7714676260948181, "learning_rate": 6.398773875034961e-05, "loss": 0.8844818115234375, "memory(GiB)": 91.52, "step": 67705, "token_acc": 0.7633044787354158, "train_speed(iter/s)": 0.139863 }, { "epoch": 0.8785833152214462, "grad_norm": 0.697472333908081, "learning_rate": 6.398258908417681e-05, "loss": 0.8676235198974609, "memory(GiB)": 91.52, "step": 67710, "token_acc": 0.7799477327306117, "train_speed(iter/s)": 0.139861 }, { "epoch": 0.8786481936231019, "grad_norm": 0.7776253819465637, "learning_rate": 6.397743925709325e-05, "loss": 0.9195581436157226, "memory(GiB)": 91.52, "step": 67715, "token_acc": 0.7552988592608586, "train_speed(iter/s)": 0.13986 }, { "epoch": 0.8787130720247576, "grad_norm": 0.7311967611312866, "learning_rate": 6.397228926915823e-05, "loss": 0.8434934616088867, "memory(GiB)": 91.52, "step": 67720, "token_acc": 0.7730221969265794, "train_speed(iter/s)": 0.139859 }, { "epoch": 0.8787779504264133, "grad_norm": 0.7039095163345337, "learning_rate": 6.396713912043102e-05, "loss": 0.856060791015625, "memory(GiB)": 91.52, "step": 67725, "token_acc": 0.7581634972346389, "train_speed(iter/s)": 0.139857 }, { "epoch": 0.878842828828069, "grad_norm": 0.7054641842842102, "learning_rate": 6.396198881097084e-05, "loss": 0.8462177276611328, "memory(GiB)": 91.52, "step": 67730, "token_acc": 0.7843545089191285, "train_speed(iter/s)": 0.139856 }, { "epoch": 0.8789077072297247, "grad_norm": 0.6948849558830261, "learning_rate": 6.395683834083701e-05, "loss": 0.8388677597045898, "memory(GiB)": 91.52, "step": 67735, "token_acc": 0.7509843903810997, "train_speed(iter/s)": 0.139854 }, { "epoch": 0.8789725856313804, "grad_norm": 0.7452324628829956, "learning_rate": 6.39516877100888e-05, "loss": 0.8656463623046875, "memory(GiB)": 91.52, "step": 67740, "token_acc": 0.7520304750952347, "train_speed(iter/s)": 0.139853 }, { "epoch": 0.8790374640330361, "grad_norm": 0.7323300838470459, "learning_rate": 6.394653691878543e-05, "loss": 0.8223881721496582, "memory(GiB)": 91.52, "step": 67745, "token_acc": 0.7784865495370707, "train_speed(iter/s)": 0.139851 }, { "epoch": 0.8791023424346918, "grad_norm": 0.7295562028884888, "learning_rate": 6.394138596698624e-05, "loss": 0.8770761489868164, "memory(GiB)": 91.52, "step": 67750, "token_acc": 0.7561950439648282, "train_speed(iter/s)": 0.13985 }, { "epoch": 0.8791672208363475, "grad_norm": 0.7133144736289978, "learning_rate": 6.393623485475045e-05, "loss": 0.856219482421875, "memory(GiB)": 91.52, "step": 67755, "token_acc": 0.7588959415272167, "train_speed(iter/s)": 0.139848 }, { "epoch": 0.8792320992380032, "grad_norm": 0.7820491790771484, "learning_rate": 6.39310835821374e-05, "loss": 0.8665787696838378, "memory(GiB)": 91.52, "step": 67760, "token_acc": 0.7593643145788002, "train_speed(iter/s)": 0.139847 }, { "epoch": 0.8792969776396589, "grad_norm": 0.7497772574424744, "learning_rate": 6.39259321492063e-05, "loss": 0.8496556282043457, "memory(GiB)": 91.52, "step": 67765, "token_acc": 0.7691317892455641, "train_speed(iter/s)": 0.139846 }, { "epoch": 0.8793618560413146, "grad_norm": 0.7089889049530029, "learning_rate": 6.39207805560165e-05, "loss": 0.864896011352539, "memory(GiB)": 91.52, "step": 67770, "token_acc": 0.7685852133731993, "train_speed(iter/s)": 0.139844 }, { "epoch": 0.8794267344429703, "grad_norm": 0.7370043396949768, "learning_rate": 6.391562880262724e-05, "loss": 0.856965446472168, "memory(GiB)": 91.52, "step": 67775, "token_acc": 0.7722891152962162, "train_speed(iter/s)": 0.139842 }, { "epoch": 0.879491612844626, "grad_norm": 0.6717169284820557, "learning_rate": 6.391047688909781e-05, "loss": 0.8311608314514161, "memory(GiB)": 91.52, "step": 67780, "token_acc": 0.7663104611923509, "train_speed(iter/s)": 0.139841 }, { "epoch": 0.8795564912462817, "grad_norm": 0.7561540603637695, "learning_rate": 6.390532481548751e-05, "loss": 0.8973829269409179, "memory(GiB)": 91.52, "step": 67785, "token_acc": 0.7664089900508195, "train_speed(iter/s)": 0.13984 }, { "epoch": 0.8796213696479374, "grad_norm": 0.8152998089790344, "learning_rate": 6.390017258185563e-05, "loss": 0.8875886917114257, "memory(GiB)": 91.52, "step": 67790, "token_acc": 0.767139944923286, "train_speed(iter/s)": 0.139839 }, { "epoch": 0.8796862480495931, "grad_norm": 0.7461992502212524, "learning_rate": 6.389502018826147e-05, "loss": 0.8349370956420898, "memory(GiB)": 91.52, "step": 67795, "token_acc": 0.7543580159921571, "train_speed(iter/s)": 0.139837 }, { "epoch": 0.8797511264512488, "grad_norm": 0.8108386993408203, "learning_rate": 6.388986763476431e-05, "loss": 0.8864916801452637, "memory(GiB)": 91.52, "step": 67800, "token_acc": 0.7480957935908873, "train_speed(iter/s)": 0.139836 }, { "epoch": 0.8798160048529045, "grad_norm": 0.7784479856491089, "learning_rate": 6.388471492142344e-05, "loss": 0.8234514236450196, "memory(GiB)": 91.52, "step": 67805, "token_acc": 0.7867824409068982, "train_speed(iter/s)": 0.139834 }, { "epoch": 0.8798808832545602, "grad_norm": 0.8129894733428955, "learning_rate": 6.387956204829816e-05, "loss": 0.8519318580627442, "memory(GiB)": 91.52, "step": 67810, "token_acc": 0.7583259878128233, "train_speed(iter/s)": 0.139832 }, { "epoch": 0.8799457616562159, "grad_norm": 0.8106945753097534, "learning_rate": 6.387440901544779e-05, "loss": 0.891725730895996, "memory(GiB)": 91.52, "step": 67815, "token_acc": 0.7677842292564151, "train_speed(iter/s)": 0.139831 }, { "epoch": 0.8800106400578716, "grad_norm": 0.7517030239105225, "learning_rate": 6.38692558229316e-05, "loss": 0.884330940246582, "memory(GiB)": 91.52, "step": 67820, "token_acc": 0.7590285600689153, "train_speed(iter/s)": 0.13983 }, { "epoch": 0.8800755184595273, "grad_norm": 0.7471114993095398, "learning_rate": 6.386410247080891e-05, "loss": 0.864409065246582, "memory(GiB)": 91.52, "step": 67825, "token_acc": 0.7563879930538328, "train_speed(iter/s)": 0.139828 }, { "epoch": 0.880140396861183, "grad_norm": 0.6800961494445801, "learning_rate": 6.385894895913904e-05, "loss": 0.8374227523803711, "memory(GiB)": 91.52, "step": 67830, "token_acc": 0.7676497944732815, "train_speed(iter/s)": 0.139826 }, { "epoch": 0.8802052752628386, "grad_norm": 0.6934287548065186, "learning_rate": 6.385379528798124e-05, "loss": 0.8444233894348144, "memory(GiB)": 91.52, "step": 67835, "token_acc": 0.7716695299078556, "train_speed(iter/s)": 0.139824 }, { "epoch": 0.8802701536644943, "grad_norm": 0.6084915995597839, "learning_rate": 6.384864145739488e-05, "loss": 0.8405320167541503, "memory(GiB)": 91.52, "step": 67840, "token_acc": 0.7645376239232895, "train_speed(iter/s)": 0.139823 }, { "epoch": 0.88033503206615, "grad_norm": 0.6756540536880493, "learning_rate": 6.384348746743922e-05, "loss": 0.8016226768493653, "memory(GiB)": 91.52, "step": 67845, "token_acc": 0.7837413898929277, "train_speed(iter/s)": 0.139821 }, { "epoch": 0.8803999104678057, "grad_norm": 0.7542067170143127, "learning_rate": 6.383833331817362e-05, "loss": 0.8684827804565429, "memory(GiB)": 91.52, "step": 67850, "token_acc": 0.7733315782545742, "train_speed(iter/s)": 0.13982 }, { "epoch": 0.8804647888694614, "grad_norm": 0.6996771097183228, "learning_rate": 6.383317900965737e-05, "loss": 0.8662117004394532, "memory(GiB)": 91.52, "step": 67855, "token_acc": 0.769180504305043, "train_speed(iter/s)": 0.139818 }, { "epoch": 0.8805296672711171, "grad_norm": 0.8315101265907288, "learning_rate": 6.382802454194978e-05, "loss": 0.8171677589416504, "memory(GiB)": 91.52, "step": 67860, "token_acc": 0.7734417135372524, "train_speed(iter/s)": 0.139817 }, { "epoch": 0.8805945456727728, "grad_norm": 0.7328597903251648, "learning_rate": 6.382286991511019e-05, "loss": 0.8877395629882813, "memory(GiB)": 91.52, "step": 67865, "token_acc": 0.7454153551389079, "train_speed(iter/s)": 0.139815 }, { "epoch": 0.8806594240744285, "grad_norm": 0.7379259467124939, "learning_rate": 6.381771512919787e-05, "loss": 0.863421630859375, "memory(GiB)": 91.52, "step": 67870, "token_acc": 0.7928593733735818, "train_speed(iter/s)": 0.139813 }, { "epoch": 0.8807243024760842, "grad_norm": 0.7147771716117859, "learning_rate": 6.381256018427221e-05, "loss": 0.8315227508544922, "memory(GiB)": 91.52, "step": 67875, "token_acc": 0.7718414270408015, "train_speed(iter/s)": 0.139812 }, { "epoch": 0.8807891808777399, "grad_norm": 0.7294259071350098, "learning_rate": 6.380740508039247e-05, "loss": 0.8352361679077148, "memory(GiB)": 91.52, "step": 67880, "token_acc": 0.7710635246569538, "train_speed(iter/s)": 0.13981 }, { "epoch": 0.8808540592793956, "grad_norm": 0.6972950100898743, "learning_rate": 6.380224981761802e-05, "loss": 0.8678708076477051, "memory(GiB)": 91.52, "step": 67885, "token_acc": 0.7692331232021543, "train_speed(iter/s)": 0.139809 }, { "epoch": 0.8809189376810513, "grad_norm": 0.7594492435455322, "learning_rate": 6.379709439600815e-05, "loss": 0.8331785202026367, "memory(GiB)": 91.52, "step": 67890, "token_acc": 0.7619461337966985, "train_speed(iter/s)": 0.139807 }, { "epoch": 0.880983816082707, "grad_norm": 0.7238640785217285, "learning_rate": 6.37919388156222e-05, "loss": 0.894290542602539, "memory(GiB)": 91.52, "step": 67895, "token_acc": 0.7633933851066346, "train_speed(iter/s)": 0.139806 }, { "epoch": 0.8810486944843627, "grad_norm": 0.7696444392204285, "learning_rate": 6.378678307651952e-05, "loss": 0.8488592147827149, "memory(GiB)": 91.52, "step": 67900, "token_acc": 0.7783477654204457, "train_speed(iter/s)": 0.139804 }, { "epoch": 0.8811135728860183, "grad_norm": 0.6765942573547363, "learning_rate": 6.378162717875944e-05, "loss": 0.8456615447998047, "memory(GiB)": 91.52, "step": 67905, "token_acc": 0.7743131511623595, "train_speed(iter/s)": 0.139802 }, { "epoch": 0.881178451287674, "grad_norm": 0.7710259556770325, "learning_rate": 6.377647112240127e-05, "loss": 0.843657112121582, "memory(GiB)": 91.52, "step": 67910, "token_acc": 0.7688311688311689, "train_speed(iter/s)": 0.139801 }, { "epoch": 0.8812433296893297, "grad_norm": 0.7247003316879272, "learning_rate": 6.377131490750434e-05, "loss": 0.8926772117614746, "memory(GiB)": 91.52, "step": 67915, "token_acc": 0.7484061930783242, "train_speed(iter/s)": 0.139799 }, { "epoch": 0.8813082080909854, "grad_norm": 0.7780964970588684, "learning_rate": 6.376615853412804e-05, "loss": 0.8676454544067382, "memory(GiB)": 91.52, "step": 67920, "token_acc": 0.7808835027365129, "train_speed(iter/s)": 0.139797 }, { "epoch": 0.8813730864926411, "grad_norm": 0.6766973733901978, "learning_rate": 6.376100200233164e-05, "loss": 0.8509298324584961, "memory(GiB)": 91.52, "step": 67925, "token_acc": 0.7689513998943476, "train_speed(iter/s)": 0.139796 }, { "epoch": 0.8814379648942968, "grad_norm": 0.7231237888336182, "learning_rate": 6.375584531217453e-05, "loss": 0.8802103996276855, "memory(GiB)": 91.52, "step": 67930, "token_acc": 0.7528552174221466, "train_speed(iter/s)": 0.139794 }, { "epoch": 0.8815028432959525, "grad_norm": 0.7262580394744873, "learning_rate": 6.375068846371604e-05, "loss": 0.8950569152832031, "memory(GiB)": 91.52, "step": 67935, "token_acc": 0.7644714491049261, "train_speed(iter/s)": 0.139793 }, { "epoch": 0.8815677216976082, "grad_norm": 0.7963489294052124, "learning_rate": 6.37455314570155e-05, "loss": 0.851406478881836, "memory(GiB)": 91.52, "step": 67940, "token_acc": 0.7746299596319598, "train_speed(iter/s)": 0.139791 }, { "epoch": 0.8816326000992639, "grad_norm": 0.6702601313591003, "learning_rate": 6.374037429213229e-05, "loss": 0.8361873626708984, "memory(GiB)": 91.52, "step": 67945, "token_acc": 0.767049777742181, "train_speed(iter/s)": 0.139789 }, { "epoch": 0.8816974785009196, "grad_norm": 0.7630237340927124, "learning_rate": 6.373521696912572e-05, "loss": 0.8758119583129883, "memory(GiB)": 91.52, "step": 67950, "token_acc": 0.7608800147901645, "train_speed(iter/s)": 0.139788 }, { "epoch": 0.8817623569025753, "grad_norm": 0.7200526595115662, "learning_rate": 6.373005948805516e-05, "loss": 0.831439208984375, "memory(GiB)": 91.52, "step": 67955, "token_acc": 0.7741064396354499, "train_speed(iter/s)": 0.139787 }, { "epoch": 0.881827235304231, "grad_norm": 0.7245494723320007, "learning_rate": 6.372490184897995e-05, "loss": 0.8600656509399414, "memory(GiB)": 91.52, "step": 67960, "token_acc": 0.7717519368774323, "train_speed(iter/s)": 0.139786 }, { "epoch": 0.8818921137058867, "grad_norm": 0.658902108669281, "learning_rate": 6.371974405195947e-05, "loss": 0.7842615604400635, "memory(GiB)": 91.52, "step": 67965, "token_acc": 0.771160795604783, "train_speed(iter/s)": 0.139784 }, { "epoch": 0.8819569921075424, "grad_norm": 0.623285174369812, "learning_rate": 6.371458609705306e-05, "loss": 0.8290412902832032, "memory(GiB)": 91.52, "step": 67970, "token_acc": 0.7803164077984792, "train_speed(iter/s)": 0.139783 }, { "epoch": 0.8820218705091981, "grad_norm": 0.6981462240219116, "learning_rate": 6.370942798432006e-05, "loss": 0.8796319961547852, "memory(GiB)": 91.52, "step": 67975, "token_acc": 0.7618645855304788, "train_speed(iter/s)": 0.139782 }, { "epoch": 0.8820867489108538, "grad_norm": 0.853570818901062, "learning_rate": 6.370426971381984e-05, "loss": 0.9049274444580078, "memory(GiB)": 91.52, "step": 67980, "token_acc": 0.746983163523376, "train_speed(iter/s)": 0.13978 }, { "epoch": 0.8821516273125095, "grad_norm": 0.8080524206161499, "learning_rate": 6.369911128561178e-05, "loss": 0.9005721092224122, "memory(GiB)": 91.52, "step": 67985, "token_acc": 0.7471548367774783, "train_speed(iter/s)": 0.139778 }, { "epoch": 0.8822165057141652, "grad_norm": 0.7290884256362915, "learning_rate": 6.369395269975523e-05, "loss": 0.8486082077026367, "memory(GiB)": 91.52, "step": 67990, "token_acc": 0.7582600340010348, "train_speed(iter/s)": 0.139777 }, { "epoch": 0.8822813841158209, "grad_norm": 0.797365665435791, "learning_rate": 6.368879395630954e-05, "loss": 0.8819149017333985, "memory(GiB)": 91.52, "step": 67995, "token_acc": 0.7843414858599087, "train_speed(iter/s)": 0.139775 }, { "epoch": 0.8823462625174766, "grad_norm": 0.7210580706596375, "learning_rate": 6.36836350553341e-05, "loss": 0.8392444610595703, "memory(GiB)": 91.52, "step": 68000, "token_acc": 0.7724793735141938, "train_speed(iter/s)": 0.139774 }, { "epoch": 0.8824111409191323, "grad_norm": 0.712000846862793, "learning_rate": 6.367847599688828e-05, "loss": 0.849397087097168, "memory(GiB)": 91.52, "step": 68005, "token_acc": 0.7627138643067847, "train_speed(iter/s)": 0.139772 }, { "epoch": 0.882476019320788, "grad_norm": 0.6714279055595398, "learning_rate": 6.367331678103142e-05, "loss": 0.8376059532165527, "memory(GiB)": 91.52, "step": 68010, "token_acc": 0.7712146143255314, "train_speed(iter/s)": 0.139771 }, { "epoch": 0.8825408977224437, "grad_norm": 0.710709273815155, "learning_rate": 6.366815740782293e-05, "loss": 0.8162797927856446, "memory(GiB)": 91.52, "step": 68015, "token_acc": 0.7638674323024456, "train_speed(iter/s)": 0.139769 }, { "epoch": 0.8826057761240994, "grad_norm": 0.6943836808204651, "learning_rate": 6.366299787732214e-05, "loss": 0.8600849151611328, "memory(GiB)": 91.52, "step": 68020, "token_acc": 0.7583046535986833, "train_speed(iter/s)": 0.139768 }, { "epoch": 0.8826706545257551, "grad_norm": 0.6855455636978149, "learning_rate": 6.365783818958846e-05, "loss": 0.8662748336791992, "memory(GiB)": 91.52, "step": 68025, "token_acc": 0.7720389897008337, "train_speed(iter/s)": 0.139767 }, { "epoch": 0.8827355329274108, "grad_norm": 0.9056350588798523, "learning_rate": 6.365267834468125e-05, "loss": 0.8797643661499024, "memory(GiB)": 91.52, "step": 68030, "token_acc": 0.7610401255323919, "train_speed(iter/s)": 0.139766 }, { "epoch": 0.8828004113290665, "grad_norm": 0.7261734008789062, "learning_rate": 6.364751834265991e-05, "loss": 0.8163793563842774, "memory(GiB)": 91.52, "step": 68035, "token_acc": 0.7731169504366203, "train_speed(iter/s)": 0.139764 }, { "epoch": 0.8828652897307222, "grad_norm": 0.7384578585624695, "learning_rate": 6.364235818358381e-05, "loss": 0.8541627883911133, "memory(GiB)": 91.52, "step": 68040, "token_acc": 0.7568169801344512, "train_speed(iter/s)": 0.139763 }, { "epoch": 0.8829301681323779, "grad_norm": 0.6826533079147339, "learning_rate": 6.363719786751232e-05, "loss": 0.8538431167602539, "memory(GiB)": 91.52, "step": 68045, "token_acc": 0.7552063894901844, "train_speed(iter/s)": 0.139761 }, { "epoch": 0.8829950465340336, "grad_norm": 0.8509007096290588, "learning_rate": 6.363203739450485e-05, "loss": 0.8614583969116211, "memory(GiB)": 91.52, "step": 68050, "token_acc": 0.7608929416318981, "train_speed(iter/s)": 0.13976 }, { "epoch": 0.8830599249356893, "grad_norm": 0.797907829284668, "learning_rate": 6.362687676462075e-05, "loss": 0.8579206466674805, "memory(GiB)": 91.52, "step": 68055, "token_acc": 0.787540815203434, "train_speed(iter/s)": 0.139758 }, { "epoch": 0.883124803337345, "grad_norm": 0.7061070799827576, "learning_rate": 6.362171597791944e-05, "loss": 0.8728673934936524, "memory(GiB)": 91.52, "step": 68060, "token_acc": 0.7609909281228192, "train_speed(iter/s)": 0.139757 }, { "epoch": 0.8831896817390007, "grad_norm": 0.7448669075965881, "learning_rate": 6.36165550344603e-05, "loss": 0.8569986343383789, "memory(GiB)": 91.52, "step": 68065, "token_acc": 0.7656176608736496, "train_speed(iter/s)": 0.139755 }, { "epoch": 0.8832545601406564, "grad_norm": 0.7269135117530823, "learning_rate": 6.361139393430271e-05, "loss": 0.829596996307373, "memory(GiB)": 91.52, "step": 68070, "token_acc": 0.7666849766547652, "train_speed(iter/s)": 0.139753 }, { "epoch": 0.8833194385423121, "grad_norm": 0.7397145628929138, "learning_rate": 6.360623267750609e-05, "loss": 0.836207389831543, "memory(GiB)": 91.52, "step": 68075, "token_acc": 0.7710367271036727, "train_speed(iter/s)": 0.139751 }, { "epoch": 0.8833843169439678, "grad_norm": 0.8416038751602173, "learning_rate": 6.36010712641298e-05, "loss": 0.8854763984680176, "memory(GiB)": 91.52, "step": 68080, "token_acc": 0.74904832836809, "train_speed(iter/s)": 0.13975 }, { "epoch": 0.8834491953456235, "grad_norm": 0.7495226263999939, "learning_rate": 6.359590969423328e-05, "loss": 0.8214269638061523, "memory(GiB)": 91.52, "step": 68085, "token_acc": 0.7700875099443119, "train_speed(iter/s)": 0.139748 }, { "epoch": 0.8835140737472792, "grad_norm": 0.7316780686378479, "learning_rate": 6.359074796787589e-05, "loss": 0.8303356170654297, "memory(GiB)": 91.52, "step": 68090, "token_acc": 0.7867141529885413, "train_speed(iter/s)": 0.139746 }, { "epoch": 0.8835789521489349, "grad_norm": 0.7486875057220459, "learning_rate": 6.358558608511705e-05, "loss": 0.7926001071929931, "memory(GiB)": 91.52, "step": 68095, "token_acc": 0.774935976319553, "train_speed(iter/s)": 0.139745 }, { "epoch": 0.8836438305505906, "grad_norm": 0.770230233669281, "learning_rate": 6.358042404601617e-05, "loss": 0.8605814933776855, "memory(GiB)": 91.52, "step": 68100, "token_acc": 0.7611097570883094, "train_speed(iter/s)": 0.139743 }, { "epoch": 0.8837087089522463, "grad_norm": 0.7483569383621216, "learning_rate": 6.357526185063262e-05, "loss": 0.8097345352172851, "memory(GiB)": 91.52, "step": 68105, "token_acc": 0.7856366430525882, "train_speed(iter/s)": 0.139742 }, { "epoch": 0.883773587353902, "grad_norm": 0.7204040288925171, "learning_rate": 6.357009949902586e-05, "loss": 0.8566308975219726, "memory(GiB)": 91.52, "step": 68110, "token_acc": 0.7594261281317362, "train_speed(iter/s)": 0.13974 }, { "epoch": 0.8838384657555577, "grad_norm": 0.7238483428955078, "learning_rate": 6.356493699125526e-05, "loss": 0.8777094841003418, "memory(GiB)": 91.52, "step": 68115, "token_acc": 0.7633867948765002, "train_speed(iter/s)": 0.139738 }, { "epoch": 0.8839033441572134, "grad_norm": 0.7990375757217407, "learning_rate": 6.355977432738023e-05, "loss": 0.8734490394592285, "memory(GiB)": 91.52, "step": 68120, "token_acc": 0.758706647398844, "train_speed(iter/s)": 0.139737 }, { "epoch": 0.8839682225588691, "grad_norm": 0.7913734912872314, "learning_rate": 6.355461150746018e-05, "loss": 0.8517242431640625, "memory(GiB)": 91.52, "step": 68125, "token_acc": 0.7813149302463639, "train_speed(iter/s)": 0.139736 }, { "epoch": 0.8840331009605248, "grad_norm": 0.708053469657898, "learning_rate": 6.354944853155456e-05, "loss": 0.8630756378173828, "memory(GiB)": 91.52, "step": 68130, "token_acc": 0.7636891284242753, "train_speed(iter/s)": 0.139734 }, { "epoch": 0.8840979793621805, "grad_norm": 0.7312325239181519, "learning_rate": 6.354428539972275e-05, "loss": 0.8672266960144043, "memory(GiB)": 91.52, "step": 68135, "token_acc": 0.7718098415346122, "train_speed(iter/s)": 0.139732 }, { "epoch": 0.8841628577638362, "grad_norm": 0.7730637788772583, "learning_rate": 6.353912211202416e-05, "loss": 0.860351848602295, "memory(GiB)": 91.52, "step": 68140, "token_acc": 0.7574292176897278, "train_speed(iter/s)": 0.139731 }, { "epoch": 0.8842277361654918, "grad_norm": 0.7614228129386902, "learning_rate": 6.353395866851823e-05, "loss": 0.876249122619629, "memory(GiB)": 91.52, "step": 68145, "token_acc": 0.7596212583995113, "train_speed(iter/s)": 0.139729 }, { "epoch": 0.8842926145671475, "grad_norm": 0.7631303071975708, "learning_rate": 6.352879506926438e-05, "loss": 0.8278395652770996, "memory(GiB)": 91.52, "step": 68150, "token_acc": 0.7758640312606699, "train_speed(iter/s)": 0.139728 }, { "epoch": 0.8843574929688032, "grad_norm": 0.7271153330802917, "learning_rate": 6.352363131432204e-05, "loss": 0.8838772773742676, "memory(GiB)": 91.52, "step": 68155, "token_acc": 0.7768598075934494, "train_speed(iter/s)": 0.139726 }, { "epoch": 0.8844223713704589, "grad_norm": 0.7745200991630554, "learning_rate": 6.351846740375059e-05, "loss": 0.8493131637573242, "memory(GiB)": 91.52, "step": 68160, "token_acc": 0.7705016974726518, "train_speed(iter/s)": 0.139725 }, { "epoch": 0.8844872497721146, "grad_norm": 0.856650710105896, "learning_rate": 6.35133033376095e-05, "loss": 0.8892314910888672, "memory(GiB)": 91.52, "step": 68165, "token_acc": 0.768060989875358, "train_speed(iter/s)": 0.139724 }, { "epoch": 0.8845521281737703, "grad_norm": 0.8185497522354126, "learning_rate": 6.350813911595819e-05, "loss": 0.8549862861633301, "memory(GiB)": 91.52, "step": 68170, "token_acc": 0.7688537603976116, "train_speed(iter/s)": 0.139723 }, { "epoch": 0.884617006575426, "grad_norm": 0.8133055567741394, "learning_rate": 6.35029747388561e-05, "loss": 0.8730661392211914, "memory(GiB)": 91.52, "step": 68175, "token_acc": 0.7701837003097026, "train_speed(iter/s)": 0.139721 }, { "epoch": 0.8846818849770817, "grad_norm": 0.8340266346931458, "learning_rate": 6.349781020636263e-05, "loss": 0.8453762054443359, "memory(GiB)": 91.52, "step": 68180, "token_acc": 0.773006808697562, "train_speed(iter/s)": 0.13972 }, { "epoch": 0.8847467633787374, "grad_norm": 0.7361058592796326, "learning_rate": 6.349264551853722e-05, "loss": 0.8062629699707031, "memory(GiB)": 91.52, "step": 68185, "token_acc": 0.774613561245952, "train_speed(iter/s)": 0.139718 }, { "epoch": 0.8848116417803931, "grad_norm": 0.7541522979736328, "learning_rate": 6.348748067543932e-05, "loss": 0.8572731018066406, "memory(GiB)": 91.52, "step": 68190, "token_acc": 0.7564800901577761, "train_speed(iter/s)": 0.139716 }, { "epoch": 0.8848765201820488, "grad_norm": 0.6896083354949951, "learning_rate": 6.348231567712837e-05, "loss": 0.8082611083984375, "memory(GiB)": 91.52, "step": 68195, "token_acc": 0.7789996687644916, "train_speed(iter/s)": 0.139714 }, { "epoch": 0.8849413985837045, "grad_norm": 0.7636362314224243, "learning_rate": 6.347715052366378e-05, "loss": 0.865785026550293, "memory(GiB)": 91.52, "step": 68200, "token_acc": 0.7500092179491906, "train_speed(iter/s)": 0.139712 }, { "epoch": 0.8850062769853602, "grad_norm": 0.6746329665184021, "learning_rate": 6.347198521510502e-05, "loss": 0.882805061340332, "memory(GiB)": 91.52, "step": 68205, "token_acc": 0.7532910775231594, "train_speed(iter/s)": 0.139711 }, { "epoch": 0.8850711553870159, "grad_norm": 0.7141187191009521, "learning_rate": 6.346681975151154e-05, "loss": 0.8555349349975586, "memory(GiB)": 91.52, "step": 68210, "token_acc": 0.7634128166915052, "train_speed(iter/s)": 0.139709 }, { "epoch": 0.8851360337886716, "grad_norm": 0.7156448364257812, "learning_rate": 6.346165413294273e-05, "loss": 0.8347945213317871, "memory(GiB)": 91.52, "step": 68215, "token_acc": 0.7767157199834961, "train_speed(iter/s)": 0.139708 }, { "epoch": 0.8852009121903273, "grad_norm": 0.6673872470855713, "learning_rate": 6.34564883594581e-05, "loss": 0.8432574272155762, "memory(GiB)": 91.52, "step": 68220, "token_acc": 0.7466605073719059, "train_speed(iter/s)": 0.139706 }, { "epoch": 0.885265790591983, "grad_norm": 0.7238833904266357, "learning_rate": 6.345132243111706e-05, "loss": 0.8530572891235352, "memory(GiB)": 91.52, "step": 68225, "token_acc": 0.7593814210014058, "train_speed(iter/s)": 0.139704 }, { "epoch": 0.8853306689936387, "grad_norm": 0.7964547276496887, "learning_rate": 6.344615634797905e-05, "loss": 0.8137373924255371, "memory(GiB)": 91.52, "step": 68230, "token_acc": 0.7694587681188843, "train_speed(iter/s)": 0.139703 }, { "epoch": 0.8853955473952944, "grad_norm": 0.8017942905426025, "learning_rate": 6.344099011010355e-05, "loss": 0.8745172500610352, "memory(GiB)": 91.52, "step": 68235, "token_acc": 0.7431290014502494, "train_speed(iter/s)": 0.139702 }, { "epoch": 0.8854604257969501, "grad_norm": 0.7247254848480225, "learning_rate": 6.343582371754998e-05, "loss": 0.8405421257019043, "memory(GiB)": 91.52, "step": 68240, "token_acc": 0.7645489036074851, "train_speed(iter/s)": 0.1397 }, { "epoch": 0.8855253041986058, "grad_norm": 0.7125424146652222, "learning_rate": 6.343065717037786e-05, "loss": 0.8416605949401855, "memory(GiB)": 91.52, "step": 68245, "token_acc": 0.742371280113368, "train_speed(iter/s)": 0.139698 }, { "epoch": 0.8855901826002615, "grad_norm": 0.7046422958374023, "learning_rate": 6.342549046864657e-05, "loss": 0.8481867790222168, "memory(GiB)": 91.52, "step": 68250, "token_acc": 0.7720756782647371, "train_speed(iter/s)": 0.139697 }, { "epoch": 0.8856550610019172, "grad_norm": 0.7718664407730103, "learning_rate": 6.34203236124156e-05, "loss": 0.8954275131225586, "memory(GiB)": 91.52, "step": 68255, "token_acc": 0.7583812176120139, "train_speed(iter/s)": 0.139696 }, { "epoch": 0.8857199394035729, "grad_norm": 0.6876785755157471, "learning_rate": 6.341515660174443e-05, "loss": 0.8480454444885254, "memory(GiB)": 91.52, "step": 68260, "token_acc": 0.7572692402870577, "train_speed(iter/s)": 0.139694 }, { "epoch": 0.8857848178052286, "grad_norm": 0.754570484161377, "learning_rate": 6.340998943669249e-05, "loss": 0.8391996383666992, "memory(GiB)": 91.52, "step": 68265, "token_acc": 0.7726674023708661, "train_speed(iter/s)": 0.139693 }, { "epoch": 0.8858496962068843, "grad_norm": 0.7718404531478882, "learning_rate": 6.340482211731927e-05, "loss": 0.8475954055786132, "memory(GiB)": 91.52, "step": 68270, "token_acc": 0.76449630646532, "train_speed(iter/s)": 0.139692 }, { "epoch": 0.88591457460854, "grad_norm": 0.7448495030403137, "learning_rate": 6.339965464368419e-05, "loss": 0.8485703468322754, "memory(GiB)": 91.52, "step": 68275, "token_acc": 0.762752573389249, "train_speed(iter/s)": 0.13969 }, { "epoch": 0.8859794530101957, "grad_norm": 0.8109685778617859, "learning_rate": 6.339448701584676e-05, "loss": 0.8630230903625489, "memory(GiB)": 91.52, "step": 68280, "token_acc": 0.7533999277891443, "train_speed(iter/s)": 0.139689 }, { "epoch": 0.8860443314118513, "grad_norm": 0.7355169057846069, "learning_rate": 6.338931923386643e-05, "loss": 0.852734375, "memory(GiB)": 91.52, "step": 68285, "token_acc": 0.7743277848911652, "train_speed(iter/s)": 0.139687 }, { "epoch": 0.886109209813507, "grad_norm": 0.7559932470321655, "learning_rate": 6.338415129780268e-05, "loss": 0.8323505401611329, "memory(GiB)": 91.52, "step": 68290, "token_acc": 0.77232330413987, "train_speed(iter/s)": 0.139686 }, { "epoch": 0.8861740882151627, "grad_norm": 0.7201526761054993, "learning_rate": 6.337898320771498e-05, "loss": 0.8456392288208008, "memory(GiB)": 91.52, "step": 68295, "token_acc": 0.7733950747211114, "train_speed(iter/s)": 0.139685 }, { "epoch": 0.8862389666168184, "grad_norm": 0.7509788274765015, "learning_rate": 6.337381496366282e-05, "loss": 0.8526444435119629, "memory(GiB)": 91.52, "step": 68300, "token_acc": 0.7704882301656495, "train_speed(iter/s)": 0.139683 }, { "epoch": 0.8863038450184741, "grad_norm": 0.7662303447723389, "learning_rate": 6.336864656570562e-05, "loss": 0.865053653717041, "memory(GiB)": 91.52, "step": 68305, "token_acc": 0.756201784903948, "train_speed(iter/s)": 0.139682 }, { "epoch": 0.8863687234201298, "grad_norm": 0.6977439522743225, "learning_rate": 6.336347801390292e-05, "loss": 0.868254566192627, "memory(GiB)": 91.52, "step": 68310, "token_acc": 0.7539888442080684, "train_speed(iter/s)": 0.139681 }, { "epoch": 0.8864336018217855, "grad_norm": 0.7176382541656494, "learning_rate": 6.335830930831418e-05, "loss": 0.8421899795532226, "memory(GiB)": 91.52, "step": 68315, "token_acc": 0.7425760159136118, "train_speed(iter/s)": 0.139679 }, { "epoch": 0.8864984802234412, "grad_norm": 0.694147527217865, "learning_rate": 6.335314044899885e-05, "loss": 0.851813793182373, "memory(GiB)": 91.52, "step": 68320, "token_acc": 0.7816764132553606, "train_speed(iter/s)": 0.139678 }, { "epoch": 0.8865633586250969, "grad_norm": 0.7347067594528198, "learning_rate": 6.334797143601647e-05, "loss": 0.8738033294677734, "memory(GiB)": 91.52, "step": 68325, "token_acc": 0.7685112031096925, "train_speed(iter/s)": 0.139677 }, { "epoch": 0.8866282370267526, "grad_norm": 0.71959388256073, "learning_rate": 6.334280226942649e-05, "loss": 0.8994359970092773, "memory(GiB)": 91.52, "step": 68330, "token_acc": 0.7623889086384643, "train_speed(iter/s)": 0.139675 }, { "epoch": 0.8866931154284083, "grad_norm": 0.7463111281394958, "learning_rate": 6.333763294928838e-05, "loss": 0.8723102569580078, "memory(GiB)": 91.52, "step": 68335, "token_acc": 0.7538158542589857, "train_speed(iter/s)": 0.139673 }, { "epoch": 0.886757993830064, "grad_norm": 0.7778328061103821, "learning_rate": 6.333246347566167e-05, "loss": 0.8743602752685546, "memory(GiB)": 91.52, "step": 68340, "token_acc": 0.764200750588385, "train_speed(iter/s)": 0.139671 }, { "epoch": 0.8868228722317197, "grad_norm": 0.7818707227706909, "learning_rate": 6.33272938486058e-05, "loss": 0.8482200622558593, "memory(GiB)": 91.52, "step": 68345, "token_acc": 0.7503556862373139, "train_speed(iter/s)": 0.139669 }, { "epoch": 0.8868877506333754, "grad_norm": 0.7747035622596741, "learning_rate": 6.33221240681803e-05, "loss": 0.8516457557678223, "memory(GiB)": 91.52, "step": 68350, "token_acc": 0.7635227690354482, "train_speed(iter/s)": 0.139668 }, { "epoch": 0.8869526290350311, "grad_norm": 0.7534372806549072, "learning_rate": 6.331695413444467e-05, "loss": 0.8966213226318359, "memory(GiB)": 91.52, "step": 68355, "token_acc": 0.7478342308592836, "train_speed(iter/s)": 0.139667 }, { "epoch": 0.8870175074366868, "grad_norm": 0.6303355693817139, "learning_rate": 6.331178404745839e-05, "loss": 0.8478371620178222, "memory(GiB)": 91.52, "step": 68360, "token_acc": 0.7669560329752686, "train_speed(iter/s)": 0.139665 }, { "epoch": 0.8870823858383425, "grad_norm": 0.8650449514389038, "learning_rate": 6.330661380728093e-05, "loss": 0.8254100799560546, "memory(GiB)": 91.52, "step": 68365, "token_acc": 0.7532208540407078, "train_speed(iter/s)": 0.139664 }, { "epoch": 0.8871472642399982, "grad_norm": 0.7972579002380371, "learning_rate": 6.330144341397184e-05, "loss": 0.852885627746582, "memory(GiB)": 91.52, "step": 68370, "token_acc": 0.7864019253910951, "train_speed(iter/s)": 0.139663 }, { "epoch": 0.8872121426416539, "grad_norm": 0.7509109377861023, "learning_rate": 6.32962728675906e-05, "loss": 0.8953657150268555, "memory(GiB)": 91.52, "step": 68375, "token_acc": 0.7603855603855604, "train_speed(iter/s)": 0.139662 }, { "epoch": 0.8872770210433095, "grad_norm": 0.6660052537918091, "learning_rate": 6.329110216819667e-05, "loss": 0.8660097122192383, "memory(GiB)": 91.52, "step": 68380, "token_acc": 0.7564825666569001, "train_speed(iter/s)": 0.13966 }, { "epoch": 0.8873418994449652, "grad_norm": 0.7446800470352173, "learning_rate": 6.328593131584962e-05, "loss": 0.8613733291625977, "memory(GiB)": 91.52, "step": 68385, "token_acc": 0.765840608279358, "train_speed(iter/s)": 0.139659 }, { "epoch": 0.8874067778466209, "grad_norm": 0.7220522165298462, "learning_rate": 6.328076031060892e-05, "loss": 0.8985748291015625, "memory(GiB)": 91.52, "step": 68390, "token_acc": 0.755067186543357, "train_speed(iter/s)": 0.139657 }, { "epoch": 0.8874716562482766, "grad_norm": 0.7321758270263672, "learning_rate": 6.32755891525341e-05, "loss": 0.9127204895019532, "memory(GiB)": 91.52, "step": 68395, "token_acc": 0.7568937759051827, "train_speed(iter/s)": 0.139656 }, { "epoch": 0.8875365346499323, "grad_norm": 0.7271766662597656, "learning_rate": 6.327041784168464e-05, "loss": 0.8454894065856934, "memory(GiB)": 91.52, "step": 68400, "token_acc": 0.7525194245711209, "train_speed(iter/s)": 0.139654 }, { "epoch": 0.887601413051588, "grad_norm": 0.6704089641571045, "learning_rate": 6.326524637812007e-05, "loss": 0.8387478828430176, "memory(GiB)": 91.52, "step": 68405, "token_acc": 0.7670946782178217, "train_speed(iter/s)": 0.139653 }, { "epoch": 0.8876662914532437, "grad_norm": 0.8398227691650391, "learning_rate": 6.32600747618999e-05, "loss": 0.8882047653198242, "memory(GiB)": 91.52, "step": 68410, "token_acc": 0.7333170459304761, "train_speed(iter/s)": 0.139652 }, { "epoch": 0.8877311698548994, "grad_norm": 0.7479211091995239, "learning_rate": 6.325490299308363e-05, "loss": 0.8709342956542969, "memory(GiB)": 91.52, "step": 68415, "token_acc": 0.7673303834808259, "train_speed(iter/s)": 0.13965 }, { "epoch": 0.8877960482565551, "grad_norm": 0.6605532169342041, "learning_rate": 6.32497310717308e-05, "loss": 0.8582326889038085, "memory(GiB)": 91.52, "step": 68420, "token_acc": 0.7745547456226046, "train_speed(iter/s)": 0.139649 }, { "epoch": 0.8878609266582108, "grad_norm": 0.7922559380531311, "learning_rate": 6.324455899790091e-05, "loss": 0.8671165466308594, "memory(GiB)": 91.52, "step": 68425, "token_acc": 0.7845046173110033, "train_speed(iter/s)": 0.139648 }, { "epoch": 0.8879258050598665, "grad_norm": 0.765718400478363, "learning_rate": 6.323938677165349e-05, "loss": 0.8878198623657226, "memory(GiB)": 91.52, "step": 68430, "token_acc": 0.7528026905829597, "train_speed(iter/s)": 0.139646 }, { "epoch": 0.8879906834615222, "grad_norm": 0.7868002653121948, "learning_rate": 6.323421439304807e-05, "loss": 0.8561637878417969, "memory(GiB)": 91.52, "step": 68435, "token_acc": 0.7683509730283373, "train_speed(iter/s)": 0.139645 }, { "epoch": 0.8880555618631779, "grad_norm": 0.680243194103241, "learning_rate": 6.322904186214415e-05, "loss": 0.8301568031311035, "memory(GiB)": 91.52, "step": 68440, "token_acc": 0.7822522498464497, "train_speed(iter/s)": 0.139643 }, { "epoch": 0.8881204402648336, "grad_norm": 0.7546965479850769, "learning_rate": 6.322386917900129e-05, "loss": 0.8041513442993165, "memory(GiB)": 91.52, "step": 68445, "token_acc": 0.7617595986880186, "train_speed(iter/s)": 0.139641 }, { "epoch": 0.8881853186664893, "grad_norm": 0.7421364188194275, "learning_rate": 6.321869634367896e-05, "loss": 0.9336803436279297, "memory(GiB)": 91.52, "step": 68450, "token_acc": 0.7478563576401612, "train_speed(iter/s)": 0.13964 }, { "epoch": 0.888250197068145, "grad_norm": 0.9485267996788025, "learning_rate": 6.321352335623674e-05, "loss": 0.8782629013061524, "memory(GiB)": 91.52, "step": 68455, "token_acc": 0.7560899158158696, "train_speed(iter/s)": 0.139639 }, { "epoch": 0.8883150754698007, "grad_norm": 0.7675119042396545, "learning_rate": 6.320835021673415e-05, "loss": 0.8819581985473632, "memory(GiB)": 91.52, "step": 68460, "token_acc": 0.7557667103538663, "train_speed(iter/s)": 0.139638 }, { "epoch": 0.8883799538714564, "grad_norm": 0.6880348920822144, "learning_rate": 6.320317692523072e-05, "loss": 0.8578874588012695, "memory(GiB)": 91.52, "step": 68465, "token_acc": 0.7624359089682469, "train_speed(iter/s)": 0.139636 }, { "epoch": 0.8884448322731121, "grad_norm": 0.730205237865448, "learning_rate": 6.319800348178596e-05, "loss": 0.9098426818847656, "memory(GiB)": 91.52, "step": 68470, "token_acc": 0.7550408622104849, "train_speed(iter/s)": 0.139634 }, { "epoch": 0.8885097106747678, "grad_norm": 0.772228479385376, "learning_rate": 6.319282988645943e-05, "loss": 0.8732165336608887, "memory(GiB)": 91.52, "step": 68475, "token_acc": 0.753008463369479, "train_speed(iter/s)": 0.139633 }, { "epoch": 0.8885745890764235, "grad_norm": 0.7231215238571167, "learning_rate": 6.318765613931068e-05, "loss": 0.8513688087463379, "memory(GiB)": 91.52, "step": 68480, "token_acc": 0.7596452328159645, "train_speed(iter/s)": 0.139632 }, { "epoch": 0.8886394674780792, "grad_norm": 0.7223733067512512, "learning_rate": 6.31824822403992e-05, "loss": 0.8658025741577149, "memory(GiB)": 91.52, "step": 68485, "token_acc": 0.7702430818958409, "train_speed(iter/s)": 0.13963 }, { "epoch": 0.8887043458797349, "grad_norm": 0.8377257585525513, "learning_rate": 6.31773081897846e-05, "loss": 0.890107536315918, "memory(GiB)": 91.52, "step": 68490, "token_acc": 0.749448193226966, "train_speed(iter/s)": 0.139628 }, { "epoch": 0.8887692242813906, "grad_norm": 0.7010579705238342, "learning_rate": 6.317213398752637e-05, "loss": 0.7858800888061523, "memory(GiB)": 91.52, "step": 68495, "token_acc": 0.7697351644748825, "train_speed(iter/s)": 0.139627 }, { "epoch": 0.8888341026830463, "grad_norm": 0.7418546080589294, "learning_rate": 6.316695963368407e-05, "loss": 0.8448317527770997, "memory(GiB)": 91.52, "step": 68500, "token_acc": 0.7698506500483507, "train_speed(iter/s)": 0.139626 }, { "epoch": 0.888898981084702, "grad_norm": 0.823096513748169, "learning_rate": 6.316178512831725e-05, "loss": 0.8867792129516602, "memory(GiB)": 91.52, "step": 68505, "token_acc": 0.7880578699218498, "train_speed(iter/s)": 0.139624 }, { "epoch": 0.8889638594863577, "grad_norm": 0.7791020274162292, "learning_rate": 6.315661047148542e-05, "loss": 0.8267025947570801, "memory(GiB)": 91.52, "step": 68510, "token_acc": 0.766361209322462, "train_speed(iter/s)": 0.139623 }, { "epoch": 0.8890287378880134, "grad_norm": 0.7671592831611633, "learning_rate": 6.315143566324821e-05, "loss": 0.8387688636779785, "memory(GiB)": 91.52, "step": 68515, "token_acc": 0.7717160676396837, "train_speed(iter/s)": 0.139622 }, { "epoch": 0.8890936162896691, "grad_norm": 0.7600715756416321, "learning_rate": 6.31462607036651e-05, "loss": 0.8510026931762695, "memory(GiB)": 91.52, "step": 68520, "token_acc": 0.7682283182380921, "train_speed(iter/s)": 0.13962 }, { "epoch": 0.8891584946913248, "grad_norm": 0.7035605907440186, "learning_rate": 6.314108559279568e-05, "loss": 0.8218960762023926, "memory(GiB)": 91.52, "step": 68525, "token_acc": 0.7610124046082678, "train_speed(iter/s)": 0.139619 }, { "epoch": 0.8892233730929805, "grad_norm": 0.8924851417541504, "learning_rate": 6.313591033069948e-05, "loss": 0.8916937828063964, "memory(GiB)": 91.52, "step": 68530, "token_acc": 0.7413166324730951, "train_speed(iter/s)": 0.139618 }, { "epoch": 0.8892882514946362, "grad_norm": 0.7418481707572937, "learning_rate": 6.313073491743608e-05, "loss": 0.8993584632873535, "memory(GiB)": 91.52, "step": 68535, "token_acc": 0.7604941384091769, "train_speed(iter/s)": 0.139616 }, { "epoch": 0.8893531298962919, "grad_norm": 0.7582166194915771, "learning_rate": 6.312555935306502e-05, "loss": 0.8632495880126954, "memory(GiB)": 91.52, "step": 68540, "token_acc": 0.7656341965862272, "train_speed(iter/s)": 0.139614 }, { "epoch": 0.8894180082979476, "grad_norm": 0.7807517051696777, "learning_rate": 6.312038363764587e-05, "loss": 0.8737518310546875, "memory(GiB)": 91.52, "step": 68545, "token_acc": 0.7671153925322136, "train_speed(iter/s)": 0.139613 }, { "epoch": 0.8894828866996033, "grad_norm": 0.6667810082435608, "learning_rate": 6.311520777123821e-05, "loss": 0.90469388961792, "memory(GiB)": 91.52, "step": 68550, "token_acc": 0.7427288468875649, "train_speed(iter/s)": 0.139612 }, { "epoch": 0.889547765101259, "grad_norm": 0.7598174214363098, "learning_rate": 6.311003175390155e-05, "loss": 0.8555234909057617, "memory(GiB)": 91.52, "step": 68555, "token_acc": 0.7574668264323436, "train_speed(iter/s)": 0.139611 }, { "epoch": 0.8896126435029147, "grad_norm": 0.7820584774017334, "learning_rate": 6.31048555856955e-05, "loss": 0.8470624923706055, "memory(GiB)": 91.52, "step": 68560, "token_acc": 0.7689326659131277, "train_speed(iter/s)": 0.139609 }, { "epoch": 0.8896775219045704, "grad_norm": 0.8060624599456787, "learning_rate": 6.30996792666796e-05, "loss": 0.8321612358093262, "memory(GiB)": 91.52, "step": 68565, "token_acc": 0.7984989451111957, "train_speed(iter/s)": 0.139607 }, { "epoch": 0.8897424003062261, "grad_norm": 0.7512158155441284, "learning_rate": 6.309450279691347e-05, "loss": 0.9039176940917969, "memory(GiB)": 91.52, "step": 68570, "token_acc": 0.7315884885565375, "train_speed(iter/s)": 0.139606 }, { "epoch": 0.8898072787078818, "grad_norm": 0.7642936706542969, "learning_rate": 6.308932617645663e-05, "loss": 0.8478323936462402, "memory(GiB)": 91.52, "step": 68575, "token_acc": 0.7782610311884317, "train_speed(iter/s)": 0.139604 }, { "epoch": 0.8898721571095375, "grad_norm": 0.7064590454101562, "learning_rate": 6.308414940536866e-05, "loss": 0.8585103988647461, "memory(GiB)": 91.52, "step": 68580, "token_acc": 0.7540731632339379, "train_speed(iter/s)": 0.139603 }, { "epoch": 0.8899370355111932, "grad_norm": 0.6844192147254944, "learning_rate": 6.307897248370916e-05, "loss": 0.825889778137207, "memory(GiB)": 91.52, "step": 68585, "token_acc": 0.7563801950589595, "train_speed(iter/s)": 0.139601 }, { "epoch": 0.8900019139128489, "grad_norm": 0.6709383726119995, "learning_rate": 6.307379541153764e-05, "loss": 0.8436723709106445, "memory(GiB)": 91.52, "step": 68590, "token_acc": 0.7483986767086648, "train_speed(iter/s)": 0.139599 }, { "epoch": 0.8900667923145046, "grad_norm": 0.7396286129951477, "learning_rate": 6.306861818891377e-05, "loss": 0.8214107513427734, "memory(GiB)": 91.52, "step": 68595, "token_acc": 0.7535165781541553, "train_speed(iter/s)": 0.139598 }, { "epoch": 0.8901316707161603, "grad_norm": 0.7368354797363281, "learning_rate": 6.306344081589705e-05, "loss": 0.839928150177002, "memory(GiB)": 91.52, "step": 68600, "token_acc": 0.7669363348575449, "train_speed(iter/s)": 0.139597 }, { "epoch": 0.890196549117816, "grad_norm": 0.6683627367019653, "learning_rate": 6.305826329254713e-05, "loss": 0.8795299530029297, "memory(GiB)": 91.52, "step": 68605, "token_acc": 0.7825463668130096, "train_speed(iter/s)": 0.139595 }, { "epoch": 0.8902614275194717, "grad_norm": 0.7508337497711182, "learning_rate": 6.305308561892353e-05, "loss": 0.822120475769043, "memory(GiB)": 91.52, "step": 68610, "token_acc": 0.7646815550041357, "train_speed(iter/s)": 0.139594 }, { "epoch": 0.8903263059211274, "grad_norm": 0.7003523111343384, "learning_rate": 6.304790779508587e-05, "loss": 0.8935146331787109, "memory(GiB)": 91.52, "step": 68615, "token_acc": 0.7285958009818599, "train_speed(iter/s)": 0.139593 }, { "epoch": 0.890391184322783, "grad_norm": 0.7516027092933655, "learning_rate": 6.304272982109372e-05, "loss": 0.843205451965332, "memory(GiB)": 91.52, "step": 68620, "token_acc": 0.7632793661729246, "train_speed(iter/s)": 0.139591 }, { "epoch": 0.8904560627244387, "grad_norm": 0.7744770050048828, "learning_rate": 6.303755169700668e-05, "loss": 0.8658239364624023, "memory(GiB)": 91.52, "step": 68625, "token_acc": 0.750724237825907, "train_speed(iter/s)": 0.13959 }, { "epoch": 0.8905209411260944, "grad_norm": 0.7696170806884766, "learning_rate": 6.303237342288433e-05, "loss": 0.9242531776428222, "memory(GiB)": 91.52, "step": 68630, "token_acc": 0.7554307003575399, "train_speed(iter/s)": 0.139588 }, { "epoch": 0.8905858195277501, "grad_norm": 0.7536959648132324, "learning_rate": 6.302719499878626e-05, "loss": 0.8529413223266602, "memory(GiB)": 91.52, "step": 68635, "token_acc": 0.7720633883352394, "train_speed(iter/s)": 0.139587 }, { "epoch": 0.8906506979294058, "grad_norm": 0.7393856048583984, "learning_rate": 6.30220164247721e-05, "loss": 0.890785026550293, "memory(GiB)": 91.52, "step": 68640, "token_acc": 0.7629251700680272, "train_speed(iter/s)": 0.139585 }, { "epoch": 0.8907155763310615, "grad_norm": 0.6932315230369568, "learning_rate": 6.301683770090137e-05, "loss": 0.8489953994750976, "memory(GiB)": 91.52, "step": 68645, "token_acc": 0.7702774108322324, "train_speed(iter/s)": 0.139584 }, { "epoch": 0.8907804547327172, "grad_norm": 0.6844789981842041, "learning_rate": 6.301165882723375e-05, "loss": 0.8374561309814453, "memory(GiB)": 91.52, "step": 68650, "token_acc": 0.7749662885698422, "train_speed(iter/s)": 0.139582 }, { "epoch": 0.8908453331343729, "grad_norm": 0.8305494785308838, "learning_rate": 6.300647980382876e-05, "loss": 0.8877767562866211, "memory(GiB)": 91.52, "step": 68655, "token_acc": 0.7843503697082961, "train_speed(iter/s)": 0.139581 }, { "epoch": 0.8909102115360286, "grad_norm": 0.7348185181617737, "learning_rate": 6.300130063074606e-05, "loss": 0.847220516204834, "memory(GiB)": 91.52, "step": 68660, "token_acc": 0.7573587907716786, "train_speed(iter/s)": 0.139579 }, { "epoch": 0.8909750899376843, "grad_norm": 0.777722179889679, "learning_rate": 6.299612130804522e-05, "loss": 0.855195140838623, "memory(GiB)": 91.52, "step": 68665, "token_acc": 0.765816963405821, "train_speed(iter/s)": 0.139578 }, { "epoch": 0.89103996833934, "grad_norm": 0.7185273170471191, "learning_rate": 6.299094183578587e-05, "loss": 0.8616115570068359, "memory(GiB)": 91.52, "step": 68670, "token_acc": 0.7622743886355929, "train_speed(iter/s)": 0.139576 }, { "epoch": 0.8911048467409957, "grad_norm": 0.7551803588867188, "learning_rate": 6.298576221402759e-05, "loss": 0.8571426391601562, "memory(GiB)": 91.52, "step": 68675, "token_acc": 0.7517612777101546, "train_speed(iter/s)": 0.139574 }, { "epoch": 0.8911697251426514, "grad_norm": 0.7407359480857849, "learning_rate": 6.298058244282997e-05, "loss": 0.878333854675293, "memory(GiB)": 91.52, "step": 68680, "token_acc": 0.7908112990638857, "train_speed(iter/s)": 0.139573 }, { "epoch": 0.8912346035443071, "grad_norm": 0.7643107771873474, "learning_rate": 6.297540252225267e-05, "loss": 0.8858253479003906, "memory(GiB)": 91.52, "step": 68685, "token_acc": 0.7595404440304301, "train_speed(iter/s)": 0.139571 }, { "epoch": 0.8912994819459628, "grad_norm": 0.7703189849853516, "learning_rate": 6.297022245235527e-05, "loss": 0.8383358955383301, "memory(GiB)": 91.52, "step": 68690, "token_acc": 0.7538267318920625, "train_speed(iter/s)": 0.13957 }, { "epoch": 0.8913643603476185, "grad_norm": 0.7179060578346252, "learning_rate": 6.296504223319737e-05, "loss": 0.8777501106262207, "memory(GiB)": 91.52, "step": 68695, "token_acc": 0.7592260943601186, "train_speed(iter/s)": 0.139568 }, { "epoch": 0.8914292387492742, "grad_norm": 0.7645868062973022, "learning_rate": 6.29598618648386e-05, "loss": 0.9001344680786133, "memory(GiB)": 91.52, "step": 68700, "token_acc": 0.765245962046306, "train_speed(iter/s)": 0.139567 }, { "epoch": 0.8914941171509299, "grad_norm": 0.711142361164093, "learning_rate": 6.295468134733858e-05, "loss": 0.8737531661987304, "memory(GiB)": 91.52, "step": 68705, "token_acc": 0.7719023779724656, "train_speed(iter/s)": 0.139564 }, { "epoch": 0.8915589955525856, "grad_norm": 0.73566734790802, "learning_rate": 6.294950068075693e-05, "loss": 0.8475525856018067, "memory(GiB)": 91.52, "step": 68710, "token_acc": 0.7625971514633149, "train_speed(iter/s)": 0.139563 }, { "epoch": 0.8916238739542413, "grad_norm": 0.6970223188400269, "learning_rate": 6.294431986515326e-05, "loss": 0.8406702995300293, "memory(GiB)": 91.52, "step": 68715, "token_acc": 0.7625795160041735, "train_speed(iter/s)": 0.139562 }, { "epoch": 0.891688752355897, "grad_norm": 0.773530125617981, "learning_rate": 6.293913890058717e-05, "loss": 0.8328742980957031, "memory(GiB)": 91.52, "step": 68720, "token_acc": 0.7634839151266256, "train_speed(iter/s)": 0.13956 }, { "epoch": 0.8917536307575527, "grad_norm": 0.6666741967201233, "learning_rate": 6.293395778711831e-05, "loss": 0.8246502876281738, "memory(GiB)": 91.52, "step": 68725, "token_acc": 0.7925513830938031, "train_speed(iter/s)": 0.139559 }, { "epoch": 0.8918185091592084, "grad_norm": 0.6642742156982422, "learning_rate": 6.292877652480631e-05, "loss": 0.8662862777709961, "memory(GiB)": 91.52, "step": 68730, "token_acc": 0.7588042899520803, "train_speed(iter/s)": 0.139557 }, { "epoch": 0.891883387560864, "grad_norm": 0.6587001085281372, "learning_rate": 6.292359511371077e-05, "loss": 0.8438087463378906, "memory(GiB)": 91.52, "step": 68735, "token_acc": 0.7708177554211134, "train_speed(iter/s)": 0.139556 }, { "epoch": 0.8919482659625197, "grad_norm": 0.7308135628700256, "learning_rate": 6.291841355389132e-05, "loss": 0.8700445175170899, "memory(GiB)": 91.52, "step": 68740, "token_acc": 0.7586534545948147, "train_speed(iter/s)": 0.139554 }, { "epoch": 0.8920131443641754, "grad_norm": 0.804404079914093, "learning_rate": 6.291323184540762e-05, "loss": 0.9019601821899415, "memory(GiB)": 91.52, "step": 68745, "token_acc": 0.7520228771975433, "train_speed(iter/s)": 0.139553 }, { "epoch": 0.8920780227658311, "grad_norm": 0.7700491547584534, "learning_rate": 6.290804998831926e-05, "loss": 0.847142505645752, "memory(GiB)": 91.52, "step": 68750, "token_acc": 0.7532515947311739, "train_speed(iter/s)": 0.139551 }, { "epoch": 0.8921429011674868, "grad_norm": 0.6722217798233032, "learning_rate": 6.29028679826859e-05, "loss": 0.869204330444336, "memory(GiB)": 91.52, "step": 68755, "token_acc": 0.7752339473839092, "train_speed(iter/s)": 0.139549 }, { "epoch": 0.8922077795691425, "grad_norm": 0.7181503176689148, "learning_rate": 6.289768582856716e-05, "loss": 0.8584987640380859, "memory(GiB)": 91.52, "step": 68760, "token_acc": 0.7695001094194517, "train_speed(iter/s)": 0.139548 }, { "epoch": 0.8922726579707982, "grad_norm": 0.7519037127494812, "learning_rate": 6.28925035260227e-05, "loss": 0.8794525146484375, "memory(GiB)": 91.52, "step": 68765, "token_acc": 0.7423919057781994, "train_speed(iter/s)": 0.139547 }, { "epoch": 0.8923375363724539, "grad_norm": 0.7204583883285522, "learning_rate": 6.288732107511213e-05, "loss": 0.8626267433166503, "memory(GiB)": 91.52, "step": 68770, "token_acc": 0.7448858699944713, "train_speed(iter/s)": 0.139546 }, { "epoch": 0.8924024147741096, "grad_norm": 0.7017295956611633, "learning_rate": 6.288213847589508e-05, "loss": 0.8651779174804688, "memory(GiB)": 91.52, "step": 68775, "token_acc": 0.7716610225489386, "train_speed(iter/s)": 0.139545 }, { "epoch": 0.8924672931757653, "grad_norm": 0.7566962242126465, "learning_rate": 6.287695572843124e-05, "loss": 0.8818632125854492, "memory(GiB)": 91.52, "step": 68780, "token_acc": 0.7619067187178961, "train_speed(iter/s)": 0.139544 }, { "epoch": 0.892532171577421, "grad_norm": 0.6875782012939453, "learning_rate": 6.28717728327802e-05, "loss": 0.8614376068115235, "memory(GiB)": 91.52, "step": 68785, "token_acc": 0.7486706287144198, "train_speed(iter/s)": 0.139542 }, { "epoch": 0.8925970499790767, "grad_norm": 0.7295048236846924, "learning_rate": 6.286658978900163e-05, "loss": 0.8643569946289062, "memory(GiB)": 91.52, "step": 68790, "token_acc": 0.7572645199020432, "train_speed(iter/s)": 0.139541 }, { "epoch": 0.8926619283807324, "grad_norm": 0.7821467518806458, "learning_rate": 6.286140659715518e-05, "loss": 0.8336012840270997, "memory(GiB)": 91.52, "step": 68795, "token_acc": 0.7591216319540018, "train_speed(iter/s)": 0.13954 }, { "epoch": 0.8927268067823881, "grad_norm": 0.6761305928230286, "learning_rate": 6.285622325730047e-05, "loss": 0.8702890396118164, "memory(GiB)": 91.52, "step": 68800, "token_acc": 0.7695258127377896, "train_speed(iter/s)": 0.139538 }, { "epoch": 0.8927916851840438, "grad_norm": 0.8189013600349426, "learning_rate": 6.28510397694972e-05, "loss": 0.871192741394043, "memory(GiB)": 91.52, "step": 68805, "token_acc": 0.7565055762081785, "train_speed(iter/s)": 0.139536 }, { "epoch": 0.8928565635856995, "grad_norm": 0.7195077538490295, "learning_rate": 6.284585613380498e-05, "loss": 0.8284858703613281, "memory(GiB)": 91.52, "step": 68810, "token_acc": 0.7819025522041764, "train_speed(iter/s)": 0.139535 }, { "epoch": 0.8929214419873552, "grad_norm": 0.7026680707931519, "learning_rate": 6.284067235028347e-05, "loss": 0.8714858055114746, "memory(GiB)": 91.52, "step": 68815, "token_acc": 0.7660128180229171, "train_speed(iter/s)": 0.139534 }, { "epoch": 0.8929863203890109, "grad_norm": 0.7417279481887817, "learning_rate": 6.283548841899233e-05, "loss": 0.8680194854736328, "memory(GiB)": 91.52, "step": 68820, "token_acc": 0.7686298731969625, "train_speed(iter/s)": 0.139532 }, { "epoch": 0.8930511987906666, "grad_norm": 0.6657299399375916, "learning_rate": 6.283030433999122e-05, "loss": 0.8617300033569336, "memory(GiB)": 91.52, "step": 68825, "token_acc": 0.7719506852744227, "train_speed(iter/s)": 0.139531 }, { "epoch": 0.8931160771923223, "grad_norm": 0.7088177800178528, "learning_rate": 6.282512011333979e-05, "loss": 0.8579037666320801, "memory(GiB)": 91.52, "step": 68830, "token_acc": 0.7654060886969346, "train_speed(iter/s)": 0.13953 }, { "epoch": 0.893180955593978, "grad_norm": 0.7671042680740356, "learning_rate": 6.28199357390977e-05, "loss": 0.8672776222229004, "memory(GiB)": 91.52, "step": 68835, "token_acc": 0.7644613664943412, "train_speed(iter/s)": 0.139529 }, { "epoch": 0.8932458339956337, "grad_norm": 0.7269892692565918, "learning_rate": 6.281475121732463e-05, "loss": 0.8601649284362793, "memory(GiB)": 91.52, "step": 68840, "token_acc": 0.7751536962063278, "train_speed(iter/s)": 0.139527 }, { "epoch": 0.8933107123972894, "grad_norm": 0.733464777469635, "learning_rate": 6.28095665480802e-05, "loss": 0.8652891159057617, "memory(GiB)": 91.52, "step": 68845, "token_acc": 0.7772477064220183, "train_speed(iter/s)": 0.139525 }, { "epoch": 0.8933755907989451, "grad_norm": 0.7928869128227234, "learning_rate": 6.280438173142412e-05, "loss": 0.8965716361999512, "memory(GiB)": 91.52, "step": 68850, "token_acc": 0.7615360247791348, "train_speed(iter/s)": 0.139524 }, { "epoch": 0.8934404692006008, "grad_norm": 0.7425705194473267, "learning_rate": 6.279919676741603e-05, "loss": 0.847654914855957, "memory(GiB)": 91.52, "step": 68855, "token_acc": 0.7788757553874806, "train_speed(iter/s)": 0.139523 }, { "epoch": 0.8935053476022564, "grad_norm": 0.7021850943565369, "learning_rate": 6.279401165611563e-05, "loss": 0.8771503448486329, "memory(GiB)": 91.52, "step": 68860, "token_acc": 0.7629002883593868, "train_speed(iter/s)": 0.139521 }, { "epoch": 0.8935702260039121, "grad_norm": 0.7187042832374573, "learning_rate": 6.278882639758255e-05, "loss": 0.8731130599975586, "memory(GiB)": 91.52, "step": 68865, "token_acc": 0.7606695822651184, "train_speed(iter/s)": 0.13952 }, { "epoch": 0.8936351044055678, "grad_norm": 0.724524974822998, "learning_rate": 6.278364099187646e-05, "loss": 0.8536840438842773, "memory(GiB)": 91.52, "step": 68870, "token_acc": 0.7606919899369506, "train_speed(iter/s)": 0.139518 }, { "epoch": 0.8936999828072235, "grad_norm": 0.8122673630714417, "learning_rate": 6.277845543905709e-05, "loss": 0.8824709892272949, "memory(GiB)": 91.52, "step": 68875, "token_acc": 0.7603398166778449, "train_speed(iter/s)": 0.139517 }, { "epoch": 0.8937648612088792, "grad_norm": 0.7530056238174438, "learning_rate": 6.277326973918406e-05, "loss": 0.8778018951416016, "memory(GiB)": 91.52, "step": 68880, "token_acc": 0.7655285890580008, "train_speed(iter/s)": 0.139515 }, { "epoch": 0.8938297396105349, "grad_norm": 0.746494710445404, "learning_rate": 6.276808389231706e-05, "loss": 0.8555618286132812, "memory(GiB)": 91.52, "step": 68885, "token_acc": 0.7590658882402002, "train_speed(iter/s)": 0.139513 }, { "epoch": 0.8938946180121906, "grad_norm": 0.7250851988792419, "learning_rate": 6.276289789851579e-05, "loss": 0.8670454025268555, "memory(GiB)": 91.52, "step": 68890, "token_acc": 0.7695137060554437, "train_speed(iter/s)": 0.139511 }, { "epoch": 0.8939594964138463, "grad_norm": 0.8189341425895691, "learning_rate": 6.275771175783989e-05, "loss": 0.8879295349121094, "memory(GiB)": 91.52, "step": 68895, "token_acc": 0.7712464841385694, "train_speed(iter/s)": 0.13951 }, { "epoch": 0.894024374815502, "grad_norm": 0.7681145668029785, "learning_rate": 6.275252547034908e-05, "loss": 0.8776051521301269, "memory(GiB)": 91.52, "step": 68900, "token_acc": 0.7748542618559949, "train_speed(iter/s)": 0.139509 }, { "epoch": 0.8940892532171577, "grad_norm": 0.750320553779602, "learning_rate": 6.2747339036103e-05, "loss": 0.839175796508789, "memory(GiB)": 91.52, "step": 68905, "token_acc": 0.7504503880266076, "train_speed(iter/s)": 0.139508 }, { "epoch": 0.8941541316188134, "grad_norm": 0.6530441641807556, "learning_rate": 6.27421524551614e-05, "loss": 0.8377780914306641, "memory(GiB)": 91.52, "step": 68910, "token_acc": 0.7613921290744576, "train_speed(iter/s)": 0.139506 }, { "epoch": 0.8942190100204691, "grad_norm": 0.7162914276123047, "learning_rate": 6.273696572758391e-05, "loss": 0.8556342124938965, "memory(GiB)": 91.52, "step": 68915, "token_acc": 0.7661843368536552, "train_speed(iter/s)": 0.139505 }, { "epoch": 0.8942838884221248, "grad_norm": 0.7502429485321045, "learning_rate": 6.273177885343024e-05, "loss": 0.8850618362426758, "memory(GiB)": 91.52, "step": 68920, "token_acc": 0.7627070245573958, "train_speed(iter/s)": 0.139504 }, { "epoch": 0.8943487668237805, "grad_norm": 0.7466179728507996, "learning_rate": 6.272659183276008e-05, "loss": 0.8299604415893554, "memory(GiB)": 91.52, "step": 68925, "token_acc": 0.7815868909012506, "train_speed(iter/s)": 0.139503 }, { "epoch": 0.8944136452254362, "grad_norm": 0.708891749382019, "learning_rate": 6.272140466563313e-05, "loss": 0.8591149330139161, "memory(GiB)": 91.52, "step": 68930, "token_acc": 0.7682460895405414, "train_speed(iter/s)": 0.139502 }, { "epoch": 0.8944785236270919, "grad_norm": 0.7330150008201599, "learning_rate": 6.271621735210906e-05, "loss": 0.8663342475891114, "memory(GiB)": 91.52, "step": 68935, "token_acc": 0.7733318835852271, "train_speed(iter/s)": 0.139501 }, { "epoch": 0.8945434020287476, "grad_norm": 0.7752655744552612, "learning_rate": 6.271102989224758e-05, "loss": 0.8162946701049805, "memory(GiB)": 91.52, "step": 68940, "token_acc": 0.76576852418861, "train_speed(iter/s)": 0.139499 }, { "epoch": 0.8946082804304033, "grad_norm": 0.7617775201797485, "learning_rate": 6.270584228610839e-05, "loss": 0.8747300148010254, "memory(GiB)": 91.52, "step": 68945, "token_acc": 0.7471506635441062, "train_speed(iter/s)": 0.139498 }, { "epoch": 0.894673158832059, "grad_norm": 0.8255077600479126, "learning_rate": 6.270065453375118e-05, "loss": 0.8850166320800781, "memory(GiB)": 91.52, "step": 68950, "token_acc": 0.767024959008927, "train_speed(iter/s)": 0.139497 }, { "epoch": 0.8947380372337147, "grad_norm": 0.9481325149536133, "learning_rate": 6.269546663523566e-05, "loss": 0.8409578323364257, "memory(GiB)": 91.52, "step": 68955, "token_acc": 0.7524520948359857, "train_speed(iter/s)": 0.139495 }, { "epoch": 0.8948029156353704, "grad_norm": 0.7710725665092468, "learning_rate": 6.26902785906215e-05, "loss": 0.8851251602172852, "memory(GiB)": 91.52, "step": 68960, "token_acc": 0.7411527140803614, "train_speed(iter/s)": 0.139494 }, { "epoch": 0.8948677940370261, "grad_norm": 0.7230789065361023, "learning_rate": 6.268509039996848e-05, "loss": 0.842432689666748, "memory(GiB)": 91.52, "step": 68965, "token_acc": 0.7621361009900262, "train_speed(iter/s)": 0.139493 }, { "epoch": 0.8949326724386818, "grad_norm": 0.692322313785553, "learning_rate": 6.267990206333623e-05, "loss": 0.8025209426879882, "memory(GiB)": 91.52, "step": 68970, "token_acc": 0.7867134171360364, "train_speed(iter/s)": 0.139491 }, { "epoch": 0.8949975508403375, "grad_norm": 0.7048906087875366, "learning_rate": 6.267471358078449e-05, "loss": 0.8704473495483398, "memory(GiB)": 91.52, "step": 68975, "token_acc": 0.7623309114537185, "train_speed(iter/s)": 0.13949 }, { "epoch": 0.8950624292419932, "grad_norm": 0.8223145008087158, "learning_rate": 6.266952495237295e-05, "loss": 0.8340736389160156, "memory(GiB)": 91.52, "step": 68980, "token_acc": 0.7615257862076219, "train_speed(iter/s)": 0.139488 }, { "epoch": 0.8951273076436489, "grad_norm": 0.7318288087844849, "learning_rate": 6.266433617816134e-05, "loss": 0.8475198745727539, "memory(GiB)": 91.52, "step": 68985, "token_acc": 0.765805848405713, "train_speed(iter/s)": 0.139487 }, { "epoch": 0.8951921860453046, "grad_norm": 0.7423206567764282, "learning_rate": 6.265914725820937e-05, "loss": 0.871098518371582, "memory(GiB)": 91.52, "step": 68990, "token_acc": 0.7593901900132567, "train_speed(iter/s)": 0.139486 }, { "epoch": 0.8952570644469603, "grad_norm": 0.7112571001052856, "learning_rate": 6.265395819257673e-05, "loss": 0.8368736267089844, "memory(GiB)": 91.52, "step": 68995, "token_acc": 0.7932851844622292, "train_speed(iter/s)": 0.139484 }, { "epoch": 0.895321942848616, "grad_norm": 0.6445283889770508, "learning_rate": 6.264876898132319e-05, "loss": 0.8850494384765625, "memory(GiB)": 91.52, "step": 69000, "token_acc": 0.7680802874101843, "train_speed(iter/s)": 0.139483 }, { "epoch": 0.8953868212502717, "grad_norm": 0.7111735343933105, "learning_rate": 6.26435796245084e-05, "loss": 0.8340898513793945, "memory(GiB)": 91.52, "step": 69005, "token_acc": 0.7772876487484612, "train_speed(iter/s)": 0.139481 }, { "epoch": 0.8954516996519274, "grad_norm": 0.7678154110908508, "learning_rate": 6.26383901221921e-05, "loss": 0.9030298233032227, "memory(GiB)": 91.52, "step": 69010, "token_acc": 0.7512426900584795, "train_speed(iter/s)": 0.13948 }, { "epoch": 0.8955165780535831, "grad_norm": 0.7534869313240051, "learning_rate": 6.263320047443405e-05, "loss": 0.881654167175293, "memory(GiB)": 91.52, "step": 69015, "token_acc": 0.7501478518003131, "train_speed(iter/s)": 0.139479 }, { "epoch": 0.8955814564552388, "grad_norm": 0.6967270374298096, "learning_rate": 6.262801068129391e-05, "loss": 0.8679634094238281, "memory(GiB)": 91.52, "step": 69020, "token_acc": 0.762693226090235, "train_speed(iter/s)": 0.139478 }, { "epoch": 0.8956463348568945, "grad_norm": 0.6799808740615845, "learning_rate": 6.262282074283147e-05, "loss": 0.7849989891052246, "memory(GiB)": 91.52, "step": 69025, "token_acc": 0.7810819672131147, "train_speed(iter/s)": 0.139476 }, { "epoch": 0.8957112132585502, "grad_norm": 0.7788699269294739, "learning_rate": 6.26176306591064e-05, "loss": 0.8624431610107421, "memory(GiB)": 91.52, "step": 69030, "token_acc": 0.7651438240270727, "train_speed(iter/s)": 0.139475 }, { "epoch": 0.8957760916602059, "grad_norm": 0.745018720626831, "learning_rate": 6.261244043017846e-05, "loss": 0.8697917938232422, "memory(GiB)": 91.52, "step": 69035, "token_acc": 0.7628369916287654, "train_speed(iter/s)": 0.139473 }, { "epoch": 0.8958409700618616, "grad_norm": 0.7004328370094299, "learning_rate": 6.260725005610735e-05, "loss": 0.8374542236328125, "memory(GiB)": 91.52, "step": 69040, "token_acc": 0.7632878877692155, "train_speed(iter/s)": 0.139472 }, { "epoch": 0.8959058484635173, "grad_norm": 0.691048800945282, "learning_rate": 6.260205953695282e-05, "loss": 0.788090705871582, "memory(GiB)": 91.52, "step": 69045, "token_acc": 0.7834579472686295, "train_speed(iter/s)": 0.139471 }, { "epoch": 0.895970726865173, "grad_norm": 0.7377545833587646, "learning_rate": 6.259686887277461e-05, "loss": 0.8714936256408692, "memory(GiB)": 91.52, "step": 69050, "token_acc": 0.7661101892926078, "train_speed(iter/s)": 0.139469 }, { "epoch": 0.8960356052668287, "grad_norm": 0.699735701084137, "learning_rate": 6.259167806363245e-05, "loss": 0.8138599395751953, "memory(GiB)": 91.52, "step": 69055, "token_acc": 0.7731141882085278, "train_speed(iter/s)": 0.139468 }, { "epoch": 0.8961004836684844, "grad_norm": 0.7357448935508728, "learning_rate": 6.258648710958605e-05, "loss": 0.8116198539733886, "memory(GiB)": 91.52, "step": 69060, "token_acc": 0.7845773165417144, "train_speed(iter/s)": 0.139467 }, { "epoch": 0.8961653620701401, "grad_norm": 0.7335065603256226, "learning_rate": 6.258129601069518e-05, "loss": 0.8278568267822266, "memory(GiB)": 91.52, "step": 69065, "token_acc": 0.7585850145238948, "train_speed(iter/s)": 0.139466 }, { "epoch": 0.8962302404717958, "grad_norm": 0.7196454405784607, "learning_rate": 6.257610476701956e-05, "loss": 0.8438584327697753, "memory(GiB)": 91.52, "step": 69070, "token_acc": 0.7615346194668166, "train_speed(iter/s)": 0.139465 }, { "epoch": 0.8962951188734515, "grad_norm": 0.7703216075897217, "learning_rate": 6.257091337861895e-05, "loss": 0.8715825080871582, "memory(GiB)": 91.52, "step": 69075, "token_acc": 0.7605676101809445, "train_speed(iter/s)": 0.139464 }, { "epoch": 0.8963599972751072, "grad_norm": 0.7569186687469482, "learning_rate": 6.256572184555307e-05, "loss": 0.8346922874450684, "memory(GiB)": 91.52, "step": 69080, "token_acc": 0.7462476894639556, "train_speed(iter/s)": 0.139463 }, { "epoch": 0.8964248756767629, "grad_norm": 0.7610247135162354, "learning_rate": 6.256053016788165e-05, "loss": 0.8437294960021973, "memory(GiB)": 91.52, "step": 69085, "token_acc": 0.7722390063779792, "train_speed(iter/s)": 0.13946 }, { "epoch": 0.8964897540784186, "grad_norm": 0.7193120718002319, "learning_rate": 6.255533834566449e-05, "loss": 0.8281048774719239, "memory(GiB)": 91.52, "step": 69090, "token_acc": 0.7747966413014957, "train_speed(iter/s)": 0.139459 }, { "epoch": 0.8965546324800743, "grad_norm": 0.843245267868042, "learning_rate": 6.25501463789613e-05, "loss": 0.891635513305664, "memory(GiB)": 91.52, "step": 69095, "token_acc": 0.752254126785962, "train_speed(iter/s)": 0.139457 }, { "epoch": 0.8966195108817299, "grad_norm": 0.6977311372756958, "learning_rate": 6.25449542678318e-05, "loss": 0.8725813865661621, "memory(GiB)": 91.52, "step": 69100, "token_acc": 0.77414743008578, "train_speed(iter/s)": 0.139456 }, { "epoch": 0.8966843892833856, "grad_norm": 0.7093015909194946, "learning_rate": 6.25397620123358e-05, "loss": 0.8371231079101562, "memory(GiB)": 91.52, "step": 69105, "token_acc": 0.7675933280381255, "train_speed(iter/s)": 0.139454 }, { "epoch": 0.8967492676850413, "grad_norm": 0.70229172706604, "learning_rate": 6.253456961253303e-05, "loss": 0.8321803092956543, "memory(GiB)": 91.52, "step": 69110, "token_acc": 0.763407862048809, "train_speed(iter/s)": 0.139453 }, { "epoch": 0.896814146086697, "grad_norm": 0.7469480037689209, "learning_rate": 6.252937706848323e-05, "loss": 0.8235244750976562, "memory(GiB)": 91.52, "step": 69115, "token_acc": 0.7788124887204476, "train_speed(iter/s)": 0.139451 }, { "epoch": 0.8968790244883527, "grad_norm": 0.7014889717102051, "learning_rate": 6.252418438024616e-05, "loss": 0.8976520538330078, "memory(GiB)": 91.52, "step": 69120, "token_acc": 0.749905051272313, "train_speed(iter/s)": 0.13945 }, { "epoch": 0.8969439028900084, "grad_norm": 0.6740008592605591, "learning_rate": 6.25189915478816e-05, "loss": 0.8318859100341797, "memory(GiB)": 91.52, "step": 69125, "token_acc": 0.7741374513077351, "train_speed(iter/s)": 0.139448 }, { "epoch": 0.8970087812916641, "grad_norm": 0.7310723066329956, "learning_rate": 6.251379857144927e-05, "loss": 0.8788355827331543, "memory(GiB)": 91.52, "step": 69130, "token_acc": 0.7574332151507706, "train_speed(iter/s)": 0.139446 }, { "epoch": 0.8970736596933198, "grad_norm": 0.7836048603057861, "learning_rate": 6.250860545100897e-05, "loss": 0.8276819229125977, "memory(GiB)": 91.52, "step": 69135, "token_acc": 0.750594088816278, "train_speed(iter/s)": 0.139445 }, { "epoch": 0.8971385380949755, "grad_norm": 0.6823737025260925, "learning_rate": 6.250341218662044e-05, "loss": 0.8612518310546875, "memory(GiB)": 91.52, "step": 69140, "token_acc": 0.7524458837566047, "train_speed(iter/s)": 0.139443 }, { "epoch": 0.8972034164966312, "grad_norm": 0.7060895562171936, "learning_rate": 6.249821877834343e-05, "loss": 0.8783531188964844, "memory(GiB)": 91.52, "step": 69145, "token_acc": 0.7683045069277202, "train_speed(iter/s)": 0.139442 }, { "epoch": 0.8972682948982869, "grad_norm": 0.7777756452560425, "learning_rate": 6.249302522623775e-05, "loss": 0.8588338851928711, "memory(GiB)": 91.52, "step": 69150, "token_acc": 0.7744939996417697, "train_speed(iter/s)": 0.139441 }, { "epoch": 0.8973331732999426, "grad_norm": 0.7266378402709961, "learning_rate": 6.248783153036312e-05, "loss": 0.8362907409667969, "memory(GiB)": 91.52, "step": 69155, "token_acc": 0.7858560143760998, "train_speed(iter/s)": 0.13944 }, { "epoch": 0.8973980517015983, "grad_norm": 0.7597447633743286, "learning_rate": 6.248263769077932e-05, "loss": 0.8713847160339355, "memory(GiB)": 91.52, "step": 69160, "token_acc": 0.759175788795879, "train_speed(iter/s)": 0.139438 }, { "epoch": 0.897462930103254, "grad_norm": 0.7313187718391418, "learning_rate": 6.247744370754616e-05, "loss": 0.8380922317504883, "memory(GiB)": 91.52, "step": 69165, "token_acc": 0.7781027466937945, "train_speed(iter/s)": 0.139437 }, { "epoch": 0.8975278085049097, "grad_norm": 0.7196516990661621, "learning_rate": 6.247224958072335e-05, "loss": 0.8368045806884765, "memory(GiB)": 91.52, "step": 69170, "token_acc": 0.7770550897265105, "train_speed(iter/s)": 0.139435 }, { "epoch": 0.8975926869065654, "grad_norm": 0.7063431143760681, "learning_rate": 6.24670553103707e-05, "loss": 0.8330942153930664, "memory(GiB)": 91.52, "step": 69175, "token_acc": 0.7811233352634627, "train_speed(iter/s)": 0.139434 }, { "epoch": 0.897657565308221, "grad_norm": 0.7778574824333191, "learning_rate": 6.2461860896548e-05, "loss": 0.9031044006347656, "memory(GiB)": 91.52, "step": 69180, "token_acc": 0.7423660822353633, "train_speed(iter/s)": 0.139432 }, { "epoch": 0.8977224437098767, "grad_norm": 0.731581449508667, "learning_rate": 6.2456666339315e-05, "loss": 0.9008378982543945, "memory(GiB)": 91.52, "step": 69185, "token_acc": 0.7523531432497423, "train_speed(iter/s)": 0.139431 }, { "epoch": 0.8977873221115324, "grad_norm": 0.6946268081665039, "learning_rate": 6.245147163873148e-05, "loss": 0.8275304794311523, "memory(GiB)": 91.52, "step": 69190, "token_acc": 0.7818796577422026, "train_speed(iter/s)": 0.139429 }, { "epoch": 0.8978522005131881, "grad_norm": 0.778652548789978, "learning_rate": 6.244627679485721e-05, "loss": 0.8180072784423829, "memory(GiB)": 91.52, "step": 69195, "token_acc": 0.7857794303353445, "train_speed(iter/s)": 0.139428 }, { "epoch": 0.8979170789148438, "grad_norm": 0.6993717551231384, "learning_rate": 6.244108180775201e-05, "loss": 0.881710147857666, "memory(GiB)": 91.52, "step": 69200, "token_acc": 0.7464410975861357, "train_speed(iter/s)": 0.139426 }, { "epoch": 0.8979819573164995, "grad_norm": 0.6911603212356567, "learning_rate": 6.243588667747563e-05, "loss": 0.8441364288330078, "memory(GiB)": 91.52, "step": 69205, "token_acc": 0.7596010338174802, "train_speed(iter/s)": 0.139424 }, { "epoch": 0.8980468357181552, "grad_norm": 0.7129101753234863, "learning_rate": 6.243069140408787e-05, "loss": 0.8532316207885742, "memory(GiB)": 91.52, "step": 69210, "token_acc": 0.7639138861968711, "train_speed(iter/s)": 0.139423 }, { "epoch": 0.8981117141198109, "grad_norm": 0.7075359225273132, "learning_rate": 6.242549598764852e-05, "loss": 0.8428264617919922, "memory(GiB)": 91.52, "step": 69215, "token_acc": 0.7728575354296706, "train_speed(iter/s)": 0.139422 }, { "epoch": 0.8981765925214666, "grad_norm": 0.7201089262962341, "learning_rate": 6.242030042821735e-05, "loss": 0.8108636856079101, "memory(GiB)": 91.52, "step": 69220, "token_acc": 0.7641436719593032, "train_speed(iter/s)": 0.13942 }, { "epoch": 0.8982414709231223, "grad_norm": 0.7942290902137756, "learning_rate": 6.241510472585417e-05, "loss": 0.8781584739685059, "memory(GiB)": 91.52, "step": 69225, "token_acc": 0.7703737212053727, "train_speed(iter/s)": 0.139418 }, { "epoch": 0.898306349324778, "grad_norm": 0.8141723275184631, "learning_rate": 6.240990888061876e-05, "loss": 0.8713947296142578, "memory(GiB)": 91.52, "step": 69230, "token_acc": 0.7618578243898593, "train_speed(iter/s)": 0.139417 }, { "epoch": 0.8983712277264337, "grad_norm": 0.6880877614021301, "learning_rate": 6.240471289257092e-05, "loss": 0.8453421592712402, "memory(GiB)": 91.52, "step": 69235, "token_acc": 0.7694767544013775, "train_speed(iter/s)": 0.139416 }, { "epoch": 0.8984361061280894, "grad_norm": 0.756389319896698, "learning_rate": 6.239951676177044e-05, "loss": 0.8689075469970703, "memory(GiB)": 91.52, "step": 69240, "token_acc": 0.7434204696499779, "train_speed(iter/s)": 0.139415 }, { "epoch": 0.8985009845297451, "grad_norm": 0.7147747278213501, "learning_rate": 6.239432048827712e-05, "loss": 0.8738800048828125, "memory(GiB)": 91.52, "step": 69245, "token_acc": 0.7335344576911893, "train_speed(iter/s)": 0.139414 }, { "epoch": 0.8985658629314008, "grad_norm": 0.7684034705162048, "learning_rate": 6.238912407215076e-05, "loss": 0.8896952629089355, "memory(GiB)": 91.52, "step": 69250, "token_acc": 0.7544939844302901, "train_speed(iter/s)": 0.139412 }, { "epoch": 0.8986307413330565, "grad_norm": 0.7622184753417969, "learning_rate": 6.238392751345116e-05, "loss": 0.8971166610717773, "memory(GiB)": 91.52, "step": 69255, "token_acc": 0.7446991756551696, "train_speed(iter/s)": 0.139411 }, { "epoch": 0.8986956197347122, "grad_norm": 0.7076020836830139, "learning_rate": 6.237873081223812e-05, "loss": 0.8588953018188477, "memory(GiB)": 91.52, "step": 69260, "token_acc": 0.7520868790015398, "train_speed(iter/s)": 0.139409 }, { "epoch": 0.8987604981363679, "grad_norm": 0.6909242272377014, "learning_rate": 6.237353396857143e-05, "loss": 0.8659885406494141, "memory(GiB)": 91.52, "step": 69265, "token_acc": 0.7529984640020916, "train_speed(iter/s)": 0.139408 }, { "epoch": 0.8988253765380236, "grad_norm": 0.7031715512275696, "learning_rate": 6.236833698251092e-05, "loss": 0.8999476432800293, "memory(GiB)": 91.52, "step": 69270, "token_acc": 0.7434899421328189, "train_speed(iter/s)": 0.139406 }, { "epoch": 0.8988902549396793, "grad_norm": 0.7321791648864746, "learning_rate": 6.236313985411636e-05, "loss": 0.8592348098754883, "memory(GiB)": 91.52, "step": 69275, "token_acc": 0.7724604789481923, "train_speed(iter/s)": 0.139405 }, { "epoch": 0.898955133341335, "grad_norm": 0.8093756437301636, "learning_rate": 6.235794258344761e-05, "loss": 0.8528629302978515, "memory(GiB)": 91.52, "step": 69280, "token_acc": 0.7752444098643483, "train_speed(iter/s)": 0.139403 }, { "epoch": 0.8990200117429907, "grad_norm": 0.7233153581619263, "learning_rate": 6.235274517056446e-05, "loss": 0.8303319931030273, "memory(GiB)": 91.52, "step": 69285, "token_acc": 0.7704289201441358, "train_speed(iter/s)": 0.139402 }, { "epoch": 0.8990848901446464, "grad_norm": 0.6838181614875793, "learning_rate": 6.234754761552669e-05, "loss": 0.8806811332702636, "memory(GiB)": 91.52, "step": 69290, "token_acc": 0.7627276451528807, "train_speed(iter/s)": 0.1394 }, { "epoch": 0.8991497685463021, "grad_norm": 0.6653163433074951, "learning_rate": 6.234234991839415e-05, "loss": 0.8507081985473632, "memory(GiB)": 91.52, "step": 69295, "token_acc": 0.7506966183907255, "train_speed(iter/s)": 0.139399 }, { "epoch": 0.8992146469479578, "grad_norm": 0.6763527393341064, "learning_rate": 6.233715207922662e-05, "loss": 0.8384085655212402, "memory(GiB)": 91.52, "step": 69300, "token_acc": 0.7623623623623623, "train_speed(iter/s)": 0.139398 }, { "epoch": 0.8992795253496135, "grad_norm": 0.7321504354476929, "learning_rate": 6.233195409808397e-05, "loss": 0.8492674827575684, "memory(GiB)": 91.52, "step": 69305, "token_acc": 0.7875251509054326, "train_speed(iter/s)": 0.139396 }, { "epoch": 0.8993444037512692, "grad_norm": 0.7129414081573486, "learning_rate": 6.232675597502595e-05, "loss": 0.8654350280761719, "memory(GiB)": 91.52, "step": 69310, "token_acc": 0.7719557195571956, "train_speed(iter/s)": 0.139395 }, { "epoch": 0.8994092821529249, "grad_norm": 0.7766408324241638, "learning_rate": 6.232155771011244e-05, "loss": 0.9365720748901367, "memory(GiB)": 91.52, "step": 69315, "token_acc": 0.7555228818231943, "train_speed(iter/s)": 0.139393 }, { "epoch": 0.8994741605545806, "grad_norm": 0.7686539888381958, "learning_rate": 6.231635930340324e-05, "loss": 0.850830078125, "memory(GiB)": 91.52, "step": 69320, "token_acc": 0.7786939313984169, "train_speed(iter/s)": 0.139392 }, { "epoch": 0.8995390389562363, "grad_norm": 0.719025731086731, "learning_rate": 6.231116075495815e-05, "loss": 0.8096143722534179, "memory(GiB)": 91.52, "step": 69325, "token_acc": 0.7889411038315096, "train_speed(iter/s)": 0.139391 }, { "epoch": 0.899603917357892, "grad_norm": 0.778102457523346, "learning_rate": 6.230596206483704e-05, "loss": 0.8379470825195312, "memory(GiB)": 91.52, "step": 69330, "token_acc": 0.770149157148866, "train_speed(iter/s)": 0.139389 }, { "epoch": 0.8996687957595476, "grad_norm": 0.7189074158668518, "learning_rate": 6.230076323309968e-05, "loss": 0.839511489868164, "memory(GiB)": 91.52, "step": 69335, "token_acc": 0.75960557633458, "train_speed(iter/s)": 0.139388 }, { "epoch": 0.8997336741612033, "grad_norm": 0.7157048583030701, "learning_rate": 6.229556425980595e-05, "loss": 0.9214282989501953, "memory(GiB)": 91.52, "step": 69340, "token_acc": 0.7478257782826311, "train_speed(iter/s)": 0.139387 }, { "epoch": 0.899798552562859, "grad_norm": 0.720573365688324, "learning_rate": 6.229036514501565e-05, "loss": 0.8665946960449219, "memory(GiB)": 91.52, "step": 69345, "token_acc": 0.7400269922475754, "train_speed(iter/s)": 0.139385 }, { "epoch": 0.8998634309645147, "grad_norm": 0.7649234533309937, "learning_rate": 6.228516588878862e-05, "loss": 0.8915252685546875, "memory(GiB)": 91.52, "step": 69350, "token_acc": 0.7373962986598596, "train_speed(iter/s)": 0.139384 }, { "epoch": 0.8999283093661704, "grad_norm": 0.7421495914459229, "learning_rate": 6.227996649118471e-05, "loss": 0.8318836212158203, "memory(GiB)": 91.52, "step": 69355, "token_acc": 0.7801008817527242, "train_speed(iter/s)": 0.139382 }, { "epoch": 0.8999931877678261, "grad_norm": 0.8340052366256714, "learning_rate": 6.227476695226371e-05, "loss": 0.841145133972168, "memory(GiB)": 91.52, "step": 69360, "token_acc": 0.7628183036569022, "train_speed(iter/s)": 0.139381 }, { "epoch": 0.9000580661694818, "grad_norm": 0.8092459440231323, "learning_rate": 6.226956727208549e-05, "loss": 0.812800407409668, "memory(GiB)": 91.52, "step": 69365, "token_acc": 0.7726208218011497, "train_speed(iter/s)": 0.139379 }, { "epoch": 0.9001229445711375, "grad_norm": 0.7699810862541199, "learning_rate": 6.226436745070987e-05, "loss": 0.8480798721313476, "memory(GiB)": 91.52, "step": 69370, "token_acc": 0.7464485122623506, "train_speed(iter/s)": 0.139378 }, { "epoch": 0.9001878229727932, "grad_norm": 0.7842661142349243, "learning_rate": 6.225916748819671e-05, "loss": 0.8383567810058594, "memory(GiB)": 91.52, "step": 69375, "token_acc": 0.7612036309055822, "train_speed(iter/s)": 0.139376 }, { "epoch": 0.9002527013744489, "grad_norm": 0.8142126798629761, "learning_rate": 6.225396738460585e-05, "loss": 0.8339334487915039, "memory(GiB)": 91.52, "step": 69380, "token_acc": 0.7925557872321547, "train_speed(iter/s)": 0.139375 }, { "epoch": 0.9003175797761046, "grad_norm": 0.740554928779602, "learning_rate": 6.22487671399971e-05, "loss": 0.8345830917358399, "memory(GiB)": 91.52, "step": 69385, "token_acc": 0.758300495493738, "train_speed(iter/s)": 0.139373 }, { "epoch": 0.9003824581777603, "grad_norm": 0.6868993639945984, "learning_rate": 6.224356675443034e-05, "loss": 0.8632238388061524, "memory(GiB)": 91.52, "step": 69390, "token_acc": 0.7540496466751223, "train_speed(iter/s)": 0.139372 }, { "epoch": 0.900447336579416, "grad_norm": 0.7157355546951294, "learning_rate": 6.22383662279654e-05, "loss": 0.8191057205200195, "memory(GiB)": 91.52, "step": 69395, "token_acc": 0.7617029862792575, "train_speed(iter/s)": 0.139371 }, { "epoch": 0.9005122149810717, "grad_norm": 0.705551028251648, "learning_rate": 6.223316556066213e-05, "loss": 0.8779601097106934, "memory(GiB)": 91.52, "step": 69400, "token_acc": 0.7917576503152391, "train_speed(iter/s)": 0.13937 }, { "epoch": 0.9005770933827274, "grad_norm": 0.7696800827980042, "learning_rate": 6.222796475258037e-05, "loss": 0.8856945037841797, "memory(GiB)": 91.52, "step": 69405, "token_acc": 0.7464213924590364, "train_speed(iter/s)": 0.139368 }, { "epoch": 0.9006419717843831, "grad_norm": 0.7797039151191711, "learning_rate": 6.222276380377998e-05, "loss": 0.8047315597534179, "memory(GiB)": 91.52, "step": 69410, "token_acc": 0.79174097533977, "train_speed(iter/s)": 0.139367 }, { "epoch": 0.9007068501860388, "grad_norm": 0.7338687777519226, "learning_rate": 6.221756271432082e-05, "loss": 0.8635638236999512, "memory(GiB)": 91.52, "step": 69415, "token_acc": 0.7693441136466677, "train_speed(iter/s)": 0.139365 }, { "epoch": 0.9007717285876945, "grad_norm": 0.7439531087875366, "learning_rate": 6.221236148426274e-05, "loss": 0.8787639617919922, "memory(GiB)": 91.52, "step": 69420, "token_acc": 0.771617308691457, "train_speed(iter/s)": 0.139364 }, { "epoch": 0.9008366069893502, "grad_norm": 0.6945292949676514, "learning_rate": 6.220716011366557e-05, "loss": 0.8113475799560547, "memory(GiB)": 91.52, "step": 69425, "token_acc": 0.7645740761839657, "train_speed(iter/s)": 0.139362 }, { "epoch": 0.9009014853910059, "grad_norm": 0.7823854684829712, "learning_rate": 6.220195860258919e-05, "loss": 0.8611774444580078, "memory(GiB)": 91.52, "step": 69430, "token_acc": 0.756272693892538, "train_speed(iter/s)": 0.139361 }, { "epoch": 0.9009663637926616, "grad_norm": 0.7175644636154175, "learning_rate": 6.219675695109348e-05, "loss": 0.8338140487670899, "memory(GiB)": 91.52, "step": 69435, "token_acc": 0.7697753818508536, "train_speed(iter/s)": 0.13936 }, { "epoch": 0.9010312421943173, "grad_norm": 0.7656043767929077, "learning_rate": 6.219155515923824e-05, "loss": 0.8572406768798828, "memory(GiB)": 91.52, "step": 69440, "token_acc": 0.7503827938610548, "train_speed(iter/s)": 0.139359 }, { "epoch": 0.901096120595973, "grad_norm": 0.6852350831031799, "learning_rate": 6.21863532270834e-05, "loss": 0.8420273780822753, "memory(GiB)": 91.52, "step": 69445, "token_acc": 0.7628632492264507, "train_speed(iter/s)": 0.139358 }, { "epoch": 0.9011609989976287, "grad_norm": 0.6564459800720215, "learning_rate": 6.218115115468877e-05, "loss": 0.8869838714599609, "memory(GiB)": 91.52, "step": 69450, "token_acc": 0.7422596615680189, "train_speed(iter/s)": 0.139356 }, { "epoch": 0.9012258773992844, "grad_norm": 0.6947202682495117, "learning_rate": 6.217594894211425e-05, "loss": 0.8704635620117187, "memory(GiB)": 91.52, "step": 69455, "token_acc": 0.7613556506739922, "train_speed(iter/s)": 0.139355 }, { "epoch": 0.9012907558009401, "grad_norm": 0.7047388553619385, "learning_rate": 6.217074658941968e-05, "loss": 0.8351509094238281, "memory(GiB)": 91.52, "step": 69460, "token_acc": 0.7758716104039846, "train_speed(iter/s)": 0.139354 }, { "epoch": 0.9013556342025958, "grad_norm": 0.7071559429168701, "learning_rate": 6.216554409666495e-05, "loss": 0.8602914810180664, "memory(GiB)": 91.52, "step": 69465, "token_acc": 0.7505406436840096, "train_speed(iter/s)": 0.139352 }, { "epoch": 0.9014205126042515, "grad_norm": 0.7752223014831543, "learning_rate": 6.216034146390992e-05, "loss": 0.82850341796875, "memory(GiB)": 91.52, "step": 69470, "token_acc": 0.77713400464756, "train_speed(iter/s)": 0.139351 }, { "epoch": 0.9014853910059072, "grad_norm": 0.813957154750824, "learning_rate": 6.215513869121447e-05, "loss": 0.8482522964477539, "memory(GiB)": 91.52, "step": 69475, "token_acc": 0.76281403760305, "train_speed(iter/s)": 0.13935 }, { "epoch": 0.9015502694075629, "grad_norm": 0.7465235590934753, "learning_rate": 6.214993577863846e-05, "loss": 0.9046283721923828, "memory(GiB)": 91.52, "step": 69480, "token_acc": 0.7542479433847038, "train_speed(iter/s)": 0.139348 }, { "epoch": 0.9016151478092186, "grad_norm": 0.6768680214881897, "learning_rate": 6.214473272624178e-05, "loss": 0.8438870429992675, "memory(GiB)": 91.52, "step": 69485, "token_acc": 0.7502844141069397, "train_speed(iter/s)": 0.139346 }, { "epoch": 0.9016800262108743, "grad_norm": 0.7792448997497559, "learning_rate": 6.213952953408427e-05, "loss": 0.8569101333618164, "memory(GiB)": 91.52, "step": 69490, "token_acc": 0.7611264012888501, "train_speed(iter/s)": 0.139345 }, { "epoch": 0.90174490461253, "grad_norm": 0.7063866853713989, "learning_rate": 6.213432620222586e-05, "loss": 0.8653087615966797, "memory(GiB)": 91.52, "step": 69495, "token_acc": 0.7640019617459539, "train_speed(iter/s)": 0.139343 }, { "epoch": 0.9018097830141857, "grad_norm": 0.725359320640564, "learning_rate": 6.21291227307264e-05, "loss": 0.827613639831543, "memory(GiB)": 91.52, "step": 69500, "token_acc": 0.776021031584404, "train_speed(iter/s)": 0.139342 }, { "epoch": 0.9018746614158414, "grad_norm": 0.6921961307525635, "learning_rate": 6.212391911964576e-05, "loss": 0.8659772872924805, "memory(GiB)": 91.52, "step": 69505, "token_acc": 0.7616641694832572, "train_speed(iter/s)": 0.13934 }, { "epoch": 0.9019395398174971, "grad_norm": 0.7756285667419434, "learning_rate": 6.211871536904386e-05, "loss": 0.889122200012207, "memory(GiB)": 91.52, "step": 69510, "token_acc": 0.7643767275669117, "train_speed(iter/s)": 0.139339 }, { "epoch": 0.9020044182191528, "grad_norm": 0.691685140132904, "learning_rate": 6.211351147898055e-05, "loss": 0.8979685783386231, "memory(GiB)": 91.52, "step": 69515, "token_acc": 0.7436803179158847, "train_speed(iter/s)": 0.139338 }, { "epoch": 0.9020692966208085, "grad_norm": 0.7379982471466064, "learning_rate": 6.210830744951574e-05, "loss": 0.868898582458496, "memory(GiB)": 91.52, "step": 69520, "token_acc": 0.7678351362453786, "train_speed(iter/s)": 0.139337 }, { "epoch": 0.9021341750224642, "grad_norm": 0.6845948696136475, "learning_rate": 6.210310328070931e-05, "loss": 0.8398476600646972, "memory(GiB)": 91.52, "step": 69525, "token_acc": 0.7604346752284514, "train_speed(iter/s)": 0.139335 }, { "epoch": 0.9021990534241199, "grad_norm": 0.7283672094345093, "learning_rate": 6.209789897262114e-05, "loss": 0.8758867263793946, "memory(GiB)": 91.52, "step": 69530, "token_acc": 0.7548605594347118, "train_speed(iter/s)": 0.139334 }, { "epoch": 0.9022639318257756, "grad_norm": 0.6719245910644531, "learning_rate": 6.209269452531112e-05, "loss": 0.8255426406860351, "memory(GiB)": 91.52, "step": 69535, "token_acc": 0.7718719049191152, "train_speed(iter/s)": 0.139333 }, { "epoch": 0.9023288102274313, "grad_norm": 0.7006338834762573, "learning_rate": 6.208748993883916e-05, "loss": 0.8663812637329101, "memory(GiB)": 91.52, "step": 69540, "token_acc": 0.7625807683738104, "train_speed(iter/s)": 0.139332 }, { "epoch": 0.902393688629087, "grad_norm": 0.7502502799034119, "learning_rate": 6.208228521326515e-05, "loss": 0.8643547058105469, "memory(GiB)": 91.52, "step": 69545, "token_acc": 0.7640127062198996, "train_speed(iter/s)": 0.13933 }, { "epoch": 0.9024585670307427, "grad_norm": 0.676100492477417, "learning_rate": 6.207708034864897e-05, "loss": 0.8108463287353516, "memory(GiB)": 91.52, "step": 69550, "token_acc": 0.7863688023867027, "train_speed(iter/s)": 0.139329 }, { "epoch": 0.9025234454323984, "grad_norm": 0.7170712947845459, "learning_rate": 6.207187534505051e-05, "loss": 0.8331436157226563, "memory(GiB)": 91.52, "step": 69555, "token_acc": 0.7699043357506558, "train_speed(iter/s)": 0.139328 }, { "epoch": 0.9025883238340541, "grad_norm": 0.7137006521224976, "learning_rate": 6.206667020252972e-05, "loss": 0.8762699127197265, "memory(GiB)": 91.52, "step": 69560, "token_acc": 0.7522784030097622, "train_speed(iter/s)": 0.139326 }, { "epoch": 0.9026532022357098, "grad_norm": 0.7004729509353638, "learning_rate": 6.206146492114644e-05, "loss": 0.8680552482604981, "memory(GiB)": 91.52, "step": 69565, "token_acc": 0.7447823494335122, "train_speed(iter/s)": 0.139324 }, { "epoch": 0.9027180806373655, "grad_norm": 0.856269896030426, "learning_rate": 6.205625950096061e-05, "loss": 0.8905999183654785, "memory(GiB)": 91.52, "step": 69570, "token_acc": 0.7620061683066529, "train_speed(iter/s)": 0.139323 }, { "epoch": 0.9027829590390211, "grad_norm": 0.707470178604126, "learning_rate": 6.205105394203212e-05, "loss": 0.8040073394775391, "memory(GiB)": 91.52, "step": 69575, "token_acc": 0.768234210776998, "train_speed(iter/s)": 0.139321 }, { "epoch": 0.9028478374406768, "grad_norm": 0.7787830829620361, "learning_rate": 6.204584824442088e-05, "loss": 0.8524394989013672, "memory(GiB)": 91.52, "step": 69580, "token_acc": 0.7575899581589958, "train_speed(iter/s)": 0.13932 }, { "epoch": 0.9029127158423325, "grad_norm": 0.7678560614585876, "learning_rate": 6.20406424081868e-05, "loss": 0.8606414794921875, "memory(GiB)": 91.52, "step": 69585, "token_acc": 0.7673180570710577, "train_speed(iter/s)": 0.139319 }, { "epoch": 0.9029775942439882, "grad_norm": 0.6536654829978943, "learning_rate": 6.203543643338975e-05, "loss": 0.857307243347168, "memory(GiB)": 91.52, "step": 69590, "token_acc": 0.7602611940298507, "train_speed(iter/s)": 0.139316 }, { "epoch": 0.9030424726456439, "grad_norm": 0.7438831925392151, "learning_rate": 6.20302303200897e-05, "loss": 0.8356273651123047, "memory(GiB)": 91.52, "step": 69595, "token_acc": 0.7594504090862015, "train_speed(iter/s)": 0.139315 }, { "epoch": 0.9031073510472996, "grad_norm": 0.7308538556098938, "learning_rate": 6.202502406834652e-05, "loss": 0.8754240036010742, "memory(GiB)": 91.52, "step": 69600, "token_acc": 0.7582702625874995, "train_speed(iter/s)": 0.139314 }, { "epoch": 0.9031722294489553, "grad_norm": 0.6536192297935486, "learning_rate": 6.201981767822014e-05, "loss": 0.8356637954711914, "memory(GiB)": 91.52, "step": 69605, "token_acc": 0.7745995755937327, "train_speed(iter/s)": 0.139312 }, { "epoch": 0.903237107850611, "grad_norm": 0.6500085592269897, "learning_rate": 6.201461114977047e-05, "loss": 0.8173526763916016, "memory(GiB)": 91.52, "step": 69610, "token_acc": 0.7616939026533123, "train_speed(iter/s)": 0.139311 }, { "epoch": 0.9033019862522667, "grad_norm": 0.717820942401886, "learning_rate": 6.200940448305745e-05, "loss": 0.9018754959106445, "memory(GiB)": 91.52, "step": 69615, "token_acc": 0.7503774905167017, "train_speed(iter/s)": 0.13931 }, { "epoch": 0.9033668646539224, "grad_norm": 0.6899024248123169, "learning_rate": 6.200419767814096e-05, "loss": 0.8544519424438477, "memory(GiB)": 91.52, "step": 69620, "token_acc": 0.7508316699933466, "train_speed(iter/s)": 0.139308 }, { "epoch": 0.903431743055578, "grad_norm": 0.8367379307746887, "learning_rate": 6.199899073508091e-05, "loss": 0.8812152862548828, "memory(GiB)": 91.52, "step": 69625, "token_acc": 0.7742354469240761, "train_speed(iter/s)": 0.139307 }, { "epoch": 0.9034966214572338, "grad_norm": 0.7969684600830078, "learning_rate": 6.19937836539373e-05, "loss": 0.8827194213867188, "memory(GiB)": 91.52, "step": 69630, "token_acc": 0.7452949518954348, "train_speed(iter/s)": 0.139306 }, { "epoch": 0.9035614998588894, "grad_norm": 0.7011486291885376, "learning_rate": 6.198857643476996e-05, "loss": 0.8443670272827148, "memory(GiB)": 91.52, "step": 69635, "token_acc": 0.7645888817719044, "train_speed(iter/s)": 0.139304 }, { "epoch": 0.9036263782605451, "grad_norm": 0.7080633044242859, "learning_rate": 6.198336907763886e-05, "loss": 0.9033907890319824, "memory(GiB)": 91.52, "step": 69640, "token_acc": 0.746080575219599, "train_speed(iter/s)": 0.139303 }, { "epoch": 0.9036912566622008, "grad_norm": 0.6613288521766663, "learning_rate": 6.197816158260393e-05, "loss": 0.8656440734863281, "memory(GiB)": 91.52, "step": 69645, "token_acc": 0.7604010723113881, "train_speed(iter/s)": 0.139302 }, { "epoch": 0.9037561350638565, "grad_norm": 0.8504424691200256, "learning_rate": 6.197295394972509e-05, "loss": 0.8333540916442871, "memory(GiB)": 91.52, "step": 69650, "token_acc": 0.7604058572581575, "train_speed(iter/s)": 0.1393 }, { "epoch": 0.9038210134655122, "grad_norm": 0.7316403985023499, "learning_rate": 6.196774617906226e-05, "loss": 0.8369348526000977, "memory(GiB)": 91.52, "step": 69655, "token_acc": 0.744012086373351, "train_speed(iter/s)": 0.139298 }, { "epoch": 0.903885891867168, "grad_norm": 0.7536184787750244, "learning_rate": 6.196253827067536e-05, "loss": 0.829162311553955, "memory(GiB)": 91.52, "step": 69660, "token_acc": 0.7874306839186691, "train_speed(iter/s)": 0.139297 }, { "epoch": 0.9039507702688236, "grad_norm": 0.7706505060195923, "learning_rate": 6.195733022462437e-05, "loss": 0.8019786834716797, "memory(GiB)": 91.52, "step": 69665, "token_acc": 0.7895276605719643, "train_speed(iter/s)": 0.139296 }, { "epoch": 0.9040156486704793, "grad_norm": 0.8104730248451233, "learning_rate": 6.195212204096917e-05, "loss": 0.8849891662597656, "memory(GiB)": 91.52, "step": 69670, "token_acc": 0.7653889913295931, "train_speed(iter/s)": 0.139295 }, { "epoch": 0.904080527072135, "grad_norm": 0.714257538318634, "learning_rate": 6.194691371976973e-05, "loss": 0.8390314102172851, "memory(GiB)": 91.52, "step": 69675, "token_acc": 0.7755615152531009, "train_speed(iter/s)": 0.139294 }, { "epoch": 0.9041454054737907, "grad_norm": 0.7054515480995178, "learning_rate": 6.194170526108598e-05, "loss": 0.8194229125976562, "memory(GiB)": 91.52, "step": 69680, "token_acc": 0.7808722560680785, "train_speed(iter/s)": 0.139292 }, { "epoch": 0.9042102838754464, "grad_norm": 0.6363995671272278, "learning_rate": 6.193649666497785e-05, "loss": 0.8950754165649414, "memory(GiB)": 91.52, "step": 69685, "token_acc": 0.7648840275056116, "train_speed(iter/s)": 0.139291 }, { "epoch": 0.9042751622771021, "grad_norm": 0.8072306513786316, "learning_rate": 6.193128793150529e-05, "loss": 0.8808174133300781, "memory(GiB)": 91.52, "step": 69690, "token_acc": 0.7639106896883109, "train_speed(iter/s)": 0.13929 }, { "epoch": 0.9043400406787578, "grad_norm": 0.7519422769546509, "learning_rate": 6.192607906072821e-05, "loss": 0.8249699592590332, "memory(GiB)": 91.52, "step": 69695, "token_acc": 0.7608051418235, "train_speed(iter/s)": 0.139288 }, { "epoch": 0.9044049190804135, "grad_norm": 0.8105151653289795, "learning_rate": 6.19208700527066e-05, "loss": 0.8377030372619629, "memory(GiB)": 91.52, "step": 69700, "token_acc": 0.7736693178342, "train_speed(iter/s)": 0.139287 }, { "epoch": 0.9044697974820692, "grad_norm": 0.6709137558937073, "learning_rate": 6.191566090750038e-05, "loss": 0.8332878112792969, "memory(GiB)": 91.52, "step": 69705, "token_acc": 0.7598579168311148, "train_speed(iter/s)": 0.139285 }, { "epoch": 0.9045346758837249, "grad_norm": 0.7087364792823792, "learning_rate": 6.191045162516952e-05, "loss": 0.8481752395629882, "memory(GiB)": 91.52, "step": 69710, "token_acc": 0.7617512437810945, "train_speed(iter/s)": 0.139284 }, { "epoch": 0.9045995542853806, "grad_norm": 0.7296774983406067, "learning_rate": 6.190524220577392e-05, "loss": 0.900248908996582, "memory(GiB)": 91.52, "step": 69715, "token_acc": 0.7561466249441215, "train_speed(iter/s)": 0.139283 }, { "epoch": 0.9046644326870363, "grad_norm": 0.8576337695121765, "learning_rate": 6.190003264937359e-05, "loss": 0.882940673828125, "memory(GiB)": 91.52, "step": 69720, "token_acc": 0.7637559587598389, "train_speed(iter/s)": 0.139282 }, { "epoch": 0.904729311088692, "grad_norm": 0.8611754775047302, "learning_rate": 6.189482295602842e-05, "loss": 0.8643610000610351, "memory(GiB)": 91.52, "step": 69725, "token_acc": 0.7735617686216804, "train_speed(iter/s)": 0.139281 }, { "epoch": 0.9047941894903477, "grad_norm": 0.7640620470046997, "learning_rate": 6.188961312579841e-05, "loss": 0.8336667060852051, "memory(GiB)": 91.52, "step": 69730, "token_acc": 0.7709235090037998, "train_speed(iter/s)": 0.139279 }, { "epoch": 0.9048590678920034, "grad_norm": 0.7142581939697266, "learning_rate": 6.188440315874348e-05, "loss": 0.8709665298461914, "memory(GiB)": 91.52, "step": 69735, "token_acc": 0.7510521204273227, "train_speed(iter/s)": 0.139278 }, { "epoch": 0.9049239462936591, "grad_norm": 0.630490243434906, "learning_rate": 6.187919305492363e-05, "loss": 0.8452693939208984, "memory(GiB)": 91.52, "step": 69740, "token_acc": 0.7740719356043881, "train_speed(iter/s)": 0.139276 }, { "epoch": 0.9049888246953148, "grad_norm": 0.5847580432891846, "learning_rate": 6.187398281439878e-05, "loss": 0.831148910522461, "memory(GiB)": 91.52, "step": 69745, "token_acc": 0.7660077352814783, "train_speed(iter/s)": 0.139275 }, { "epoch": 0.9050537030969705, "grad_norm": 0.6963379383087158, "learning_rate": 6.18687724372289e-05, "loss": 0.8438922882080078, "memory(GiB)": 91.52, "step": 69750, "token_acc": 0.7446738232865401, "train_speed(iter/s)": 0.139273 }, { "epoch": 0.9051185814986262, "grad_norm": 0.7579087018966675, "learning_rate": 6.186356192347394e-05, "loss": 0.8627372741699219, "memory(GiB)": 91.52, "step": 69755, "token_acc": 0.7679972178213996, "train_speed(iter/s)": 0.139271 }, { "epoch": 0.9051834599002819, "grad_norm": 0.7418227791786194, "learning_rate": 6.18583512731939e-05, "loss": 0.8705196380615234, "memory(GiB)": 91.52, "step": 69760, "token_acc": 0.7696256515558364, "train_speed(iter/s)": 0.139271 }, { "epoch": 0.9052483383019376, "grad_norm": 0.6781882643699646, "learning_rate": 6.185314048644868e-05, "loss": 0.8625868797302246, "memory(GiB)": 91.52, "step": 69765, "token_acc": 0.7789182459537972, "train_speed(iter/s)": 0.139269 }, { "epoch": 0.9053132167035933, "grad_norm": 0.6648185849189758, "learning_rate": 6.18479295632983e-05, "loss": 0.8350255966186524, "memory(GiB)": 91.52, "step": 69770, "token_acc": 0.7814650608787719, "train_speed(iter/s)": 0.139268 }, { "epoch": 0.905378095105249, "grad_norm": 0.7997493147850037, "learning_rate": 6.184271850380271e-05, "loss": 0.8842401504516602, "memory(GiB)": 91.52, "step": 69775, "token_acc": 0.7429510465317419, "train_speed(iter/s)": 0.139267 }, { "epoch": 0.9054429735069047, "grad_norm": 0.7668933868408203, "learning_rate": 6.183750730802188e-05, "loss": 0.8398136138916016, "memory(GiB)": 91.52, "step": 69780, "token_acc": 0.7552795031055901, "train_speed(iter/s)": 0.139266 }, { "epoch": 0.9055078519085604, "grad_norm": 0.7232652306556702, "learning_rate": 6.183229597601576e-05, "loss": 0.8871635437011719, "memory(GiB)": 91.52, "step": 69785, "token_acc": 0.7685629013294746, "train_speed(iter/s)": 0.139264 }, { "epoch": 0.9055727303102161, "grad_norm": 0.7316810488700867, "learning_rate": 6.182708450784436e-05, "loss": 0.8746123313903809, "memory(GiB)": 91.52, "step": 69790, "token_acc": 0.7682244463708724, "train_speed(iter/s)": 0.139263 }, { "epoch": 0.9056376087118718, "grad_norm": 0.7126661539077759, "learning_rate": 6.182187290356764e-05, "loss": 0.8818841934204101, "memory(GiB)": 91.52, "step": 69795, "token_acc": 0.7448213749624737, "train_speed(iter/s)": 0.139262 }, { "epoch": 0.9057024871135275, "grad_norm": 0.7549588084220886, "learning_rate": 6.181666116324555e-05, "loss": 0.8583763122558594, "memory(GiB)": 91.52, "step": 69800, "token_acc": 0.7480281389895544, "train_speed(iter/s)": 0.139261 }, { "epoch": 0.9057673655151832, "grad_norm": 0.7984362840652466, "learning_rate": 6.181144928693808e-05, "loss": 0.800874137878418, "memory(GiB)": 91.52, "step": 69805, "token_acc": 0.7758033330264248, "train_speed(iter/s)": 0.13926 }, { "epoch": 0.9058322439168389, "grad_norm": 0.7262463569641113, "learning_rate": 6.180623727470523e-05, "loss": 0.8461206436157227, "memory(GiB)": 91.52, "step": 69810, "token_acc": 0.7777419932009304, "train_speed(iter/s)": 0.139258 }, { "epoch": 0.9058971223184945, "grad_norm": 0.7567569613456726, "learning_rate": 6.180102512660694e-05, "loss": 0.8256158828735352, "memory(GiB)": 91.52, "step": 69815, "token_acc": 0.7800874537504204, "train_speed(iter/s)": 0.139257 }, { "epoch": 0.9059620007201502, "grad_norm": 0.8197337985038757, "learning_rate": 6.179581284270322e-05, "loss": 0.8417640686035156, "memory(GiB)": 91.52, "step": 69820, "token_acc": 0.759279874093125, "train_speed(iter/s)": 0.139255 }, { "epoch": 0.9060268791218059, "grad_norm": 0.7514495849609375, "learning_rate": 6.179060042305405e-05, "loss": 0.8707865715026856, "memory(GiB)": 91.52, "step": 69825, "token_acc": 0.7678009404969082, "train_speed(iter/s)": 0.139254 }, { "epoch": 0.9060917575234616, "grad_norm": 0.6794930696487427, "learning_rate": 6.178538786771942e-05, "loss": 0.8953927993774414, "memory(GiB)": 91.52, "step": 69830, "token_acc": 0.7614390921163829, "train_speed(iter/s)": 0.139253 }, { "epoch": 0.9061566359251173, "grad_norm": 0.7463109493255615, "learning_rate": 6.17801751767593e-05, "loss": 0.8118209838867188, "memory(GiB)": 91.52, "step": 69835, "token_acc": 0.7837044839000903, "train_speed(iter/s)": 0.139252 }, { "epoch": 0.906221514326773, "grad_norm": 0.7579851746559143, "learning_rate": 6.177496235023366e-05, "loss": 0.8526542663574219, "memory(GiB)": 91.52, "step": 69840, "token_acc": 0.7626825769431403, "train_speed(iter/s)": 0.139251 }, { "epoch": 0.9062863927284287, "grad_norm": 0.725229799747467, "learning_rate": 6.176974938820252e-05, "loss": 0.8307476043701172, "memory(GiB)": 91.52, "step": 69845, "token_acc": 0.7736531505806206, "train_speed(iter/s)": 0.139249 }, { "epoch": 0.9063512711300844, "grad_norm": 0.7701364159584045, "learning_rate": 6.176453629072588e-05, "loss": 0.8658640861511231, "memory(GiB)": 91.52, "step": 69850, "token_acc": 0.7556488374381005, "train_speed(iter/s)": 0.139248 }, { "epoch": 0.9064161495317401, "grad_norm": 0.7596153020858765, "learning_rate": 6.17593230578637e-05, "loss": 0.9038763046264648, "memory(GiB)": 91.52, "step": 69855, "token_acc": 0.7553859964093357, "train_speed(iter/s)": 0.139247 }, { "epoch": 0.9064810279333958, "grad_norm": 0.725424587726593, "learning_rate": 6.175410968967598e-05, "loss": 0.8766164779663086, "memory(GiB)": 91.52, "step": 69860, "token_acc": 0.7372329441664681, "train_speed(iter/s)": 0.139246 }, { "epoch": 0.9065459063350515, "grad_norm": 0.6936312913894653, "learning_rate": 6.174889618622273e-05, "loss": 0.8242454528808594, "memory(GiB)": 91.52, "step": 69865, "token_acc": 0.7895028440570289, "train_speed(iter/s)": 0.139244 }, { "epoch": 0.9066107847367072, "grad_norm": 0.7429123520851135, "learning_rate": 6.174368254756395e-05, "loss": 0.848212718963623, "memory(GiB)": 91.52, "step": 69870, "token_acc": 0.7727633218777145, "train_speed(iter/s)": 0.139243 }, { "epoch": 0.9066756631383629, "grad_norm": 0.7686523795127869, "learning_rate": 6.173846877375962e-05, "loss": 0.8427179336547852, "memory(GiB)": 91.52, "step": 69875, "token_acc": 0.7566328521264144, "train_speed(iter/s)": 0.139241 }, { "epoch": 0.9067405415400186, "grad_norm": 0.7503640651702881, "learning_rate": 6.173325486486973e-05, "loss": 0.9069686889648437, "memory(GiB)": 91.52, "step": 69880, "token_acc": 0.755948902062286, "train_speed(iter/s)": 0.13924 }, { "epoch": 0.9068054199416743, "grad_norm": 0.7212452292442322, "learning_rate": 6.172804082095433e-05, "loss": 0.8928600311279297, "memory(GiB)": 91.52, "step": 69885, "token_acc": 0.7664710547184774, "train_speed(iter/s)": 0.139239 }, { "epoch": 0.90687029834333, "grad_norm": 0.6515539884567261, "learning_rate": 6.172282664207334e-05, "loss": 0.8338552474975586, "memory(GiB)": 91.52, "step": 69890, "token_acc": 0.7755050045495905, "train_speed(iter/s)": 0.139238 }, { "epoch": 0.9069351767449857, "grad_norm": 0.7619446516036987, "learning_rate": 6.171761232828686e-05, "loss": 0.8801738739013671, "memory(GiB)": 91.52, "step": 69895, "token_acc": 0.7763232783153102, "train_speed(iter/s)": 0.139237 }, { "epoch": 0.9070000551466414, "grad_norm": 0.6990941166877747, "learning_rate": 6.171239787965485e-05, "loss": 0.8553642272949219, "memory(GiB)": 91.52, "step": 69900, "token_acc": 0.7659126775381379, "train_speed(iter/s)": 0.139236 }, { "epoch": 0.9070649335482971, "grad_norm": 0.7251709699630737, "learning_rate": 6.17071832962373e-05, "loss": 0.8456319808959961, "memory(GiB)": 91.52, "step": 69905, "token_acc": 0.7685669826000848, "train_speed(iter/s)": 0.139234 }, { "epoch": 0.9071298119499528, "grad_norm": 0.6508288979530334, "learning_rate": 6.170196857809425e-05, "loss": 0.8433233261108398, "memory(GiB)": 91.52, "step": 69910, "token_acc": 0.7638519658598013, "train_speed(iter/s)": 0.139233 }, { "epoch": 0.9071946903516085, "grad_norm": 0.669716477394104, "learning_rate": 6.169675372528569e-05, "loss": 0.8121282577514648, "memory(GiB)": 91.52, "step": 69915, "token_acc": 0.7639988884257329, "train_speed(iter/s)": 0.139231 }, { "epoch": 0.9072595687532642, "grad_norm": 0.738756000995636, "learning_rate": 6.169153873787166e-05, "loss": 0.849112606048584, "memory(GiB)": 91.52, "step": 69920, "token_acc": 0.7738511060886241, "train_speed(iter/s)": 0.13923 }, { "epoch": 0.9073244471549199, "grad_norm": 0.7441701292991638, "learning_rate": 6.168632361591212e-05, "loss": 0.8271172523498536, "memory(GiB)": 91.52, "step": 69925, "token_acc": 0.7558584948174853, "train_speed(iter/s)": 0.139229 }, { "epoch": 0.9073893255565756, "grad_norm": 0.6462600827217102, "learning_rate": 6.168110835946715e-05, "loss": 0.8360027313232422, "memory(GiB)": 91.52, "step": 69930, "token_acc": 0.7735640385301463, "train_speed(iter/s)": 0.139228 }, { "epoch": 0.9074542039582313, "grad_norm": 0.7587388753890991, "learning_rate": 6.167589296859673e-05, "loss": 0.8591070175170898, "memory(GiB)": 91.52, "step": 69935, "token_acc": 0.7653869812208272, "train_speed(iter/s)": 0.139227 }, { "epoch": 0.907519082359887, "grad_norm": 0.7051774859428406, "learning_rate": 6.167067744336089e-05, "loss": 0.8742124557495117, "memory(GiB)": 91.52, "step": 69940, "token_acc": 0.7768172321878827, "train_speed(iter/s)": 0.139225 }, { "epoch": 0.9075839607615427, "grad_norm": 0.8231613636016846, "learning_rate": 6.166546178381964e-05, "loss": 0.8644981384277344, "memory(GiB)": 91.52, "step": 69945, "token_acc": 0.776129699896516, "train_speed(iter/s)": 0.139224 }, { "epoch": 0.9076488391631984, "grad_norm": 0.6835699081420898, "learning_rate": 6.166024599003298e-05, "loss": 0.8792840957641601, "memory(GiB)": 91.52, "step": 69950, "token_acc": 0.7677075693827923, "train_speed(iter/s)": 0.139222 }, { "epoch": 0.9077137175648541, "grad_norm": 0.7554329037666321, "learning_rate": 6.1655030062061e-05, "loss": 0.8722074508666993, "memory(GiB)": 91.52, "step": 69955, "token_acc": 0.7424032140248357, "train_speed(iter/s)": 0.139222 }, { "epoch": 0.9077785959665098, "grad_norm": 0.7468889355659485, "learning_rate": 6.164981399996366e-05, "loss": 0.8545145988464355, "memory(GiB)": 91.52, "step": 69960, "token_acc": 0.7678714247629204, "train_speed(iter/s)": 0.13922 }, { "epoch": 0.9078434743681655, "grad_norm": 0.7786298990249634, "learning_rate": 6.164459780380102e-05, "loss": 0.8702436447143554, "memory(GiB)": 91.52, "step": 69965, "token_acc": 0.7590404711459044, "train_speed(iter/s)": 0.139219 }, { "epoch": 0.9079083527698212, "grad_norm": 0.7699339985847473, "learning_rate": 6.16393814736331e-05, "loss": 0.8747980117797851, "memory(GiB)": 91.52, "step": 69970, "token_acc": 0.7627772768259693, "train_speed(iter/s)": 0.139218 }, { "epoch": 0.9079732311714769, "grad_norm": 0.6659730672836304, "learning_rate": 6.163416500951992e-05, "loss": 0.8046365737915039, "memory(GiB)": 91.52, "step": 69975, "token_acc": 0.761932660164731, "train_speed(iter/s)": 0.139217 }, { "epoch": 0.9080381095731326, "grad_norm": 0.8069307208061218, "learning_rate": 6.162894841152153e-05, "loss": 0.8564000129699707, "memory(GiB)": 91.52, "step": 69980, "token_acc": 0.7597705748966254, "train_speed(iter/s)": 0.139216 }, { "epoch": 0.9081029879747883, "grad_norm": 0.7299882173538208, "learning_rate": 6.162373167969792e-05, "loss": 0.8950634956359863, "memory(GiB)": 91.52, "step": 69985, "token_acc": 0.7553627987330838, "train_speed(iter/s)": 0.139215 }, { "epoch": 0.908167866376444, "grad_norm": 0.681795597076416, "learning_rate": 6.16185148141092e-05, "loss": 0.8213112831115723, "memory(GiB)": 91.52, "step": 69990, "token_acc": 0.7692799031802595, "train_speed(iter/s)": 0.139213 }, { "epoch": 0.9082327447780997, "grad_norm": 0.7750905752182007, "learning_rate": 6.161329781481532e-05, "loss": 0.8258428573608398, "memory(GiB)": 91.52, "step": 69995, "token_acc": 0.7529374065370648, "train_speed(iter/s)": 0.139212 }, { "epoch": 0.9082976231797554, "grad_norm": 0.692266583442688, "learning_rate": 6.160808068187638e-05, "loss": 0.891107177734375, "memory(GiB)": 91.52, "step": 70000, "token_acc": 0.7603662484200142, "train_speed(iter/s)": 0.13921 }, { "epoch": 0.9082976231797554, "eval_loss": 0.8503258228302002, "eval_runtime": 1702.0997, "eval_samples_per_second": 29.27, "eval_steps_per_second": 1.83, "eval_token_acc": 0.7652481929780048, "step": 70000 }, { "epoch": 0.9083625015814111, "grad_norm": 0.6277385950088501, "learning_rate": 6.160286341535237e-05, "loss": 0.8338653564453125, "memory(GiB)": 91.52, "step": 70005, "token_acc": 0.7666439693355399, "train_speed(iter/s)": 0.138704 }, { "epoch": 0.9084273799830668, "grad_norm": 0.6926554441452026, "learning_rate": 6.159764601530337e-05, "loss": 0.8427633285522461, "memory(GiB)": 91.52, "step": 70010, "token_acc": 0.7686993188452805, "train_speed(iter/s)": 0.138703 }, { "epoch": 0.9084922583847225, "grad_norm": 0.8399341702461243, "learning_rate": 6.159242848178943e-05, "loss": 0.8600588798522949, "memory(GiB)": 91.52, "step": 70015, "token_acc": 0.7632460468969492, "train_speed(iter/s)": 0.138701 }, { "epoch": 0.9085571367863782, "grad_norm": 0.6460912227630615, "learning_rate": 6.158721081487052e-05, "loss": 0.8628847122192382, "memory(GiB)": 91.52, "step": 70020, "token_acc": 0.760841533763149, "train_speed(iter/s)": 0.138699 }, { "epoch": 0.9086220151880339, "grad_norm": 0.7988932132720947, "learning_rate": 6.158199301460676e-05, "loss": 0.8668216705322266, "memory(GiB)": 91.52, "step": 70025, "token_acc": 0.7546445880452343, "train_speed(iter/s)": 0.138698 }, { "epoch": 0.9086868935896896, "grad_norm": 0.7751309275627136, "learning_rate": 6.157677508105817e-05, "loss": 0.831269645690918, "memory(GiB)": 91.52, "step": 70030, "token_acc": 0.7592796504812986, "train_speed(iter/s)": 0.138696 }, { "epoch": 0.9087517719913453, "grad_norm": 0.7470778226852417, "learning_rate": 6.157155701428479e-05, "loss": 0.8598276138305664, "memory(GiB)": 91.52, "step": 70035, "token_acc": 0.7540682778200956, "train_speed(iter/s)": 0.138695 }, { "epoch": 0.908816650393001, "grad_norm": 0.7091312408447266, "learning_rate": 6.15663388143467e-05, "loss": 0.8723898887634277, "memory(GiB)": 91.52, "step": 70040, "token_acc": 0.7726382851644158, "train_speed(iter/s)": 0.138694 }, { "epoch": 0.9088815287946567, "grad_norm": 0.7304142713546753, "learning_rate": 6.15611204813039e-05, "loss": 0.8109058380126953, "memory(GiB)": 91.52, "step": 70045, "token_acc": 0.7730573097835264, "train_speed(iter/s)": 0.138693 }, { "epoch": 0.9089464071963124, "grad_norm": 0.6969614028930664, "learning_rate": 6.155590201521648e-05, "loss": 0.8733938217163086, "memory(GiB)": 91.52, "step": 70050, "token_acc": 0.7745978301533857, "train_speed(iter/s)": 0.138692 }, { "epoch": 0.909011285597968, "grad_norm": 0.7470064163208008, "learning_rate": 6.155068341614448e-05, "loss": 0.8142601013183594, "memory(GiB)": 91.52, "step": 70055, "token_acc": 0.7885353003161222, "train_speed(iter/s)": 0.13869 }, { "epoch": 0.9090761639996237, "grad_norm": 0.8095625638961792, "learning_rate": 6.154546468414797e-05, "loss": 0.8218210220336915, "memory(GiB)": 91.52, "step": 70060, "token_acc": 0.7774379754577775, "train_speed(iter/s)": 0.138689 }, { "epoch": 0.9091410424012794, "grad_norm": 0.7752968668937683, "learning_rate": 6.154024581928697e-05, "loss": 0.8575508117675781, "memory(GiB)": 91.52, "step": 70065, "token_acc": 0.7727814630269312, "train_speed(iter/s)": 0.138688 }, { "epoch": 0.909205920802935, "grad_norm": 0.6401729583740234, "learning_rate": 6.15350268216216e-05, "loss": 0.865544605255127, "memory(GiB)": 91.52, "step": 70070, "token_acc": 0.7621752912601375, "train_speed(iter/s)": 0.138686 }, { "epoch": 0.9092707992045908, "grad_norm": 0.7496227025985718, "learning_rate": 6.152980769121186e-05, "loss": 0.8364864349365234, "memory(GiB)": 91.52, "step": 70075, "token_acc": 0.7843383420298953, "train_speed(iter/s)": 0.138685 }, { "epoch": 0.9093356776062465, "grad_norm": 0.6869655847549438, "learning_rate": 6.152458842811785e-05, "loss": 0.8203821182250977, "memory(GiB)": 91.52, "step": 70080, "token_acc": 0.7757029028747938, "train_speed(iter/s)": 0.138684 }, { "epoch": 0.9094005560079021, "grad_norm": 0.6536240577697754, "learning_rate": 6.15193690323996e-05, "loss": 0.8497827529907227, "memory(GiB)": 91.52, "step": 70085, "token_acc": 0.7804181540030597, "train_speed(iter/s)": 0.138683 }, { "epoch": 0.9094654344095578, "grad_norm": 0.7099568843841553, "learning_rate": 6.15141495041172e-05, "loss": 0.8512172698974609, "memory(GiB)": 91.52, "step": 70090, "token_acc": 0.7582432952201126, "train_speed(iter/s)": 0.138681 }, { "epoch": 0.9095303128112135, "grad_norm": 0.7234222292900085, "learning_rate": 6.15089298433307e-05, "loss": 0.8675153732299805, "memory(GiB)": 91.52, "step": 70095, "token_acc": 0.7749384441338923, "train_speed(iter/s)": 0.13868 }, { "epoch": 0.9095951912128692, "grad_norm": 0.7248008847236633, "learning_rate": 6.150371005010019e-05, "loss": 0.8989582061767578, "memory(GiB)": 91.52, "step": 70100, "token_acc": 0.7275291933007177, "train_speed(iter/s)": 0.138679 }, { "epoch": 0.909660069614525, "grad_norm": 0.7556139230728149, "learning_rate": 6.149849012448572e-05, "loss": 0.8703657150268554, "memory(GiB)": 91.52, "step": 70105, "token_acc": 0.7703558910757616, "train_speed(iter/s)": 0.138678 }, { "epoch": 0.9097249480161806, "grad_norm": 0.7715654373168945, "learning_rate": 6.149327006654737e-05, "loss": 0.8147188186645508, "memory(GiB)": 91.52, "step": 70110, "token_acc": 0.7536690533644527, "train_speed(iter/s)": 0.138677 }, { "epoch": 0.9097898264178363, "grad_norm": 0.8211331367492676, "learning_rate": 6.148804987634519e-05, "loss": 0.834269905090332, "memory(GiB)": 91.52, "step": 70115, "token_acc": 0.7711843511136681, "train_speed(iter/s)": 0.138675 }, { "epoch": 0.909854704819492, "grad_norm": 0.6817640662193298, "learning_rate": 6.14828295539393e-05, "loss": 0.8548405647277832, "memory(GiB)": 91.52, "step": 70120, "token_acc": 0.7610270751239354, "train_speed(iter/s)": 0.138674 }, { "epoch": 0.9099195832211477, "grad_norm": 0.6874873042106628, "learning_rate": 6.147760909938972e-05, "loss": 0.8799006462097168, "memory(GiB)": 91.52, "step": 70125, "token_acc": 0.7708788351534062, "train_speed(iter/s)": 0.138672 }, { "epoch": 0.9099844616228034, "grad_norm": 0.7629898190498352, "learning_rate": 6.147238851275656e-05, "loss": 0.8807003021240234, "memory(GiB)": 91.52, "step": 70130, "token_acc": 0.7668052256532066, "train_speed(iter/s)": 0.138671 }, { "epoch": 0.9100493400244591, "grad_norm": 0.6938570737838745, "learning_rate": 6.14671677940999e-05, "loss": 0.841768741607666, "memory(GiB)": 91.52, "step": 70135, "token_acc": 0.7658978991745492, "train_speed(iter/s)": 0.138669 }, { "epoch": 0.9101142184261148, "grad_norm": 0.700292706489563, "learning_rate": 6.14619469434798e-05, "loss": 0.8257214546203613, "memory(GiB)": 91.52, "step": 70140, "token_acc": 0.7693268167386108, "train_speed(iter/s)": 0.138668 }, { "epoch": 0.9101790968277705, "grad_norm": 0.7262477278709412, "learning_rate": 6.145672596095638e-05, "loss": 0.834454345703125, "memory(GiB)": 91.52, "step": 70145, "token_acc": 0.7770105321894399, "train_speed(iter/s)": 0.138667 }, { "epoch": 0.9102439752294262, "grad_norm": 0.6950844526290894, "learning_rate": 6.145150484658965e-05, "loss": 0.8629266738891601, "memory(GiB)": 91.52, "step": 70150, "token_acc": 0.7593032462391133, "train_speed(iter/s)": 0.138665 }, { "epoch": 0.9103088536310819, "grad_norm": 0.699541449546814, "learning_rate": 6.144628360043978e-05, "loss": 0.8516787528991699, "memory(GiB)": 91.52, "step": 70155, "token_acc": 0.7725551932231343, "train_speed(iter/s)": 0.138664 }, { "epoch": 0.9103737320327376, "grad_norm": 0.7342844009399414, "learning_rate": 6.144106222256679e-05, "loss": 0.8355495452880859, "memory(GiB)": 91.52, "step": 70160, "token_acc": 0.7551924473493101, "train_speed(iter/s)": 0.138662 }, { "epoch": 0.9104386104343933, "grad_norm": 0.6412194967269897, "learning_rate": 6.14358407130308e-05, "loss": 0.8573727607727051, "memory(GiB)": 91.52, "step": 70165, "token_acc": 0.757295501481548, "train_speed(iter/s)": 0.138661 }, { "epoch": 0.910503488836049, "grad_norm": 0.6912797689437866, "learning_rate": 6.14306190718919e-05, "loss": 0.8322903633117675, "memory(GiB)": 91.52, "step": 70170, "token_acc": 0.7691241996480324, "train_speed(iter/s)": 0.13866 }, { "epoch": 0.9105683672377047, "grad_norm": 0.7229600548744202, "learning_rate": 6.142539729921016e-05, "loss": 0.8342727661132813, "memory(GiB)": 91.52, "step": 70175, "token_acc": 0.769694723364108, "train_speed(iter/s)": 0.138658 }, { "epoch": 0.9106332456393604, "grad_norm": 0.6613095998764038, "learning_rate": 6.142017539504569e-05, "loss": 0.9227705955505371, "memory(GiB)": 91.52, "step": 70180, "token_acc": 0.7418644650842793, "train_speed(iter/s)": 0.138657 }, { "epoch": 0.9106981240410161, "grad_norm": 0.7054991722106934, "learning_rate": 6.141495335945856e-05, "loss": 0.8197239875793457, "memory(GiB)": 91.52, "step": 70185, "token_acc": 0.7760639708947588, "train_speed(iter/s)": 0.138656 }, { "epoch": 0.9107630024426718, "grad_norm": 0.7365036010742188, "learning_rate": 6.140973119250892e-05, "loss": 0.815266227722168, "memory(GiB)": 91.52, "step": 70190, "token_acc": 0.7679737284491184, "train_speed(iter/s)": 0.138655 }, { "epoch": 0.9108278808443275, "grad_norm": 0.7051651477813721, "learning_rate": 6.140450889425678e-05, "loss": 0.8356533050537109, "memory(GiB)": 91.52, "step": 70195, "token_acc": 0.788670440636475, "train_speed(iter/s)": 0.138653 }, { "epoch": 0.9108927592459832, "grad_norm": 0.721714973449707, "learning_rate": 6.139928646476231e-05, "loss": 0.8150405883789062, "memory(GiB)": 91.52, "step": 70200, "token_acc": 0.7755524147678307, "train_speed(iter/s)": 0.138652 }, { "epoch": 0.9109576376476389, "grad_norm": 0.7107817530632019, "learning_rate": 6.139406390408557e-05, "loss": 0.8491813659667968, "memory(GiB)": 91.52, "step": 70205, "token_acc": 0.7510449229040076, "train_speed(iter/s)": 0.13865 }, { "epoch": 0.9110225160492946, "grad_norm": 0.7019027471542358, "learning_rate": 6.138884121228671e-05, "loss": 0.8716033935546875, "memory(GiB)": 91.52, "step": 70210, "token_acc": 0.7589681411489255, "train_speed(iter/s)": 0.138649 }, { "epoch": 0.9110873944509503, "grad_norm": 0.7475009560585022, "learning_rate": 6.138361838942577e-05, "loss": 0.8482744216918945, "memory(GiB)": 91.52, "step": 70215, "token_acc": 0.7509005693922609, "train_speed(iter/s)": 0.138647 }, { "epoch": 0.911152272852606, "grad_norm": 0.7825855612754822, "learning_rate": 6.137839543556288e-05, "loss": 0.9084024429321289, "memory(GiB)": 91.52, "step": 70220, "token_acc": 0.7425441476310606, "train_speed(iter/s)": 0.138646 }, { "epoch": 0.9112171512542617, "grad_norm": 0.8319322466850281, "learning_rate": 6.137317235075817e-05, "loss": 0.8490226745605469, "memory(GiB)": 91.52, "step": 70225, "token_acc": 0.7751001169507744, "train_speed(iter/s)": 0.138645 }, { "epoch": 0.9112820296559174, "grad_norm": 0.7019975781440735, "learning_rate": 6.136794913507171e-05, "loss": 0.8549559593200684, "memory(GiB)": 91.52, "step": 70230, "token_acc": 0.7547425037843409, "train_speed(iter/s)": 0.138643 }, { "epoch": 0.9113469080575731, "grad_norm": 0.6969828009605408, "learning_rate": 6.136272578856362e-05, "loss": 0.8914553642272949, "memory(GiB)": 91.52, "step": 70235, "token_acc": 0.762494674401075, "train_speed(iter/s)": 0.138642 }, { "epoch": 0.9114117864592288, "grad_norm": 0.7367108464241028, "learning_rate": 6.135750231129402e-05, "loss": 0.8840127944946289, "memory(GiB)": 91.52, "step": 70240, "token_acc": 0.7725310307609282, "train_speed(iter/s)": 0.138641 }, { "epoch": 0.9114766648608845, "grad_norm": 0.7034518718719482, "learning_rate": 6.135227870332301e-05, "loss": 0.8611281394958497, "memory(GiB)": 91.52, "step": 70245, "token_acc": 0.7495286332211911, "train_speed(iter/s)": 0.138639 }, { "epoch": 0.9115415432625402, "grad_norm": 0.7643048167228699, "learning_rate": 6.13470549647107e-05, "loss": 0.8880550384521484, "memory(GiB)": 91.52, "step": 70250, "token_acc": 0.7593461480248221, "train_speed(iter/s)": 0.138638 }, { "epoch": 0.9116064216641959, "grad_norm": 0.7592902779579163, "learning_rate": 6.134183109551723e-05, "loss": 0.8398201942443848, "memory(GiB)": 91.52, "step": 70255, "token_acc": 0.7582494237534878, "train_speed(iter/s)": 0.138636 }, { "epoch": 0.9116713000658516, "grad_norm": 0.6994112730026245, "learning_rate": 6.13366070958027e-05, "loss": 0.8535432815551758, "memory(GiB)": 91.52, "step": 70260, "token_acc": 0.7664279349153438, "train_speed(iter/s)": 0.138635 }, { "epoch": 0.9117361784675073, "grad_norm": 0.6912292242050171, "learning_rate": 6.13313829656272e-05, "loss": 0.8792295455932617, "memory(GiB)": 91.52, "step": 70265, "token_acc": 0.7706014795754261, "train_speed(iter/s)": 0.138634 }, { "epoch": 0.911801056869163, "grad_norm": 0.703516960144043, "learning_rate": 6.13261587050509e-05, "loss": 0.8212192535400391, "memory(GiB)": 91.52, "step": 70270, "token_acc": 0.7744213417026284, "train_speed(iter/s)": 0.138633 }, { "epoch": 0.9118659352708187, "grad_norm": 0.6886546015739441, "learning_rate": 6.132093431413386e-05, "loss": 0.834623146057129, "memory(GiB)": 91.52, "step": 70275, "token_acc": 0.7837768018599846, "train_speed(iter/s)": 0.138631 }, { "epoch": 0.9119308136724744, "grad_norm": 0.6884874105453491, "learning_rate": 6.131570979293628e-05, "loss": 0.8523218154907226, "memory(GiB)": 91.52, "step": 70280, "token_acc": 0.7606316229782487, "train_speed(iter/s)": 0.13863 }, { "epoch": 0.9119956920741301, "grad_norm": 0.762336015701294, "learning_rate": 6.13104851415182e-05, "loss": 0.8641202926635743, "memory(GiB)": 91.52, "step": 70285, "token_acc": 0.7528800049816302, "train_speed(iter/s)": 0.138628 }, { "epoch": 0.9120605704757857, "grad_norm": 0.8450523614883423, "learning_rate": 6.130526035993981e-05, "loss": 0.8172248840332031, "memory(GiB)": 91.52, "step": 70290, "token_acc": 0.760945934367235, "train_speed(iter/s)": 0.138627 }, { "epoch": 0.9121254488774414, "grad_norm": 0.7683306932449341, "learning_rate": 6.13000354482612e-05, "loss": 0.8868536949157715, "memory(GiB)": 91.52, "step": 70295, "token_acc": 0.7463083053363544, "train_speed(iter/s)": 0.138626 }, { "epoch": 0.9121903272790971, "grad_norm": 0.7435581088066101, "learning_rate": 6.129481040654252e-05, "loss": 0.8850818634033203, "memory(GiB)": 91.52, "step": 70300, "token_acc": 0.7532483451826428, "train_speed(iter/s)": 0.138625 }, { "epoch": 0.9122552056807528, "grad_norm": 0.7321770191192627, "learning_rate": 6.128958523484388e-05, "loss": 0.807182502746582, "memory(GiB)": 91.52, "step": 70305, "token_acc": 0.7719968612466309, "train_speed(iter/s)": 0.138623 }, { "epoch": 0.9123200840824085, "grad_norm": 0.7412548661231995, "learning_rate": 6.128435993322543e-05, "loss": 0.8809606552124023, "memory(GiB)": 91.52, "step": 70310, "token_acc": 0.7530251216625017, "train_speed(iter/s)": 0.138621 }, { "epoch": 0.9123849624840642, "grad_norm": 0.6853781342506409, "learning_rate": 6.12791345017473e-05, "loss": 0.8443038940429688, "memory(GiB)": 91.52, "step": 70315, "token_acc": 0.7533226611071814, "train_speed(iter/s)": 0.13862 }, { "epoch": 0.9124498408857199, "grad_norm": 0.7184979915618896, "learning_rate": 6.127390894046958e-05, "loss": 0.8506214141845703, "memory(GiB)": 91.52, "step": 70320, "token_acc": 0.766275500992959, "train_speed(iter/s)": 0.138619 }, { "epoch": 0.9125147192873756, "grad_norm": 0.6456124186515808, "learning_rate": 6.126868324945247e-05, "loss": 0.8937208175659179, "memory(GiB)": 91.52, "step": 70325, "token_acc": 0.7487017804154302, "train_speed(iter/s)": 0.138617 }, { "epoch": 0.9125795976890313, "grad_norm": 0.8025760650634766, "learning_rate": 6.126345742875606e-05, "loss": 0.9096400260925293, "memory(GiB)": 91.52, "step": 70330, "token_acc": 0.753731343283582, "train_speed(iter/s)": 0.138616 }, { "epoch": 0.912644476090687, "grad_norm": 0.7716947793960571, "learning_rate": 6.125823147844052e-05, "loss": 0.9018775939941406, "memory(GiB)": 91.52, "step": 70335, "token_acc": 0.7545994239699053, "train_speed(iter/s)": 0.138614 }, { "epoch": 0.9127093544923427, "grad_norm": 0.691888689994812, "learning_rate": 6.125300539856597e-05, "loss": 0.867519474029541, "memory(GiB)": 91.52, "step": 70340, "token_acc": 0.7701695688540795, "train_speed(iter/s)": 0.138613 }, { "epoch": 0.9127742328939984, "grad_norm": 0.7137902975082397, "learning_rate": 6.124777918919257e-05, "loss": 0.8774105072021484, "memory(GiB)": 91.52, "step": 70345, "token_acc": 0.7608334716918479, "train_speed(iter/s)": 0.138612 }, { "epoch": 0.9128391112956541, "grad_norm": 0.7040041089057922, "learning_rate": 6.124255285038045e-05, "loss": 0.8672063827514649, "memory(GiB)": 91.52, "step": 70350, "token_acc": 0.7649786234720298, "train_speed(iter/s)": 0.13861 }, { "epoch": 0.9129039896973098, "grad_norm": 0.7328476309776306, "learning_rate": 6.123732638218974e-05, "loss": 0.8494096755981445, "memory(GiB)": 91.52, "step": 70355, "token_acc": 0.776397744444035, "train_speed(iter/s)": 0.138609 }, { "epoch": 0.9129688680989655, "grad_norm": 0.6928780674934387, "learning_rate": 6.123209978468061e-05, "loss": 0.8038660049438476, "memory(GiB)": 91.52, "step": 70360, "token_acc": 0.7577450482478415, "train_speed(iter/s)": 0.138607 }, { "epoch": 0.9130337465006212, "grad_norm": 0.7526678442955017, "learning_rate": 6.122687305791321e-05, "loss": 0.860066032409668, "memory(GiB)": 91.52, "step": 70365, "token_acc": 0.7462295331079284, "train_speed(iter/s)": 0.138606 }, { "epoch": 0.9130986249022769, "grad_norm": 0.7618358135223389, "learning_rate": 6.122164620194765e-05, "loss": 0.8681159019470215, "memory(GiB)": 91.52, "step": 70370, "token_acc": 0.7453730628442407, "train_speed(iter/s)": 0.138605 }, { "epoch": 0.9131635033039326, "grad_norm": 0.6989964842796326, "learning_rate": 6.121641921684413e-05, "loss": 0.8128108978271484, "memory(GiB)": 91.52, "step": 70375, "token_acc": 0.7852822580645161, "train_speed(iter/s)": 0.138603 }, { "epoch": 0.9132283817055883, "grad_norm": 0.6755953431129456, "learning_rate": 6.121119210266278e-05, "loss": 0.8448332786560059, "memory(GiB)": 91.52, "step": 70380, "token_acc": 0.7558553778518912, "train_speed(iter/s)": 0.138601 }, { "epoch": 0.913293260107244, "grad_norm": 0.7537696361541748, "learning_rate": 6.120596485946374e-05, "loss": 0.8007573127746582, "memory(GiB)": 91.52, "step": 70385, "token_acc": 0.7909169823232324, "train_speed(iter/s)": 0.1386 }, { "epoch": 0.9133581385088997, "grad_norm": 0.7849984169006348, "learning_rate": 6.120073748730718e-05, "loss": 0.8652969360351562, "memory(GiB)": 91.52, "step": 70390, "token_acc": 0.7690880687365749, "train_speed(iter/s)": 0.138598 }, { "epoch": 0.9134230169105554, "grad_norm": 0.7073960304260254, "learning_rate": 6.119550998625326e-05, "loss": 0.8809985160827637, "memory(GiB)": 91.52, "step": 70395, "token_acc": 0.762109375, "train_speed(iter/s)": 0.138597 }, { "epoch": 0.9134878953122111, "grad_norm": 0.7303362488746643, "learning_rate": 6.119028235636213e-05, "loss": 0.8322208404541016, "memory(GiB)": 91.52, "step": 70400, "token_acc": 0.7805027967299772, "train_speed(iter/s)": 0.138595 }, { "epoch": 0.9135527737138668, "grad_norm": 0.7099728584289551, "learning_rate": 6.118505459769396e-05, "loss": 0.8489057540893554, "memory(GiB)": 91.52, "step": 70405, "token_acc": 0.7515006821282401, "train_speed(iter/s)": 0.138594 }, { "epoch": 0.9136176521155225, "grad_norm": 0.765952467918396, "learning_rate": 6.117982671030888e-05, "loss": 0.8338533401489258, "memory(GiB)": 91.52, "step": 70410, "token_acc": 0.7678003015580499, "train_speed(iter/s)": 0.138593 }, { "epoch": 0.9136825305171782, "grad_norm": 0.6992976665496826, "learning_rate": 6.117459869426709e-05, "loss": 0.8476093292236329, "memory(GiB)": 91.52, "step": 70415, "token_acc": 0.7688730115934214, "train_speed(iter/s)": 0.138592 }, { "epoch": 0.9137474089188339, "grad_norm": 0.6935964822769165, "learning_rate": 6.116937054962872e-05, "loss": 0.8567188262939454, "memory(GiB)": 91.52, "step": 70420, "token_acc": 0.7502227587594493, "train_speed(iter/s)": 0.138591 }, { "epoch": 0.9138122873204896, "grad_norm": 0.6762041449546814, "learning_rate": 6.116414227645398e-05, "loss": 0.8119741439819336, "memory(GiB)": 91.52, "step": 70425, "token_acc": 0.7816605560647724, "train_speed(iter/s)": 0.138589 }, { "epoch": 0.9138771657221453, "grad_norm": 0.7743059396743774, "learning_rate": 6.1158913874803e-05, "loss": 0.8884471893310547, "memory(GiB)": 91.52, "step": 70430, "token_acc": 0.7414632508477991, "train_speed(iter/s)": 0.138588 }, { "epoch": 0.913942044123801, "grad_norm": 0.7254141569137573, "learning_rate": 6.115368534473595e-05, "loss": 0.8088607788085938, "memory(GiB)": 91.52, "step": 70435, "token_acc": 0.7928977272727272, "train_speed(iter/s)": 0.138586 }, { "epoch": 0.9140069225254567, "grad_norm": 0.7037091255187988, "learning_rate": 6.114845668631301e-05, "loss": 0.8610383033752441, "memory(GiB)": 91.52, "step": 70440, "token_acc": 0.7761659807956104, "train_speed(iter/s)": 0.138585 }, { "epoch": 0.9140718009271124, "grad_norm": 0.707504391670227, "learning_rate": 6.114322789959435e-05, "loss": 0.8505050659179687, "memory(GiB)": 91.52, "step": 70445, "token_acc": 0.7526866565660125, "train_speed(iter/s)": 0.138584 }, { "epoch": 0.9141366793287681, "grad_norm": 0.7893027067184448, "learning_rate": 6.113799898464012e-05, "loss": 0.8450967788696289, "memory(GiB)": 91.52, "step": 70450, "token_acc": 0.746978212333513, "train_speed(iter/s)": 0.138582 }, { "epoch": 0.9142015577304238, "grad_norm": 0.6962154507637024, "learning_rate": 6.113276994151054e-05, "loss": 0.8218727111816406, "memory(GiB)": 91.52, "step": 70455, "token_acc": 0.7671476990810301, "train_speed(iter/s)": 0.138581 }, { "epoch": 0.9142664361320795, "grad_norm": 0.7427415251731873, "learning_rate": 6.112754077026575e-05, "loss": 0.8837147712707519, "memory(GiB)": 91.52, "step": 70460, "token_acc": 0.7649234514724472, "train_speed(iter/s)": 0.138581 }, { "epoch": 0.9143313145337352, "grad_norm": 0.7046576142311096, "learning_rate": 6.112231147096594e-05, "loss": 0.8587104797363281, "memory(GiB)": 91.52, "step": 70465, "token_acc": 0.7514192024258665, "train_speed(iter/s)": 0.138579 }, { "epoch": 0.9143961929353909, "grad_norm": 0.7325541973114014, "learning_rate": 6.111708204367129e-05, "loss": 0.8105691909790039, "memory(GiB)": 91.52, "step": 70470, "token_acc": 0.7806589294588827, "train_speed(iter/s)": 0.138578 }, { "epoch": 0.9144610713370466, "grad_norm": 0.787950873374939, "learning_rate": 6.111185248844196e-05, "loss": 0.786283016204834, "memory(GiB)": 91.52, "step": 70475, "token_acc": 0.7662309570161614, "train_speed(iter/s)": 0.138577 }, { "epoch": 0.9145259497387023, "grad_norm": 0.7175638675689697, "learning_rate": 6.110662280533817e-05, "loss": 0.8417142868041992, "memory(GiB)": 91.52, "step": 70480, "token_acc": 0.7669990933816863, "train_speed(iter/s)": 0.138575 }, { "epoch": 0.914590828140358, "grad_norm": 0.7228912115097046, "learning_rate": 6.110139299442004e-05, "loss": 0.8056599617004394, "memory(GiB)": 91.52, "step": 70485, "token_acc": 0.75794746391102, "train_speed(iter/s)": 0.138574 }, { "epoch": 0.9146557065420137, "grad_norm": 0.778931200504303, "learning_rate": 6.109616305574783e-05, "loss": 0.8530633926391602, "memory(GiB)": 91.52, "step": 70490, "token_acc": 0.7416493227336313, "train_speed(iter/s)": 0.138572 }, { "epoch": 0.9147205849436694, "grad_norm": 0.6808444261550903, "learning_rate": 6.109093298938169e-05, "loss": 0.8479632377624512, "memory(GiB)": 91.52, "step": 70495, "token_acc": 0.7561707035755478, "train_speed(iter/s)": 0.138571 }, { "epoch": 0.9147854633453251, "grad_norm": 0.7762015461921692, "learning_rate": 6.108570279538179e-05, "loss": 0.8218420028686524, "memory(GiB)": 91.52, "step": 70500, "token_acc": 0.7840268251790885, "train_speed(iter/s)": 0.138569 }, { "epoch": 0.9148503417469808, "grad_norm": 0.7065361142158508, "learning_rate": 6.108047247380835e-05, "loss": 0.8934780120849609, "memory(GiB)": 91.52, "step": 70505, "token_acc": 0.7666186532229025, "train_speed(iter/s)": 0.138567 }, { "epoch": 0.9149152201486365, "grad_norm": 0.7554187774658203, "learning_rate": 6.107524202472152e-05, "loss": 0.8670061111450196, "memory(GiB)": 91.52, "step": 70510, "token_acc": 0.7613531522296259, "train_speed(iter/s)": 0.138566 }, { "epoch": 0.9149800985502922, "grad_norm": 0.7149054408073425, "learning_rate": 6.107001144818155e-05, "loss": 0.8253884315490723, "memory(GiB)": 91.52, "step": 70515, "token_acc": 0.7744780417566595, "train_speed(iter/s)": 0.138564 }, { "epoch": 0.9150449769519479, "grad_norm": 0.774957001209259, "learning_rate": 6.106478074424859e-05, "loss": 0.8657382011413575, "memory(GiB)": 91.52, "step": 70520, "token_acc": 0.7666678857477234, "train_speed(iter/s)": 0.138563 }, { "epoch": 0.9151098553536036, "grad_norm": 0.768876850605011, "learning_rate": 6.105954991298284e-05, "loss": 0.8554253578186035, "memory(GiB)": 91.52, "step": 70525, "token_acc": 0.7506102790789734, "train_speed(iter/s)": 0.138562 }, { "epoch": 0.9151747337552592, "grad_norm": 0.6933068633079529, "learning_rate": 6.10543189544445e-05, "loss": 0.8567211151123046, "memory(GiB)": 91.52, "step": 70530, "token_acc": 0.7548800691493193, "train_speed(iter/s)": 0.13856 }, { "epoch": 0.9152396121569149, "grad_norm": 0.7437354326248169, "learning_rate": 6.104908786869377e-05, "loss": 0.8211839675903321, "memory(GiB)": 91.52, "step": 70535, "token_acc": 0.7644979709386045, "train_speed(iter/s)": 0.138559 }, { "epoch": 0.9153044905585705, "grad_norm": 0.6684088706970215, "learning_rate": 6.104385665579085e-05, "loss": 0.8485998153686524, "memory(GiB)": 91.52, "step": 70540, "token_acc": 0.7537502798716322, "train_speed(iter/s)": 0.138558 }, { "epoch": 0.9153693689602262, "grad_norm": 0.6599823832511902, "learning_rate": 6.103862531579592e-05, "loss": 0.8644049644470215, "memory(GiB)": 91.52, "step": 70545, "token_acc": 0.7626743340604137, "train_speed(iter/s)": 0.138556 }, { "epoch": 0.915434247361882, "grad_norm": 0.7131385207176208, "learning_rate": 6.103339384876924e-05, "loss": 0.8642951965332031, "memory(GiB)": 91.52, "step": 70550, "token_acc": 0.7723334546778303, "train_speed(iter/s)": 0.138555 }, { "epoch": 0.9154991257635376, "grad_norm": 0.7263808846473694, "learning_rate": 6.102816225477095e-05, "loss": 0.856791877746582, "memory(GiB)": 91.52, "step": 70555, "token_acc": 0.7674426251068587, "train_speed(iter/s)": 0.138553 }, { "epoch": 0.9155640041651933, "grad_norm": 0.841991662979126, "learning_rate": 6.1022930533861275e-05, "loss": 0.854067611694336, "memory(GiB)": 91.52, "step": 70560, "token_acc": 0.7727059852009117, "train_speed(iter/s)": 0.138552 }, { "epoch": 0.915628882566849, "grad_norm": 0.6191303133964539, "learning_rate": 6.101769868610042e-05, "loss": 0.8337909698486328, "memory(GiB)": 91.52, "step": 70565, "token_acc": 0.7619717388460029, "train_speed(iter/s)": 0.138551 }, { "epoch": 0.9156937609685047, "grad_norm": 0.6917495131492615, "learning_rate": 6.1012466711548624e-05, "loss": 0.820586109161377, "memory(GiB)": 91.52, "step": 70570, "token_acc": 0.7700805785712251, "train_speed(iter/s)": 0.138549 }, { "epoch": 0.9157586393701604, "grad_norm": 0.7560496926307678, "learning_rate": 6.100723461026605e-05, "loss": 0.8662574768066407, "memory(GiB)": 91.52, "step": 70575, "token_acc": 0.7640080045740423, "train_speed(iter/s)": 0.138548 }, { "epoch": 0.9158235177718161, "grad_norm": 0.809380829334259, "learning_rate": 6.100200238231293e-05, "loss": 0.8843698501586914, "memory(GiB)": 91.52, "step": 70580, "token_acc": 0.7620047801839647, "train_speed(iter/s)": 0.138546 }, { "epoch": 0.9158883961734718, "grad_norm": 0.6107609272003174, "learning_rate": 6.0996770027749494e-05, "loss": 0.809115982055664, "memory(GiB)": 91.52, "step": 70585, "token_acc": 0.7580996415050089, "train_speed(iter/s)": 0.138545 }, { "epoch": 0.9159532745751275, "grad_norm": 0.7283051609992981, "learning_rate": 6.09915375466359e-05, "loss": 0.8332070350646973, "memory(GiB)": 91.52, "step": 70590, "token_acc": 0.7724802624833385, "train_speed(iter/s)": 0.138543 }, { "epoch": 0.9160181529767832, "grad_norm": 0.7134395837783813, "learning_rate": 6.098630493903242e-05, "loss": 0.8097076416015625, "memory(GiB)": 91.52, "step": 70595, "token_acc": 0.7688533208715462, "train_speed(iter/s)": 0.138541 }, { "epoch": 0.9160830313784389, "grad_norm": 0.8033645749092102, "learning_rate": 6.098107220499925e-05, "loss": 0.8120020866394043, "memory(GiB)": 91.52, "step": 70600, "token_acc": 0.774018944519621, "train_speed(iter/s)": 0.13854 }, { "epoch": 0.9161479097800946, "grad_norm": 0.7969058156013489, "learning_rate": 6.097583934459662e-05, "loss": 0.8371479988098145, "memory(GiB)": 91.52, "step": 70605, "token_acc": 0.7606078078892922, "train_speed(iter/s)": 0.138539 }, { "epoch": 0.9162127881817503, "grad_norm": 0.6491057872772217, "learning_rate": 6.097060635788472e-05, "loss": 0.8279546737670899, "memory(GiB)": 91.52, "step": 70610, "token_acc": 0.7811044577511643, "train_speed(iter/s)": 0.138538 }, { "epoch": 0.916277666583406, "grad_norm": 0.7091308236122131, "learning_rate": 6.096537324492378e-05, "loss": 0.7960067749023437, "memory(GiB)": 91.52, "step": 70615, "token_acc": 0.7736414048059149, "train_speed(iter/s)": 0.138536 }, { "epoch": 0.9163425449850617, "grad_norm": 0.8191639184951782, "learning_rate": 6.0960140005774056e-05, "loss": 0.8693655014038086, "memory(GiB)": 91.52, "step": 70620, "token_acc": 0.7810186556114895, "train_speed(iter/s)": 0.138535 }, { "epoch": 0.9164074233867174, "grad_norm": 0.7759312391281128, "learning_rate": 6.0954906640495724e-05, "loss": 0.8942092895507813, "memory(GiB)": 91.52, "step": 70625, "token_acc": 0.7541965181877963, "train_speed(iter/s)": 0.138534 }, { "epoch": 0.9164723017883731, "grad_norm": 0.6971482038497925, "learning_rate": 6.094967314914905e-05, "loss": 0.8656571388244629, "memory(GiB)": 91.52, "step": 70630, "token_acc": 0.7503713633564001, "train_speed(iter/s)": 0.138533 }, { "epoch": 0.9165371801900288, "grad_norm": 0.6799042820930481, "learning_rate": 6.094443953179422e-05, "loss": 0.8669689178466797, "memory(GiB)": 91.52, "step": 70635, "token_acc": 0.7742226368159204, "train_speed(iter/s)": 0.138531 }, { "epoch": 0.9166020585916845, "grad_norm": 0.6974301934242249, "learning_rate": 6.0939205788491506e-05, "loss": 0.8294025421142578, "memory(GiB)": 91.52, "step": 70640, "token_acc": 0.7539763378516023, "train_speed(iter/s)": 0.138531 }, { "epoch": 0.9166669369933402, "grad_norm": 0.7012091875076294, "learning_rate": 6.093397191930109e-05, "loss": 0.9072494506835938, "memory(GiB)": 91.52, "step": 70645, "token_acc": 0.7491136757939202, "train_speed(iter/s)": 0.138529 }, { "epoch": 0.9167318153949959, "grad_norm": 0.7074299454689026, "learning_rate": 6.092873792428325e-05, "loss": 0.8732309341430664, "memory(GiB)": 91.52, "step": 70650, "token_acc": 0.7675296475697344, "train_speed(iter/s)": 0.138528 }, { "epoch": 0.9167966937966516, "grad_norm": 0.7314314246177673, "learning_rate": 6.092350380349819e-05, "loss": 0.8867626190185547, "memory(GiB)": 91.52, "step": 70655, "token_acc": 0.7620578778135049, "train_speed(iter/s)": 0.138527 }, { "epoch": 0.9168615721983073, "grad_norm": 0.7366589903831482, "learning_rate": 6.091826955700616e-05, "loss": 0.8414024353027344, "memory(GiB)": 91.52, "step": 70660, "token_acc": 0.7458057766319172, "train_speed(iter/s)": 0.138526 }, { "epoch": 0.916926450599963, "grad_norm": 0.7011892795562744, "learning_rate": 6.091303518486739e-05, "loss": 0.8607949256896973, "memory(GiB)": 91.52, "step": 70665, "token_acc": 0.7581662095251666, "train_speed(iter/s)": 0.138524 }, { "epoch": 0.9169913290016187, "grad_norm": 0.7345300316810608, "learning_rate": 6.090780068714208e-05, "loss": 0.844246768951416, "memory(GiB)": 91.52, "step": 70670, "token_acc": 0.7637778412335808, "train_speed(iter/s)": 0.138522 }, { "epoch": 0.9170562074032744, "grad_norm": 0.9626737236976624, "learning_rate": 6.0902566063890546e-05, "loss": 0.8660898208618164, "memory(GiB)": 91.52, "step": 70675, "token_acc": 0.7543988553308326, "train_speed(iter/s)": 0.138521 }, { "epoch": 0.9171210858049301, "grad_norm": 0.5952730774879456, "learning_rate": 6.089733131517296e-05, "loss": 0.833595085144043, "memory(GiB)": 91.52, "step": 70680, "token_acc": 0.7727572831558966, "train_speed(iter/s)": 0.138519 }, { "epoch": 0.9171859642065858, "grad_norm": 0.6582485437393188, "learning_rate": 6.089209644104958e-05, "loss": 0.8475896835327148, "memory(GiB)": 91.52, "step": 70685, "token_acc": 0.760190940622961, "train_speed(iter/s)": 0.138517 }, { "epoch": 0.9172508426082415, "grad_norm": 0.7966558933258057, "learning_rate": 6.0886861441580666e-05, "loss": 0.8884407043457031, "memory(GiB)": 91.52, "step": 70690, "token_acc": 0.7561584922366632, "train_speed(iter/s)": 0.138516 }, { "epoch": 0.9173157210098972, "grad_norm": 0.695160448551178, "learning_rate": 6.0881626316826454e-05, "loss": 0.8276718139648438, "memory(GiB)": 91.52, "step": 70695, "token_acc": 0.7751300614836104, "train_speed(iter/s)": 0.138515 }, { "epoch": 0.9173805994115529, "grad_norm": 0.700812578201294, "learning_rate": 6.0876391066847184e-05, "loss": 0.8351899147033691, "memory(GiB)": 91.52, "step": 70700, "token_acc": 0.7612326182656698, "train_speed(iter/s)": 0.138513 }, { "epoch": 0.9174454778132086, "grad_norm": 0.7632697820663452, "learning_rate": 6.0871155691703094e-05, "loss": 0.859923267364502, "memory(GiB)": 91.52, "step": 70705, "token_acc": 0.7551793835270338, "train_speed(iter/s)": 0.138512 }, { "epoch": 0.9175103562148643, "grad_norm": 0.7345230579376221, "learning_rate": 6.0865920191454475e-05, "loss": 0.9017560005187988, "memory(GiB)": 91.52, "step": 70710, "token_acc": 0.7520807567603703, "train_speed(iter/s)": 0.13851 }, { "epoch": 0.91757523461652, "grad_norm": 0.7724908590316772, "learning_rate": 6.0860684566161505e-05, "loss": 0.8552303314208984, "memory(GiB)": 91.52, "step": 70715, "token_acc": 0.7691645408163266, "train_speed(iter/s)": 0.138509 }, { "epoch": 0.9176401130181757, "grad_norm": 0.6716448664665222, "learning_rate": 6.0855448815884495e-05, "loss": 0.8319606781005859, "memory(GiB)": 91.52, "step": 70720, "token_acc": 0.7724771285358758, "train_speed(iter/s)": 0.138508 }, { "epoch": 0.9177049914198314, "grad_norm": 0.6942352056503296, "learning_rate": 6.085021294068368e-05, "loss": 0.8098738670349122, "memory(GiB)": 91.52, "step": 70725, "token_acc": 0.7897645922818233, "train_speed(iter/s)": 0.138506 }, { "epoch": 0.9177698698214871, "grad_norm": 0.7115210294723511, "learning_rate": 6.0844976940619326e-05, "loss": 0.8084661483764648, "memory(GiB)": 91.52, "step": 70730, "token_acc": 0.771265289054605, "train_speed(iter/s)": 0.138505 }, { "epoch": 0.9178347482231428, "grad_norm": 0.7245924472808838, "learning_rate": 6.083974081575166e-05, "loss": 0.8308080673217774, "memory(GiB)": 91.52, "step": 70735, "token_acc": 0.748306958138882, "train_speed(iter/s)": 0.138504 }, { "epoch": 0.9178996266247985, "grad_norm": 0.7401642799377441, "learning_rate": 6.083450456614095e-05, "loss": 0.8359159469604492, "memory(GiB)": 91.52, "step": 70740, "token_acc": 0.7551810432470478, "train_speed(iter/s)": 0.138503 }, { "epoch": 0.9179645050264542, "grad_norm": 0.6527836918830872, "learning_rate": 6.0829268191847456e-05, "loss": 0.826622200012207, "memory(GiB)": 91.52, "step": 70745, "token_acc": 0.7764171354392038, "train_speed(iter/s)": 0.138502 }, { "epoch": 0.9180293834281099, "grad_norm": 0.7238606214523315, "learning_rate": 6.082403169293145e-05, "loss": 0.8710742950439453, "memory(GiB)": 91.52, "step": 70750, "token_acc": 0.7579588628324381, "train_speed(iter/s)": 0.138501 }, { "epoch": 0.9180942618297656, "grad_norm": 0.689405620098114, "learning_rate": 6.081879506945319e-05, "loss": 0.8565481185913086, "memory(GiB)": 91.52, "step": 70755, "token_acc": 0.7819808615746019, "train_speed(iter/s)": 0.1385 }, { "epoch": 0.9181591402314213, "grad_norm": 0.8007175922393799, "learning_rate": 6.081355832147293e-05, "loss": 0.8924362182617187, "memory(GiB)": 91.52, "step": 70760, "token_acc": 0.7460236820508812, "train_speed(iter/s)": 0.138499 }, { "epoch": 0.918224018633077, "grad_norm": 0.759489893913269, "learning_rate": 6.080832144905093e-05, "loss": 0.8425697326660156, "memory(GiB)": 91.52, "step": 70765, "token_acc": 0.765094375023453, "train_speed(iter/s)": 0.138498 }, { "epoch": 0.9182888970347326, "grad_norm": 0.7379432916641235, "learning_rate": 6.080308445224747e-05, "loss": 0.8677888870239258, "memory(GiB)": 91.52, "step": 70770, "token_acc": 0.7732235894025052, "train_speed(iter/s)": 0.138497 }, { "epoch": 0.9183537754363883, "grad_norm": 0.6935157179832458, "learning_rate": 6.0797847331122795e-05, "loss": 0.8575725555419922, "memory(GiB)": 91.52, "step": 70775, "token_acc": 0.7660231078995898, "train_speed(iter/s)": 0.138496 }, { "epoch": 0.918418653838044, "grad_norm": 0.6466492414474487, "learning_rate": 6.07926100857372e-05, "loss": 0.8098854064941406, "memory(GiB)": 91.52, "step": 70780, "token_acc": 0.7748906182158245, "train_speed(iter/s)": 0.138495 }, { "epoch": 0.9184835322396997, "grad_norm": 0.6813456416130066, "learning_rate": 6.078737271615094e-05, "loss": 0.8566837310791016, "memory(GiB)": 91.52, "step": 70785, "token_acc": 0.7597849549953163, "train_speed(iter/s)": 0.138493 }, { "epoch": 0.9185484106413554, "grad_norm": 0.7616624236106873, "learning_rate": 6.0782135222424283e-05, "loss": 0.9094722747802735, "memory(GiB)": 91.52, "step": 70790, "token_acc": 0.7532190367788291, "train_speed(iter/s)": 0.138492 }, { "epoch": 0.9186132890430111, "grad_norm": 0.7416688203811646, "learning_rate": 6.077689760461752e-05, "loss": 0.862267017364502, "memory(GiB)": 91.52, "step": 70795, "token_acc": 0.7631921497947332, "train_speed(iter/s)": 0.138491 }, { "epoch": 0.9186781674446668, "grad_norm": 0.7060362696647644, "learning_rate": 6.0771659862790905e-05, "loss": 0.8538544654846192, "memory(GiB)": 91.52, "step": 70800, "token_acc": 0.7422684145723184, "train_speed(iter/s)": 0.138489 }, { "epoch": 0.9187430458463225, "grad_norm": 0.6912962794303894, "learning_rate": 6.076642199700473e-05, "loss": 0.8472768783569335, "memory(GiB)": 91.52, "step": 70805, "token_acc": 0.7535374868004224, "train_speed(iter/s)": 0.138488 }, { "epoch": 0.9188079242479782, "grad_norm": 0.6689486503601074, "learning_rate": 6.076118400731925e-05, "loss": 0.8652551651000977, "memory(GiB)": 91.52, "step": 70810, "token_acc": 0.7416711342574864, "train_speed(iter/s)": 0.138486 }, { "epoch": 0.9188728026496339, "grad_norm": 0.664498507976532, "learning_rate": 6.0755945893794765e-05, "loss": 0.8677644729614258, "memory(GiB)": 91.52, "step": 70815, "token_acc": 0.759301502399119, "train_speed(iter/s)": 0.138485 }, { "epoch": 0.9189376810512896, "grad_norm": 0.6434130072593689, "learning_rate": 6.0750707656491544e-05, "loss": 0.8758370399475097, "memory(GiB)": 91.52, "step": 70820, "token_acc": 0.7588499252118996, "train_speed(iter/s)": 0.138482 }, { "epoch": 0.9190025594529453, "grad_norm": 0.7723864912986755, "learning_rate": 6.074546929546987e-05, "loss": 0.8431594848632813, "memory(GiB)": 91.52, "step": 70825, "token_acc": 0.7616375041267746, "train_speed(iter/s)": 0.138481 }, { "epoch": 0.919067437854601, "grad_norm": 0.7548149228096008, "learning_rate": 6.0740230810790034e-05, "loss": 0.8816940307617187, "memory(GiB)": 91.52, "step": 70830, "token_acc": 0.7666745786850225, "train_speed(iter/s)": 0.13848 }, { "epoch": 0.9191323162562567, "grad_norm": 0.659684419631958, "learning_rate": 6.073499220251231e-05, "loss": 0.8783185958862305, "memory(GiB)": 91.52, "step": 70835, "token_acc": 0.7606162700919492, "train_speed(iter/s)": 0.138478 }, { "epoch": 0.9191971946579124, "grad_norm": 0.6894384026527405, "learning_rate": 6.0729753470696995e-05, "loss": 0.8354202270507812, "memory(GiB)": 91.52, "step": 70840, "token_acc": 0.7752661412968339, "train_speed(iter/s)": 0.138477 }, { "epoch": 0.9192620730595681, "grad_norm": 0.6940687894821167, "learning_rate": 6.072451461540435e-05, "loss": 0.8350656509399415, "memory(GiB)": 91.52, "step": 70845, "token_acc": 0.746843273534442, "train_speed(iter/s)": 0.138476 }, { "epoch": 0.9193269514612238, "grad_norm": 0.6245447397232056, "learning_rate": 6.071927563669468e-05, "loss": 0.8535028457641601, "memory(GiB)": 91.52, "step": 70850, "token_acc": 0.7835151672116287, "train_speed(iter/s)": 0.138474 }, { "epoch": 0.9193918298628795, "grad_norm": 0.6449307799339294, "learning_rate": 6.071403653462829e-05, "loss": 0.8486034393310546, "memory(GiB)": 91.52, "step": 70855, "token_acc": 0.771623591311458, "train_speed(iter/s)": 0.138473 }, { "epoch": 0.9194567082645352, "grad_norm": 0.6665225625038147, "learning_rate": 6.070879730926545e-05, "loss": 0.8470022201538085, "memory(GiB)": 91.52, "step": 70860, "token_acc": 0.7690501600853789, "train_speed(iter/s)": 0.138472 }, { "epoch": 0.9195215866661909, "grad_norm": 0.7168011665344238, "learning_rate": 6.070355796066647e-05, "loss": 0.860880184173584, "memory(GiB)": 91.52, "step": 70865, "token_acc": 0.762008870010619, "train_speed(iter/s)": 0.138471 }, { "epoch": 0.9195864650678466, "grad_norm": 0.7279468178749084, "learning_rate": 6.069831848889161e-05, "loss": 0.8597329139709473, "memory(GiB)": 91.52, "step": 70870, "token_acc": 0.766823676669192, "train_speed(iter/s)": 0.138469 }, { "epoch": 0.9196513434695023, "grad_norm": 0.7059509754180908, "learning_rate": 6.069307889400122e-05, "loss": 0.9105655670166015, "memory(GiB)": 91.52, "step": 70875, "token_acc": 0.7618336356200434, "train_speed(iter/s)": 0.138468 }, { "epoch": 0.919716221871158, "grad_norm": 0.7291401624679565, "learning_rate": 6.068783917605555e-05, "loss": 0.8526329040527344, "memory(GiB)": 91.52, "step": 70880, "token_acc": 0.7644004189212777, "train_speed(iter/s)": 0.138467 }, { "epoch": 0.9197811002728137, "grad_norm": 0.7198129892349243, "learning_rate": 6.068259933511491e-05, "loss": 0.8526886940002442, "memory(GiB)": 91.52, "step": 70885, "token_acc": 0.7868302363341151, "train_speed(iter/s)": 0.138465 }, { "epoch": 0.9198459786744694, "grad_norm": 0.7242595553398132, "learning_rate": 6.067735937123962e-05, "loss": 0.8387908935546875, "memory(GiB)": 91.52, "step": 70890, "token_acc": 0.7638102759861348, "train_speed(iter/s)": 0.138464 }, { "epoch": 0.9199108570761251, "grad_norm": 0.7525068521499634, "learning_rate": 6.067211928448996e-05, "loss": 0.8828344345092773, "memory(GiB)": 91.52, "step": 70895, "token_acc": 0.7624270782693242, "train_speed(iter/s)": 0.138462 }, { "epoch": 0.9199757354777808, "grad_norm": 0.7019964456558228, "learning_rate": 6.066687907492623e-05, "loss": 0.8145263671875, "memory(GiB)": 91.52, "step": 70900, "token_acc": 0.7665246408506466, "train_speed(iter/s)": 0.138461 }, { "epoch": 0.9200406138794365, "grad_norm": 0.7267746925354004, "learning_rate": 6.066163874260874e-05, "loss": 0.8485010147094727, "memory(GiB)": 91.52, "step": 70905, "token_acc": 0.7684331408112947, "train_speed(iter/s)": 0.13846 }, { "epoch": 0.9201054922810922, "grad_norm": 0.749591588973999, "learning_rate": 6.065639828759782e-05, "loss": 0.8394533157348633, "memory(GiB)": 91.52, "step": 70910, "token_acc": 0.77869429241595, "train_speed(iter/s)": 0.138458 }, { "epoch": 0.9201703706827479, "grad_norm": 0.787621021270752, "learning_rate": 6.065115770995372e-05, "loss": 0.9042747497558594, "memory(GiB)": 91.52, "step": 70915, "token_acc": 0.7574118222385231, "train_speed(iter/s)": 0.138457 }, { "epoch": 0.9202352490844036, "grad_norm": 0.6464123129844666, "learning_rate": 6.064591700973681e-05, "loss": 0.8193988800048828, "memory(GiB)": 91.52, "step": 70920, "token_acc": 0.7711042539122769, "train_speed(iter/s)": 0.138455 }, { "epoch": 0.9203001274860593, "grad_norm": 0.7755818367004395, "learning_rate": 6.064067618700736e-05, "loss": 0.8729055404663086, "memory(GiB)": 91.52, "step": 70925, "token_acc": 0.7452993874154598, "train_speed(iter/s)": 0.138454 }, { "epoch": 0.920365005887715, "grad_norm": 0.6815967559814453, "learning_rate": 6.06354352418257e-05, "loss": 0.8605981826782226, "memory(GiB)": 91.52, "step": 70930, "token_acc": 0.7752817814083179, "train_speed(iter/s)": 0.138452 }, { "epoch": 0.9204298842893707, "grad_norm": 0.7745524644851685, "learning_rate": 6.063019417425213e-05, "loss": 0.8718292236328125, "memory(GiB)": 91.52, "step": 70935, "token_acc": 0.7666567342073898, "train_speed(iter/s)": 0.138451 }, { "epoch": 0.9204947626910264, "grad_norm": 0.709029495716095, "learning_rate": 6.062495298434696e-05, "loss": 0.8569538116455078, "memory(GiB)": 91.52, "step": 70940, "token_acc": 0.7444401092469762, "train_speed(iter/s)": 0.13845 }, { "epoch": 0.9205596410926821, "grad_norm": 0.695615291595459, "learning_rate": 6.0619711672170546e-05, "loss": 0.8488873481750489, "memory(GiB)": 91.52, "step": 70945, "token_acc": 0.7676543991001441, "train_speed(iter/s)": 0.138449 }, { "epoch": 0.9206245194943378, "grad_norm": 0.6838041543960571, "learning_rate": 6.061447023778314e-05, "loss": 0.880772590637207, "memory(GiB)": 91.52, "step": 70950, "token_acc": 0.7514647451006802, "train_speed(iter/s)": 0.138448 }, { "epoch": 0.9206893978959935, "grad_norm": 0.7357181310653687, "learning_rate": 6.060922868124511e-05, "loss": 0.9007863998413086, "memory(GiB)": 91.52, "step": 70955, "token_acc": 0.7417987706243934, "train_speed(iter/s)": 0.138446 }, { "epoch": 0.9207542762976492, "grad_norm": 0.7232347130775452, "learning_rate": 6.0603987002616745e-05, "loss": 0.8466567039489746, "memory(GiB)": 91.52, "step": 70960, "token_acc": 0.7721449424695175, "train_speed(iter/s)": 0.138445 }, { "epoch": 0.9208191546993049, "grad_norm": 0.7353493571281433, "learning_rate": 6.059874520195841e-05, "loss": 0.855069351196289, "memory(GiB)": 91.52, "step": 70965, "token_acc": 0.7514245570213098, "train_speed(iter/s)": 0.138444 }, { "epoch": 0.9208840331009606, "grad_norm": 0.7087655663490295, "learning_rate": 6.0593503279330375e-05, "loss": 0.8364511489868164, "memory(GiB)": 91.52, "step": 70970, "token_acc": 0.7705253166867448, "train_speed(iter/s)": 0.138443 }, { "epoch": 0.9209489115026163, "grad_norm": 0.7763378024101257, "learning_rate": 6.058826123479299e-05, "loss": 0.8833907127380372, "memory(GiB)": 91.52, "step": 70975, "token_acc": 0.7688657154453776, "train_speed(iter/s)": 0.138441 }, { "epoch": 0.921013789904272, "grad_norm": 0.649917483329773, "learning_rate": 6.0583019068406575e-05, "loss": 0.8729021072387695, "memory(GiB)": 91.52, "step": 70980, "token_acc": 0.7622113517013165, "train_speed(iter/s)": 0.13844 }, { "epoch": 0.9210786683059277, "grad_norm": 0.720427930355072, "learning_rate": 6.057777678023145e-05, "loss": 0.8692965507507324, "memory(GiB)": 91.52, "step": 70985, "token_acc": 0.7740727403673029, "train_speed(iter/s)": 0.138438 }, { "epoch": 0.9211435467075834, "grad_norm": 0.7341895699501038, "learning_rate": 6.057253437032797e-05, "loss": 0.8546430587768554, "memory(GiB)": 91.52, "step": 70990, "token_acc": 0.7747911232089524, "train_speed(iter/s)": 0.138437 }, { "epoch": 0.9212084251092391, "grad_norm": 0.7389283776283264, "learning_rate": 6.0567291838756426e-05, "loss": 0.8731228828430175, "memory(GiB)": 91.52, "step": 70995, "token_acc": 0.754726368159204, "train_speed(iter/s)": 0.138435 }, { "epoch": 0.9212733035108948, "grad_norm": 0.7183566689491272, "learning_rate": 6.056204918557719e-05, "loss": 0.8646602630615234, "memory(GiB)": 91.52, "step": 71000, "token_acc": 0.7648593697051846, "train_speed(iter/s)": 0.138434 }, { "epoch": 0.9213381819125503, "grad_norm": 0.6408079862594604, "learning_rate": 6.055680641085055e-05, "loss": 0.8702609062194824, "memory(GiB)": 91.52, "step": 71005, "token_acc": 0.7574622911441591, "train_speed(iter/s)": 0.138433 }, { "epoch": 0.921403060314206, "grad_norm": 0.7834725379943848, "learning_rate": 6.0551563514636866e-05, "loss": 0.8484785079956054, "memory(GiB)": 91.52, "step": 71010, "token_acc": 0.7587833643699565, "train_speed(iter/s)": 0.138431 }, { "epoch": 0.9214679387158617, "grad_norm": 0.7275198698043823, "learning_rate": 6.0546320496996475e-05, "loss": 0.8285722732543945, "memory(GiB)": 91.52, "step": 71015, "token_acc": 0.7544930955520088, "train_speed(iter/s)": 0.138431 }, { "epoch": 0.9215328171175174, "grad_norm": 0.738427996635437, "learning_rate": 6.0541077357989706e-05, "loss": 0.8983650207519531, "memory(GiB)": 91.52, "step": 71020, "token_acc": 0.7681846711765116, "train_speed(iter/s)": 0.138429 }, { "epoch": 0.9215976955191731, "grad_norm": 0.7507554888725281, "learning_rate": 6.053583409767689e-05, "loss": 0.8377803802490235, "memory(GiB)": 91.52, "step": 71025, "token_acc": 0.7658197175297702, "train_speed(iter/s)": 0.138428 }, { "epoch": 0.9216625739208288, "grad_norm": 0.6985677480697632, "learning_rate": 6.053059071611837e-05, "loss": 0.867741584777832, "memory(GiB)": 91.52, "step": 71030, "token_acc": 0.7714144411473788, "train_speed(iter/s)": 0.138427 }, { "epoch": 0.9217274523224845, "grad_norm": 0.7087802886962891, "learning_rate": 6.052534721337451e-05, "loss": 0.8142443656921386, "memory(GiB)": 91.52, "step": 71035, "token_acc": 0.777097698923065, "train_speed(iter/s)": 0.138425 }, { "epoch": 0.9217923307241402, "grad_norm": 0.669389009475708, "learning_rate": 6.052010358950561e-05, "loss": 0.8242501258850098, "memory(GiB)": 91.52, "step": 71040, "token_acc": 0.7800339950221574, "train_speed(iter/s)": 0.138424 }, { "epoch": 0.9218572091257959, "grad_norm": 0.7576184272766113, "learning_rate": 6.0514859844572046e-05, "loss": 0.8766820907592774, "memory(GiB)": 91.52, "step": 71045, "token_acc": 0.7717992749475291, "train_speed(iter/s)": 0.138423 }, { "epoch": 0.9219220875274516, "grad_norm": 0.7394291758537292, "learning_rate": 6.050961597863415e-05, "loss": 0.852420711517334, "memory(GiB)": 91.52, "step": 71050, "token_acc": 0.7703492145917175, "train_speed(iter/s)": 0.138422 }, { "epoch": 0.9219869659291073, "grad_norm": 0.7798939347267151, "learning_rate": 6.050437199175227e-05, "loss": 0.8722843170166016, "memory(GiB)": 91.52, "step": 71055, "token_acc": 0.7646400447695567, "train_speed(iter/s)": 0.138421 }, { "epoch": 0.922051844330763, "grad_norm": 0.6921105980873108, "learning_rate": 6.049912788398675e-05, "loss": 0.8149906158447265, "memory(GiB)": 91.52, "step": 71060, "token_acc": 0.7727404077701076, "train_speed(iter/s)": 0.138419 }, { "epoch": 0.9221167227324187, "grad_norm": 0.6771863698959351, "learning_rate": 6.0493883655397944e-05, "loss": 0.8771469116210937, "memory(GiB)": 91.52, "step": 71065, "token_acc": 0.7758679606342264, "train_speed(iter/s)": 0.138417 }, { "epoch": 0.9221816011340744, "grad_norm": 0.6898292899131775, "learning_rate": 6.048863930604621e-05, "loss": 0.8332710266113281, "memory(GiB)": 91.52, "step": 71070, "token_acc": 0.7998063603187607, "train_speed(iter/s)": 0.138416 }, { "epoch": 0.9222464795357301, "grad_norm": 0.7553849220275879, "learning_rate": 6.048339483599187e-05, "loss": 0.8470208168029785, "memory(GiB)": 91.52, "step": 71075, "token_acc": 0.7625639703987335, "train_speed(iter/s)": 0.138415 }, { "epoch": 0.9223113579373858, "grad_norm": 0.829831063747406, "learning_rate": 6.04781502452953e-05, "loss": 0.8661880493164062, "memory(GiB)": 91.52, "step": 71080, "token_acc": 0.7629844097995546, "train_speed(iter/s)": 0.138414 }, { "epoch": 0.9223762363390415, "grad_norm": 0.7231758236885071, "learning_rate": 6.0472905534016864e-05, "loss": 0.8212755203247071, "memory(GiB)": 91.52, "step": 71085, "token_acc": 0.7723449385605617, "train_speed(iter/s)": 0.138413 }, { "epoch": 0.9224411147406972, "grad_norm": 0.660781741142273, "learning_rate": 6.046766070221691e-05, "loss": 0.8244606018066406, "memory(GiB)": 91.52, "step": 71090, "token_acc": 0.7933136835305044, "train_speed(iter/s)": 0.138412 }, { "epoch": 0.9225059931423529, "grad_norm": 0.7524054050445557, "learning_rate": 6.046241574995578e-05, "loss": 0.8386343002319336, "memory(GiB)": 91.52, "step": 71095, "token_acc": 0.782186697116757, "train_speed(iter/s)": 0.138411 }, { "epoch": 0.9225708715440086, "grad_norm": 0.7642073631286621, "learning_rate": 6.0457170677293825e-05, "loss": 0.9230081558227539, "memory(GiB)": 91.52, "step": 71100, "token_acc": 0.753201536737634, "train_speed(iter/s)": 0.13841 }, { "epoch": 0.9226357499456643, "grad_norm": 0.8101956844329834, "learning_rate": 6.045192548429145e-05, "loss": 0.859919548034668, "memory(GiB)": 91.52, "step": 71105, "token_acc": 0.7469043407266295, "train_speed(iter/s)": 0.138408 }, { "epoch": 0.92270062834732, "grad_norm": 0.7977537512779236, "learning_rate": 6.044668017100898e-05, "loss": 0.8556754112243652, "memory(GiB)": 91.52, "step": 71110, "token_acc": 0.7624521072796935, "train_speed(iter/s)": 0.138407 }, { "epoch": 0.9227655067489757, "grad_norm": 0.7424387335777283, "learning_rate": 6.044143473750678e-05, "loss": 0.8464897155761719, "memory(GiB)": 91.52, "step": 71115, "token_acc": 0.7421896641940349, "train_speed(iter/s)": 0.138406 }, { "epoch": 0.9228303851506314, "grad_norm": 0.779412031173706, "learning_rate": 6.043618918384524e-05, "loss": 0.8258762359619141, "memory(GiB)": 91.52, "step": 71120, "token_acc": 0.7886572952011633, "train_speed(iter/s)": 0.138405 }, { "epoch": 0.9228952635522871, "grad_norm": 0.7402225136756897, "learning_rate": 6.043094351008469e-05, "loss": 0.8508189201354981, "memory(GiB)": 91.52, "step": 71125, "token_acc": 0.7756709181310028, "train_speed(iter/s)": 0.138403 }, { "epoch": 0.9229601419539428, "grad_norm": 0.6054394841194153, "learning_rate": 6.042569771628552e-05, "loss": 0.8524438858032226, "memory(GiB)": 91.52, "step": 71130, "token_acc": 0.7687372967479674, "train_speed(iter/s)": 0.138402 }, { "epoch": 0.9230250203555985, "grad_norm": 0.6646188497543335, "learning_rate": 6.0420451802508086e-05, "loss": 0.8732712745666504, "memory(GiB)": 91.52, "step": 71135, "token_acc": 0.7546464145326871, "train_speed(iter/s)": 0.138401 }, { "epoch": 0.9230898987572542, "grad_norm": 0.6534129977226257, "learning_rate": 6.0415205768812766e-05, "loss": 0.8546504020690918, "memory(GiB)": 91.52, "step": 71140, "token_acc": 0.7577060696582739, "train_speed(iter/s)": 0.1384 }, { "epoch": 0.9231547771589099, "grad_norm": 0.6857154369354248, "learning_rate": 6.040995961525994e-05, "loss": 0.8830436706542969, "memory(GiB)": 91.52, "step": 71145, "token_acc": 0.7710008554319931, "train_speed(iter/s)": 0.138398 }, { "epoch": 0.9232196555605656, "grad_norm": 0.7006729245185852, "learning_rate": 6.0404713341909955e-05, "loss": 0.8690764427185058, "memory(GiB)": 91.52, "step": 71150, "token_acc": 0.7722405853603412, "train_speed(iter/s)": 0.138397 }, { "epoch": 0.9232845339622213, "grad_norm": 0.6762166023254395, "learning_rate": 6.0399466948823204e-05, "loss": 0.8833452224731445, "memory(GiB)": 91.52, "step": 71155, "token_acc": 0.7413668499607227, "train_speed(iter/s)": 0.138395 }, { "epoch": 0.923349412363877, "grad_norm": 0.6799482703208923, "learning_rate": 6.039422043606004e-05, "loss": 0.8207826614379883, "memory(GiB)": 91.52, "step": 71160, "token_acc": 0.7825183997668148, "train_speed(iter/s)": 0.138394 }, { "epoch": 0.9234142907655327, "grad_norm": 0.7255700826644897, "learning_rate": 6.038897380368089e-05, "loss": 0.8543231964111329, "memory(GiB)": 91.52, "step": 71165, "token_acc": 0.7584762364003496, "train_speed(iter/s)": 0.138393 }, { "epoch": 0.9234791691671884, "grad_norm": 0.6967780590057373, "learning_rate": 6.0383727051746066e-05, "loss": 0.8490265846252442, "memory(GiB)": 91.52, "step": 71170, "token_acc": 0.7839091837050326, "train_speed(iter/s)": 0.138391 }, { "epoch": 0.9235440475688441, "grad_norm": 0.5745630860328674, "learning_rate": 6.0378480180316e-05, "loss": 0.8107476234436035, "memory(GiB)": 91.52, "step": 71175, "token_acc": 0.7926763232706105, "train_speed(iter/s)": 0.13839 }, { "epoch": 0.9236089259704998, "grad_norm": 0.7937750816345215, "learning_rate": 6.037323318945104e-05, "loss": 0.8338220596313477, "memory(GiB)": 91.52, "step": 71180, "token_acc": 0.771343797856049, "train_speed(iter/s)": 0.138388 }, { "epoch": 0.9236738043721555, "grad_norm": 0.6686110496520996, "learning_rate": 6.036798607921159e-05, "loss": 0.8510639190673828, "memory(GiB)": 91.52, "step": 71185, "token_acc": 0.7636612527285508, "train_speed(iter/s)": 0.138387 }, { "epoch": 0.9237386827738112, "grad_norm": 0.7248358130455017, "learning_rate": 6.036273884965802e-05, "loss": 0.7986763477325439, "memory(GiB)": 91.52, "step": 71190, "token_acc": 0.7783166904422254, "train_speed(iter/s)": 0.138385 }, { "epoch": 0.9238035611754669, "grad_norm": 0.7078704833984375, "learning_rate": 6.0357491500850705e-05, "loss": 0.8485191345214844, "memory(GiB)": 91.52, "step": 71195, "token_acc": 0.7846759756534193, "train_speed(iter/s)": 0.138384 }, { "epoch": 0.9238684395771226, "grad_norm": 0.7565039992332458, "learning_rate": 6.035224403285007e-05, "loss": 0.837211799621582, "memory(GiB)": 91.52, "step": 71200, "token_acc": 0.7683394319567869, "train_speed(iter/s)": 0.138382 }, { "epoch": 0.9239333179787783, "grad_norm": 0.7770920991897583, "learning_rate": 6.0346996445716454e-05, "loss": 0.8641195297241211, "memory(GiB)": 91.52, "step": 71205, "token_acc": 0.7731175468483816, "train_speed(iter/s)": 0.138381 }, { "epoch": 0.923998196380434, "grad_norm": 0.7691748142242432, "learning_rate": 6.0341748739510277e-05, "loss": 0.858538818359375, "memory(GiB)": 91.52, "step": 71210, "token_acc": 0.7499801161218485, "train_speed(iter/s)": 0.13838 }, { "epoch": 0.9240630747820897, "grad_norm": 0.7361534833908081, "learning_rate": 6.033650091429192e-05, "loss": 0.8902053833007812, "memory(GiB)": 91.52, "step": 71215, "token_acc": 0.7598393574297189, "train_speed(iter/s)": 0.138379 }, { "epoch": 0.9241279531837454, "grad_norm": 0.6655129790306091, "learning_rate": 6.0331252970121774e-05, "loss": 0.837038230895996, "memory(GiB)": 91.52, "step": 71220, "token_acc": 0.7622527178756014, "train_speed(iter/s)": 0.138377 }, { "epoch": 0.9241928315854011, "grad_norm": 0.7200034856796265, "learning_rate": 6.0326004907060243e-05, "loss": 0.8198734283447265, "memory(GiB)": 91.52, "step": 71225, "token_acc": 0.7731190123060889, "train_speed(iter/s)": 0.138376 }, { "epoch": 0.9242577099870568, "grad_norm": 0.7671160697937012, "learning_rate": 6.0320756725167695e-05, "loss": 0.8063575744628906, "memory(GiB)": 91.52, "step": 71230, "token_acc": 0.7780871548861082, "train_speed(iter/s)": 0.138374 }, { "epoch": 0.9243225883887125, "grad_norm": 0.7580261826515198, "learning_rate": 6.0315508424504565e-05, "loss": 0.8701646804809571, "memory(GiB)": 91.52, "step": 71235, "token_acc": 0.7536270026647367, "train_speed(iter/s)": 0.138373 }, { "epoch": 0.9243874667903682, "grad_norm": 0.6822572946548462, "learning_rate": 6.0310260005131205e-05, "loss": 0.904690170288086, "memory(GiB)": 91.52, "step": 71240, "token_acc": 0.7492957746478873, "train_speed(iter/s)": 0.138371 }, { "epoch": 0.9244523451920238, "grad_norm": 0.7711939215660095, "learning_rate": 6.030501146710805e-05, "loss": 0.8594965934753418, "memory(GiB)": 91.52, "step": 71245, "token_acc": 0.7674702252833979, "train_speed(iter/s)": 0.13837 }, { "epoch": 0.9245172235936795, "grad_norm": 0.7221664786338806, "learning_rate": 6.029976281049549e-05, "loss": 0.885257339477539, "memory(GiB)": 91.52, "step": 71250, "token_acc": 0.7605611651096222, "train_speed(iter/s)": 0.138369 }, { "epoch": 0.9245821019953352, "grad_norm": 0.7250567674636841, "learning_rate": 6.0294514035353924e-05, "loss": 0.8285625457763672, "memory(GiB)": 91.52, "step": 71255, "token_acc": 0.7780510197389993, "train_speed(iter/s)": 0.138368 }, { "epoch": 0.9246469803969909, "grad_norm": 0.7173495292663574, "learning_rate": 6.0289265141743743e-05, "loss": 0.8235406875610352, "memory(GiB)": 91.52, "step": 71260, "token_acc": 0.7858474216016481, "train_speed(iter/s)": 0.138366 }, { "epoch": 0.9247118587986466, "grad_norm": 0.6999727487564087, "learning_rate": 6.028401612972535e-05, "loss": 0.8059365272521972, "memory(GiB)": 91.52, "step": 71265, "token_acc": 0.7595907928388747, "train_speed(iter/s)": 0.138365 }, { "epoch": 0.9247767372003023, "grad_norm": 0.7134426236152649, "learning_rate": 6.0278766999359195e-05, "loss": 0.8351614952087403, "memory(GiB)": 91.52, "step": 71270, "token_acc": 0.7493872331355902, "train_speed(iter/s)": 0.138363 }, { "epoch": 0.924841615601958, "grad_norm": 0.7300340533256531, "learning_rate": 6.027351775070562e-05, "loss": 0.8695193290710449, "memory(GiB)": 91.52, "step": 71275, "token_acc": 0.7521631871571118, "train_speed(iter/s)": 0.138362 }, { "epoch": 0.9249064940036137, "grad_norm": 0.6700130701065063, "learning_rate": 6.0268268383825065e-05, "loss": 0.834627914428711, "memory(GiB)": 91.52, "step": 71280, "token_acc": 0.7566088071233067, "train_speed(iter/s)": 0.13836 }, { "epoch": 0.9249713724052694, "grad_norm": 0.7058625817298889, "learning_rate": 6.026301889877796e-05, "loss": 0.8813419342041016, "memory(GiB)": 91.52, "step": 71285, "token_acc": 0.751782014817936, "train_speed(iter/s)": 0.138359 }, { "epoch": 0.9250362508069251, "grad_norm": 0.6343062520027161, "learning_rate": 6.0257769295624676e-05, "loss": 0.8819210052490234, "memory(GiB)": 91.52, "step": 71290, "token_acc": 0.7718488966463034, "train_speed(iter/s)": 0.138358 }, { "epoch": 0.9251011292085808, "grad_norm": 0.6482227444648743, "learning_rate": 6.025251957442566e-05, "loss": 0.8316693305969238, "memory(GiB)": 91.52, "step": 71295, "token_acc": 0.7767068015883182, "train_speed(iter/s)": 0.138356 }, { "epoch": 0.9251660076102365, "grad_norm": 0.7084221243858337, "learning_rate": 6.0247269735241285e-05, "loss": 0.8236455917358398, "memory(GiB)": 91.52, "step": 71300, "token_acc": 0.7653644756786551, "train_speed(iter/s)": 0.138355 }, { "epoch": 0.9252308860118922, "grad_norm": 0.7346278429031372, "learning_rate": 6.024201977813202e-05, "loss": 0.8763134002685546, "memory(GiB)": 91.52, "step": 71305, "token_acc": 0.7557839798873437, "train_speed(iter/s)": 0.138353 }, { "epoch": 0.9252957644135479, "grad_norm": 0.832023024559021, "learning_rate": 6.0236769703158215e-05, "loss": 0.9011063575744629, "memory(GiB)": 91.52, "step": 71310, "token_acc": 0.7512259256627105, "train_speed(iter/s)": 0.138352 }, { "epoch": 0.9253606428152036, "grad_norm": 0.7805453538894653, "learning_rate": 6.023151951038034e-05, "loss": 0.856136703491211, "memory(GiB)": 91.52, "step": 71315, "token_acc": 0.7553683577150163, "train_speed(iter/s)": 0.138351 }, { "epoch": 0.9254255212168593, "grad_norm": 0.6759916543960571, "learning_rate": 6.022626919985881e-05, "loss": 0.8562568664550781, "memory(GiB)": 91.52, "step": 71320, "token_acc": 0.7754119973632169, "train_speed(iter/s)": 0.138349 }, { "epoch": 0.925490399618515, "grad_norm": 0.6541647911071777, "learning_rate": 6.0221018771654016e-05, "loss": 0.8526294708251954, "memory(GiB)": 91.52, "step": 71325, "token_acc": 0.7806415993491022, "train_speed(iter/s)": 0.138348 }, { "epoch": 0.9255552780201707, "grad_norm": 0.7175742387771606, "learning_rate": 6.0215768225826396e-05, "loss": 0.8507390975952148, "memory(GiB)": 91.52, "step": 71330, "token_acc": 0.7754633779084358, "train_speed(iter/s)": 0.138346 }, { "epoch": 0.9256201564218264, "grad_norm": 0.7437961101531982, "learning_rate": 6.021051756243638e-05, "loss": 0.8311615943908691, "memory(GiB)": 91.52, "step": 71335, "token_acc": 0.7751070250543898, "train_speed(iter/s)": 0.138345 }, { "epoch": 0.9256850348234821, "grad_norm": 0.737492024898529, "learning_rate": 6.0205266781544376e-05, "loss": 0.8527727127075195, "memory(GiB)": 91.52, "step": 71340, "token_acc": 0.7735558587873063, "train_speed(iter/s)": 0.138344 }, { "epoch": 0.9257499132251378, "grad_norm": 0.7583640813827515, "learning_rate": 6.020001588321082e-05, "loss": 0.8597976684570312, "memory(GiB)": 91.52, "step": 71345, "token_acc": 0.759665068791633, "train_speed(iter/s)": 0.138343 }, { "epoch": 0.9258147916267935, "grad_norm": 0.7473886013031006, "learning_rate": 6.019476486749614e-05, "loss": 0.8553120613098144, "memory(GiB)": 91.52, "step": 71350, "token_acc": 0.768250087320992, "train_speed(iter/s)": 0.138342 }, { "epoch": 0.9258796700284492, "grad_norm": 0.7345724105834961, "learning_rate": 6.018951373446078e-05, "loss": 0.8446931838989258, "memory(GiB)": 91.52, "step": 71355, "token_acc": 0.7586368662667963, "train_speed(iter/s)": 0.13834 }, { "epoch": 0.9259445484301049, "grad_norm": 0.7353050112724304, "learning_rate": 6.018426248416513e-05, "loss": 0.8527478218078614, "memory(GiB)": 91.52, "step": 71360, "token_acc": 0.7539238618258246, "train_speed(iter/s)": 0.138339 }, { "epoch": 0.9260094268317606, "grad_norm": 0.7353869676589966, "learning_rate": 6.0179011116669645e-05, "loss": 0.8786430358886719, "memory(GiB)": 91.52, "step": 71365, "token_acc": 0.7669080053529149, "train_speed(iter/s)": 0.138338 }, { "epoch": 0.9260743052334163, "grad_norm": 0.7380164861679077, "learning_rate": 6.017375963203475e-05, "loss": 0.8700435638427735, "memory(GiB)": 91.52, "step": 71370, "token_acc": 0.7656351158876085, "train_speed(iter/s)": 0.138337 }, { "epoch": 0.926139183635072, "grad_norm": 0.7038853168487549, "learning_rate": 6.01685080303209e-05, "loss": 0.7949440479278564, "memory(GiB)": 91.52, "step": 71375, "token_acc": 0.7821677428769472, "train_speed(iter/s)": 0.138336 }, { "epoch": 0.9262040620367277, "grad_norm": 0.6479490995407104, "learning_rate": 6.016325631158851e-05, "loss": 0.856482982635498, "memory(GiB)": 91.52, "step": 71380, "token_acc": 0.7555353602115004, "train_speed(iter/s)": 0.138335 }, { "epoch": 0.9262689404383834, "grad_norm": 0.7008846402168274, "learning_rate": 6.015800447589802e-05, "loss": 0.8155274391174316, "memory(GiB)": 91.52, "step": 71385, "token_acc": 0.753411727691037, "train_speed(iter/s)": 0.138333 }, { "epoch": 0.9263338188400391, "grad_norm": 0.7253506183624268, "learning_rate": 6.015275252330985e-05, "loss": 0.8567755699157715, "memory(GiB)": 91.52, "step": 71390, "token_acc": 0.746170063246662, "train_speed(iter/s)": 0.138331 }, { "epoch": 0.9263986972416948, "grad_norm": 0.7276660799980164, "learning_rate": 6.014750045388449e-05, "loss": 0.8493905067443848, "memory(GiB)": 91.52, "step": 71395, "token_acc": 0.776048139185035, "train_speed(iter/s)": 0.138329 }, { "epoch": 0.9264635756433505, "grad_norm": 0.6895363330841064, "learning_rate": 6.014224826768233e-05, "loss": 0.8859667778015137, "memory(GiB)": 91.52, "step": 71400, "token_acc": 0.7729023383768914, "train_speed(iter/s)": 0.138328 }, { "epoch": 0.9265284540450062, "grad_norm": 0.7921222448348999, "learning_rate": 6.013699596476382e-05, "loss": 0.920191764831543, "memory(GiB)": 91.52, "step": 71405, "token_acc": 0.7446445044244764, "train_speed(iter/s)": 0.138327 }, { "epoch": 0.9265933324466619, "grad_norm": 0.7968646287918091, "learning_rate": 6.0131743545189426e-05, "loss": 0.8401737213134766, "memory(GiB)": 91.52, "step": 71410, "token_acc": 0.7764363734740117, "train_speed(iter/s)": 0.138325 }, { "epoch": 0.9266582108483176, "grad_norm": 0.744569718837738, "learning_rate": 6.012649100901958e-05, "loss": 0.8218320846557617, "memory(GiB)": 91.52, "step": 71415, "token_acc": 0.7750110269056498, "train_speed(iter/s)": 0.138324 }, { "epoch": 0.9267230892499733, "grad_norm": 0.7266913652420044, "learning_rate": 6.012123835631472e-05, "loss": 0.8347698211669922, "memory(GiB)": 91.52, "step": 71420, "token_acc": 0.748078387923274, "train_speed(iter/s)": 0.138323 }, { "epoch": 0.926787967651629, "grad_norm": 0.7965584397315979, "learning_rate": 6.0115985587135305e-05, "loss": 0.8636545181274414, "memory(GiB)": 91.52, "step": 71425, "token_acc": 0.7532521307063248, "train_speed(iter/s)": 0.138321 }, { "epoch": 0.9268528460532847, "grad_norm": 0.6668707132339478, "learning_rate": 6.011073270154177e-05, "loss": 0.8651451110839844, "memory(GiB)": 91.52, "step": 71430, "token_acc": 0.768163695673663, "train_speed(iter/s)": 0.13832 }, { "epoch": 0.9269177244549404, "grad_norm": 0.6791182160377502, "learning_rate": 6.01054796995946e-05, "loss": 0.8622640609741211, "memory(GiB)": 91.52, "step": 71435, "token_acc": 0.759831711056441, "train_speed(iter/s)": 0.138318 }, { "epoch": 0.9269826028565961, "grad_norm": 0.8884703516960144, "learning_rate": 6.010022658135419e-05, "loss": 0.8593103408813476, "memory(GiB)": 91.52, "step": 71440, "token_acc": 0.7571020220912588, "train_speed(iter/s)": 0.138317 }, { "epoch": 0.9270474812582518, "grad_norm": 0.6341551542282104, "learning_rate": 6.009497334688103e-05, "loss": 0.8561876296997071, "memory(GiB)": 91.52, "step": 71445, "token_acc": 0.7687242291833344, "train_speed(iter/s)": 0.138316 }, { "epoch": 0.9271123596599075, "grad_norm": 0.793799102306366, "learning_rate": 6.0089719996235585e-05, "loss": 0.8973383903503418, "memory(GiB)": 91.52, "step": 71450, "token_acc": 0.7549239419130075, "train_speed(iter/s)": 0.138314 }, { "epoch": 0.9271772380615632, "grad_norm": 0.638627290725708, "learning_rate": 6.008446652947828e-05, "loss": 0.8425102233886719, "memory(GiB)": 91.52, "step": 71455, "token_acc": 0.7527919332104521, "train_speed(iter/s)": 0.138313 }, { "epoch": 0.9272421164632189, "grad_norm": 0.6941645741462708, "learning_rate": 6.007921294666956e-05, "loss": 0.8821895599365235, "memory(GiB)": 91.52, "step": 71460, "token_acc": 0.749799490881194, "train_speed(iter/s)": 0.138311 }, { "epoch": 0.9273069948648746, "grad_norm": 0.623773992061615, "learning_rate": 6.007395924786994e-05, "loss": 0.8215556144714355, "memory(GiB)": 91.52, "step": 71465, "token_acc": 0.7625339824239742, "train_speed(iter/s)": 0.138309 }, { "epoch": 0.9273718732665303, "grad_norm": 0.7316675186157227, "learning_rate": 6.006870543313983e-05, "loss": 0.8405012130737305, "memory(GiB)": 91.52, "step": 71470, "token_acc": 0.7592272889629718, "train_speed(iter/s)": 0.138308 }, { "epoch": 0.927436751668186, "grad_norm": 0.714491605758667, "learning_rate": 6.0063451502539705e-05, "loss": 0.8558759689331055, "memory(GiB)": 91.52, "step": 71475, "token_acc": 0.7706904201221801, "train_speed(iter/s)": 0.138307 }, { "epoch": 0.9275016300698417, "grad_norm": 0.7617092132568359, "learning_rate": 6.005819745613004e-05, "loss": 0.8531077384948731, "memory(GiB)": 91.52, "step": 71480, "token_acc": 0.7710237223372074, "train_speed(iter/s)": 0.138306 }, { "epoch": 0.9275665084714972, "grad_norm": 0.7342143058776855, "learning_rate": 6.0052943293971275e-05, "loss": 0.8837972640991211, "memory(GiB)": 91.52, "step": 71485, "token_acc": 0.7539794632973241, "train_speed(iter/s)": 0.138305 }, { "epoch": 0.9276313868731529, "grad_norm": 0.6788016557693481, "learning_rate": 6.004768901612389e-05, "loss": 0.8913719177246093, "memory(GiB)": 91.52, "step": 71490, "token_acc": 0.7613154445215514, "train_speed(iter/s)": 0.138304 }, { "epoch": 0.9276962652748086, "grad_norm": 0.7720679044723511, "learning_rate": 6.0042434622648335e-05, "loss": 0.8683380126953125, "memory(GiB)": 91.52, "step": 71495, "token_acc": 0.7816100204572372, "train_speed(iter/s)": 0.138302 }, { "epoch": 0.9277611436764643, "grad_norm": 0.7042711973190308, "learning_rate": 6.0037180113605096e-05, "loss": 0.792636489868164, "memory(GiB)": 91.52, "step": 71500, "token_acc": 0.7926496324816241, "train_speed(iter/s)": 0.1383 }, { "epoch": 0.92782602207812, "grad_norm": 0.6520592570304871, "learning_rate": 6.0031925489054644e-05, "loss": 0.7909307479858398, "memory(GiB)": 91.52, "step": 71505, "token_acc": 0.7758543880862979, "train_speed(iter/s)": 0.138298 }, { "epoch": 0.9278909004797757, "grad_norm": 0.7171379923820496, "learning_rate": 6.002667074905742e-05, "loss": 0.810588264465332, "memory(GiB)": 91.52, "step": 71510, "token_acc": 0.7888116747741487, "train_speed(iter/s)": 0.138297 }, { "epoch": 0.9279557788814314, "grad_norm": 0.7463667392730713, "learning_rate": 6.002141589367394e-05, "loss": 0.8094963073730469, "memory(GiB)": 91.52, "step": 71515, "token_acc": 0.785963411424059, "train_speed(iter/s)": 0.138296 }, { "epoch": 0.9280206572830871, "grad_norm": 0.7733613848686218, "learning_rate": 6.001616092296461e-05, "loss": 0.8598566055297852, "memory(GiB)": 91.52, "step": 71520, "token_acc": 0.7607534642032333, "train_speed(iter/s)": 0.138295 }, { "epoch": 0.9280855356847428, "grad_norm": 0.7293283343315125, "learning_rate": 6.0010905836989995e-05, "loss": 0.856103515625, "memory(GiB)": 91.52, "step": 71525, "token_acc": 0.7676590579587983, "train_speed(iter/s)": 0.138294 }, { "epoch": 0.9281504140863985, "grad_norm": 0.7176207304000854, "learning_rate": 6.0005650635810486e-05, "loss": 0.8773721694946289, "memory(GiB)": 91.52, "step": 71530, "token_acc": 0.776085409252669, "train_speed(iter/s)": 0.138293 }, { "epoch": 0.9282152924880542, "grad_norm": 0.7210019826889038, "learning_rate": 6.00003953194866e-05, "loss": 0.8244415283203125, "memory(GiB)": 91.52, "step": 71535, "token_acc": 0.7803719704629715, "train_speed(iter/s)": 0.138291 }, { "epoch": 0.9282801708897099, "grad_norm": 0.650670051574707, "learning_rate": 5.999513988807881e-05, "loss": 0.7958621025085449, "memory(GiB)": 91.52, "step": 71540, "token_acc": 0.7872133763485621, "train_speed(iter/s)": 0.138289 }, { "epoch": 0.9283450492913656, "grad_norm": 0.7192659378051758, "learning_rate": 5.9989884341647605e-05, "loss": 0.8540594100952148, "memory(GiB)": 91.52, "step": 71545, "token_acc": 0.7723779457253102, "train_speed(iter/s)": 0.138288 }, { "epoch": 0.9284099276930213, "grad_norm": 0.7634053826332092, "learning_rate": 5.998462868025344e-05, "loss": 0.8406813621520997, "memory(GiB)": 91.52, "step": 71550, "token_acc": 0.7667696206541538, "train_speed(iter/s)": 0.138287 }, { "epoch": 0.928474806094677, "grad_norm": 0.7355902194976807, "learning_rate": 5.997937290395681e-05, "loss": 0.8571417808532715, "memory(GiB)": 91.52, "step": 71555, "token_acc": 0.7698517449891034, "train_speed(iter/s)": 0.138285 }, { "epoch": 0.9285396844963327, "grad_norm": 0.6955447793006897, "learning_rate": 5.997411701281821e-05, "loss": 0.8824156761169434, "memory(GiB)": 91.52, "step": 71560, "token_acc": 0.7429033958286274, "train_speed(iter/s)": 0.138284 }, { "epoch": 0.9286045628979884, "grad_norm": 0.7157596349716187, "learning_rate": 5.99688610068981e-05, "loss": 0.8388916969299316, "memory(GiB)": 91.52, "step": 71565, "token_acc": 0.7694740137758297, "train_speed(iter/s)": 0.138283 }, { "epoch": 0.9286694412996441, "grad_norm": 0.7617591023445129, "learning_rate": 5.9963604886256984e-05, "loss": 0.8566040992736816, "memory(GiB)": 91.52, "step": 71570, "token_acc": 0.7542797564995123, "train_speed(iter/s)": 0.138282 }, { "epoch": 0.9287343197012998, "grad_norm": 0.7242287397384644, "learning_rate": 5.995834865095535e-05, "loss": 0.8441910743713379, "memory(GiB)": 91.52, "step": 71575, "token_acc": 0.7552422529163386, "train_speed(iter/s)": 0.13828 }, { "epoch": 0.9287991981029555, "grad_norm": 0.7302148342132568, "learning_rate": 5.995309230105368e-05, "loss": 0.8515067100524902, "memory(GiB)": 91.52, "step": 71580, "token_acc": 0.7662531440508163, "train_speed(iter/s)": 0.138279 }, { "epoch": 0.9288640765046112, "grad_norm": 0.689223051071167, "learning_rate": 5.994783583661247e-05, "loss": 0.8578771591186524, "memory(GiB)": 91.52, "step": 71585, "token_acc": 0.7649290719535122, "train_speed(iter/s)": 0.138278 }, { "epoch": 0.9289289549062669, "grad_norm": 0.6968035101890564, "learning_rate": 5.9942579257692176e-05, "loss": 0.8319831848144531, "memory(GiB)": 91.52, "step": 71590, "token_acc": 0.7553849434847515, "train_speed(iter/s)": 0.138276 }, { "epoch": 0.9289938333079226, "grad_norm": 0.7037874460220337, "learning_rate": 5.993732256435336e-05, "loss": 0.8454278945922852, "memory(GiB)": 91.52, "step": 71595, "token_acc": 0.7654142990416419, "train_speed(iter/s)": 0.138275 }, { "epoch": 0.9290587117095783, "grad_norm": 0.7050111889839172, "learning_rate": 5.993206575665644e-05, "loss": 0.8226518630981445, "memory(GiB)": 91.52, "step": 71600, "token_acc": 0.7730238564534765, "train_speed(iter/s)": 0.138274 }, { "epoch": 0.929123590111234, "grad_norm": 0.7460736632347107, "learning_rate": 5.9926808834661976e-05, "loss": 0.8644102096557618, "memory(GiB)": 91.52, "step": 71605, "token_acc": 0.756919201007738, "train_speed(iter/s)": 0.138273 }, { "epoch": 0.9291884685128897, "grad_norm": 0.8086996078491211, "learning_rate": 5.992155179843042e-05, "loss": 0.852964210510254, "memory(GiB)": 91.52, "step": 71610, "token_acc": 0.7581008254314755, "train_speed(iter/s)": 0.138272 }, { "epoch": 0.9292533469145454, "grad_norm": 0.7554856538772583, "learning_rate": 5.991629464802228e-05, "loss": 0.8591835021972656, "memory(GiB)": 91.52, "step": 71615, "token_acc": 0.7628900664479771, "train_speed(iter/s)": 0.13827 }, { "epoch": 0.9293182253162011, "grad_norm": 0.683417797088623, "learning_rate": 5.991103738349807e-05, "loss": 0.8183914184570312, "memory(GiB)": 91.52, "step": 71620, "token_acc": 0.7890203872677447, "train_speed(iter/s)": 0.138268 }, { "epoch": 0.9293831037178568, "grad_norm": 0.8109111785888672, "learning_rate": 5.990578000491826e-05, "loss": 0.8888690948486329, "memory(GiB)": 91.52, "step": 71625, "token_acc": 0.7554955221671231, "train_speed(iter/s)": 0.138267 }, { "epoch": 0.9294479821195125, "grad_norm": 0.7407388091087341, "learning_rate": 5.99005225123434e-05, "loss": 0.850580883026123, "memory(GiB)": 91.52, "step": 71630, "token_acc": 0.777542608889384, "train_speed(iter/s)": 0.138265 }, { "epoch": 0.9295128605211682, "grad_norm": 0.6982021927833557, "learning_rate": 5.9895264905833944e-05, "loss": 0.8505752563476563, "memory(GiB)": 91.52, "step": 71635, "token_acc": 0.7668016774794796, "train_speed(iter/s)": 0.138264 }, { "epoch": 0.9295777389228239, "grad_norm": 0.6645917892456055, "learning_rate": 5.989000718545043e-05, "loss": 0.8794928550720215, "memory(GiB)": 91.52, "step": 71640, "token_acc": 0.7524103645676409, "train_speed(iter/s)": 0.138262 }, { "epoch": 0.9296426173244796, "grad_norm": 0.657608151435852, "learning_rate": 5.988474935125334e-05, "loss": 0.8670208930969239, "memory(GiB)": 91.52, "step": 71645, "token_acc": 0.7575493726995686, "train_speed(iter/s)": 0.138261 }, { "epoch": 0.9297074957261353, "grad_norm": 0.6910579204559326, "learning_rate": 5.98794914033032e-05, "loss": 0.8635068893432617, "memory(GiB)": 91.52, "step": 71650, "token_acc": 0.7623022139151171, "train_speed(iter/s)": 0.13826 }, { "epoch": 0.929772374127791, "grad_norm": 0.7283229231834412, "learning_rate": 5.9874233341660504e-05, "loss": 0.8061504364013672, "memory(GiB)": 91.52, "step": 71655, "token_acc": 0.7622350263195333, "train_speed(iter/s)": 0.138258 }, { "epoch": 0.9298372525294467, "grad_norm": 0.6125963926315308, "learning_rate": 5.986897516638574e-05, "loss": 0.8218945503234864, "memory(GiB)": 91.52, "step": 71660, "token_acc": 0.7875374798185593, "train_speed(iter/s)": 0.138257 }, { "epoch": 0.9299021309311024, "grad_norm": 0.6442288756370544, "learning_rate": 5.986371687753949e-05, "loss": 0.8090122222900391, "memory(GiB)": 91.52, "step": 71665, "token_acc": 0.7680817127783571, "train_speed(iter/s)": 0.138255 }, { "epoch": 0.9299670093327581, "grad_norm": 0.6812033653259277, "learning_rate": 5.985845847518219e-05, "loss": 0.8475112915039062, "memory(GiB)": 91.52, "step": 71670, "token_acc": 0.7927657167102751, "train_speed(iter/s)": 0.138253 }, { "epoch": 0.9300318877344138, "grad_norm": 0.648144006729126, "learning_rate": 5.9853199959374394e-05, "loss": 0.8251527786254883, "memory(GiB)": 91.52, "step": 71675, "token_acc": 0.7567193302700321, "train_speed(iter/s)": 0.138251 }, { "epoch": 0.9300967661360695, "grad_norm": 0.6732907295227051, "learning_rate": 5.984794133017661e-05, "loss": 0.8399602890014648, "memory(GiB)": 91.52, "step": 71680, "token_acc": 0.7744230100127576, "train_speed(iter/s)": 0.13825 }, { "epoch": 0.9301616445377252, "grad_norm": 0.6668565273284912, "learning_rate": 5.9842682587649354e-05, "loss": 0.8054010391235351, "memory(GiB)": 91.52, "step": 71685, "token_acc": 0.7592194648551227, "train_speed(iter/s)": 0.138248 }, { "epoch": 0.9302265229393809, "grad_norm": 0.7015047073364258, "learning_rate": 5.9837423731853136e-05, "loss": 0.8416273117065429, "memory(GiB)": 91.52, "step": 71690, "token_acc": 0.7594694960212202, "train_speed(iter/s)": 0.138247 }, { "epoch": 0.9302914013410366, "grad_norm": 0.7630422711372375, "learning_rate": 5.9832164762848465e-05, "loss": 0.8665042877197265, "memory(GiB)": 91.52, "step": 71695, "token_acc": 0.7585459629500175, "train_speed(iter/s)": 0.138246 }, { "epoch": 0.9303562797426923, "grad_norm": 0.7214812636375427, "learning_rate": 5.9826905680695885e-05, "loss": 0.8277902603149414, "memory(GiB)": 91.52, "step": 71700, "token_acc": 0.7807556378323797, "train_speed(iter/s)": 0.138246 }, { "epoch": 0.930421158144348, "grad_norm": 0.7436507344245911, "learning_rate": 5.9821646485455916e-05, "loss": 0.8226799011230469, "memory(GiB)": 91.52, "step": 71705, "token_acc": 0.7691982636087742, "train_speed(iter/s)": 0.138245 }, { "epoch": 0.9304860365460037, "grad_norm": 0.7567209005355835, "learning_rate": 5.981638717718906e-05, "loss": 0.8563107490539551, "memory(GiB)": 91.52, "step": 71710, "token_acc": 0.7593875571567134, "train_speed(iter/s)": 0.138243 }, { "epoch": 0.9305509149476594, "grad_norm": 0.7642106413841248, "learning_rate": 5.9811127755955866e-05, "loss": 0.8272886276245117, "memory(GiB)": 91.52, "step": 71715, "token_acc": 0.7827430473021155, "train_speed(iter/s)": 0.138242 }, { "epoch": 0.9306157933493151, "grad_norm": 0.723591685295105, "learning_rate": 5.980586822181683e-05, "loss": 0.8533003807067872, "memory(GiB)": 91.52, "step": 71720, "token_acc": 0.7705092180111027, "train_speed(iter/s)": 0.138241 }, { "epoch": 0.9306806717509707, "grad_norm": 0.7389458417892456, "learning_rate": 5.9800608574832506e-05, "loss": 0.8300365447998047, "memory(GiB)": 91.52, "step": 71725, "token_acc": 0.7886100386100386, "train_speed(iter/s)": 0.13824 }, { "epoch": 0.9307455501526264, "grad_norm": 1.0043257474899292, "learning_rate": 5.979534881506339e-05, "loss": 0.8982722282409668, "memory(GiB)": 91.52, "step": 71730, "token_acc": 0.7595702419646082, "train_speed(iter/s)": 0.138239 }, { "epoch": 0.9308104285542821, "grad_norm": 0.7161476612091064, "learning_rate": 5.979008894257005e-05, "loss": 0.8756365776062012, "memory(GiB)": 91.52, "step": 71735, "token_acc": 0.7542437975266918, "train_speed(iter/s)": 0.138238 }, { "epoch": 0.9308753069559378, "grad_norm": 0.7294369339942932, "learning_rate": 5.978482895741299e-05, "loss": 0.8594440460205078, "memory(GiB)": 91.52, "step": 71740, "token_acc": 0.7711995628017123, "train_speed(iter/s)": 0.138236 }, { "epoch": 0.9309401853575935, "grad_norm": 0.6972799897193909, "learning_rate": 5.977956885965275e-05, "loss": 0.8205194473266602, "memory(GiB)": 91.52, "step": 71745, "token_acc": 0.7893098967238903, "train_speed(iter/s)": 0.138235 }, { "epoch": 0.9310050637592492, "grad_norm": 0.7020057439804077, "learning_rate": 5.977430864934985e-05, "loss": 0.8458812713623047, "memory(GiB)": 91.52, "step": 71750, "token_acc": 0.7712383955158522, "train_speed(iter/s)": 0.138233 }, { "epoch": 0.9310699421609049, "grad_norm": 0.819059431552887, "learning_rate": 5.9769048326564846e-05, "loss": 0.8491980552673339, "memory(GiB)": 91.52, "step": 71755, "token_acc": 0.7753570483998942, "train_speed(iter/s)": 0.138232 }, { "epoch": 0.9311348205625606, "grad_norm": 0.7786859273910522, "learning_rate": 5.976378789135827e-05, "loss": 0.8513853073120117, "memory(GiB)": 91.52, "step": 71760, "token_acc": 0.7608962553713935, "train_speed(iter/s)": 0.138231 }, { "epoch": 0.9311996989642163, "grad_norm": 0.7212005257606506, "learning_rate": 5.975852734379063e-05, "loss": 0.8784875869750977, "memory(GiB)": 91.52, "step": 71765, "token_acc": 0.7606056377337427, "train_speed(iter/s)": 0.13823 }, { "epoch": 0.931264577365872, "grad_norm": 0.6676339507102966, "learning_rate": 5.975326668392249e-05, "loss": 0.85780029296875, "memory(GiB)": 91.52, "step": 71770, "token_acc": 0.7678584484901659, "train_speed(iter/s)": 0.138229 }, { "epoch": 0.9313294557675277, "grad_norm": 0.7222048044204712, "learning_rate": 5.97480059118144e-05, "loss": 0.8417369842529296, "memory(GiB)": 91.52, "step": 71775, "token_acc": 0.7703918041761164, "train_speed(iter/s)": 0.138227 }, { "epoch": 0.9313943341691834, "grad_norm": 0.7051928043365479, "learning_rate": 5.974274502752688e-05, "loss": 0.8494789123535156, "memory(GiB)": 91.52, "step": 71780, "token_acc": 0.7416907128353105, "train_speed(iter/s)": 0.138226 }, { "epoch": 0.9314592125708391, "grad_norm": 0.6457183361053467, "learning_rate": 5.973748403112047e-05, "loss": 0.8300496101379394, "memory(GiB)": 91.52, "step": 71785, "token_acc": 0.7702807107906258, "train_speed(iter/s)": 0.138225 }, { "epoch": 0.9315240909724948, "grad_norm": 0.6925725340843201, "learning_rate": 5.973222292265572e-05, "loss": 0.8765684127807617, "memory(GiB)": 91.52, "step": 71790, "token_acc": 0.7621835715515757, "train_speed(iter/s)": 0.138224 }, { "epoch": 0.9315889693741505, "grad_norm": 0.7627551555633545, "learning_rate": 5.97269617021932e-05, "loss": 0.8732732772827149, "memory(GiB)": 91.52, "step": 71795, "token_acc": 0.7551545541516841, "train_speed(iter/s)": 0.138223 }, { "epoch": 0.9316538477758062, "grad_norm": 0.7346044778823853, "learning_rate": 5.972170036979339e-05, "loss": 0.7973661899566651, "memory(GiB)": 91.52, "step": 71800, "token_acc": 0.7839730731309285, "train_speed(iter/s)": 0.138222 }, { "epoch": 0.9317187261774619, "grad_norm": 0.6906795501708984, "learning_rate": 5.9716438925516906e-05, "loss": 0.8417911529541016, "memory(GiB)": 91.52, "step": 71805, "token_acc": 0.7613157266725002, "train_speed(iter/s)": 0.138221 }, { "epoch": 0.9317836045791176, "grad_norm": 0.8256847262382507, "learning_rate": 5.9711177369424254e-05, "loss": 0.8812941551208496, "memory(GiB)": 91.52, "step": 71810, "token_acc": 0.7397006576460806, "train_speed(iter/s)": 0.13822 }, { "epoch": 0.9318484829807733, "grad_norm": 0.6777195334434509, "learning_rate": 5.970591570157601e-05, "loss": 0.8355422973632812, "memory(GiB)": 91.52, "step": 71815, "token_acc": 0.7740569668976135, "train_speed(iter/s)": 0.138219 }, { "epoch": 0.931913361382429, "grad_norm": 0.6673433780670166, "learning_rate": 5.97006539220327e-05, "loss": 0.7787882328033447, "memory(GiB)": 91.52, "step": 71820, "token_acc": 0.7745790934320074, "train_speed(iter/s)": 0.138218 }, { "epoch": 0.9319782397840847, "grad_norm": 0.8155144453048706, "learning_rate": 5.969539203085489e-05, "loss": 0.8401153564453125, "memory(GiB)": 91.52, "step": 71825, "token_acc": 0.7592256269562193, "train_speed(iter/s)": 0.138217 }, { "epoch": 0.9320431181857404, "grad_norm": 0.7336229681968689, "learning_rate": 5.969013002810315e-05, "loss": 0.8364687919616699, "memory(GiB)": 91.52, "step": 71830, "token_acc": 0.7753107365470439, "train_speed(iter/s)": 0.138215 }, { "epoch": 0.9321079965873961, "grad_norm": 0.7786077260971069, "learning_rate": 5.968486791383801e-05, "loss": 0.8769001007080078, "memory(GiB)": 91.52, "step": 71835, "token_acc": 0.7544133313042155, "train_speed(iter/s)": 0.138214 }, { "epoch": 0.9321728749890518, "grad_norm": 0.7529370784759521, "learning_rate": 5.967960568812002e-05, "loss": 0.8886937141418457, "memory(GiB)": 91.52, "step": 71840, "token_acc": 0.7550193969917648, "train_speed(iter/s)": 0.138212 }, { "epoch": 0.9322377533907075, "grad_norm": 0.6501801609992981, "learning_rate": 5.9674343351009765e-05, "loss": 0.8274084091186523, "memory(GiB)": 91.52, "step": 71845, "token_acc": 0.7934698020639486, "train_speed(iter/s)": 0.138211 }, { "epoch": 0.9323026317923632, "grad_norm": 0.7484086751937866, "learning_rate": 5.966908090256779e-05, "loss": 0.8222555160522461, "memory(GiB)": 91.52, "step": 71850, "token_acc": 0.7817637055632035, "train_speed(iter/s)": 0.13821 }, { "epoch": 0.9323675101940189, "grad_norm": 0.6143427491188049, "learning_rate": 5.9663818342854636e-05, "loss": 0.8377583503723145, "memory(GiB)": 91.52, "step": 71855, "token_acc": 0.7644368625458241, "train_speed(iter/s)": 0.138209 }, { "epoch": 0.9324323885956746, "grad_norm": 0.7107955813407898, "learning_rate": 5.96585556719309e-05, "loss": 0.8368682861328125, "memory(GiB)": 91.52, "step": 71860, "token_acc": 0.762682961489271, "train_speed(iter/s)": 0.138208 }, { "epoch": 0.9324972669973303, "grad_norm": 0.7453829050064087, "learning_rate": 5.965329288985713e-05, "loss": 0.8491985321044921, "memory(GiB)": 91.52, "step": 71865, "token_acc": 0.7750288035898368, "train_speed(iter/s)": 0.138206 }, { "epoch": 0.932562145398986, "grad_norm": 0.7073935866355896, "learning_rate": 5.9648029996693865e-05, "loss": 0.8249039649963379, "memory(GiB)": 91.52, "step": 71870, "token_acc": 0.7624078911458536, "train_speed(iter/s)": 0.138205 }, { "epoch": 0.9326270238006417, "grad_norm": 0.6807515621185303, "learning_rate": 5.9642766992501706e-05, "loss": 0.8392508506774903, "memory(GiB)": 91.52, "step": 71875, "token_acc": 0.7613347002215681, "train_speed(iter/s)": 0.138204 }, { "epoch": 0.9326919022022974, "grad_norm": 0.7066439390182495, "learning_rate": 5.96375038773412e-05, "loss": 0.86422119140625, "memory(GiB)": 91.52, "step": 71880, "token_acc": 0.7604649478563151, "train_speed(iter/s)": 0.138203 }, { "epoch": 0.9327567806039531, "grad_norm": 0.6519762873649597, "learning_rate": 5.963224065127293e-05, "loss": 0.7965458393096924, "memory(GiB)": 91.52, "step": 71885, "token_acc": 0.7867040229009958, "train_speed(iter/s)": 0.138202 }, { "epoch": 0.9328216590056088, "grad_norm": 0.6939429640769958, "learning_rate": 5.962697731435745e-05, "loss": 0.8172630310058594, "memory(GiB)": 91.52, "step": 71890, "token_acc": 0.7724382946896036, "train_speed(iter/s)": 0.138201 }, { "epoch": 0.9328865374072645, "grad_norm": 0.7375147342681885, "learning_rate": 5.962171386665533e-05, "loss": 0.8359163284301758, "memory(GiB)": 91.52, "step": 71895, "token_acc": 0.7736508630107057, "train_speed(iter/s)": 0.138199 }, { "epoch": 0.9329514158089202, "grad_norm": 0.6721404194831848, "learning_rate": 5.9616450308227155e-05, "loss": 0.8776390075683593, "memory(GiB)": 91.52, "step": 71900, "token_acc": 0.7541897530610849, "train_speed(iter/s)": 0.138198 }, { "epoch": 0.9330162942105759, "grad_norm": 0.8180306553840637, "learning_rate": 5.961118663913348e-05, "loss": 0.8709512710571289, "memory(GiB)": 91.52, "step": 71905, "token_acc": 0.7617922913671528, "train_speed(iter/s)": 0.138197 }, { "epoch": 0.9330811726122316, "grad_norm": 0.7114003300666809, "learning_rate": 5.960592285943489e-05, "loss": 0.8415656089782715, "memory(GiB)": 91.52, "step": 71910, "token_acc": 0.780193741724162, "train_speed(iter/s)": 0.138196 }, { "epoch": 0.9331460510138873, "grad_norm": 0.7908647060394287, "learning_rate": 5.960065896919195e-05, "loss": 0.8984376907348632, "memory(GiB)": 91.52, "step": 71915, "token_acc": 0.7458505415725084, "train_speed(iter/s)": 0.138194 }, { "epoch": 0.933210929415543, "grad_norm": 0.6508895754814148, "learning_rate": 5.959539496846527e-05, "loss": 0.8166882514953613, "memory(GiB)": 91.52, "step": 71920, "token_acc": 0.768983489903561, "train_speed(iter/s)": 0.138192 }, { "epoch": 0.9332758078171987, "grad_norm": 0.7620565891265869, "learning_rate": 5.959013085731537e-05, "loss": 0.8571077346801758, "memory(GiB)": 91.52, "step": 71925, "token_acc": 0.7717660506763193, "train_speed(iter/s)": 0.138191 }, { "epoch": 0.9333406862188544, "grad_norm": 0.6933053731918335, "learning_rate": 5.9584866635802886e-05, "loss": 0.8166207313537598, "memory(GiB)": 91.52, "step": 71930, "token_acc": 0.7991642924976259, "train_speed(iter/s)": 0.138189 }, { "epoch": 0.93340556462051, "grad_norm": 0.6258636713027954, "learning_rate": 5.9579602303988354e-05, "loss": 0.8067361831665039, "memory(GiB)": 91.52, "step": 71935, "token_acc": 0.7841137884003377, "train_speed(iter/s)": 0.138188 }, { "epoch": 0.9334704430221658, "grad_norm": 0.7154983282089233, "learning_rate": 5.95743378619324e-05, "loss": 0.870643424987793, "memory(GiB)": 91.52, "step": 71940, "token_acc": 0.7534935304990757, "train_speed(iter/s)": 0.138187 }, { "epoch": 0.9335353214238215, "grad_norm": 0.6547574400901794, "learning_rate": 5.9569073309695566e-05, "loss": 0.8362199783325195, "memory(GiB)": 91.52, "step": 71945, "token_acc": 0.7420846220068233, "train_speed(iter/s)": 0.138185 }, { "epoch": 0.9336001998254772, "grad_norm": 0.7833141684532166, "learning_rate": 5.9563808647338434e-05, "loss": 0.7968051910400391, "memory(GiB)": 91.52, "step": 71950, "token_acc": 0.7650304671631686, "train_speed(iter/s)": 0.138183 }, { "epoch": 0.9336650782271328, "grad_norm": 0.6874158978462219, "learning_rate": 5.955854387492165e-05, "loss": 0.914668083190918, "memory(GiB)": 91.52, "step": 71955, "token_acc": 0.7269323937458709, "train_speed(iter/s)": 0.138182 }, { "epoch": 0.9337299566287884, "grad_norm": 0.5919481515884399, "learning_rate": 5.9553278992505724e-05, "loss": 0.7929769515991211, "memory(GiB)": 91.52, "step": 71960, "token_acc": 0.7912164645350974, "train_speed(iter/s)": 0.138181 }, { "epoch": 0.9337948350304441, "grad_norm": 0.7990551590919495, "learning_rate": 5.95480140001513e-05, "loss": 0.8706586837768555, "memory(GiB)": 91.52, "step": 71965, "token_acc": 0.7617135207496654, "train_speed(iter/s)": 0.138179 }, { "epoch": 0.9338597134320998, "grad_norm": 0.7592129111289978, "learning_rate": 5.954274889791893e-05, "loss": 0.8462516784667968, "memory(GiB)": 91.52, "step": 71970, "token_acc": 0.7642005539721408, "train_speed(iter/s)": 0.138178 }, { "epoch": 0.9339245918337555, "grad_norm": 0.7187172174453735, "learning_rate": 5.953748368586922e-05, "loss": 0.9058017730712891, "memory(GiB)": 91.52, "step": 71975, "token_acc": 0.757995306738183, "train_speed(iter/s)": 0.138177 }, { "epoch": 0.9339894702354112, "grad_norm": 0.7344481945037842, "learning_rate": 5.953221836406277e-05, "loss": 0.8090316772460937, "memory(GiB)": 91.52, "step": 71980, "token_acc": 0.7568216546605545, "train_speed(iter/s)": 0.138176 }, { "epoch": 0.9340543486370669, "grad_norm": 0.6927450895309448, "learning_rate": 5.952695293256015e-05, "loss": 0.8074647903442382, "memory(GiB)": 91.52, "step": 71985, "token_acc": 0.7851846130078789, "train_speed(iter/s)": 0.138175 }, { "epoch": 0.9341192270387226, "grad_norm": 0.6866949796676636, "learning_rate": 5.9521687391421996e-05, "loss": 0.8481704711914062, "memory(GiB)": 91.52, "step": 71990, "token_acc": 0.7866956925163271, "train_speed(iter/s)": 0.138174 }, { "epoch": 0.9341841054403783, "grad_norm": 0.842936635017395, "learning_rate": 5.951642174070886e-05, "loss": 0.8320266723632812, "memory(GiB)": 91.52, "step": 71995, "token_acc": 0.7708093909046133, "train_speed(iter/s)": 0.138173 }, { "epoch": 0.934248983842034, "grad_norm": 0.700329065322876, "learning_rate": 5.9511155980481345e-05, "loss": 0.8620624542236328, "memory(GiB)": 91.52, "step": 72000, "token_acc": 0.7703613174140579, "train_speed(iter/s)": 0.138171 }, { "epoch": 0.9343138622436897, "grad_norm": 0.7336350083351135, "learning_rate": 5.9505890110800077e-05, "loss": 0.8389872550964356, "memory(GiB)": 91.52, "step": 72005, "token_acc": 0.7683318216702714, "train_speed(iter/s)": 0.13817 }, { "epoch": 0.9343787406453454, "grad_norm": 0.6777831315994263, "learning_rate": 5.9500624131725634e-05, "loss": 0.8214012145996094, "memory(GiB)": 91.52, "step": 72010, "token_acc": 0.7823742836797667, "train_speed(iter/s)": 0.138168 }, { "epoch": 0.9344436190470011, "grad_norm": 0.6937123537063599, "learning_rate": 5.949535804331862e-05, "loss": 0.8573092460632324, "memory(GiB)": 91.52, "step": 72015, "token_acc": 0.7496082504281913, "train_speed(iter/s)": 0.138167 }, { "epoch": 0.9345084974486568, "grad_norm": 0.7640988230705261, "learning_rate": 5.9490091845639616e-05, "loss": 0.8060905456542968, "memory(GiB)": 91.52, "step": 72020, "token_acc": 0.7743080771403211, "train_speed(iter/s)": 0.138166 }, { "epoch": 0.9345733758503125, "grad_norm": 0.8592856526374817, "learning_rate": 5.948482553874928e-05, "loss": 0.8562499046325683, "memory(GiB)": 91.52, "step": 72025, "token_acc": 0.7493855606758832, "train_speed(iter/s)": 0.138165 }, { "epoch": 0.9346382542519682, "grad_norm": 0.701219379901886, "learning_rate": 5.947955912270815e-05, "loss": 0.8740832328796386, "memory(GiB)": 91.52, "step": 72030, "token_acc": 0.7749032162001191, "train_speed(iter/s)": 0.138164 }, { "epoch": 0.9347031326536239, "grad_norm": 0.7880163788795471, "learning_rate": 5.9474292597576885e-05, "loss": 0.8135068893432618, "memory(GiB)": 91.52, "step": 72035, "token_acc": 0.7574115396875997, "train_speed(iter/s)": 0.138162 }, { "epoch": 0.9347680110552796, "grad_norm": 0.685045599937439, "learning_rate": 5.946902596341606e-05, "loss": 0.8494199752807617, "memory(GiB)": 91.52, "step": 72040, "token_acc": 0.7726360135602146, "train_speed(iter/s)": 0.138161 }, { "epoch": 0.9348328894569353, "grad_norm": 0.7197635769844055, "learning_rate": 5.9463759220286306e-05, "loss": 0.8124029159545898, "memory(GiB)": 91.52, "step": 72045, "token_acc": 0.772760356644629, "train_speed(iter/s)": 0.138159 }, { "epoch": 0.934897767858591, "grad_norm": 0.6935360431671143, "learning_rate": 5.945849236824821e-05, "loss": 0.8246758460998536, "memory(GiB)": 91.52, "step": 72050, "token_acc": 0.7724454119664885, "train_speed(iter/s)": 0.138159 }, { "epoch": 0.9349626462602467, "grad_norm": 0.7583639025688171, "learning_rate": 5.945322540736239e-05, "loss": 0.8830734252929687, "memory(GiB)": 91.52, "step": 72055, "token_acc": 0.7485169817120024, "train_speed(iter/s)": 0.138158 }, { "epoch": 0.9350275246619024, "grad_norm": 0.7777541875839233, "learning_rate": 5.944795833768946e-05, "loss": 0.8433677673339843, "memory(GiB)": 91.52, "step": 72060, "token_acc": 0.7619240369695743, "train_speed(iter/s)": 0.138157 }, { "epoch": 0.9350924030635581, "grad_norm": 0.7125686407089233, "learning_rate": 5.944269115929004e-05, "loss": 0.8787535667419434, "memory(GiB)": 91.52, "step": 72065, "token_acc": 0.7674335245046924, "train_speed(iter/s)": 0.138156 }, { "epoch": 0.9351572814652138, "grad_norm": 0.7380174398422241, "learning_rate": 5.943742387222474e-05, "loss": 0.8765707015991211, "memory(GiB)": 91.52, "step": 72070, "token_acc": 0.7515796083356227, "train_speed(iter/s)": 0.138154 }, { "epoch": 0.9352221598668695, "grad_norm": 0.6921352744102478, "learning_rate": 5.943215647655417e-05, "loss": 0.873044776916504, "memory(GiB)": 91.52, "step": 72075, "token_acc": 0.7544660367266892, "train_speed(iter/s)": 0.138153 }, { "epoch": 0.9352870382685252, "grad_norm": 0.5971954464912415, "learning_rate": 5.942688897233895e-05, "loss": 0.8497484207153321, "memory(GiB)": 91.52, "step": 72080, "token_acc": 0.7780488706517319, "train_speed(iter/s)": 0.138151 }, { "epoch": 0.9353519166701809, "grad_norm": 0.7416242361068726, "learning_rate": 5.94216213596397e-05, "loss": 0.8177047729492187, "memory(GiB)": 91.52, "step": 72085, "token_acc": 0.7780417566594673, "train_speed(iter/s)": 0.13815 }, { "epoch": 0.9354167950718366, "grad_norm": 0.845615029335022, "learning_rate": 5.941635363851703e-05, "loss": 0.8836843490600585, "memory(GiB)": 91.52, "step": 72090, "token_acc": 0.7521269576692017, "train_speed(iter/s)": 0.138149 }, { "epoch": 0.9354816734734923, "grad_norm": 0.845308244228363, "learning_rate": 5.9411085809031576e-05, "loss": 0.8103215217590332, "memory(GiB)": 91.52, "step": 72095, "token_acc": 0.764296978083867, "train_speed(iter/s)": 0.138147 }, { "epoch": 0.935546551875148, "grad_norm": 0.7740452885627747, "learning_rate": 5.9405817871243954e-05, "loss": 0.8932565689086914, "memory(GiB)": 91.52, "step": 72100, "token_acc": 0.7691646022570878, "train_speed(iter/s)": 0.138146 }, { "epoch": 0.9356114302768037, "grad_norm": 0.6956016421318054, "learning_rate": 5.940054982521478e-05, "loss": 0.8476414680480957, "memory(GiB)": 91.52, "step": 72105, "token_acc": 0.7618901149602061, "train_speed(iter/s)": 0.138144 }, { "epoch": 0.9356763086784594, "grad_norm": 0.7226853370666504, "learning_rate": 5.9395281671004686e-05, "loss": 0.8408037185668945, "memory(GiB)": 91.52, "step": 72110, "token_acc": 0.7911392405063291, "train_speed(iter/s)": 0.138143 }, { "epoch": 0.9357411870801151, "grad_norm": 0.6957815885543823, "learning_rate": 5.9390013408674296e-05, "loss": 0.8271939277648925, "memory(GiB)": 91.52, "step": 72115, "token_acc": 0.7634056487924683, "train_speed(iter/s)": 0.138143 }, { "epoch": 0.9358060654817708, "grad_norm": 0.8250951170921326, "learning_rate": 5.938474503828424e-05, "loss": 0.8608936309814453, "memory(GiB)": 91.52, "step": 72120, "token_acc": 0.7566448560127167, "train_speed(iter/s)": 0.138141 }, { "epoch": 0.9358709438834265, "grad_norm": 0.7921530604362488, "learning_rate": 5.937947655989512e-05, "loss": 0.9016265869140625, "memory(GiB)": 91.52, "step": 72125, "token_acc": 0.7439602703409134, "train_speed(iter/s)": 0.13814 }, { "epoch": 0.9359358222850822, "grad_norm": 0.7279161810874939, "learning_rate": 5.9374207973567605e-05, "loss": 0.8379890441894531, "memory(GiB)": 91.52, "step": 72130, "token_acc": 0.7739416947210085, "train_speed(iter/s)": 0.138139 }, { "epoch": 0.9360007006867379, "grad_norm": 0.6776863932609558, "learning_rate": 5.9368939279362315e-05, "loss": 0.8598546981811523, "memory(GiB)": 91.52, "step": 72135, "token_acc": 0.7551349715937742, "train_speed(iter/s)": 0.138137 }, { "epoch": 0.9360655790883936, "grad_norm": 0.6805723905563354, "learning_rate": 5.936367047733986e-05, "loss": 0.8488550186157227, "memory(GiB)": 91.52, "step": 72140, "token_acc": 0.7593423019431988, "train_speed(iter/s)": 0.138136 }, { "epoch": 0.9361304574900493, "grad_norm": 0.6290897130966187, "learning_rate": 5.9358401567560905e-05, "loss": 0.844134521484375, "memory(GiB)": 91.52, "step": 72145, "token_acc": 0.7529262401684523, "train_speed(iter/s)": 0.138134 }, { "epoch": 0.936195335891705, "grad_norm": 0.7889754772186279, "learning_rate": 5.9353132550086046e-05, "loss": 0.8362715721130372, "memory(GiB)": 91.52, "step": 72150, "token_acc": 0.7843778256938165, "train_speed(iter/s)": 0.138133 }, { "epoch": 0.9362602142933607, "grad_norm": 0.8260429501533508, "learning_rate": 5.934786342497596e-05, "loss": 0.8618780136108398, "memory(GiB)": 91.52, "step": 72155, "token_acc": 0.7572434607645875, "train_speed(iter/s)": 0.138132 }, { "epoch": 0.9363250926950164, "grad_norm": 0.7384077310562134, "learning_rate": 5.9342594192291245e-05, "loss": 0.8556699752807617, "memory(GiB)": 91.52, "step": 72160, "token_acc": 0.7560156278526308, "train_speed(iter/s)": 0.138131 }, { "epoch": 0.9363899710966721, "grad_norm": 0.6575726270675659, "learning_rate": 5.933732485209257e-05, "loss": 0.8283618927001953, "memory(GiB)": 91.52, "step": 72165, "token_acc": 0.7732015376166941, "train_speed(iter/s)": 0.138129 }, { "epoch": 0.9364548494983278, "grad_norm": 0.6913120150566101, "learning_rate": 5.933205540444056e-05, "loss": 0.8423828125, "memory(GiB)": 91.52, "step": 72170, "token_acc": 0.7627218009849136, "train_speed(iter/s)": 0.138128 }, { "epoch": 0.9365197278999835, "grad_norm": 0.7850247621536255, "learning_rate": 5.932678584939585e-05, "loss": 0.8677689552307128, "memory(GiB)": 91.52, "step": 72175, "token_acc": 0.7671555788370875, "train_speed(iter/s)": 0.138127 }, { "epoch": 0.9365846063016392, "grad_norm": 0.7486584782600403, "learning_rate": 5.9321516187019096e-05, "loss": 0.8396449089050293, "memory(GiB)": 91.52, "step": 72180, "token_acc": 0.7625665186591194, "train_speed(iter/s)": 0.138125 }, { "epoch": 0.9366494847032949, "grad_norm": 0.708745002746582, "learning_rate": 5.931624641737092e-05, "loss": 0.8084196090698242, "memory(GiB)": 91.52, "step": 72185, "token_acc": 0.7699932497246598, "train_speed(iter/s)": 0.138124 }, { "epoch": 0.9367143631049506, "grad_norm": 0.7047679424285889, "learning_rate": 5.931097654051201e-05, "loss": 0.8573516845703125, "memory(GiB)": 91.52, "step": 72190, "token_acc": 0.750171139101862, "train_speed(iter/s)": 0.138123 }, { "epoch": 0.9367792415066063, "grad_norm": 0.7137047052383423, "learning_rate": 5.930570655650295e-05, "loss": 0.8810419082641602, "memory(GiB)": 91.52, "step": 72195, "token_acc": 0.7676280741890678, "train_speed(iter/s)": 0.138122 }, { "epoch": 0.9368441199082619, "grad_norm": 0.6946898698806763, "learning_rate": 5.930043646540443e-05, "loss": 0.8757179260253907, "memory(GiB)": 91.52, "step": 72200, "token_acc": 0.7443903855929126, "train_speed(iter/s)": 0.138121 }, { "epoch": 0.9369089983099176, "grad_norm": 0.6809142827987671, "learning_rate": 5.9295166267277094e-05, "loss": 0.853840446472168, "memory(GiB)": 91.52, "step": 72205, "token_acc": 0.7563153977496607, "train_speed(iter/s)": 0.138119 }, { "epoch": 0.9369738767115733, "grad_norm": 0.7269337177276611, "learning_rate": 5.928989596218158e-05, "loss": 0.831302261352539, "memory(GiB)": 91.52, "step": 72210, "token_acc": 0.7691533567069212, "train_speed(iter/s)": 0.138118 }, { "epoch": 0.937038755113229, "grad_norm": 0.7309617400169373, "learning_rate": 5.928462555017852e-05, "loss": 0.8805547714233398, "memory(GiB)": 91.52, "step": 72215, "token_acc": 0.7488939740655988, "train_speed(iter/s)": 0.138116 }, { "epoch": 0.9371036335148847, "grad_norm": 0.748752772808075, "learning_rate": 5.92793550313286e-05, "loss": 0.7914044380187988, "memory(GiB)": 91.52, "step": 72220, "token_acc": 0.758475944462383, "train_speed(iter/s)": 0.138115 }, { "epoch": 0.9371685119165404, "grad_norm": 0.7740876078605652, "learning_rate": 5.927408440569247e-05, "loss": 0.8723075866699219, "memory(GiB)": 91.52, "step": 72225, "token_acc": 0.7832400855685371, "train_speed(iter/s)": 0.138114 }, { "epoch": 0.9372333903181961, "grad_norm": 0.7020790576934814, "learning_rate": 5.9268813673330755e-05, "loss": 0.8763684272766114, "memory(GiB)": 91.52, "step": 72230, "token_acc": 0.7602885718970323, "train_speed(iter/s)": 0.138112 }, { "epoch": 0.9372982687198518, "grad_norm": 0.6952505707740784, "learning_rate": 5.9263542834304144e-05, "loss": 0.8468410491943359, "memory(GiB)": 91.52, "step": 72235, "token_acc": 0.7748658035996211, "train_speed(iter/s)": 0.138111 }, { "epoch": 0.9373631471215075, "grad_norm": 0.7099246382713318, "learning_rate": 5.9258271888673265e-05, "loss": 0.8515022277832032, "memory(GiB)": 91.52, "step": 72240, "token_acc": 0.7606348557610436, "train_speed(iter/s)": 0.13811 }, { "epoch": 0.9374280255231632, "grad_norm": 0.684683084487915, "learning_rate": 5.925300083649881e-05, "loss": 0.8673391342163086, "memory(GiB)": 91.52, "step": 72245, "token_acc": 0.7680374764812041, "train_speed(iter/s)": 0.138108 }, { "epoch": 0.9374929039248189, "grad_norm": 0.7314845323562622, "learning_rate": 5.9247729677841394e-05, "loss": 0.8174867630004883, "memory(GiB)": 91.52, "step": 72250, "token_acc": 0.7817182188273807, "train_speed(iter/s)": 0.138108 }, { "epoch": 0.9375577823264746, "grad_norm": 0.7968274354934692, "learning_rate": 5.9242458412761716e-05, "loss": 0.8333283424377441, "memory(GiB)": 91.52, "step": 72255, "token_acc": 0.7838840188806473, "train_speed(iter/s)": 0.138106 }, { "epoch": 0.9376226607281303, "grad_norm": 0.7362838983535767, "learning_rate": 5.9237187041320416e-05, "loss": 0.8718693733215332, "memory(GiB)": 91.52, "step": 72260, "token_acc": 0.7627106935617574, "train_speed(iter/s)": 0.138105 }, { "epoch": 0.937687539129786, "grad_norm": 0.6432992815971375, "learning_rate": 5.923191556357817e-05, "loss": 0.8121635437011718, "memory(GiB)": 91.52, "step": 72265, "token_acc": 0.7800503675469643, "train_speed(iter/s)": 0.138103 }, { "epoch": 0.9377524175314417, "grad_norm": 0.6751867532730103, "learning_rate": 5.922664397959562e-05, "loss": 0.8655847549438477, "memory(GiB)": 91.52, "step": 72270, "token_acc": 0.7762280828865382, "train_speed(iter/s)": 0.138102 }, { "epoch": 0.9378172959330974, "grad_norm": 0.6629014611244202, "learning_rate": 5.922137228943344e-05, "loss": 0.8440389633178711, "memory(GiB)": 91.52, "step": 72275, "token_acc": 0.7580264599309008, "train_speed(iter/s)": 0.1381 }, { "epoch": 0.9378821743347531, "grad_norm": 0.7464938759803772, "learning_rate": 5.921610049315233e-05, "loss": 0.7993349075317383, "memory(GiB)": 91.52, "step": 72280, "token_acc": 0.7745784695201038, "train_speed(iter/s)": 0.138099 }, { "epoch": 0.9379470527364088, "grad_norm": 0.7128240466117859, "learning_rate": 5.9210828590812896e-05, "loss": 0.8442848205566407, "memory(GiB)": 91.52, "step": 72285, "token_acc": 0.7859245295498892, "train_speed(iter/s)": 0.138097 }, { "epoch": 0.9380119311380645, "grad_norm": 0.687951922416687, "learning_rate": 5.9205556582475854e-05, "loss": 0.8664110183715821, "memory(GiB)": 91.52, "step": 72290, "token_acc": 0.7702607470049331, "train_speed(iter/s)": 0.138095 }, { "epoch": 0.9380768095397202, "grad_norm": 0.6875705122947693, "learning_rate": 5.920028446820186e-05, "loss": 0.8782877922058105, "memory(GiB)": 91.52, "step": 72295, "token_acc": 0.7598769813106222, "train_speed(iter/s)": 0.138095 }, { "epoch": 0.9381416879413759, "grad_norm": 0.7194284200668335, "learning_rate": 5.9195012248051584e-05, "loss": 0.8888042449951172, "memory(GiB)": 91.52, "step": 72300, "token_acc": 0.7493676420073072, "train_speed(iter/s)": 0.138093 }, { "epoch": 0.9382065663430316, "grad_norm": 0.6759243011474609, "learning_rate": 5.91897399220857e-05, "loss": 0.8383056640625, "memory(GiB)": 91.52, "step": 72305, "token_acc": 0.7741172881872951, "train_speed(iter/s)": 0.138092 }, { "epoch": 0.9382714447446873, "grad_norm": 0.7393335700035095, "learning_rate": 5.918446749036487e-05, "loss": 0.8856300354003906, "memory(GiB)": 91.52, "step": 72310, "token_acc": 0.7704911829259063, "train_speed(iter/s)": 0.138091 }, { "epoch": 0.938336323146343, "grad_norm": 0.7503483295440674, "learning_rate": 5.9179194952949804e-05, "loss": 0.859228515625, "memory(GiB)": 91.52, "step": 72315, "token_acc": 0.7576840600428878, "train_speed(iter/s)": 0.138089 }, { "epoch": 0.9384012015479987, "grad_norm": 0.6788370013237, "learning_rate": 5.917392230990112e-05, "loss": 0.8686291694641113, "memory(GiB)": 91.52, "step": 72320, "token_acc": 0.7777567200480117, "train_speed(iter/s)": 0.138088 }, { "epoch": 0.9384660799496544, "grad_norm": 0.7604748010635376, "learning_rate": 5.9168649561279554e-05, "loss": 0.8568140983581543, "memory(GiB)": 91.52, "step": 72325, "token_acc": 0.7639822978011639, "train_speed(iter/s)": 0.138087 }, { "epoch": 0.9385309583513101, "grad_norm": 0.6926873922348022, "learning_rate": 5.9163376707145745e-05, "loss": 0.8370530128479003, "memory(GiB)": 91.52, "step": 72330, "token_acc": 0.7771927440722876, "train_speed(iter/s)": 0.138086 }, { "epoch": 0.9385958367529658, "grad_norm": 0.6918683052062988, "learning_rate": 5.91581037475604e-05, "loss": 0.8610259056091308, "memory(GiB)": 91.52, "step": 72335, "token_acc": 0.7860788545199192, "train_speed(iter/s)": 0.138085 }, { "epoch": 0.9386607151546215, "grad_norm": 0.7640447616577148, "learning_rate": 5.915283068258418e-05, "loss": 0.8519098281860351, "memory(GiB)": 91.52, "step": 72340, "token_acc": 0.7523251808473992, "train_speed(iter/s)": 0.138084 }, { "epoch": 0.9387255935562772, "grad_norm": 0.674798309803009, "learning_rate": 5.914755751227776e-05, "loss": 0.8565457344055176, "memory(GiB)": 91.52, "step": 72345, "token_acc": 0.7840734568506356, "train_speed(iter/s)": 0.138082 }, { "epoch": 0.9387904719579329, "grad_norm": 0.7245751023292542, "learning_rate": 5.914228423670186e-05, "loss": 0.8351632118225097, "memory(GiB)": 91.52, "step": 72350, "token_acc": 0.776131625967838, "train_speed(iter/s)": 0.13808 }, { "epoch": 0.9388553503595886, "grad_norm": 0.7071266174316406, "learning_rate": 5.913701085591712e-05, "loss": 0.8657102584838867, "memory(GiB)": 91.52, "step": 72355, "token_acc": 0.7545636910732196, "train_speed(iter/s)": 0.138079 }, { "epoch": 0.9389202287612443, "grad_norm": 0.7668712735176086, "learning_rate": 5.9131737369984265e-05, "loss": 0.8378213882446289, "memory(GiB)": 91.52, "step": 72360, "token_acc": 0.7766209346054739, "train_speed(iter/s)": 0.138078 }, { "epoch": 0.9389851071629, "grad_norm": 0.6208828687667847, "learning_rate": 5.912646377896396e-05, "loss": 0.8358643531799317, "memory(GiB)": 91.52, "step": 72365, "token_acc": 0.7783352496428696, "train_speed(iter/s)": 0.138076 }, { "epoch": 0.9390499855645557, "grad_norm": 0.7536062598228455, "learning_rate": 5.912119008291689e-05, "loss": 0.8417261123657227, "memory(GiB)": 91.52, "step": 72370, "token_acc": 0.7498007679489966, "train_speed(iter/s)": 0.138075 }, { "epoch": 0.9391148639662114, "grad_norm": 0.7300651669502258, "learning_rate": 5.911591628190376e-05, "loss": 0.8907516479492188, "memory(GiB)": 91.52, "step": 72375, "token_acc": 0.7611329842131417, "train_speed(iter/s)": 0.138074 }, { "epoch": 0.939179742367867, "grad_norm": 0.7718824148178101, "learning_rate": 5.911064237598524e-05, "loss": 0.8806023597717285, "memory(GiB)": 91.52, "step": 72380, "token_acc": 0.7503402623113091, "train_speed(iter/s)": 0.138072 }, { "epoch": 0.9392446207695228, "grad_norm": 0.7301470637321472, "learning_rate": 5.9105368365222044e-05, "loss": 0.8396547317504883, "memory(GiB)": 91.52, "step": 72385, "token_acc": 0.7626380586675112, "train_speed(iter/s)": 0.138071 }, { "epoch": 0.9393094991711785, "grad_norm": 0.7546541690826416, "learning_rate": 5.910009424967485e-05, "loss": 0.9031003952026367, "memory(GiB)": 91.52, "step": 72390, "token_acc": 0.7501740543049431, "train_speed(iter/s)": 0.13807 }, { "epoch": 0.9393743775728342, "grad_norm": 0.6802457571029663, "learning_rate": 5.9094820029404366e-05, "loss": 0.8609850883483887, "memory(GiB)": 91.52, "step": 72395, "token_acc": 0.7486145720980399, "train_speed(iter/s)": 0.138069 }, { "epoch": 0.9394392559744899, "grad_norm": 0.6445992588996887, "learning_rate": 5.908954570447127e-05, "loss": 0.8532915115356445, "memory(GiB)": 91.52, "step": 72400, "token_acc": 0.760463174706886, "train_speed(iter/s)": 0.138068 }, { "epoch": 0.9395041343761455, "grad_norm": 0.7297563552856445, "learning_rate": 5.908427127493626e-05, "loss": 0.8608333587646484, "memory(GiB)": 91.52, "step": 72405, "token_acc": 0.7699739426738825, "train_speed(iter/s)": 0.138067 }, { "epoch": 0.9395690127778012, "grad_norm": 0.6924133896827698, "learning_rate": 5.907899674086006e-05, "loss": 0.8725994110107422, "memory(GiB)": 91.52, "step": 72410, "token_acc": 0.7642384105960265, "train_speed(iter/s)": 0.138066 }, { "epoch": 0.939633891179457, "grad_norm": 0.6514278650283813, "learning_rate": 5.907372210230333e-05, "loss": 0.8210596084594727, "memory(GiB)": 91.52, "step": 72415, "token_acc": 0.7732999310169553, "train_speed(iter/s)": 0.138064 }, { "epoch": 0.9396987695811126, "grad_norm": 0.7841244339942932, "learning_rate": 5.9068447359326806e-05, "loss": 0.8393388748168945, "memory(GiB)": 91.52, "step": 72420, "token_acc": 0.7696834049312299, "train_speed(iter/s)": 0.138063 }, { "epoch": 0.9397636479827683, "grad_norm": 0.659920871257782, "learning_rate": 5.906317251199116e-05, "loss": 0.9004960060119629, "memory(GiB)": 91.52, "step": 72425, "token_acc": 0.7700561447286338, "train_speed(iter/s)": 0.138062 }, { "epoch": 0.939828526384424, "grad_norm": 0.792058527469635, "learning_rate": 5.905789756035712e-05, "loss": 0.8632234573364258, "memory(GiB)": 91.52, "step": 72430, "token_acc": 0.7632817537072856, "train_speed(iter/s)": 0.13806 }, { "epoch": 0.9398934047860797, "grad_norm": 0.6943027973175049, "learning_rate": 5.905262250448538e-05, "loss": 0.8535731315612793, "memory(GiB)": 91.52, "step": 72435, "token_acc": 0.7806055456862594, "train_speed(iter/s)": 0.138059 }, { "epoch": 0.9399582831877353, "grad_norm": 0.7022404074668884, "learning_rate": 5.904734734443663e-05, "loss": 0.8488956451416015, "memory(GiB)": 91.52, "step": 72440, "token_acc": 0.7671252456673218, "train_speed(iter/s)": 0.138058 }, { "epoch": 0.940023161589391, "grad_norm": 0.6953614950180054, "learning_rate": 5.9042072080271596e-05, "loss": 0.8481634140014649, "memory(GiB)": 91.52, "step": 72445, "token_acc": 0.7727873620199617, "train_speed(iter/s)": 0.138057 }, { "epoch": 0.9400880399910467, "grad_norm": 0.6640408039093018, "learning_rate": 5.903679671205097e-05, "loss": 0.8654926300048829, "memory(GiB)": 91.52, "step": 72450, "token_acc": 0.7632133043074837, "train_speed(iter/s)": 0.138056 }, { "epoch": 0.9401529183927024, "grad_norm": 0.7855748534202576, "learning_rate": 5.903152123983547e-05, "loss": 0.8565866470336914, "memory(GiB)": 91.52, "step": 72455, "token_acc": 0.7569703534320215, "train_speed(iter/s)": 0.138055 }, { "epoch": 0.9402177967943581, "grad_norm": 0.7184349894523621, "learning_rate": 5.902624566368582e-05, "loss": 0.8527013778686523, "memory(GiB)": 91.52, "step": 72460, "token_acc": 0.7578323749782192, "train_speed(iter/s)": 0.138054 }, { "epoch": 0.9402826751960138, "grad_norm": 0.7978522777557373, "learning_rate": 5.9020969983662714e-05, "loss": 0.8253383636474609, "memory(GiB)": 91.52, "step": 72465, "token_acc": 0.7632656543252004, "train_speed(iter/s)": 0.138052 }, { "epoch": 0.9403475535976695, "grad_norm": 0.7635722160339355, "learning_rate": 5.901569419982685e-05, "loss": 0.879728889465332, "memory(GiB)": 91.52, "step": 72470, "token_acc": 0.7627521613832853, "train_speed(iter/s)": 0.138051 }, { "epoch": 0.9404124319993252, "grad_norm": 0.7619060277938843, "learning_rate": 5.9010418312238966e-05, "loss": 0.8500371932983398, "memory(GiB)": 91.52, "step": 72475, "token_acc": 0.7657109382627656, "train_speed(iter/s)": 0.13805 }, { "epoch": 0.9404773104009809, "grad_norm": 0.8282998204231262, "learning_rate": 5.9005142320959774e-05, "loss": 0.8734756469726562, "memory(GiB)": 91.52, "step": 72480, "token_acc": 0.7578640705092746, "train_speed(iter/s)": 0.138049 }, { "epoch": 0.9405421888026366, "grad_norm": 0.723902702331543, "learning_rate": 5.899986622604997e-05, "loss": 0.8248062133789062, "memory(GiB)": 91.52, "step": 72485, "token_acc": 0.7720131400414197, "train_speed(iter/s)": 0.138048 }, { "epoch": 0.9406070672042923, "grad_norm": 0.6703231334686279, "learning_rate": 5.89945900275703e-05, "loss": 0.8614574432373047, "memory(GiB)": 91.52, "step": 72490, "token_acc": 0.7579340926875924, "train_speed(iter/s)": 0.138046 }, { "epoch": 0.940671945605948, "grad_norm": 0.7661575675010681, "learning_rate": 5.898931372558145e-05, "loss": 0.8849372863769531, "memory(GiB)": 91.52, "step": 72495, "token_acc": 0.7660642191450024, "train_speed(iter/s)": 0.138045 }, { "epoch": 0.9407368240076037, "grad_norm": 0.7524946331977844, "learning_rate": 5.898403732014417e-05, "loss": 0.8562942504882812, "memory(GiB)": 91.52, "step": 72500, "token_acc": 0.76927777565596, "train_speed(iter/s)": 0.138044 }, { "epoch": 0.9408017024092594, "grad_norm": 0.7184634804725647, "learning_rate": 5.897876081131917e-05, "loss": 0.7893376350402832, "memory(GiB)": 91.52, "step": 72505, "token_acc": 0.7692576487032148, "train_speed(iter/s)": 0.138042 }, { "epoch": 0.9408665808109151, "grad_norm": 0.6569507122039795, "learning_rate": 5.897348419916715e-05, "loss": 0.8173463821411133, "memory(GiB)": 91.52, "step": 72510, "token_acc": 0.7669969763194923, "train_speed(iter/s)": 0.138041 }, { "epoch": 0.9409314592125708, "grad_norm": 0.7095922231674194, "learning_rate": 5.896820748374887e-05, "loss": 0.8487480163574219, "memory(GiB)": 91.52, "step": 72515, "token_acc": 0.7755596358948315, "train_speed(iter/s)": 0.13804 }, { "epoch": 0.9409963376142265, "grad_norm": 0.7664234638214111, "learning_rate": 5.8962930665125025e-05, "loss": 0.8356271743774414, "memory(GiB)": 91.52, "step": 72520, "token_acc": 0.7792672943102743, "train_speed(iter/s)": 0.138039 }, { "epoch": 0.9410612160158822, "grad_norm": 0.7104492783546448, "learning_rate": 5.8957653743356356e-05, "loss": 0.8772176742553711, "memory(GiB)": 91.52, "step": 72525, "token_acc": 0.7527713420067724, "train_speed(iter/s)": 0.138037 }, { "epoch": 0.9411260944175379, "grad_norm": 0.8167775869369507, "learning_rate": 5.8952376718503574e-05, "loss": 0.8872747421264648, "memory(GiB)": 91.52, "step": 72530, "token_acc": 0.7609035549703752, "train_speed(iter/s)": 0.138036 }, { "epoch": 0.9411909728191936, "grad_norm": 0.6996518969535828, "learning_rate": 5.894709959062743e-05, "loss": 0.8821189880371094, "memory(GiB)": 91.52, "step": 72535, "token_acc": 0.7629478303635607, "train_speed(iter/s)": 0.138036 }, { "epoch": 0.9412558512208493, "grad_norm": 0.6232720017433167, "learning_rate": 5.8941822359788634e-05, "loss": 0.8657538414001464, "memory(GiB)": 91.52, "step": 72540, "token_acc": 0.760006052810774, "train_speed(iter/s)": 0.138034 }, { "epoch": 0.941320729622505, "grad_norm": 0.7623403668403625, "learning_rate": 5.893654502604792e-05, "loss": 0.882136344909668, "memory(GiB)": 91.52, "step": 72545, "token_acc": 0.7412926702477907, "train_speed(iter/s)": 0.138033 }, { "epoch": 0.9413856080241607, "grad_norm": 0.7656404376029968, "learning_rate": 5.893126758946603e-05, "loss": 0.872926902770996, "memory(GiB)": 91.52, "step": 72550, "token_acc": 0.7553657823814008, "train_speed(iter/s)": 0.138032 }, { "epoch": 0.9414504864258164, "grad_norm": 0.6825488805770874, "learning_rate": 5.8925990050103677e-05, "loss": 0.7904274940490723, "memory(GiB)": 91.52, "step": 72555, "token_acc": 0.7755115823596948, "train_speed(iter/s)": 0.13803 }, { "epoch": 0.9415153648274721, "grad_norm": 0.6460607051849365, "learning_rate": 5.8920712408021604e-05, "loss": 0.8124428749084472, "memory(GiB)": 91.52, "step": 72560, "token_acc": 0.7957507587930727, "train_speed(iter/s)": 0.138029 }, { "epoch": 0.9415802432291278, "grad_norm": 0.7577589154243469, "learning_rate": 5.8915434663280555e-05, "loss": 0.8584440231323243, "memory(GiB)": 91.52, "step": 72565, "token_acc": 0.7543397860926931, "train_speed(iter/s)": 0.138028 }, { "epoch": 0.9416451216307835, "grad_norm": 0.7067638039588928, "learning_rate": 5.891015681594125e-05, "loss": 0.8506933212280273, "memory(GiB)": 91.52, "step": 72570, "token_acc": 0.7484060238607471, "train_speed(iter/s)": 0.138026 }, { "epoch": 0.9417100000324392, "grad_norm": 0.7376146912574768, "learning_rate": 5.890487886606445e-05, "loss": 0.8821369171142578, "memory(GiB)": 91.52, "step": 72575, "token_acc": 0.7536837705089351, "train_speed(iter/s)": 0.138025 }, { "epoch": 0.9417748784340949, "grad_norm": 0.773241400718689, "learning_rate": 5.8899600813710855e-05, "loss": 0.8728492736816407, "memory(GiB)": 91.52, "step": 72580, "token_acc": 0.7677551460286669, "train_speed(iter/s)": 0.138025 }, { "epoch": 0.9418397568357506, "grad_norm": 0.7431491017341614, "learning_rate": 5.889432265894125e-05, "loss": 0.8644199371337891, "memory(GiB)": 91.52, "step": 72585, "token_acc": 0.7757281553398059, "train_speed(iter/s)": 0.138023 }, { "epoch": 0.9419046352374063, "grad_norm": 0.7156950235366821, "learning_rate": 5.888904440181633e-05, "loss": 0.860682201385498, "memory(GiB)": 91.52, "step": 72590, "token_acc": 0.7690898345153664, "train_speed(iter/s)": 0.138022 }, { "epoch": 0.941969513639062, "grad_norm": 0.7940352559089661, "learning_rate": 5.888376604239687e-05, "loss": 0.8914192199707032, "memory(GiB)": 91.52, "step": 72595, "token_acc": 0.7467032453159584, "train_speed(iter/s)": 0.13802 }, { "epoch": 0.9420343920407177, "grad_norm": 0.7562698721885681, "learning_rate": 5.887848758074359e-05, "loss": 0.8759580612182617, "memory(GiB)": 91.52, "step": 72600, "token_acc": 0.7417808421862516, "train_speed(iter/s)": 0.13802 }, { "epoch": 0.9420992704423734, "grad_norm": 0.6948648691177368, "learning_rate": 5.887320901691726e-05, "loss": 0.8506891250610351, "memory(GiB)": 91.52, "step": 72605, "token_acc": 0.7442197007071205, "train_speed(iter/s)": 0.138019 }, { "epoch": 0.9421641488440291, "grad_norm": 0.7974737882614136, "learning_rate": 5.886793035097862e-05, "loss": 0.8910709381103515, "memory(GiB)": 91.52, "step": 72610, "token_acc": 0.7609325887271506, "train_speed(iter/s)": 0.138018 }, { "epoch": 0.9422290272456848, "grad_norm": 0.6703189015388489, "learning_rate": 5.8862651582988384e-05, "loss": 0.8536802291870117, "memory(GiB)": 91.52, "step": 72615, "token_acc": 0.7724560003041016, "train_speed(iter/s)": 0.138017 }, { "epoch": 0.9422939056473405, "grad_norm": 0.5909208059310913, "learning_rate": 5.8857372713007344e-05, "loss": 0.8052533149719239, "memory(GiB)": 91.52, "step": 72620, "token_acc": 0.7875371528133648, "train_speed(iter/s)": 0.138015 }, { "epoch": 0.9423587840489962, "grad_norm": 0.7331749796867371, "learning_rate": 5.8852093741096216e-05, "loss": 0.8620843887329102, "memory(GiB)": 91.52, "step": 72625, "token_acc": 0.7451096621221103, "train_speed(iter/s)": 0.138013 }, { "epoch": 0.9424236624506519, "grad_norm": 0.7239560484886169, "learning_rate": 5.884681466731576e-05, "loss": 0.8550826072692871, "memory(GiB)": 91.52, "step": 72630, "token_acc": 0.7669069009773917, "train_speed(iter/s)": 0.138012 }, { "epoch": 0.9424885408523076, "grad_norm": 0.7491987943649292, "learning_rate": 5.884153549172674e-05, "loss": 0.8411898612976074, "memory(GiB)": 91.52, "step": 72635, "token_acc": 0.763346552776083, "train_speed(iter/s)": 0.138011 }, { "epoch": 0.9425534192539633, "grad_norm": 0.7768036723136902, "learning_rate": 5.8836256214389905e-05, "loss": 0.8405853271484375, "memory(GiB)": 91.52, "step": 72640, "token_acc": 0.758092119949401, "train_speed(iter/s)": 0.13801 }, { "epoch": 0.942618297655619, "grad_norm": 0.6837063431739807, "learning_rate": 5.8830976835365984e-05, "loss": 0.8608893394470215, "memory(GiB)": 91.52, "step": 72645, "token_acc": 0.7381225149942718, "train_speed(iter/s)": 0.138009 }, { "epoch": 0.9426831760572747, "grad_norm": 0.7595828771591187, "learning_rate": 5.882569735471576e-05, "loss": 0.8818962097167968, "memory(GiB)": 91.52, "step": 72650, "token_acc": 0.7672144147319265, "train_speed(iter/s)": 0.138008 }, { "epoch": 0.9427480544589304, "grad_norm": 0.7449809312820435, "learning_rate": 5.8820417772499994e-05, "loss": 0.8568965911865234, "memory(GiB)": 91.52, "step": 72655, "token_acc": 0.7542021734031741, "train_speed(iter/s)": 0.138006 }, { "epoch": 0.9428129328605861, "grad_norm": 0.7888363003730774, "learning_rate": 5.881513808877941e-05, "loss": 0.8543298721313477, "memory(GiB)": 91.52, "step": 72660, "token_acc": 0.7540895139802354, "train_speed(iter/s)": 0.138005 }, { "epoch": 0.9428778112622418, "grad_norm": 0.7787288427352905, "learning_rate": 5.8809858303614785e-05, "loss": 0.8921606063842773, "memory(GiB)": 91.52, "step": 72665, "token_acc": 0.7551339657180896, "train_speed(iter/s)": 0.138004 }, { "epoch": 0.9429426896638975, "grad_norm": 0.6922255158424377, "learning_rate": 5.880457841706687e-05, "loss": 0.8611026763916015, "memory(GiB)": 91.52, "step": 72670, "token_acc": 0.7597036344825284, "train_speed(iter/s)": 0.138002 }, { "epoch": 0.9430075680655532, "grad_norm": 0.6556398868560791, "learning_rate": 5.8799298429196445e-05, "loss": 0.8500768661499023, "memory(GiB)": 91.52, "step": 72675, "token_acc": 0.7521408486707567, "train_speed(iter/s)": 0.138 }, { "epoch": 0.9430724464672088, "grad_norm": 0.7369718551635742, "learning_rate": 5.879401834006426e-05, "loss": 0.8554052352905274, "memory(GiB)": 91.52, "step": 72680, "token_acc": 0.7594302294908994, "train_speed(iter/s)": 0.137999 }, { "epoch": 0.9431373248688645, "grad_norm": 0.7890505790710449, "learning_rate": 5.878873814973107e-05, "loss": 0.8631251335144043, "memory(GiB)": 91.52, "step": 72685, "token_acc": 0.7781618918024347, "train_speed(iter/s)": 0.137998 }, { "epoch": 0.9432022032705202, "grad_norm": 0.6829859018325806, "learning_rate": 5.8783457858257665e-05, "loss": 0.872618579864502, "memory(GiB)": 91.52, "step": 72690, "token_acc": 0.7524164352746828, "train_speed(iter/s)": 0.137997 }, { "epoch": 0.9432670816721759, "grad_norm": 0.7009853720664978, "learning_rate": 5.8778177465704775e-05, "loss": 0.8360281944274902, "memory(GiB)": 91.52, "step": 72695, "token_acc": 0.7579223547739106, "train_speed(iter/s)": 0.137996 }, { "epoch": 0.9433319600738316, "grad_norm": 0.6987848281860352, "learning_rate": 5.87728969721332e-05, "loss": 0.8169890403747558, "memory(GiB)": 91.52, "step": 72700, "token_acc": 0.7665129397415863, "train_speed(iter/s)": 0.137995 }, { "epoch": 0.9433968384754873, "grad_norm": 0.752547562122345, "learning_rate": 5.876761637760366e-05, "loss": 0.8653467178344727, "memory(GiB)": 91.52, "step": 72705, "token_acc": 0.7570813587526679, "train_speed(iter/s)": 0.137994 }, { "epoch": 0.943461716877143, "grad_norm": 0.7073636054992676, "learning_rate": 5.876233568217699e-05, "loss": 0.8389930725097656, "memory(GiB)": 91.52, "step": 72710, "token_acc": 0.7779482501243987, "train_speed(iter/s)": 0.137993 }, { "epoch": 0.9435265952787987, "grad_norm": 0.8127533793449402, "learning_rate": 5.8757054885913895e-05, "loss": 0.8762578964233398, "memory(GiB)": 91.52, "step": 72715, "token_acc": 0.7749036049863363, "train_speed(iter/s)": 0.137992 }, { "epoch": 0.9435914736804544, "grad_norm": 0.6956433653831482, "learning_rate": 5.875177398887519e-05, "loss": 0.8135251998901367, "memory(GiB)": 91.52, "step": 72720, "token_acc": 0.7807961592318464, "train_speed(iter/s)": 0.137991 }, { "epoch": 0.9436563520821101, "grad_norm": 0.7179995775222778, "learning_rate": 5.8746492991121636e-05, "loss": 0.8679300308227539, "memory(GiB)": 91.52, "step": 72725, "token_acc": 0.7491503823279524, "train_speed(iter/s)": 0.13799 }, { "epoch": 0.9437212304837658, "grad_norm": 0.7941993474960327, "learning_rate": 5.874121189271401e-05, "loss": 0.8318608283996582, "memory(GiB)": 91.52, "step": 72730, "token_acc": 0.7589970319069997, "train_speed(iter/s)": 0.137989 }, { "epoch": 0.9437861088854215, "grad_norm": 0.7073065638542175, "learning_rate": 5.873593069371307e-05, "loss": 0.8451433181762695, "memory(GiB)": 91.52, "step": 72735, "token_acc": 0.772157979617747, "train_speed(iter/s)": 0.137987 }, { "epoch": 0.9438509872870772, "grad_norm": 0.7333809733390808, "learning_rate": 5.873064939417959e-05, "loss": 0.8342760086059571, "memory(GiB)": 91.52, "step": 72740, "token_acc": 0.7766209851623641, "train_speed(iter/s)": 0.137986 }, { "epoch": 0.9439158656887329, "grad_norm": 0.7227102518081665, "learning_rate": 5.872536799417438e-05, "loss": 0.8362598419189453, "memory(GiB)": 91.52, "step": 72745, "token_acc": 0.7512292506192465, "train_speed(iter/s)": 0.137985 }, { "epoch": 0.9439807440903886, "grad_norm": 0.6856714487075806, "learning_rate": 5.8720086493758194e-05, "loss": 0.8584502220153809, "memory(GiB)": 91.52, "step": 72750, "token_acc": 0.7616109566750288, "train_speed(iter/s)": 0.137984 }, { "epoch": 0.9440456224920443, "grad_norm": 0.7471880912780762, "learning_rate": 5.87148048929918e-05, "loss": 0.8101869583129883, "memory(GiB)": 91.52, "step": 72755, "token_acc": 0.7547725300607413, "train_speed(iter/s)": 0.137983 }, { "epoch": 0.9441105008937, "grad_norm": 0.727984607219696, "learning_rate": 5.870952319193601e-05, "loss": 0.8310519218444824, "memory(GiB)": 91.52, "step": 72760, "token_acc": 0.7899585143566459, "train_speed(iter/s)": 0.137982 }, { "epoch": 0.9441753792953557, "grad_norm": 0.7461758852005005, "learning_rate": 5.870424139065158e-05, "loss": 0.8770934104919433, "memory(GiB)": 91.52, "step": 72765, "token_acc": 0.7588547617447409, "train_speed(iter/s)": 0.137981 }, { "epoch": 0.9442402576970114, "grad_norm": 0.7034784555435181, "learning_rate": 5.86989594891993e-05, "loss": 0.8283901214599609, "memory(GiB)": 91.52, "step": 72770, "token_acc": 0.7629203147889483, "train_speed(iter/s)": 0.13798 }, { "epoch": 0.9443051360986671, "grad_norm": 1.455082893371582, "learning_rate": 5.8693677487639945e-05, "loss": 0.8405192375183106, "memory(GiB)": 91.52, "step": 72775, "token_acc": 0.7731932093775262, "train_speed(iter/s)": 0.137977 }, { "epoch": 0.9443700145003228, "grad_norm": 0.7580165266990662, "learning_rate": 5.868839538603432e-05, "loss": 0.8281591415405274, "memory(GiB)": 91.52, "step": 72780, "token_acc": 0.7713273172335684, "train_speed(iter/s)": 0.137976 }, { "epoch": 0.9444348929019785, "grad_norm": 0.6254951357841492, "learning_rate": 5.8683113184443196e-05, "loss": 0.9007944107055664, "memory(GiB)": 91.52, "step": 72785, "token_acc": 0.7512518272011943, "train_speed(iter/s)": 0.137974 }, { "epoch": 0.9444997713036342, "grad_norm": 0.7213383316993713, "learning_rate": 5.867783088292736e-05, "loss": 0.8658132553100586, "memory(GiB)": 91.52, "step": 72790, "token_acc": 0.7623980163173892, "train_speed(iter/s)": 0.137974 }, { "epoch": 0.9445646497052899, "grad_norm": 0.6942673921585083, "learning_rate": 5.867254848154762e-05, "loss": 0.8217355728149414, "memory(GiB)": 91.52, "step": 72795, "token_acc": 0.7891952385797131, "train_speed(iter/s)": 0.137973 }, { "epoch": 0.9446295281069456, "grad_norm": 0.6506268382072449, "learning_rate": 5.866726598036474e-05, "loss": 0.8463171005249024, "memory(GiB)": 91.52, "step": 72800, "token_acc": 0.7775044722719141, "train_speed(iter/s)": 0.137971 }, { "epoch": 0.9446944065086013, "grad_norm": 0.6859850883483887, "learning_rate": 5.8661983379439524e-05, "loss": 0.803118896484375, "memory(GiB)": 91.52, "step": 72805, "token_acc": 0.7786434136230433, "train_speed(iter/s)": 0.13797 }, { "epoch": 0.944759284910257, "grad_norm": 0.7100105285644531, "learning_rate": 5.865670067883275e-05, "loss": 0.8396108627319336, "memory(GiB)": 91.52, "step": 72810, "token_acc": 0.7489700036140224, "train_speed(iter/s)": 0.137969 }, { "epoch": 0.9448241633119127, "grad_norm": 0.6342796087265015, "learning_rate": 5.865141787860523e-05, "loss": 0.8573494911193847, "memory(GiB)": 91.52, "step": 72815, "token_acc": 0.7757169966701278, "train_speed(iter/s)": 0.137967 }, { "epoch": 0.9448890417135684, "grad_norm": 0.6595110297203064, "learning_rate": 5.864613497881775e-05, "loss": 0.8306384086608887, "memory(GiB)": 91.52, "step": 72820, "token_acc": 0.7590122757986344, "train_speed(iter/s)": 0.137965 }, { "epoch": 0.944953920115224, "grad_norm": 0.6884434819221497, "learning_rate": 5.864085197953111e-05, "loss": 0.8097543716430664, "memory(GiB)": 91.52, "step": 72825, "token_acc": 0.7564820497084995, "train_speed(iter/s)": 0.137964 }, { "epoch": 0.9450187985168798, "grad_norm": 0.7742207646369934, "learning_rate": 5.86355688808061e-05, "loss": 0.8693404197692871, "memory(GiB)": 91.52, "step": 72830, "token_acc": 0.7273103552077061, "train_speed(iter/s)": 0.137963 }, { "epoch": 0.9450836769185355, "grad_norm": 0.7391070127487183, "learning_rate": 5.86302856827035e-05, "loss": 0.818966293334961, "memory(GiB)": 91.52, "step": 72835, "token_acc": 0.7720875684128226, "train_speed(iter/s)": 0.137962 }, { "epoch": 0.9451485553201912, "grad_norm": 0.7839586734771729, "learning_rate": 5.862500238528416e-05, "loss": 0.837562370300293, "memory(GiB)": 91.52, "step": 72840, "token_acc": 0.7729938995776631, "train_speed(iter/s)": 0.137961 }, { "epoch": 0.9452134337218469, "grad_norm": 0.740543782711029, "learning_rate": 5.861971898860882e-05, "loss": 0.8088598251342773, "memory(GiB)": 91.52, "step": 72845, "token_acc": 0.7661410198410133, "train_speed(iter/s)": 0.13796 }, { "epoch": 0.9452783121235026, "grad_norm": 0.7007105350494385, "learning_rate": 5.861443549273832e-05, "loss": 0.8523126602172851, "memory(GiB)": 91.52, "step": 72850, "token_acc": 0.7596798212956068, "train_speed(iter/s)": 0.137958 }, { "epoch": 0.9453431905251583, "grad_norm": 0.6826072335243225, "learning_rate": 5.8609151897733447e-05, "loss": 0.8190240859985352, "memory(GiB)": 91.52, "step": 72855, "token_acc": 0.7889184947256999, "train_speed(iter/s)": 0.137957 }, { "epoch": 0.945408068926814, "grad_norm": 0.710407555103302, "learning_rate": 5.860386820365502e-05, "loss": 0.8356119155883789, "memory(GiB)": 91.52, "step": 72860, "token_acc": 0.7706684753937749, "train_speed(iter/s)": 0.137956 }, { "epoch": 0.9454729473284696, "grad_norm": 0.7177035212516785, "learning_rate": 5.859858441056383e-05, "loss": 0.8810947418212891, "memory(GiB)": 91.52, "step": 72865, "token_acc": 0.7550362610797744, "train_speed(iter/s)": 0.137955 }, { "epoch": 0.9455378257301253, "grad_norm": 0.7530991435050964, "learning_rate": 5.859330051852066e-05, "loss": 0.8398212432861328, "memory(GiB)": 91.52, "step": 72870, "token_acc": 0.7639631558974713, "train_speed(iter/s)": 0.137953 }, { "epoch": 0.945602704131781, "grad_norm": 0.6850519776344299, "learning_rate": 5.858801652758636e-05, "loss": 0.8058443069458008, "memory(GiB)": 91.52, "step": 72875, "token_acc": 0.7760435906908016, "train_speed(iter/s)": 0.137952 }, { "epoch": 0.9456675825334367, "grad_norm": 0.836557149887085, "learning_rate": 5.858273243782172e-05, "loss": 0.8691946029663086, "memory(GiB)": 91.52, "step": 72880, "token_acc": 0.7547887515283249, "train_speed(iter/s)": 0.13795 }, { "epoch": 0.9457324609350924, "grad_norm": 0.8213895559310913, "learning_rate": 5.857744824928754e-05, "loss": 0.8547879219055176, "memory(GiB)": 91.52, "step": 72885, "token_acc": 0.7591973244147158, "train_speed(iter/s)": 0.137949 }, { "epoch": 0.9457973393367481, "grad_norm": 0.661551296710968, "learning_rate": 5.8572163962044646e-05, "loss": 0.8872101783752442, "memory(GiB)": 91.52, "step": 72890, "token_acc": 0.7769368082827561, "train_speed(iter/s)": 0.137948 }, { "epoch": 0.9458622177384038, "grad_norm": 0.711311936378479, "learning_rate": 5.856687957615383e-05, "loss": 0.8055265426635743, "memory(GiB)": 91.52, "step": 72895, "token_acc": 0.7824795842613215, "train_speed(iter/s)": 0.137947 }, { "epoch": 0.9459270961400595, "grad_norm": 0.7046164274215698, "learning_rate": 5.8561595091675926e-05, "loss": 0.8421518325805664, "memory(GiB)": 91.52, "step": 72900, "token_acc": 0.7546987951807229, "train_speed(iter/s)": 0.137945 }, { "epoch": 0.9459919745417152, "grad_norm": 0.6931328773498535, "learning_rate": 5.855631050867172e-05, "loss": 0.8501694679260254, "memory(GiB)": 91.52, "step": 72905, "token_acc": 0.748904593639576, "train_speed(iter/s)": 0.137944 }, { "epoch": 0.9460568529433709, "grad_norm": 0.6920241117477417, "learning_rate": 5.855102582720207e-05, "loss": 0.8350456237792969, "memory(GiB)": 91.52, "step": 72910, "token_acc": 0.7674030213315114, "train_speed(iter/s)": 0.137942 }, { "epoch": 0.9461217313450265, "grad_norm": 0.6388944983482361, "learning_rate": 5.854574104732774e-05, "loss": 0.839846420288086, "memory(GiB)": 91.52, "step": 72915, "token_acc": 0.7767669172932331, "train_speed(iter/s)": 0.137941 }, { "epoch": 0.9461866097466822, "grad_norm": 0.6720243096351624, "learning_rate": 5.854045616910959e-05, "loss": 0.8273222923278809, "memory(GiB)": 91.52, "step": 72920, "token_acc": 0.7543003079696537, "train_speed(iter/s)": 0.13794 }, { "epoch": 0.9462514881483379, "grad_norm": 0.7315597534179688, "learning_rate": 5.853517119260841e-05, "loss": 0.8009773254394531, "memory(GiB)": 91.52, "step": 72925, "token_acc": 0.7692587209302325, "train_speed(iter/s)": 0.137939 }, { "epoch": 0.9463163665499936, "grad_norm": 0.6797655820846558, "learning_rate": 5.8529886117885036e-05, "loss": 0.8232698440551758, "memory(GiB)": 91.52, "step": 72930, "token_acc": 0.7887050016513155, "train_speed(iter/s)": 0.137938 }, { "epoch": 0.9463812449516493, "grad_norm": 0.7238015532493591, "learning_rate": 5.8524600945000275e-05, "loss": 0.8671398162841797, "memory(GiB)": 91.52, "step": 72935, "token_acc": 0.752593224789916, "train_speed(iter/s)": 0.137938 }, { "epoch": 0.946446123353305, "grad_norm": 0.7234598398208618, "learning_rate": 5.851931567401495e-05, "loss": 0.8205926895141602, "memory(GiB)": 91.52, "step": 72940, "token_acc": 0.7599181464123045, "train_speed(iter/s)": 0.137936 }, { "epoch": 0.9465110017549607, "grad_norm": 0.7626316547393799, "learning_rate": 5.851403030498991e-05, "loss": 0.8759008407592773, "memory(GiB)": 91.52, "step": 72945, "token_acc": 0.7486793545119039, "train_speed(iter/s)": 0.137935 }, { "epoch": 0.9465758801566164, "grad_norm": 0.7460187673568726, "learning_rate": 5.850874483798593e-05, "loss": 0.8447658538818359, "memory(GiB)": 91.52, "step": 72950, "token_acc": 0.7777019503609784, "train_speed(iter/s)": 0.137934 }, { "epoch": 0.9466407585582721, "grad_norm": 0.721824586391449, "learning_rate": 5.8503459273063874e-05, "loss": 0.8737371444702149, "memory(GiB)": 91.52, "step": 72955, "token_acc": 0.7709158089205596, "train_speed(iter/s)": 0.137932 }, { "epoch": 0.9467056369599278, "grad_norm": 0.7793892025947571, "learning_rate": 5.8498173610284556e-05, "loss": 0.8584270477294922, "memory(GiB)": 91.52, "step": 72960, "token_acc": 0.757537688442211, "train_speed(iter/s)": 0.137931 }, { "epoch": 0.9467705153615835, "grad_norm": 0.7496585845947266, "learning_rate": 5.8492887849708814e-05, "loss": 0.8390953063964843, "memory(GiB)": 91.52, "step": 72965, "token_acc": 0.7800734097703107, "train_speed(iter/s)": 0.13793 }, { "epoch": 0.9468353937632392, "grad_norm": 0.6367460489273071, "learning_rate": 5.8487601991397456e-05, "loss": 0.8070056915283204, "memory(GiB)": 91.52, "step": 72970, "token_acc": 0.7737221472533983, "train_speed(iter/s)": 0.137929 }, { "epoch": 0.9469002721648949, "grad_norm": 0.7432242035865784, "learning_rate": 5.848231603541131e-05, "loss": 0.8426952362060547, "memory(GiB)": 91.52, "step": 72975, "token_acc": 0.7586293146573286, "train_speed(iter/s)": 0.137927 }, { "epoch": 0.9469651505665506, "grad_norm": 0.6835929751396179, "learning_rate": 5.847702998181124e-05, "loss": 0.8624131202697753, "memory(GiB)": 91.52, "step": 72980, "token_acc": 0.7508612750281407, "train_speed(iter/s)": 0.137926 }, { "epoch": 0.9470300289682063, "grad_norm": 0.7273412942886353, "learning_rate": 5.8471743830658034e-05, "loss": 0.8553705215454102, "memory(GiB)": 91.52, "step": 72985, "token_acc": 0.7555854138674499, "train_speed(iter/s)": 0.137925 }, { "epoch": 0.947094907369862, "grad_norm": 0.7757776379585266, "learning_rate": 5.846645758201255e-05, "loss": 0.8599655151367187, "memory(GiB)": 91.52, "step": 72990, "token_acc": 0.7676605264077738, "train_speed(iter/s)": 0.137923 }, { "epoch": 0.9471597857715177, "grad_norm": 0.6488290429115295, "learning_rate": 5.846117123593561e-05, "loss": 0.8004536628723145, "memory(GiB)": 91.52, "step": 72995, "token_acc": 0.7791181509843002, "train_speed(iter/s)": 0.137922 }, { "epoch": 0.9472246641731734, "grad_norm": 0.7706604599952698, "learning_rate": 5.845588479248807e-05, "loss": 0.8674046516418457, "memory(GiB)": 91.52, "step": 73000, "token_acc": 0.7548500881834215, "train_speed(iter/s)": 0.137921 }, { "epoch": 0.9472895425748291, "grad_norm": 0.7745923399925232, "learning_rate": 5.845059825173076e-05, "loss": 0.8438253402709961, "memory(GiB)": 91.52, "step": 73005, "token_acc": 0.7750495882671155, "train_speed(iter/s)": 0.137919 }, { "epoch": 0.9473544209764848, "grad_norm": 0.7451147437095642, "learning_rate": 5.844531161372448e-05, "loss": 0.8334356307983398, "memory(GiB)": 91.52, "step": 73010, "token_acc": 0.7726600597766242, "train_speed(iter/s)": 0.137918 }, { "epoch": 0.9474192993781405, "grad_norm": 0.7724778652191162, "learning_rate": 5.844002487853012e-05, "loss": 0.8608345031738281, "memory(GiB)": 91.52, "step": 73015, "token_acc": 0.7630002572489067, "train_speed(iter/s)": 0.137917 }, { "epoch": 0.9474841777797962, "grad_norm": 0.6687450408935547, "learning_rate": 5.8434738046208495e-05, "loss": 0.8547245025634765, "memory(GiB)": 91.52, "step": 73020, "token_acc": 0.7685671021586787, "train_speed(iter/s)": 0.137916 }, { "epoch": 0.9475490561814519, "grad_norm": 0.7993388772010803, "learning_rate": 5.8429451116820446e-05, "loss": 0.8451305389404297, "memory(GiB)": 91.52, "step": 73025, "token_acc": 0.7631646765741775, "train_speed(iter/s)": 0.137915 }, { "epoch": 0.9476139345831076, "grad_norm": 0.7957258224487305, "learning_rate": 5.8424164090426795e-05, "loss": 0.8790852546691894, "memory(GiB)": 91.52, "step": 73030, "token_acc": 0.7659533510860149, "train_speed(iter/s)": 0.137914 }, { "epoch": 0.9476788129847633, "grad_norm": 0.667934238910675, "learning_rate": 5.841887696708844e-05, "loss": 0.8710899353027344, "memory(GiB)": 91.52, "step": 73035, "token_acc": 0.7588410737111205, "train_speed(iter/s)": 0.137913 }, { "epoch": 0.947743691386419, "grad_norm": 0.6700770854949951, "learning_rate": 5.8413589746866174e-05, "loss": 0.8326766967773438, "memory(GiB)": 91.52, "step": 73040, "token_acc": 0.7675276752767528, "train_speed(iter/s)": 0.137912 }, { "epoch": 0.9478085697880747, "grad_norm": 0.7685117125511169, "learning_rate": 5.840830242982085e-05, "loss": 0.8583526611328125, "memory(GiB)": 91.52, "step": 73045, "token_acc": 0.7597845206684257, "train_speed(iter/s)": 0.137911 }, { "epoch": 0.9478734481897304, "grad_norm": 0.6537221670150757, "learning_rate": 5.840301501601334e-05, "loss": 0.8186607360839844, "memory(GiB)": 91.52, "step": 73050, "token_acc": 0.7602561468392576, "train_speed(iter/s)": 0.13791 }, { "epoch": 0.9479383265913861, "grad_norm": 0.7540070414543152, "learning_rate": 5.8397727505504464e-05, "loss": 0.8864892005920411, "memory(GiB)": 91.52, "step": 73055, "token_acc": 0.7654681511224676, "train_speed(iter/s)": 0.137908 }, { "epoch": 0.9480032049930418, "grad_norm": 0.7111716270446777, "learning_rate": 5.839243989835508e-05, "loss": 0.8368627548217773, "memory(GiB)": 91.52, "step": 73060, "token_acc": 0.7604221810481736, "train_speed(iter/s)": 0.137907 }, { "epoch": 0.9480680833946975, "grad_norm": 0.6999214291572571, "learning_rate": 5.838715219462604e-05, "loss": 0.8785394668579102, "memory(GiB)": 91.52, "step": 73065, "token_acc": 0.7637132132605747, "train_speed(iter/s)": 0.137906 }, { "epoch": 0.9481329617963532, "grad_norm": 0.6826648712158203, "learning_rate": 5.838186439437821e-05, "loss": 0.8100296020507812, "memory(GiB)": 91.52, "step": 73070, "token_acc": 0.7588579683698297, "train_speed(iter/s)": 0.137905 }, { "epoch": 0.9481978401980089, "grad_norm": 0.7650649547576904, "learning_rate": 5.8376576497672386e-05, "loss": 0.8718189239501953, "memory(GiB)": 91.52, "step": 73075, "token_acc": 0.7476028574702983, "train_speed(iter/s)": 0.137903 }, { "epoch": 0.9482627185996646, "grad_norm": 0.6560620665550232, "learning_rate": 5.8371288504569485e-05, "loss": 0.8637275695800781, "memory(GiB)": 91.52, "step": 73080, "token_acc": 0.7585159305813998, "train_speed(iter/s)": 0.137902 }, { "epoch": 0.9483275970013203, "grad_norm": 0.7093400955200195, "learning_rate": 5.836600041513033e-05, "loss": 0.8172271728515625, "memory(GiB)": 91.52, "step": 73085, "token_acc": 0.773270013568521, "train_speed(iter/s)": 0.137901 }, { "epoch": 0.948392475402976, "grad_norm": 0.6929423809051514, "learning_rate": 5.836071222941578e-05, "loss": 0.8977165222167969, "memory(GiB)": 91.52, "step": 73090, "token_acc": 0.7368506493506494, "train_speed(iter/s)": 0.1379 }, { "epoch": 0.9484573538046317, "grad_norm": 0.6946055293083191, "learning_rate": 5.83554239474867e-05, "loss": 0.8748724937438965, "memory(GiB)": 91.52, "step": 73095, "token_acc": 0.7491427265107063, "train_speed(iter/s)": 0.137898 }, { "epoch": 0.9485222322062874, "grad_norm": 0.7244396805763245, "learning_rate": 5.835013556940392e-05, "loss": 0.8595210075378418, "memory(GiB)": 91.52, "step": 73100, "token_acc": 0.7401264141068481, "train_speed(iter/s)": 0.137897 }, { "epoch": 0.9485871106079431, "grad_norm": 0.7149503231048584, "learning_rate": 5.834484709522833e-05, "loss": 0.8636001586914063, "memory(GiB)": 91.52, "step": 73105, "token_acc": 0.7542828008157716, "train_speed(iter/s)": 0.137896 }, { "epoch": 0.9486519890095988, "grad_norm": 0.8547353744506836, "learning_rate": 5.833955852502077e-05, "loss": 0.8355772972106934, "memory(GiB)": 91.52, "step": 73110, "token_acc": 0.7787201765273756, "train_speed(iter/s)": 0.137895 }, { "epoch": 0.9487168674112545, "grad_norm": 0.7539089322090149, "learning_rate": 5.8334269858842115e-05, "loss": 0.8204631805419922, "memory(GiB)": 91.52, "step": 73115, "token_acc": 0.7679818594104308, "train_speed(iter/s)": 0.137893 }, { "epoch": 0.9487817458129102, "grad_norm": 0.7906562089920044, "learning_rate": 5.8328981096753224e-05, "loss": 0.8655825614929199, "memory(GiB)": 91.52, "step": 73120, "token_acc": 0.7638638638638638, "train_speed(iter/s)": 0.137892 }, { "epoch": 0.9488466242145659, "grad_norm": 0.7537640333175659, "learning_rate": 5.832369223881494e-05, "loss": 0.8378013610839844, "memory(GiB)": 91.52, "step": 73125, "token_acc": 0.7575694732476151, "train_speed(iter/s)": 0.137891 }, { "epoch": 0.9489115026162216, "grad_norm": 0.9233375191688538, "learning_rate": 5.831840328508815e-05, "loss": 0.881655216217041, "memory(GiB)": 91.52, "step": 73130, "token_acc": 0.759128020224626, "train_speed(iter/s)": 0.13789 }, { "epoch": 0.9489763810178773, "grad_norm": 0.8006504774093628, "learning_rate": 5.8313114235633704e-05, "loss": 0.9035166740417481, "memory(GiB)": 91.52, "step": 73135, "token_acc": 0.7282977386934674, "train_speed(iter/s)": 0.137889 }, { "epoch": 0.949041259419533, "grad_norm": 0.7562177777290344, "learning_rate": 5.830782509051247e-05, "loss": 0.80679349899292, "memory(GiB)": 91.52, "step": 73140, "token_acc": 0.8084930088037287, "train_speed(iter/s)": 0.137888 }, { "epoch": 0.9491061378211887, "grad_norm": 0.686305820941925, "learning_rate": 5.830253584978533e-05, "loss": 0.8159136772155762, "memory(GiB)": 91.52, "step": 73145, "token_acc": 0.7660870231346714, "train_speed(iter/s)": 0.137886 }, { "epoch": 0.9491710162228444, "grad_norm": 0.7692148089408875, "learning_rate": 5.829724651351315e-05, "loss": 0.8673927307128906, "memory(GiB)": 91.52, "step": 73150, "token_acc": 0.7512173099867789, "train_speed(iter/s)": 0.137884 }, { "epoch": 0.9492358946245, "grad_norm": 0.6828406453132629, "learning_rate": 5.829195708175678e-05, "loss": 0.8405611991882325, "memory(GiB)": 91.52, "step": 73155, "token_acc": 0.7614147658359091, "train_speed(iter/s)": 0.137883 }, { "epoch": 0.9493007730261557, "grad_norm": 0.7833287715911865, "learning_rate": 5.8286667554577104e-05, "loss": 0.8573968887329102, "memory(GiB)": 91.52, "step": 73160, "token_acc": 0.7389635316698656, "train_speed(iter/s)": 0.137882 }, { "epoch": 0.9493656514278114, "grad_norm": 0.6957299113273621, "learning_rate": 5.8281377932034985e-05, "loss": 0.8640766143798828, "memory(GiB)": 91.52, "step": 73165, "token_acc": 0.7761734320344912, "train_speed(iter/s)": 0.137881 }, { "epoch": 0.9494305298294671, "grad_norm": 0.7499120831489563, "learning_rate": 5.8276088214191305e-05, "loss": 0.7933804035186768, "memory(GiB)": 91.52, "step": 73170, "token_acc": 0.7713938348105733, "train_speed(iter/s)": 0.13788 }, { "epoch": 0.9494954082311228, "grad_norm": 0.7571925520896912, "learning_rate": 5.827079840110693e-05, "loss": 0.8132871627807617, "memory(GiB)": 91.52, "step": 73175, "token_acc": 0.7688275862068965, "train_speed(iter/s)": 0.137879 }, { "epoch": 0.9495602866327785, "grad_norm": 0.7087757587432861, "learning_rate": 5.826550849284275e-05, "loss": 0.8327371597290039, "memory(GiB)": 91.52, "step": 73180, "token_acc": 0.7735219154970026, "train_speed(iter/s)": 0.137878 }, { "epoch": 0.9496251650344342, "grad_norm": 0.7027418613433838, "learning_rate": 5.8260218489459626e-05, "loss": 0.792939567565918, "memory(GiB)": 91.52, "step": 73185, "token_acc": 0.775813869600505, "train_speed(iter/s)": 0.137876 }, { "epoch": 0.9496900434360899, "grad_norm": 0.833844006061554, "learning_rate": 5.825492839101845e-05, "loss": 0.8528675079345703, "memory(GiB)": 91.52, "step": 73190, "token_acc": 0.7583000798084597, "train_speed(iter/s)": 0.137875 }, { "epoch": 0.9497549218377456, "grad_norm": 0.707425057888031, "learning_rate": 5.824963819758006e-05, "loss": 0.8277080535888672, "memory(GiB)": 91.52, "step": 73195, "token_acc": 0.7549757063163578, "train_speed(iter/s)": 0.137873 }, { "epoch": 0.9498198002394013, "grad_norm": 0.7631058692932129, "learning_rate": 5.82443479092054e-05, "loss": 0.8416669845581055, "memory(GiB)": 91.52, "step": 73200, "token_acc": 0.7595802139951457, "train_speed(iter/s)": 0.137872 }, { "epoch": 0.949884678641057, "grad_norm": 0.7190422415733337, "learning_rate": 5.823905752595529e-05, "loss": 0.8538885116577148, "memory(GiB)": 91.52, "step": 73205, "token_acc": 0.7507597722429874, "train_speed(iter/s)": 0.13787 }, { "epoch": 0.9499495570427127, "grad_norm": 0.6804998517036438, "learning_rate": 5.823376704789064e-05, "loss": 0.8309386253356934, "memory(GiB)": 91.52, "step": 73210, "token_acc": 0.766963049340427, "train_speed(iter/s)": 0.137869 }, { "epoch": 0.9500144354443684, "grad_norm": 0.6279652118682861, "learning_rate": 5.8228476475072334e-05, "loss": 0.8469268798828125, "memory(GiB)": 91.52, "step": 73215, "token_acc": 0.7684773236764959, "train_speed(iter/s)": 0.137867 }, { "epoch": 0.9500793138460241, "grad_norm": 0.7154844403266907, "learning_rate": 5.822318580756125e-05, "loss": 0.820529842376709, "memory(GiB)": 91.52, "step": 73220, "token_acc": 0.7534073243917428, "train_speed(iter/s)": 0.137866 }, { "epoch": 0.9501441922476798, "grad_norm": 0.6709368824958801, "learning_rate": 5.821789504541827e-05, "loss": 0.8382222175598144, "memory(GiB)": 91.52, "step": 73225, "token_acc": 0.7772820226807957, "train_speed(iter/s)": 0.137864 }, { "epoch": 0.9502090706493355, "grad_norm": 0.7511888146400452, "learning_rate": 5.821260418870428e-05, "loss": 0.8553171157836914, "memory(GiB)": 91.52, "step": 73230, "token_acc": 0.7563507920650778, "train_speed(iter/s)": 0.137863 }, { "epoch": 0.9502739490509912, "grad_norm": 0.6628409028053284, "learning_rate": 5.8207313237480184e-05, "loss": 0.82568359375, "memory(GiB)": 91.52, "step": 73235, "token_acc": 0.7586967347229139, "train_speed(iter/s)": 0.137862 }, { "epoch": 0.9503388274526469, "grad_norm": 0.6818907856941223, "learning_rate": 5.820202219180684e-05, "loss": 0.7685755729675293, "memory(GiB)": 91.52, "step": 73240, "token_acc": 0.7911418800567187, "train_speed(iter/s)": 0.13786 }, { "epoch": 0.9504037058543026, "grad_norm": 0.6835947036743164, "learning_rate": 5.819673105174516e-05, "loss": 0.8267712593078613, "memory(GiB)": 91.52, "step": 73245, "token_acc": 0.7733961608280869, "train_speed(iter/s)": 0.137859 }, { "epoch": 0.9504685842559583, "grad_norm": 0.7252064943313599, "learning_rate": 5.8191439817356016e-05, "loss": 0.8507648468017578, "memory(GiB)": 91.52, "step": 73250, "token_acc": 0.7704866008462623, "train_speed(iter/s)": 0.137858 }, { "epoch": 0.950533462657614, "grad_norm": 0.6985076665878296, "learning_rate": 5.818614848870032e-05, "loss": 0.8852998733520507, "memory(GiB)": 91.52, "step": 73255, "token_acc": 0.7448858892290565, "train_speed(iter/s)": 0.137857 }, { "epoch": 0.9505983410592697, "grad_norm": 0.7396331429481506, "learning_rate": 5.8180857065838956e-05, "loss": 0.8739980697631836, "memory(GiB)": 91.52, "step": 73260, "token_acc": 0.7568891381566648, "train_speed(iter/s)": 0.137856 }, { "epoch": 0.9506632194609254, "grad_norm": 0.7498807907104492, "learning_rate": 5.817556554883281e-05, "loss": 0.8458061218261719, "memory(GiB)": 91.52, "step": 73265, "token_acc": 0.7752043596730245, "train_speed(iter/s)": 0.137855 }, { "epoch": 0.9507280978625811, "grad_norm": 0.7690597772598267, "learning_rate": 5.817027393774279e-05, "loss": 0.8118733406066895, "memory(GiB)": 91.52, "step": 73270, "token_acc": 0.7743704151856613, "train_speed(iter/s)": 0.137853 }, { "epoch": 0.9507929762642368, "grad_norm": 0.7499186396598816, "learning_rate": 5.816498223262977e-05, "loss": 0.827587890625, "memory(GiB)": 91.52, "step": 73275, "token_acc": 0.7720771247469551, "train_speed(iter/s)": 0.137852 }, { "epoch": 0.9508578546658925, "grad_norm": 0.6600866913795471, "learning_rate": 5.815969043355467e-05, "loss": 0.852021598815918, "memory(GiB)": 91.52, "step": 73280, "token_acc": 0.7602038945322528, "train_speed(iter/s)": 0.137851 }, { "epoch": 0.9509227330675482, "grad_norm": 0.8378646373748779, "learning_rate": 5.815439854057838e-05, "loss": 0.87098388671875, "memory(GiB)": 91.52, "step": 73285, "token_acc": 0.7616482803374433, "train_speed(iter/s)": 0.13785 }, { "epoch": 0.9509876114692039, "grad_norm": 0.7588594555854797, "learning_rate": 5.814910655376179e-05, "loss": 0.8806919097900391, "memory(GiB)": 91.52, "step": 73290, "token_acc": 0.7640734123761572, "train_speed(iter/s)": 0.137849 }, { "epoch": 0.9510524898708596, "grad_norm": 0.8100644946098328, "learning_rate": 5.81438144731658e-05, "loss": 0.8422821044921875, "memory(GiB)": 91.52, "step": 73295, "token_acc": 0.7609129312367989, "train_speed(iter/s)": 0.137848 }, { "epoch": 0.9511173682725153, "grad_norm": 0.7074512839317322, "learning_rate": 5.8138522298851316e-05, "loss": 0.8092176437377929, "memory(GiB)": 91.52, "step": 73300, "token_acc": 0.7709663218137794, "train_speed(iter/s)": 0.137846 }, { "epoch": 0.951182246674171, "grad_norm": 0.7847768068313599, "learning_rate": 5.8133230030879274e-05, "loss": 0.8327008247375488, "memory(GiB)": 91.52, "step": 73305, "token_acc": 0.7580817473192736, "train_speed(iter/s)": 0.137845 }, { "epoch": 0.9512471250758266, "grad_norm": 0.6734256148338318, "learning_rate": 5.8127937669310504e-05, "loss": 0.8199447631835938, "memory(GiB)": 91.52, "step": 73310, "token_acc": 0.7625567658005258, "train_speed(iter/s)": 0.137844 }, { "epoch": 0.9513120034774823, "grad_norm": 0.6814106106758118, "learning_rate": 5.812264521420596e-05, "loss": 0.8667509078979492, "memory(GiB)": 91.52, "step": 73315, "token_acc": 0.7573529411764706, "train_speed(iter/s)": 0.137842 }, { "epoch": 0.951376881879138, "grad_norm": 0.704149067401886, "learning_rate": 5.811735266562656e-05, "loss": 0.8951284408569335, "memory(GiB)": 91.52, "step": 73320, "token_acc": 0.7464290867118741, "train_speed(iter/s)": 0.137841 }, { "epoch": 0.9514417602807937, "grad_norm": 0.6359079480171204, "learning_rate": 5.811206002363316e-05, "loss": 0.8096217155456543, "memory(GiB)": 91.52, "step": 73325, "token_acc": 0.7617447431461273, "train_speed(iter/s)": 0.137839 }, { "epoch": 0.9515066386824494, "grad_norm": 0.7427884340286255, "learning_rate": 5.8106767288286715e-05, "loss": 0.8398382186889648, "memory(GiB)": 91.52, "step": 73330, "token_acc": 0.7875074915096224, "train_speed(iter/s)": 0.137838 }, { "epoch": 0.9515715170841051, "grad_norm": 0.7113170623779297, "learning_rate": 5.8101474459648095e-05, "loss": 0.884544849395752, "memory(GiB)": 91.52, "step": 73335, "token_acc": 0.7466486505707073, "train_speed(iter/s)": 0.137837 }, { "epoch": 0.9516363954857608, "grad_norm": 0.679914653301239, "learning_rate": 5.8096181537778246e-05, "loss": 0.825497055053711, "memory(GiB)": 91.52, "step": 73340, "token_acc": 0.7943412162162162, "train_speed(iter/s)": 0.137835 }, { "epoch": 0.9517012738874165, "grad_norm": 0.7151426076889038, "learning_rate": 5.809088852273805e-05, "loss": 0.8486597061157226, "memory(GiB)": 91.52, "step": 73345, "token_acc": 0.7377536207420411, "train_speed(iter/s)": 0.137834 }, { "epoch": 0.9517661522890722, "grad_norm": 0.7340355515480042, "learning_rate": 5.808559541458843e-05, "loss": 0.8700370788574219, "memory(GiB)": 91.52, "step": 73350, "token_acc": 0.754504132231405, "train_speed(iter/s)": 0.137833 }, { "epoch": 0.9518310306907279, "grad_norm": 0.8334697484970093, "learning_rate": 5.808030221339031e-05, "loss": 0.8596277236938477, "memory(GiB)": 91.52, "step": 73355, "token_acc": 0.7609088173337346, "train_speed(iter/s)": 0.137832 }, { "epoch": 0.9518959090923836, "grad_norm": 0.7323723435401917, "learning_rate": 5.807500891920459e-05, "loss": 0.8298049926757812, "memory(GiB)": 91.52, "step": 73360, "token_acc": 0.7608403941264426, "train_speed(iter/s)": 0.137831 }, { "epoch": 0.9519607874940393, "grad_norm": 0.7610617876052856, "learning_rate": 5.8069715532092184e-05, "loss": 0.8730973243713379, "memory(GiB)": 91.52, "step": 73365, "token_acc": 0.7737647058823529, "train_speed(iter/s)": 0.13783 }, { "epoch": 0.952025665895695, "grad_norm": 0.7172161340713501, "learning_rate": 5.8064422052114e-05, "loss": 0.805727767944336, "memory(GiB)": 91.52, "step": 73370, "token_acc": 0.7682415106068847, "train_speed(iter/s)": 0.137829 }, { "epoch": 0.9520905442973507, "grad_norm": 0.8696466684341431, "learning_rate": 5.805912847933097e-05, "loss": 0.8584739685058593, "memory(GiB)": 91.52, "step": 73375, "token_acc": 0.7702020202020202, "train_speed(iter/s)": 0.137828 }, { "epoch": 0.9521554226990064, "grad_norm": 0.7388513088226318, "learning_rate": 5.805383481380402e-05, "loss": 0.8200247764587403, "memory(GiB)": 91.52, "step": 73380, "token_acc": 0.7727562275790392, "train_speed(iter/s)": 0.137827 }, { "epoch": 0.9522203011006621, "grad_norm": 0.8279734253883362, "learning_rate": 5.804854105559405e-05, "loss": 0.8677579879760742, "memory(GiB)": 91.52, "step": 73385, "token_acc": 0.7490820493462819, "train_speed(iter/s)": 0.137825 }, { "epoch": 0.9522851795023178, "grad_norm": 0.7151660919189453, "learning_rate": 5.804324720476198e-05, "loss": 0.9203866958618164, "memory(GiB)": 91.52, "step": 73390, "token_acc": 0.7570769977353607, "train_speed(iter/s)": 0.137824 }, { "epoch": 0.9523500579039734, "grad_norm": 0.7834680676460266, "learning_rate": 5.803795326136876e-05, "loss": 0.8176114082336425, "memory(GiB)": 91.52, "step": 73395, "token_acc": 0.7745776165098084, "train_speed(iter/s)": 0.137824 }, { "epoch": 0.9524149363056291, "grad_norm": 0.6945429444313049, "learning_rate": 5.803265922547527e-05, "loss": 0.829608154296875, "memory(GiB)": 91.52, "step": 73400, "token_acc": 0.7569626682468467, "train_speed(iter/s)": 0.137823 }, { "epoch": 0.9524798147072848, "grad_norm": 0.737699031829834, "learning_rate": 5.802736509714246e-05, "loss": 0.9053963661193848, "memory(GiB)": 91.52, "step": 73405, "token_acc": 0.7469375560203168, "train_speed(iter/s)": 0.137821 }, { "epoch": 0.9525446931089405, "grad_norm": 0.6894633769989014, "learning_rate": 5.8022070876431244e-05, "loss": 0.8318004608154297, "memory(GiB)": 91.52, "step": 73410, "token_acc": 0.7767445064407247, "train_speed(iter/s)": 0.137819 }, { "epoch": 0.9526095715105962, "grad_norm": 0.7306087017059326, "learning_rate": 5.8016776563402567e-05, "loss": 0.830047607421875, "memory(GiB)": 91.52, "step": 73415, "token_acc": 0.756539360872954, "train_speed(iter/s)": 0.137818 }, { "epoch": 0.9526744499122519, "grad_norm": 0.7797287106513977, "learning_rate": 5.801148215811734e-05, "loss": 0.8696192741394043, "memory(GiB)": 91.52, "step": 73420, "token_acc": 0.7618466770163377, "train_speed(iter/s)": 0.137817 }, { "epoch": 0.9527393283139076, "grad_norm": 0.7791616916656494, "learning_rate": 5.800618766063647e-05, "loss": 0.8496103286743164, "memory(GiB)": 91.52, "step": 73425, "token_acc": 0.7641115593755482, "train_speed(iter/s)": 0.137816 }, { "epoch": 0.9528042067155633, "grad_norm": 0.6421327590942383, "learning_rate": 5.800089307102094e-05, "loss": 0.8544141769409179, "memory(GiB)": 91.52, "step": 73430, "token_acc": 0.7745991019884542, "train_speed(iter/s)": 0.137815 }, { "epoch": 0.952869085117219, "grad_norm": 0.6763702630996704, "learning_rate": 5.799559838933163e-05, "loss": 0.8640617370605469, "memory(GiB)": 91.52, "step": 73435, "token_acc": 0.7661440727357501, "train_speed(iter/s)": 0.137813 }, { "epoch": 0.9529339635188747, "grad_norm": 0.7289764285087585, "learning_rate": 5.799030361562948e-05, "loss": 0.8822286605834961, "memory(GiB)": 91.52, "step": 73440, "token_acc": 0.7705376344086021, "train_speed(iter/s)": 0.137812 }, { "epoch": 0.9529988419205304, "grad_norm": 0.7601962685585022, "learning_rate": 5.798500874997544e-05, "loss": 0.8519794464111328, "memory(GiB)": 91.52, "step": 73445, "token_acc": 0.7475449560005102, "train_speed(iter/s)": 0.137811 }, { "epoch": 0.9530637203221861, "grad_norm": 0.7541000247001648, "learning_rate": 5.797971379243044e-05, "loss": 0.8561118125915528, "memory(GiB)": 91.52, "step": 73450, "token_acc": 0.7843689752335755, "train_speed(iter/s)": 0.137809 }, { "epoch": 0.9531285987238418, "grad_norm": 0.8339028358459473, "learning_rate": 5.7974418743055405e-05, "loss": 0.8718000411987304, "memory(GiB)": 91.52, "step": 73455, "token_acc": 0.7646140028896091, "train_speed(iter/s)": 0.137808 }, { "epoch": 0.9531934771254975, "grad_norm": 0.6861048340797424, "learning_rate": 5.7969123601911245e-05, "loss": 0.8455877304077148, "memory(GiB)": 91.52, "step": 73460, "token_acc": 0.7672192843006845, "train_speed(iter/s)": 0.137806 }, { "epoch": 0.9532583555271532, "grad_norm": 0.6882511973381042, "learning_rate": 5.7963828369058956e-05, "loss": 0.831972312927246, "memory(GiB)": 91.52, "step": 73465, "token_acc": 0.7748847639217641, "train_speed(iter/s)": 0.137805 }, { "epoch": 0.9533232339288089, "grad_norm": 0.7061059474945068, "learning_rate": 5.7958533044559436e-05, "loss": 0.8578017234802247, "memory(GiB)": 91.52, "step": 73470, "token_acc": 0.7559369937614173, "train_speed(iter/s)": 0.137804 }, { "epoch": 0.9533881123304646, "grad_norm": 0.7123342156410217, "learning_rate": 5.795323762847362e-05, "loss": 0.85257568359375, "memory(GiB)": 91.52, "step": 73475, "token_acc": 0.7547071311732154, "train_speed(iter/s)": 0.137803 }, { "epoch": 0.9534529907321203, "grad_norm": 0.7346358895301819, "learning_rate": 5.794794212086248e-05, "loss": 0.8356710433959961, "memory(GiB)": 91.52, "step": 73480, "token_acc": 0.7734107538430557, "train_speed(iter/s)": 0.137801 }, { "epoch": 0.953517869133776, "grad_norm": 0.7922106981277466, "learning_rate": 5.7942646521786914e-05, "loss": 0.8129449844360351, "memory(GiB)": 91.52, "step": 73485, "token_acc": 0.7678693811731506, "train_speed(iter/s)": 0.1378 }, { "epoch": 0.9535827475354317, "grad_norm": 0.7215369343757629, "learning_rate": 5.793735083130788e-05, "loss": 0.8343949317932129, "memory(GiB)": 91.52, "step": 73490, "token_acc": 0.7734317094535409, "train_speed(iter/s)": 0.137799 }, { "epoch": 0.9536476259370874, "grad_norm": 0.7183101773262024, "learning_rate": 5.7932055049486325e-05, "loss": 0.8802660942077637, "memory(GiB)": 91.52, "step": 73495, "token_acc": 0.7676209809833716, "train_speed(iter/s)": 0.137797 }, { "epoch": 0.9537125043387431, "grad_norm": 0.772046685218811, "learning_rate": 5.79267591763832e-05, "loss": 0.8752782821655274, "memory(GiB)": 91.52, "step": 73500, "token_acc": 0.7586853203025637, "train_speed(iter/s)": 0.137796 }, { "epoch": 0.9537773827403988, "grad_norm": 0.7006763219833374, "learning_rate": 5.792146321205943e-05, "loss": 0.8021515846252442, "memory(GiB)": 91.52, "step": 73505, "token_acc": 0.7791047497492934, "train_speed(iter/s)": 0.137795 }, { "epoch": 0.9538422611420545, "grad_norm": 0.6535695791244507, "learning_rate": 5.791616715657599e-05, "loss": 0.8503403663635254, "memory(GiB)": 91.52, "step": 73510, "token_acc": 0.7611744412779361, "train_speed(iter/s)": 0.137794 }, { "epoch": 0.9539071395437102, "grad_norm": 0.7845249176025391, "learning_rate": 5.791087100999379e-05, "loss": 0.8701522827148438, "memory(GiB)": 91.52, "step": 73515, "token_acc": 0.7576888771431403, "train_speed(iter/s)": 0.137793 }, { "epoch": 0.9539720179453659, "grad_norm": 0.7878091335296631, "learning_rate": 5.79055747723738e-05, "loss": 0.9321477890014649, "memory(GiB)": 91.52, "step": 73520, "token_acc": 0.7507459937373365, "train_speed(iter/s)": 0.137792 }, { "epoch": 0.9540368963470216, "grad_norm": 0.7074980735778809, "learning_rate": 5.7900278443776965e-05, "loss": 0.8268765449523926, "memory(GiB)": 91.52, "step": 73525, "token_acc": 0.7547009494255421, "train_speed(iter/s)": 0.13779 }, { "epoch": 0.9541017747486773, "grad_norm": 0.6396201252937317, "learning_rate": 5.789498202426423e-05, "loss": 0.8650020599365235, "memory(GiB)": 91.52, "step": 73530, "token_acc": 0.7564621300785822, "train_speed(iter/s)": 0.137789 }, { "epoch": 0.954166653150333, "grad_norm": 0.7458380460739136, "learning_rate": 5.788968551389655e-05, "loss": 0.8626018524169922, "memory(GiB)": 91.52, "step": 73535, "token_acc": 0.7328275862068966, "train_speed(iter/s)": 0.137788 }, { "epoch": 0.9542315315519887, "grad_norm": 0.6731317639350891, "learning_rate": 5.7884388912734885e-05, "loss": 0.8067849159240723, "memory(GiB)": 91.52, "step": 73540, "token_acc": 0.7493800278231416, "train_speed(iter/s)": 0.137786 }, { "epoch": 0.9542964099536444, "grad_norm": 0.7119393944740295, "learning_rate": 5.787909222084018e-05, "loss": 0.9033678054809571, "memory(GiB)": 91.52, "step": 73545, "token_acc": 0.7388856978747846, "train_speed(iter/s)": 0.137785 }, { "epoch": 0.9543612883553001, "grad_norm": 0.7486111521720886, "learning_rate": 5.7873795438273395e-05, "loss": 0.855949878692627, "memory(GiB)": 91.52, "step": 73550, "token_acc": 0.7549096065225097, "train_speed(iter/s)": 0.137784 }, { "epoch": 0.9544261667569558, "grad_norm": 0.6991732120513916, "learning_rate": 5.7868498565095454e-05, "loss": 0.818477725982666, "memory(GiB)": 91.52, "step": 73555, "token_acc": 0.7766183362624487, "train_speed(iter/s)": 0.137783 }, { "epoch": 0.9544910451586115, "grad_norm": 0.7521180510520935, "learning_rate": 5.7863201601367366e-05, "loss": 0.8408734321594238, "memory(GiB)": 91.52, "step": 73560, "token_acc": 0.76650390625, "train_speed(iter/s)": 0.137781 }, { "epoch": 0.9545559235602672, "grad_norm": 0.7735978364944458, "learning_rate": 5.785790454715003e-05, "loss": 0.8589760780334472, "memory(GiB)": 91.52, "step": 73565, "token_acc": 0.7530530958886072, "train_speed(iter/s)": 0.13778 }, { "epoch": 0.9546208019619229, "grad_norm": 0.7009447813034058, "learning_rate": 5.785260740250447e-05, "loss": 0.8302520751953125, "memory(GiB)": 91.52, "step": 73570, "token_acc": 0.7874404415041305, "train_speed(iter/s)": 0.137778 }, { "epoch": 0.9546856803635786, "grad_norm": 0.6435365676879883, "learning_rate": 5.784731016749159e-05, "loss": 0.794615364074707, "memory(GiB)": 91.52, "step": 73575, "token_acc": 0.7778337169159953, "train_speed(iter/s)": 0.137776 }, { "epoch": 0.9547505587652343, "grad_norm": 0.7532041072845459, "learning_rate": 5.7842012842172366e-05, "loss": 0.8136236190795898, "memory(GiB)": 91.52, "step": 73580, "token_acc": 0.7626331003695708, "train_speed(iter/s)": 0.137775 }, { "epoch": 0.95481543716689, "grad_norm": 0.7846174240112305, "learning_rate": 5.783671542660777e-05, "loss": 0.8589750289916992, "memory(GiB)": 91.52, "step": 73585, "token_acc": 0.7561629930394431, "train_speed(iter/s)": 0.137773 }, { "epoch": 0.9548803155685457, "grad_norm": 0.7727460861206055, "learning_rate": 5.783141792085874e-05, "loss": 0.8879575729370117, "memory(GiB)": 91.52, "step": 73590, "token_acc": 0.7604372188461842, "train_speed(iter/s)": 0.137772 }, { "epoch": 0.9549451939702014, "grad_norm": 0.7083730697631836, "learning_rate": 5.782612032498629e-05, "loss": 0.8352319717407226, "memory(GiB)": 91.52, "step": 73595, "token_acc": 0.7844654340836013, "train_speed(iter/s)": 0.137771 }, { "epoch": 0.9550100723718571, "grad_norm": 0.657984733581543, "learning_rate": 5.782082263905132e-05, "loss": 0.8500648498535156, "memory(GiB)": 91.52, "step": 73600, "token_acc": 0.774854758203093, "train_speed(iter/s)": 0.13777 }, { "epoch": 0.9550749507735128, "grad_norm": 0.738715648651123, "learning_rate": 5.781552486311483e-05, "loss": 0.84071044921875, "memory(GiB)": 91.52, "step": 73605, "token_acc": 0.765770545926188, "train_speed(iter/s)": 0.137768 }, { "epoch": 0.9551398291751685, "grad_norm": 0.6700513362884521, "learning_rate": 5.78102269972378e-05, "loss": 0.8542108535766602, "memory(GiB)": 91.52, "step": 73610, "token_acc": 0.7600077429345722, "train_speed(iter/s)": 0.137767 }, { "epoch": 0.9552047075768242, "grad_norm": 0.7587414383888245, "learning_rate": 5.780492904148117e-05, "loss": 0.816622257232666, "memory(GiB)": 91.52, "step": 73615, "token_acc": 0.7628363719407757, "train_speed(iter/s)": 0.137766 }, { "epoch": 0.9552695859784799, "grad_norm": 0.75958651304245, "learning_rate": 5.779963099590593e-05, "loss": 0.8360688209533691, "memory(GiB)": 91.52, "step": 73620, "token_acc": 0.7712425853170017, "train_speed(iter/s)": 0.137765 }, { "epoch": 0.9553344643801356, "grad_norm": 0.7043180465698242, "learning_rate": 5.779433286057302e-05, "loss": 0.8895124435424805, "memory(GiB)": 91.52, "step": 73625, "token_acc": 0.743264605966836, "train_speed(iter/s)": 0.137764 }, { "epoch": 0.9553993427817913, "grad_norm": 0.6994507312774658, "learning_rate": 5.778903463554345e-05, "loss": 0.8296473503112793, "memory(GiB)": 91.52, "step": 73630, "token_acc": 0.7634082007833107, "train_speed(iter/s)": 0.137762 }, { "epoch": 0.9554642211834469, "grad_norm": 0.7532933950424194, "learning_rate": 5.778373632087814e-05, "loss": 0.8481263160705567, "memory(GiB)": 91.52, "step": 73635, "token_acc": 0.7557033454252318, "train_speed(iter/s)": 0.137761 }, { "epoch": 0.9555290995851026, "grad_norm": 0.7010743021965027, "learning_rate": 5.777843791663812e-05, "loss": 0.8341658592224122, "memory(GiB)": 91.52, "step": 73640, "token_acc": 0.7667543173903774, "train_speed(iter/s)": 0.13776 }, { "epoch": 0.9555939779867583, "grad_norm": 0.7951314449310303, "learning_rate": 5.777313942288433e-05, "loss": 0.8353057861328125, "memory(GiB)": 91.52, "step": 73645, "token_acc": 0.7464545140089934, "train_speed(iter/s)": 0.137758 }, { "epoch": 0.955658856388414, "grad_norm": 0.7827897667884827, "learning_rate": 5.7767840839677756e-05, "loss": 0.8338241577148438, "memory(GiB)": 91.52, "step": 73650, "token_acc": 0.762465248943929, "train_speed(iter/s)": 0.137757 }, { "epoch": 0.9557237347900697, "grad_norm": 0.6888256669044495, "learning_rate": 5.776254216707936e-05, "loss": 0.8649642944335938, "memory(GiB)": 91.52, "step": 73655, "token_acc": 0.7823732754137478, "train_speed(iter/s)": 0.137756 }, { "epoch": 0.9557886131917254, "grad_norm": 0.6712013483047485, "learning_rate": 5.775724340515013e-05, "loss": 0.801418399810791, "memory(GiB)": 91.52, "step": 73660, "token_acc": 0.7769594328475777, "train_speed(iter/s)": 0.137755 }, { "epoch": 0.9558534915933811, "grad_norm": 0.7177898287773132, "learning_rate": 5.7751944553951056e-05, "loss": 0.8616643905639648, "memory(GiB)": 91.52, "step": 73665, "token_acc": 0.7751880535072442, "train_speed(iter/s)": 0.137753 }, { "epoch": 0.9559183699950368, "grad_norm": 0.7827591896057129, "learning_rate": 5.774664561354308e-05, "loss": 0.8820051193237305, "memory(GiB)": 91.52, "step": 73670, "token_acc": 0.7526368083154998, "train_speed(iter/s)": 0.137753 }, { "epoch": 0.9559832483966925, "grad_norm": 0.7319234013557434, "learning_rate": 5.774134658398722e-05, "loss": 0.8437487602233886, "memory(GiB)": 91.52, "step": 73675, "token_acc": 0.7802129719264279, "train_speed(iter/s)": 0.137751 }, { "epoch": 0.9560481267983482, "grad_norm": 0.7792706489562988, "learning_rate": 5.773604746534444e-05, "loss": 0.8460330963134766, "memory(GiB)": 91.52, "step": 73680, "token_acc": 0.7750625864066061, "train_speed(iter/s)": 0.13775 }, { "epoch": 0.9561130052000039, "grad_norm": 0.6193641424179077, "learning_rate": 5.773074825767573e-05, "loss": 0.7839105606079102, "memory(GiB)": 91.52, "step": 73685, "token_acc": 0.7819692693634368, "train_speed(iter/s)": 0.137749 }, { "epoch": 0.9561778836016596, "grad_norm": 0.6395155191421509, "learning_rate": 5.772544896104205e-05, "loss": 0.8192200660705566, "memory(GiB)": 91.52, "step": 73690, "token_acc": 0.7618799318663747, "train_speed(iter/s)": 0.137748 }, { "epoch": 0.9562427620033153, "grad_norm": 0.7314130067825317, "learning_rate": 5.7720149575504404e-05, "loss": 0.8287089347839356, "memory(GiB)": 91.52, "step": 73695, "token_acc": 0.7609816457001444, "train_speed(iter/s)": 0.137746 }, { "epoch": 0.956307640404971, "grad_norm": 0.7370043992996216, "learning_rate": 5.771485010112378e-05, "loss": 0.8562421798706055, "memory(GiB)": 91.52, "step": 73700, "token_acc": 0.7529814373120398, "train_speed(iter/s)": 0.137745 }, { "epoch": 0.9563725188066267, "grad_norm": 0.8086774945259094, "learning_rate": 5.770955053796115e-05, "loss": 0.8100248336791992, "memory(GiB)": 91.52, "step": 73705, "token_acc": 0.7639777468706537, "train_speed(iter/s)": 0.137744 }, { "epoch": 0.9564373972082824, "grad_norm": 0.7460051774978638, "learning_rate": 5.7704250886077526e-05, "loss": 0.8444608688354492, "memory(GiB)": 91.52, "step": 73710, "token_acc": 0.7517619420516837, "train_speed(iter/s)": 0.137743 }, { "epoch": 0.9565022756099381, "grad_norm": 0.7683544754981995, "learning_rate": 5.7698951145533864e-05, "loss": 0.8102716445922852, "memory(GiB)": 91.52, "step": 73715, "token_acc": 0.7682963978371792, "train_speed(iter/s)": 0.137742 }, { "epoch": 0.9565671540115938, "grad_norm": 0.7499260902404785, "learning_rate": 5.769365131639117e-05, "loss": 0.834664249420166, "memory(GiB)": 91.52, "step": 73720, "token_acc": 0.7682065842989794, "train_speed(iter/s)": 0.137741 }, { "epoch": 0.9566320324132495, "grad_norm": 0.7034429311752319, "learning_rate": 5.768835139871044e-05, "loss": 0.8995408058166504, "memory(GiB)": 91.52, "step": 73725, "token_acc": 0.7480153298658637, "train_speed(iter/s)": 0.13774 }, { "epoch": 0.9566969108149052, "grad_norm": 0.7240361571311951, "learning_rate": 5.7683051392552645e-05, "loss": 0.852564525604248, "memory(GiB)": 91.52, "step": 73730, "token_acc": 0.7750119511029161, "train_speed(iter/s)": 0.137738 }, { "epoch": 0.9567617892165609, "grad_norm": 0.7859472632408142, "learning_rate": 5.767775129797879e-05, "loss": 0.8664539337158204, "memory(GiB)": 91.52, "step": 73735, "token_acc": 0.7549149034859243, "train_speed(iter/s)": 0.137737 }, { "epoch": 0.9568266676182166, "grad_norm": 0.7176886796951294, "learning_rate": 5.7672451115049876e-05, "loss": 0.8500844955444335, "memory(GiB)": 91.52, "step": 73740, "token_acc": 0.754487911175677, "train_speed(iter/s)": 0.137736 }, { "epoch": 0.9568915460198723, "grad_norm": 0.7358684539794922, "learning_rate": 5.766715084382689e-05, "loss": 0.8257819175720215, "memory(GiB)": 91.52, "step": 73745, "token_acc": 0.7936727570049372, "train_speed(iter/s)": 0.137735 }, { "epoch": 0.956956424421528, "grad_norm": 0.743215799331665, "learning_rate": 5.766185048437082e-05, "loss": 0.8723649978637695, "memory(GiB)": 91.52, "step": 73750, "token_acc": 0.7690367669401398, "train_speed(iter/s)": 0.137734 }, { "epoch": 0.9570213028231837, "grad_norm": 0.7238826155662537, "learning_rate": 5.7656550036742666e-05, "loss": 0.8570962905883789, "memory(GiB)": 91.52, "step": 73755, "token_acc": 0.784641008001738, "train_speed(iter/s)": 0.137733 }, { "epoch": 0.9570861812248393, "grad_norm": 0.6916747689247131, "learning_rate": 5.765124950100342e-05, "loss": 0.794449758529663, "memory(GiB)": 91.52, "step": 73760, "token_acc": 0.7796983895735716, "train_speed(iter/s)": 0.137732 }, { "epoch": 0.957151059626495, "grad_norm": 0.6999020576477051, "learning_rate": 5.7645948877214094e-05, "loss": 0.8362617492675781, "memory(GiB)": 91.52, "step": 73765, "token_acc": 0.7509740528889994, "train_speed(iter/s)": 0.13773 }, { "epoch": 0.9572159380281507, "grad_norm": 0.7025668025016785, "learning_rate": 5.764064816543568e-05, "loss": 0.8125121116638183, "memory(GiB)": 91.52, "step": 73770, "token_acc": 0.7754818763998864, "train_speed(iter/s)": 0.137729 }, { "epoch": 0.9572808164298064, "grad_norm": 0.7059847712516785, "learning_rate": 5.7635347365729184e-05, "loss": 0.8361503601074218, "memory(GiB)": 91.52, "step": 73775, "token_acc": 0.7651854326762446, "train_speed(iter/s)": 0.137728 }, { "epoch": 0.9573456948314621, "grad_norm": 0.7569947838783264, "learning_rate": 5.7630046478155596e-05, "loss": 0.8685829162597656, "memory(GiB)": 91.52, "step": 73780, "token_acc": 0.7553160815973473, "train_speed(iter/s)": 0.137727 }, { "epoch": 0.9574105732331178, "grad_norm": 0.7368376851081848, "learning_rate": 5.7624745502775936e-05, "loss": 0.8691749572753906, "memory(GiB)": 91.52, "step": 73785, "token_acc": 0.7668103308597167, "train_speed(iter/s)": 0.137725 }, { "epoch": 0.9574754516347735, "grad_norm": 0.8809506297111511, "learning_rate": 5.7619444439651184e-05, "loss": 0.8348456382751465, "memory(GiB)": 91.52, "step": 73790, "token_acc": 0.7592836543596682, "train_speed(iter/s)": 0.137724 }, { "epoch": 0.9575403300364292, "grad_norm": 0.7384183406829834, "learning_rate": 5.761414328884236e-05, "loss": 0.833613395690918, "memory(GiB)": 91.52, "step": 73795, "token_acc": 0.7576978417266187, "train_speed(iter/s)": 0.137723 }, { "epoch": 0.9576052084380849, "grad_norm": 0.8076102137565613, "learning_rate": 5.760884205041045e-05, "loss": 0.8099202156066895, "memory(GiB)": 91.52, "step": 73800, "token_acc": 0.7787055592666815, "train_speed(iter/s)": 0.137722 }, { "epoch": 0.9576700868397406, "grad_norm": 0.8027071952819824, "learning_rate": 5.760354072441649e-05, "loss": 0.8789849281311035, "memory(GiB)": 91.52, "step": 73805, "token_acc": 0.7475202582293803, "train_speed(iter/s)": 0.137721 }, { "epoch": 0.9577349652413963, "grad_norm": 0.7209692001342773, "learning_rate": 5.7598239310921475e-05, "loss": 0.881980037689209, "memory(GiB)": 91.52, "step": 73810, "token_acc": 0.7798926477858605, "train_speed(iter/s)": 0.13772 }, { "epoch": 0.957799843643052, "grad_norm": 0.7623945474624634, "learning_rate": 5.75929378099864e-05, "loss": 0.9128270149230957, "memory(GiB)": 91.52, "step": 73815, "token_acc": 0.7409207791750326, "train_speed(iter/s)": 0.137718 }, { "epoch": 0.9578647220447077, "grad_norm": 0.7672079205513, "learning_rate": 5.758763622167229e-05, "loss": 0.8562568664550781, "memory(GiB)": 91.52, "step": 73820, "token_acc": 0.754858326356145, "train_speed(iter/s)": 0.137717 }, { "epoch": 0.9579296004463634, "grad_norm": 0.828536868095398, "learning_rate": 5.758233454604015e-05, "loss": 0.860356330871582, "memory(GiB)": 91.52, "step": 73825, "token_acc": 0.7673312417771481, "train_speed(iter/s)": 0.137716 }, { "epoch": 0.9579944788480191, "grad_norm": 0.6582562923431396, "learning_rate": 5.7577032783151005e-05, "loss": 0.835703182220459, "memory(GiB)": 91.52, "step": 73830, "token_acc": 0.7531896796144032, "train_speed(iter/s)": 0.137715 }, { "epoch": 0.9580593572496748, "grad_norm": 0.7412394881248474, "learning_rate": 5.7571730933065845e-05, "loss": 0.8722868919372558, "memory(GiB)": 91.52, "step": 73835, "token_acc": 0.7528777091239846, "train_speed(iter/s)": 0.137714 }, { "epoch": 0.9581242356513305, "grad_norm": 0.7008219957351685, "learning_rate": 5.756642899584569e-05, "loss": 0.8611944198608399, "memory(GiB)": 91.52, "step": 73840, "token_acc": 0.7785504558800803, "train_speed(iter/s)": 0.137713 }, { "epoch": 0.9581891140529862, "grad_norm": 0.7224447727203369, "learning_rate": 5.756112697155156e-05, "loss": 0.8554189682006836, "memory(GiB)": 91.52, "step": 73845, "token_acc": 0.7827358909791142, "train_speed(iter/s)": 0.137712 }, { "epoch": 0.9582539924546419, "grad_norm": 0.7839717268943787, "learning_rate": 5.755582486024447e-05, "loss": 0.8660338401794434, "memory(GiB)": 91.52, "step": 73850, "token_acc": 0.7597278093161397, "train_speed(iter/s)": 0.137711 }, { "epoch": 0.9583188708562976, "grad_norm": 0.7362529039382935, "learning_rate": 5.755052266198542e-05, "loss": 0.8598552703857422, "memory(GiB)": 91.52, "step": 73855, "token_acc": 0.7691012287157737, "train_speed(iter/s)": 0.13771 }, { "epoch": 0.9583837492579533, "grad_norm": 0.7907160520553589, "learning_rate": 5.754522037683545e-05, "loss": 0.8947991371154785, "memory(GiB)": 91.52, "step": 73860, "token_acc": 0.769050329435303, "train_speed(iter/s)": 0.137708 }, { "epoch": 0.958448627659609, "grad_norm": 0.7579163312911987, "learning_rate": 5.753991800485559e-05, "loss": 0.8147472381591797, "memory(GiB)": 91.52, "step": 73865, "token_acc": 0.7620632888238522, "train_speed(iter/s)": 0.137707 }, { "epoch": 0.9585135060612646, "grad_norm": 0.8117289543151855, "learning_rate": 5.7534615546106804e-05, "loss": 0.8185652732849121, "memory(GiB)": 91.52, "step": 73870, "token_acc": 0.7587600114416476, "train_speed(iter/s)": 0.137707 }, { "epoch": 0.9585783844629203, "grad_norm": 0.6944610476493835, "learning_rate": 5.752931300065016e-05, "loss": 0.8370601654052734, "memory(GiB)": 91.52, "step": 73875, "token_acc": 0.7654059505799294, "train_speed(iter/s)": 0.137705 }, { "epoch": 0.958643262864576, "grad_norm": 0.7108429074287415, "learning_rate": 5.752401036854668e-05, "loss": 0.8703311920166016, "memory(GiB)": 91.52, "step": 73880, "token_acc": 0.7461712611532827, "train_speed(iter/s)": 0.137704 }, { "epoch": 0.9587081412662317, "grad_norm": 0.6974568367004395, "learning_rate": 5.751870764985736e-05, "loss": 0.879997730255127, "memory(GiB)": 91.52, "step": 73885, "token_acc": 0.748263819196339, "train_speed(iter/s)": 0.137703 }, { "epoch": 0.9587730196678874, "grad_norm": 0.7101849317550659, "learning_rate": 5.751340484464324e-05, "loss": 0.8555782318115235, "memory(GiB)": 91.52, "step": 73890, "token_acc": 0.752468471991397, "train_speed(iter/s)": 0.137702 }, { "epoch": 0.9588378980695431, "grad_norm": 0.7976945638656616, "learning_rate": 5.750810195296533e-05, "loss": 0.8213965415954589, "memory(GiB)": 91.52, "step": 73895, "token_acc": 0.7729456463717148, "train_speed(iter/s)": 0.1377 }, { "epoch": 0.9589027764711988, "grad_norm": 0.6946247220039368, "learning_rate": 5.7502798974884685e-05, "loss": 0.8537691116333008, "memory(GiB)": 91.52, "step": 73900, "token_acc": 0.782922116527943, "train_speed(iter/s)": 0.137699 }, { "epoch": 0.9589676548728545, "grad_norm": 0.7602744698524475, "learning_rate": 5.7497495910462294e-05, "loss": 0.800328254699707, "memory(GiB)": 91.52, "step": 73905, "token_acc": 0.7587907716785999, "train_speed(iter/s)": 0.137697 }, { "epoch": 0.9590325332745102, "grad_norm": 0.6392431855201721, "learning_rate": 5.749219275975922e-05, "loss": 0.8380626678466797, "memory(GiB)": 91.52, "step": 73910, "token_acc": 0.7713806799548154, "train_speed(iter/s)": 0.137696 }, { "epoch": 0.9590974116761659, "grad_norm": 0.7694475650787354, "learning_rate": 5.748688952283645e-05, "loss": 0.897998046875, "memory(GiB)": 91.52, "step": 73915, "token_acc": 0.7637860082304527, "train_speed(iter/s)": 0.137694 }, { "epoch": 0.9591622900778216, "grad_norm": 0.695093035697937, "learning_rate": 5.7481586199755065e-05, "loss": 0.8755880355834961, "memory(GiB)": 91.52, "step": 73920, "token_acc": 0.7717778527768403, "train_speed(iter/s)": 0.137693 }, { "epoch": 0.9592271684794773, "grad_norm": 0.6738801002502441, "learning_rate": 5.7476282790576055e-05, "loss": 0.8413724899291992, "memory(GiB)": 91.52, "step": 73925, "token_acc": 0.7622372372372372, "train_speed(iter/s)": 0.137693 }, { "epoch": 0.959292046881133, "grad_norm": 0.7935639023780823, "learning_rate": 5.747097929536046e-05, "loss": 0.8439983367919922, "memory(GiB)": 91.52, "step": 73930, "token_acc": 0.7469971785570334, "train_speed(iter/s)": 0.137691 }, { "epoch": 0.9593569252827887, "grad_norm": 0.6978920698165894, "learning_rate": 5.7465675714169324e-05, "loss": 0.8332988739013671, "memory(GiB)": 91.52, "step": 73935, "token_acc": 0.7672438988747625, "train_speed(iter/s)": 0.13769 }, { "epoch": 0.9594218036844444, "grad_norm": 0.739919126033783, "learning_rate": 5.746037204706367e-05, "loss": 0.8354084014892578, "memory(GiB)": 91.52, "step": 73940, "token_acc": 0.7684750814062586, "train_speed(iter/s)": 0.137688 }, { "epoch": 0.9594866820861001, "grad_norm": 0.6667203307151794, "learning_rate": 5.745506829410453e-05, "loss": 0.8219707489013672, "memory(GiB)": 91.52, "step": 73945, "token_acc": 0.794592, "train_speed(iter/s)": 0.137687 }, { "epoch": 0.9595515604877558, "grad_norm": 0.7030312418937683, "learning_rate": 5.744976445535294e-05, "loss": 0.7923248291015625, "memory(GiB)": 91.52, "step": 73950, "token_acc": 0.7677363457629378, "train_speed(iter/s)": 0.137686 }, { "epoch": 0.9596164388894115, "grad_norm": 0.7139490842819214, "learning_rate": 5.744446053086996e-05, "loss": 0.8446505546569825, "memory(GiB)": 91.52, "step": 73955, "token_acc": 0.7478070175438597, "train_speed(iter/s)": 0.137685 }, { "epoch": 0.9596813172910672, "grad_norm": 0.7087961435317993, "learning_rate": 5.7439156520716584e-05, "loss": 0.8603033065795899, "memory(GiB)": 91.52, "step": 73960, "token_acc": 0.7572416774751405, "train_speed(iter/s)": 0.137683 }, { "epoch": 0.9597461956927229, "grad_norm": 0.7859252095222473, "learning_rate": 5.743385242495388e-05, "loss": 0.8567302703857422, "memory(GiB)": 91.52, "step": 73965, "token_acc": 0.7697875012393007, "train_speed(iter/s)": 0.137681 }, { "epoch": 0.9598110740943786, "grad_norm": 0.7524347901344299, "learning_rate": 5.74285482436429e-05, "loss": 0.8108489990234375, "memory(GiB)": 91.52, "step": 73970, "token_acc": 0.7843160611475084, "train_speed(iter/s)": 0.137681 }, { "epoch": 0.9598759524960343, "grad_norm": 0.7020773887634277, "learning_rate": 5.742324397684464e-05, "loss": 0.8257826805114746, "memory(GiB)": 91.52, "step": 73975, "token_acc": 0.774500960821993, "train_speed(iter/s)": 0.137679 }, { "epoch": 0.95994083089769, "grad_norm": 0.6767012476921082, "learning_rate": 5.741793962462018e-05, "loss": 0.8270103454589843, "memory(GiB)": 91.52, "step": 73980, "token_acc": 0.7663652489940725, "train_speed(iter/s)": 0.137678 }, { "epoch": 0.9600057092993457, "grad_norm": 0.8020363450050354, "learning_rate": 5.741263518703054e-05, "loss": 0.8504196166992187, "memory(GiB)": 91.52, "step": 73985, "token_acc": 0.7664879251634115, "train_speed(iter/s)": 0.137677 }, { "epoch": 0.9600705877010014, "grad_norm": 0.7143902778625488, "learning_rate": 5.740733066413678e-05, "loss": 0.851194190979004, "memory(GiB)": 91.52, "step": 73990, "token_acc": 0.7515139598898938, "train_speed(iter/s)": 0.137676 }, { "epoch": 0.9601354661026571, "grad_norm": 0.6867411732673645, "learning_rate": 5.7402026055999925e-05, "loss": 0.8040542602539062, "memory(GiB)": 91.52, "step": 73995, "token_acc": 0.7634556050737059, "train_speed(iter/s)": 0.137675 }, { "epoch": 0.9602003445043128, "grad_norm": 0.6939929127693176, "learning_rate": 5.739672136268104e-05, "loss": 0.8236404418945312, "memory(GiB)": 91.52, "step": 74000, "token_acc": 0.7588946341129336, "train_speed(iter/s)": 0.137673 }, { "epoch": 0.9602652229059685, "grad_norm": 0.6691039800643921, "learning_rate": 5.7391416584241156e-05, "loss": 0.863614559173584, "memory(GiB)": 91.52, "step": 74005, "token_acc": 0.7763323844185795, "train_speed(iter/s)": 0.137672 }, { "epoch": 0.9603301013076242, "grad_norm": 0.7423203587532043, "learning_rate": 5.738611172074133e-05, "loss": 0.8338222503662109, "memory(GiB)": 91.52, "step": 74010, "token_acc": 0.774829242785712, "train_speed(iter/s)": 0.137671 }, { "epoch": 0.9603949797092799, "grad_norm": 0.7652748823165894, "learning_rate": 5.73808067722426e-05, "loss": 0.8720067977905274, "memory(GiB)": 91.52, "step": 74015, "token_acc": 0.7548213484825833, "train_speed(iter/s)": 0.13767 }, { "epoch": 0.9604598581109356, "grad_norm": 0.7091481685638428, "learning_rate": 5.737550173880601e-05, "loss": 0.8841240882873536, "memory(GiB)": 91.52, "step": 74020, "token_acc": 0.7658150020494603, "train_speed(iter/s)": 0.137669 }, { "epoch": 0.9605247365125913, "grad_norm": 0.6700178980827332, "learning_rate": 5.7370196620492655e-05, "loss": 0.8362401962280274, "memory(GiB)": 91.52, "step": 74025, "token_acc": 0.7613337330535292, "train_speed(iter/s)": 0.137667 }, { "epoch": 0.960589614914247, "grad_norm": 0.7423970103263855, "learning_rate": 5.736489141736352e-05, "loss": 0.8539560317993165, "memory(GiB)": 91.52, "step": 74030, "token_acc": 0.7627325969308325, "train_speed(iter/s)": 0.137666 }, { "epoch": 0.9606544933159027, "grad_norm": 0.7753958702087402, "learning_rate": 5.7359586129479707e-05, "loss": 0.8433376312255859, "memory(GiB)": 91.52, "step": 74035, "token_acc": 0.757978437623124, "train_speed(iter/s)": 0.137665 }, { "epoch": 0.9607193717175584, "grad_norm": 0.7411150932312012, "learning_rate": 5.735428075690224e-05, "loss": 0.8673131942749024, "memory(GiB)": 91.52, "step": 74040, "token_acc": 0.760863676817707, "train_speed(iter/s)": 0.137664 }, { "epoch": 0.9607842501192141, "grad_norm": 0.7844907641410828, "learning_rate": 5.7348975299692184e-05, "loss": 0.8043699264526367, "memory(GiB)": 91.52, "step": 74045, "token_acc": 0.7859170925610448, "train_speed(iter/s)": 0.137663 }, { "epoch": 0.9608491285208698, "grad_norm": 0.7236629128456116, "learning_rate": 5.73436697579106e-05, "loss": 0.8454559326171875, "memory(GiB)": 91.52, "step": 74050, "token_acc": 0.7710421692887968, "train_speed(iter/s)": 0.137662 }, { "epoch": 0.9609140069225255, "grad_norm": 0.7087186574935913, "learning_rate": 5.7338364131618527e-05, "loss": 0.8176664352416992, "memory(GiB)": 91.52, "step": 74055, "token_acc": 0.7834867555889536, "train_speed(iter/s)": 0.137661 }, { "epoch": 0.9609788853241812, "grad_norm": 0.8129547834396362, "learning_rate": 5.733305842087705e-05, "loss": 0.8910234451293946, "memory(GiB)": 91.52, "step": 74060, "token_acc": 0.7779530601075303, "train_speed(iter/s)": 0.137659 }, { "epoch": 0.9610437637258369, "grad_norm": 0.7398377656936646, "learning_rate": 5.732775262574719e-05, "loss": 0.8503387451171875, "memory(GiB)": 91.52, "step": 74065, "token_acc": 0.7447875551427908, "train_speed(iter/s)": 0.137658 }, { "epoch": 0.9611086421274926, "grad_norm": 0.6590423583984375, "learning_rate": 5.7322446746290024e-05, "loss": 0.8385724067687989, "memory(GiB)": 91.52, "step": 74070, "token_acc": 0.7619967680741938, "train_speed(iter/s)": 0.137657 }, { "epoch": 0.9611735205291483, "grad_norm": 0.67412269115448, "learning_rate": 5.731714078256663e-05, "loss": 0.8570759773254395, "memory(GiB)": 91.52, "step": 74075, "token_acc": 0.759079781347463, "train_speed(iter/s)": 0.137656 }, { "epoch": 0.961238398930804, "grad_norm": 0.7171369194984436, "learning_rate": 5.731183473463804e-05, "loss": 0.8652105331420898, "memory(GiB)": 91.52, "step": 74080, "token_acc": 0.7343482663500821, "train_speed(iter/s)": 0.137654 }, { "epoch": 0.9613032773324597, "grad_norm": 0.7990082502365112, "learning_rate": 5.730652860256532e-05, "loss": 0.8361066818237305, "memory(GiB)": 91.52, "step": 74085, "token_acc": 0.7764563839420465, "train_speed(iter/s)": 0.137653 }, { "epoch": 0.9613681557341154, "grad_norm": 0.7067784667015076, "learning_rate": 5.730122238640954e-05, "loss": 0.836367416381836, "memory(GiB)": 91.52, "step": 74090, "token_acc": 0.7545589812796291, "train_speed(iter/s)": 0.137652 }, { "epoch": 0.9614330341357711, "grad_norm": 0.7402742505073547, "learning_rate": 5.7295916086231774e-05, "loss": 0.8289758682250976, "memory(GiB)": 91.52, "step": 74095, "token_acc": 0.7814078237794961, "train_speed(iter/s)": 0.137651 }, { "epoch": 0.9614979125374268, "grad_norm": 0.7154051065444946, "learning_rate": 5.7290609702093055e-05, "loss": 0.8583213806152343, "memory(GiB)": 91.52, "step": 74100, "token_acc": 0.7677398397459034, "train_speed(iter/s)": 0.13765 }, { "epoch": 0.9615627909390825, "grad_norm": 0.7709404826164246, "learning_rate": 5.728530323405449e-05, "loss": 0.833344841003418, "memory(GiB)": 91.52, "step": 74105, "token_acc": 0.7730660498586481, "train_speed(iter/s)": 0.137649 }, { "epoch": 0.9616276693407381, "grad_norm": 0.801862359046936, "learning_rate": 5.7279996682177116e-05, "loss": 0.835883617401123, "memory(GiB)": 91.52, "step": 74110, "token_acc": 0.7815569564579974, "train_speed(iter/s)": 0.137647 }, { "epoch": 0.9616925477423938, "grad_norm": 0.8327333331108093, "learning_rate": 5.727469004652202e-05, "loss": 0.8772491455078125, "memory(GiB)": 91.52, "step": 74115, "token_acc": 0.7742432920650306, "train_speed(iter/s)": 0.137647 }, { "epoch": 0.9617574261440495, "grad_norm": 0.7530616521835327, "learning_rate": 5.726938332715025e-05, "loss": 0.8573257446289062, "memory(GiB)": 91.52, "step": 74120, "token_acc": 0.7739347809864936, "train_speed(iter/s)": 0.137646 }, { "epoch": 0.9618223045457052, "grad_norm": 0.7431091666221619, "learning_rate": 5.726407652412287e-05, "loss": 0.8297759056091308, "memory(GiB)": 91.52, "step": 74125, "token_acc": 0.7551570605775908, "train_speed(iter/s)": 0.137645 }, { "epoch": 0.9618871829473609, "grad_norm": 0.7344521284103394, "learning_rate": 5.7258769637500974e-05, "loss": 0.8207870483398437, "memory(GiB)": 91.52, "step": 74130, "token_acc": 0.780438713198922, "train_speed(iter/s)": 0.137644 }, { "epoch": 0.9619520613490166, "grad_norm": 0.7156974673271179, "learning_rate": 5.7253462667345634e-05, "loss": 0.8234710693359375, "memory(GiB)": 91.52, "step": 74135, "token_acc": 0.773217288102067, "train_speed(iter/s)": 0.137642 }, { "epoch": 0.9620169397506723, "grad_norm": 0.7481767535209656, "learning_rate": 5.7248155613717914e-05, "loss": 0.8737626075744629, "memory(GiB)": 91.52, "step": 74140, "token_acc": 0.7635827998413688, "train_speed(iter/s)": 0.13764 }, { "epoch": 0.962081818152328, "grad_norm": 0.6871814131736755, "learning_rate": 5.724284847667888e-05, "loss": 0.888219165802002, "memory(GiB)": 91.52, "step": 74145, "token_acc": 0.7734761474598872, "train_speed(iter/s)": 0.137639 }, { "epoch": 0.9621466965539837, "grad_norm": 0.7342802882194519, "learning_rate": 5.7237541256289606e-05, "loss": 0.8355379104614258, "memory(GiB)": 91.52, "step": 74150, "token_acc": 0.7695077504061828, "train_speed(iter/s)": 0.137638 }, { "epoch": 0.9622115749556394, "grad_norm": 0.7438652515411377, "learning_rate": 5.723223395261118e-05, "loss": 0.842805290222168, "memory(GiB)": 91.52, "step": 74155, "token_acc": 0.7721537721537721, "train_speed(iter/s)": 0.137637 }, { "epoch": 0.9622764533572951, "grad_norm": 0.7013435363769531, "learning_rate": 5.7226926565704666e-05, "loss": 0.8539204597473145, "memory(GiB)": 91.52, "step": 74160, "token_acc": 0.7582131955148548, "train_speed(iter/s)": 0.137636 }, { "epoch": 0.9623413317589508, "grad_norm": 0.7541162967681885, "learning_rate": 5.722161909563114e-05, "loss": 0.8346611022949219, "memory(GiB)": 91.52, "step": 74165, "token_acc": 0.7723100374457368, "train_speed(iter/s)": 0.137635 }, { "epoch": 0.9624062101606065, "grad_norm": 0.6514286398887634, "learning_rate": 5.7216311542451703e-05, "loss": 0.8190793991088867, "memory(GiB)": 91.52, "step": 74170, "token_acc": 0.7925887978142077, "train_speed(iter/s)": 0.137634 }, { "epoch": 0.9624710885622622, "grad_norm": 0.7612169981002808, "learning_rate": 5.721100390622741e-05, "loss": 0.8434415817260742, "memory(GiB)": 91.52, "step": 74175, "token_acc": 0.7569001276324187, "train_speed(iter/s)": 0.137633 }, { "epoch": 0.9625359669639179, "grad_norm": 0.7411245107650757, "learning_rate": 5.720569618701934e-05, "loss": 0.8688810348510743, "memory(GiB)": 91.52, "step": 74180, "token_acc": 0.7688232773731157, "train_speed(iter/s)": 0.137632 }, { "epoch": 0.9626008453655736, "grad_norm": 0.7818331718444824, "learning_rate": 5.7200388384888596e-05, "loss": 0.8408498764038086, "memory(GiB)": 91.52, "step": 74185, "token_acc": 0.7703408146300914, "train_speed(iter/s)": 0.13763 }, { "epoch": 0.9626657237672293, "grad_norm": 0.7572746872901917, "learning_rate": 5.719508049989625e-05, "loss": 0.8083955764770507, "memory(GiB)": 91.52, "step": 74190, "token_acc": 0.7934044936416946, "train_speed(iter/s)": 0.137629 }, { "epoch": 0.962730602168885, "grad_norm": 0.7315854430198669, "learning_rate": 5.7189772532103356e-05, "loss": 0.8502326965332031, "memory(GiB)": 91.52, "step": 74195, "token_acc": 0.7537119369180438, "train_speed(iter/s)": 0.137628 }, { "epoch": 0.9627954805705407, "grad_norm": 0.7848492860794067, "learning_rate": 5.718446448157103e-05, "loss": 0.8475182533264161, "memory(GiB)": 91.52, "step": 74200, "token_acc": 0.7734068497508217, "train_speed(iter/s)": 0.137626 }, { "epoch": 0.9628603589721964, "grad_norm": 0.6853637099266052, "learning_rate": 5.717915634836036e-05, "loss": 0.8412385940551758, "memory(GiB)": 91.52, "step": 74205, "token_acc": 0.7688350481247925, "train_speed(iter/s)": 0.137625 }, { "epoch": 0.962925237373852, "grad_norm": 0.7222158908843994, "learning_rate": 5.7173848132532415e-05, "loss": 0.8311758041381836, "memory(GiB)": 91.52, "step": 74210, "token_acc": 0.7745550638352064, "train_speed(iter/s)": 0.137624 }, { "epoch": 0.9629901157755077, "grad_norm": 0.7122728228569031, "learning_rate": 5.7168539834148295e-05, "loss": 0.8369526863098145, "memory(GiB)": 91.52, "step": 74215, "token_acc": 0.7612844096756244, "train_speed(iter/s)": 0.137623 }, { "epoch": 0.9630549941771634, "grad_norm": 0.7351152300834656, "learning_rate": 5.716323145326906e-05, "loss": 0.8620250701904297, "memory(GiB)": 91.52, "step": 74220, "token_acc": 0.7592677506966244, "train_speed(iter/s)": 0.137622 }, { "epoch": 0.9631198725788191, "grad_norm": 0.6699095964431763, "learning_rate": 5.715792298995585e-05, "loss": 0.8193746566772461, "memory(GiB)": 91.52, "step": 74225, "token_acc": 0.7501930372310068, "train_speed(iter/s)": 0.13762 }, { "epoch": 0.9631847509804748, "grad_norm": 0.7193103432655334, "learning_rate": 5.715261444426969e-05, "loss": 0.8427473068237304, "memory(GiB)": 91.52, "step": 74230, "token_acc": 0.7814442142645753, "train_speed(iter/s)": 0.137619 }, { "epoch": 0.9632496293821305, "grad_norm": 0.7212684750556946, "learning_rate": 5.714730581627172e-05, "loss": 0.8321004867553711, "memory(GiB)": 91.52, "step": 74235, "token_acc": 0.7791709586900153, "train_speed(iter/s)": 0.137618 }, { "epoch": 0.9633145077837862, "grad_norm": 0.7431164383888245, "learning_rate": 5.7141997106023014e-05, "loss": 0.8159212112426758, "memory(GiB)": 91.52, "step": 74240, "token_acc": 0.7736583217091256, "train_speed(iter/s)": 0.137617 }, { "epoch": 0.9633793861854419, "grad_norm": 0.671999454498291, "learning_rate": 5.7136688313584674e-05, "loss": 0.8429964065551758, "memory(GiB)": 91.52, "step": 74245, "token_acc": 0.7665268391206401, "train_speed(iter/s)": 0.137615 }, { "epoch": 0.9634442645870976, "grad_norm": 0.7577124238014221, "learning_rate": 5.713137943901776e-05, "loss": 0.8374926567077636, "memory(GiB)": 91.52, "step": 74250, "token_acc": 0.7653415803738968, "train_speed(iter/s)": 0.137614 }, { "epoch": 0.9635091429887533, "grad_norm": 0.6281298995018005, "learning_rate": 5.7126070482383406e-05, "loss": 0.834293270111084, "memory(GiB)": 91.52, "step": 74255, "token_acc": 0.7826124099439018, "train_speed(iter/s)": 0.137612 }, { "epoch": 0.963574021390409, "grad_norm": 0.6721548438072205, "learning_rate": 5.712076144374271e-05, "loss": 0.8400924682617188, "memory(GiB)": 91.52, "step": 74260, "token_acc": 0.7527787366240939, "train_speed(iter/s)": 0.137611 }, { "epoch": 0.9636388997920647, "grad_norm": 0.7173861265182495, "learning_rate": 5.711545232315672e-05, "loss": 0.8633119583129882, "memory(GiB)": 91.52, "step": 74265, "token_acc": 0.7510630758327428, "train_speed(iter/s)": 0.13761 }, { "epoch": 0.9637037781937204, "grad_norm": 0.688256025314331, "learning_rate": 5.711014312068657e-05, "loss": 0.8520097732543945, "memory(GiB)": 91.52, "step": 74270, "token_acc": 0.7549703706301062, "train_speed(iter/s)": 0.137608 }, { "epoch": 0.9637686565953761, "grad_norm": 0.7183630466461182, "learning_rate": 5.7104833836393346e-05, "loss": 0.8240285873413086, "memory(GiB)": 91.52, "step": 74275, "token_acc": 0.7669275728885395, "train_speed(iter/s)": 0.137607 }, { "epoch": 0.9638335349970318, "grad_norm": 0.6809591054916382, "learning_rate": 5.7099524470338175e-05, "loss": 0.8589340209960937, "memory(GiB)": 91.52, "step": 74280, "token_acc": 0.7870157138986724, "train_speed(iter/s)": 0.137606 }, { "epoch": 0.9638984133986875, "grad_norm": 0.8889156579971313, "learning_rate": 5.7094215022582096e-05, "loss": 0.8015963554382324, "memory(GiB)": 91.52, "step": 74285, "token_acc": 0.7776144907723855, "train_speed(iter/s)": 0.137605 }, { "epoch": 0.9639632918003432, "grad_norm": 0.8029480576515198, "learning_rate": 5.708890549318627e-05, "loss": 0.8677597999572754, "memory(GiB)": 91.52, "step": 74290, "token_acc": 0.7655982747081134, "train_speed(iter/s)": 0.137604 }, { "epoch": 0.9640281702019989, "grad_norm": 0.7252041697502136, "learning_rate": 5.7083595882211794e-05, "loss": 0.8544645309448242, "memory(GiB)": 91.52, "step": 74295, "token_acc": 0.7559771387534038, "train_speed(iter/s)": 0.137602 }, { "epoch": 0.9640930486036546, "grad_norm": 0.7371488213539124, "learning_rate": 5.707828618971972e-05, "loss": 0.7954586982727051, "memory(GiB)": 91.52, "step": 74300, "token_acc": 0.7688251285578592, "train_speed(iter/s)": 0.137601 }, { "epoch": 0.9641579270053103, "grad_norm": 0.6866380572319031, "learning_rate": 5.707297641577119e-05, "loss": 0.8326642036437988, "memory(GiB)": 91.52, "step": 74305, "token_acc": 0.7823501502451368, "train_speed(iter/s)": 0.1376 }, { "epoch": 0.964222805406966, "grad_norm": 0.7057980895042419, "learning_rate": 5.7067666560427304e-05, "loss": 0.8551604270935058, "memory(GiB)": 91.52, "step": 74310, "token_acc": 0.769850240180842, "train_speed(iter/s)": 0.137599 }, { "epoch": 0.9642876838086217, "grad_norm": 0.6850934028625488, "learning_rate": 5.706235662374918e-05, "loss": 0.8137928009033203, "memory(GiB)": 91.52, "step": 74315, "token_acc": 0.7934248228019907, "train_speed(iter/s)": 0.137597 }, { "epoch": 0.9643525622102774, "grad_norm": 0.7365819215774536, "learning_rate": 5.7057046605797895e-05, "loss": 0.8243298530578613, "memory(GiB)": 91.52, "step": 74320, "token_acc": 0.7902139461172741, "train_speed(iter/s)": 0.137596 }, { "epoch": 0.9644174406119331, "grad_norm": 0.7371101379394531, "learning_rate": 5.7051736506634576e-05, "loss": 0.8497812271118164, "memory(GiB)": 91.52, "step": 74325, "token_acc": 0.7731776744856412, "train_speed(iter/s)": 0.137595 }, { "epoch": 0.9644823190135888, "grad_norm": 0.731521487236023, "learning_rate": 5.704642632632033e-05, "loss": 0.8793880462646484, "memory(GiB)": 91.52, "step": 74330, "token_acc": 0.7468108108108108, "train_speed(iter/s)": 0.137594 }, { "epoch": 0.9645471974152445, "grad_norm": 0.725003182888031, "learning_rate": 5.704111606491627e-05, "loss": 0.797488784790039, "memory(GiB)": 91.52, "step": 74335, "token_acc": 0.8031004073239214, "train_speed(iter/s)": 0.137593 }, { "epoch": 0.9646120758169002, "grad_norm": 0.7223293781280518, "learning_rate": 5.7035805722483495e-05, "loss": 0.8332139015197754, "memory(GiB)": 91.52, "step": 74340, "token_acc": 0.7782396837836917, "train_speed(iter/s)": 0.137591 }, { "epoch": 0.9646769542185559, "grad_norm": 0.8075453639030457, "learning_rate": 5.703049529908311e-05, "loss": 0.8580242156982422, "memory(GiB)": 91.52, "step": 74345, "token_acc": 0.7640766200664246, "train_speed(iter/s)": 0.137589 }, { "epoch": 0.9647418326202115, "grad_norm": 0.7001959681510925, "learning_rate": 5.702518479477625e-05, "loss": 0.8426689147949219, "memory(GiB)": 91.52, "step": 74350, "token_acc": 0.7530067365005467, "train_speed(iter/s)": 0.137588 }, { "epoch": 0.9648067110218672, "grad_norm": 0.6881533265113831, "learning_rate": 5.7019874209624e-05, "loss": 0.8219380378723145, "memory(GiB)": 91.52, "step": 74355, "token_acc": 0.783134374043465, "train_speed(iter/s)": 0.137587 }, { "epoch": 0.9648715894235229, "grad_norm": 0.730562150478363, "learning_rate": 5.7014563543687504e-05, "loss": 0.8901018142700196, "memory(GiB)": 91.52, "step": 74360, "token_acc": 0.7559340320591862, "train_speed(iter/s)": 0.137586 }, { "epoch": 0.9649364678251786, "grad_norm": 0.7372187972068787, "learning_rate": 5.700925279702787e-05, "loss": 0.820534610748291, "memory(GiB)": 91.52, "step": 74365, "token_acc": 0.7767544267683816, "train_speed(iter/s)": 0.137585 }, { "epoch": 0.9650013462268343, "grad_norm": 0.7626118659973145, "learning_rate": 5.7003941969706185e-05, "loss": 0.8252824783325196, "memory(GiB)": 91.52, "step": 74370, "token_acc": 0.74379132265386, "train_speed(iter/s)": 0.137584 }, { "epoch": 0.96506622462849, "grad_norm": 0.7467566728591919, "learning_rate": 5.6998631061783605e-05, "loss": 0.8513009071350097, "memory(GiB)": 91.52, "step": 74375, "token_acc": 0.7616736454764623, "train_speed(iter/s)": 0.137582 }, { "epoch": 0.9651311030301457, "grad_norm": 0.7289013266563416, "learning_rate": 5.699332007332121e-05, "loss": 0.8242665290832519, "memory(GiB)": 91.52, "step": 74380, "token_acc": 0.7740806625645213, "train_speed(iter/s)": 0.137581 }, { "epoch": 0.9651959814318014, "grad_norm": 0.8042505979537964, "learning_rate": 5.698800900438016e-05, "loss": 0.8309592247009278, "memory(GiB)": 91.52, "step": 74385, "token_acc": 0.7591502431533146, "train_speed(iter/s)": 0.13758 }, { "epoch": 0.9652608598334571, "grad_norm": 0.7720139622688293, "learning_rate": 5.698269785502153e-05, "loss": 0.7975579261779785, "memory(GiB)": 91.52, "step": 74390, "token_acc": 0.7890067622060922, "train_speed(iter/s)": 0.137579 }, { "epoch": 0.9653257382351128, "grad_norm": 0.6893329620361328, "learning_rate": 5.697738662530647e-05, "loss": 0.8683751106262207, "memory(GiB)": 91.52, "step": 74395, "token_acc": 0.7669420193399366, "train_speed(iter/s)": 0.137577 }, { "epoch": 0.9653906166367685, "grad_norm": 0.7332873344421387, "learning_rate": 5.69720753152961e-05, "loss": 0.8717411041259766, "memory(GiB)": 91.52, "step": 74400, "token_acc": 0.7658850961849861, "train_speed(iter/s)": 0.137576 }, { "epoch": 0.9654554950384242, "grad_norm": 0.6931112408638, "learning_rate": 5.6966763925051536e-05, "loss": 0.8581413269042969, "memory(GiB)": 91.52, "step": 74405, "token_acc": 0.7606964582838127, "train_speed(iter/s)": 0.137575 }, { "epoch": 0.9655203734400799, "grad_norm": 0.7542691826820374, "learning_rate": 5.69614524546339e-05, "loss": 0.8401701927185059, "memory(GiB)": 91.52, "step": 74410, "token_acc": 0.7708555082490687, "train_speed(iter/s)": 0.137573 }, { "epoch": 0.9655852518417356, "grad_norm": 0.6734376549720764, "learning_rate": 5.6956140904104304e-05, "loss": 0.8161501884460449, "memory(GiB)": 91.52, "step": 74415, "token_acc": 0.759049964576094, "train_speed(iter/s)": 0.137572 }, { "epoch": 0.9656501302433913, "grad_norm": 0.740078866481781, "learning_rate": 5.6950829273523896e-05, "loss": 0.8981643676757812, "memory(GiB)": 91.52, "step": 74420, "token_acc": 0.7597128041483845, "train_speed(iter/s)": 0.13757 }, { "epoch": 0.965715008645047, "grad_norm": 0.7528336048126221, "learning_rate": 5.6945517562953796e-05, "loss": 0.8378435134887695, "memory(GiB)": 91.52, "step": 74425, "token_acc": 0.7800404166018965, "train_speed(iter/s)": 0.137569 }, { "epoch": 0.9657798870467027, "grad_norm": 0.7525323629379272, "learning_rate": 5.6940205772455126e-05, "loss": 0.844328498840332, "memory(GiB)": 91.52, "step": 74430, "token_acc": 0.7774832765366654, "train_speed(iter/s)": 0.137568 }, { "epoch": 0.9658447654483584, "grad_norm": 0.7149173617362976, "learning_rate": 5.6934893902089015e-05, "loss": 0.8384828567504883, "memory(GiB)": 91.52, "step": 74435, "token_acc": 0.7584442800788954, "train_speed(iter/s)": 0.137567 }, { "epoch": 0.9659096438500141, "grad_norm": 0.7371343970298767, "learning_rate": 5.692958195191659e-05, "loss": 0.8465866088867188, "memory(GiB)": 91.52, "step": 74440, "token_acc": 0.780168413816807, "train_speed(iter/s)": 0.137566 }, { "epoch": 0.9659745222516698, "grad_norm": 0.7742069363594055, "learning_rate": 5.6924269921998977e-05, "loss": 0.8768838882446289, "memory(GiB)": 91.52, "step": 74445, "token_acc": 0.7539450613676213, "train_speed(iter/s)": 0.137565 }, { "epoch": 0.9660394006533255, "grad_norm": 0.7926766276359558, "learning_rate": 5.691895781239731e-05, "loss": 0.7840144634246826, "memory(GiB)": 91.52, "step": 74450, "token_acc": 0.8036006546644845, "train_speed(iter/s)": 0.137564 }, { "epoch": 0.9661042790549812, "grad_norm": 0.7080514430999756, "learning_rate": 5.691364562317273e-05, "loss": 0.827549934387207, "memory(GiB)": 91.52, "step": 74455, "token_acc": 0.7760455597081332, "train_speed(iter/s)": 0.137564 }, { "epoch": 0.9661691574566369, "grad_norm": 0.7497239708900452, "learning_rate": 5.6908333354386356e-05, "loss": 0.8689794540405273, "memory(GiB)": 91.52, "step": 74460, "token_acc": 0.762614038354124, "train_speed(iter/s)": 0.137562 }, { "epoch": 0.9662340358582926, "grad_norm": 0.7622904777526855, "learning_rate": 5.690302100609933e-05, "loss": 0.8354673385620117, "memory(GiB)": 91.52, "step": 74465, "token_acc": 0.7716011622572259, "train_speed(iter/s)": 0.137561 }, { "epoch": 0.9662989142599483, "grad_norm": 0.6937574148178101, "learning_rate": 5.6897708578372776e-05, "loss": 0.8738059997558594, "memory(GiB)": 91.52, "step": 74470, "token_acc": 0.7747670741088982, "train_speed(iter/s)": 0.13756 }, { "epoch": 0.966363792661604, "grad_norm": 0.6814736127853394, "learning_rate": 5.689239607126785e-05, "loss": 0.8472724914550781, "memory(GiB)": 91.52, "step": 74475, "token_acc": 0.7565659068384539, "train_speed(iter/s)": 0.137559 }, { "epoch": 0.9664286710632597, "grad_norm": 0.6520979404449463, "learning_rate": 5.6887083484845663e-05, "loss": 0.8399484634399415, "memory(GiB)": 91.52, "step": 74480, "token_acc": 0.7631209818819404, "train_speed(iter/s)": 0.137557 }, { "epoch": 0.9664935494649154, "grad_norm": 0.6710931062698364, "learning_rate": 5.688177081916736e-05, "loss": 0.8497884750366211, "memory(GiB)": 91.52, "step": 74485, "token_acc": 0.7743201349919554, "train_speed(iter/s)": 0.137556 }, { "epoch": 0.9665584278665711, "grad_norm": 0.692986249923706, "learning_rate": 5.687645807429408e-05, "loss": 0.8195056915283203, "memory(GiB)": 91.52, "step": 74490, "token_acc": 0.7587554601885202, "train_speed(iter/s)": 0.137555 }, { "epoch": 0.9666233062682268, "grad_norm": 0.736050009727478, "learning_rate": 5.6871145250286975e-05, "loss": 0.8186094284057617, "memory(GiB)": 91.52, "step": 74495, "token_acc": 0.7702758802883345, "train_speed(iter/s)": 0.137554 }, { "epoch": 0.9666881846698825, "grad_norm": 0.7605337500572205, "learning_rate": 5.686583234720716e-05, "loss": 0.8315010070800781, "memory(GiB)": 91.52, "step": 74500, "token_acc": 0.7639362243898618, "train_speed(iter/s)": 0.137553 }, { "epoch": 0.9667530630715382, "grad_norm": 0.6606886386871338, "learning_rate": 5.6860519365115796e-05, "loss": 0.8394043922424317, "memory(GiB)": 91.52, "step": 74505, "token_acc": 0.7686695848255737, "train_speed(iter/s)": 0.137551 }, { "epoch": 0.9668179414731939, "grad_norm": 0.6512470841407776, "learning_rate": 5.685520630407401e-05, "loss": 0.8359296798706055, "memory(GiB)": 91.52, "step": 74510, "token_acc": 0.7828043581241118, "train_speed(iter/s)": 0.137549 }, { "epoch": 0.9668828198748496, "grad_norm": 0.7217135429382324, "learning_rate": 5.6849893164142964e-05, "loss": 0.868135643005371, "memory(GiB)": 91.52, "step": 74515, "token_acc": 0.7522681893018531, "train_speed(iter/s)": 0.137548 }, { "epoch": 0.9669476982765053, "grad_norm": 0.7524034976959229, "learning_rate": 5.684457994538377e-05, "loss": 0.8374613761901856, "memory(GiB)": 91.52, "step": 74520, "token_acc": 0.7724520489081604, "train_speed(iter/s)": 0.137547 }, { "epoch": 0.967012576678161, "grad_norm": 0.670815646648407, "learning_rate": 5.6839266647857605e-05, "loss": 0.8393245697021484, "memory(GiB)": 91.52, "step": 74525, "token_acc": 0.7555743243243244, "train_speed(iter/s)": 0.137546 }, { "epoch": 0.9670774550798167, "grad_norm": 0.7491099238395691, "learning_rate": 5.6833953271625594e-05, "loss": 0.8192634582519531, "memory(GiB)": 91.52, "step": 74530, "token_acc": 0.7632969997448142, "train_speed(iter/s)": 0.137544 }, { "epoch": 0.9671423334814724, "grad_norm": 0.7290495038032532, "learning_rate": 5.6828639816748883e-05, "loss": 0.8462538719177246, "memory(GiB)": 91.52, "step": 74535, "token_acc": 0.7684415837721864, "train_speed(iter/s)": 0.137543 }, { "epoch": 0.9672072118831281, "grad_norm": 0.7891630530357361, "learning_rate": 5.682332628328864e-05, "loss": 0.8761987686157227, "memory(GiB)": 91.52, "step": 74540, "token_acc": 0.7634599122135868, "train_speed(iter/s)": 0.137542 }, { "epoch": 0.9672720902847838, "grad_norm": 0.7288753390312195, "learning_rate": 5.681801267130598e-05, "loss": 0.8453407287597656, "memory(GiB)": 91.52, "step": 74545, "token_acc": 0.7651022864019253, "train_speed(iter/s)": 0.137541 }, { "epoch": 0.9673369686864395, "grad_norm": 0.7759631872177124, "learning_rate": 5.6812698980862086e-05, "loss": 0.8486703872680664, "memory(GiB)": 91.52, "step": 74550, "token_acc": 0.7636654778368149, "train_speed(iter/s)": 0.13754 }, { "epoch": 0.9674018470880952, "grad_norm": 0.7806294560432434, "learning_rate": 5.680738521201807e-05, "loss": 0.8337367057800293, "memory(GiB)": 91.52, "step": 74555, "token_acc": 0.7776752427005279, "train_speed(iter/s)": 0.137539 }, { "epoch": 0.9674667254897509, "grad_norm": 0.6854736804962158, "learning_rate": 5.680207136483512e-05, "loss": 0.8291316032409668, "memory(GiB)": 91.52, "step": 74560, "token_acc": 0.770311327161597, "train_speed(iter/s)": 0.137538 }, { "epoch": 0.9675316038914066, "grad_norm": 0.8112233281135559, "learning_rate": 5.679675743937437e-05, "loss": 0.8587610244750976, "memory(GiB)": 91.52, "step": 74565, "token_acc": 0.7576078543202835, "train_speed(iter/s)": 0.137537 }, { "epoch": 0.9675964822930623, "grad_norm": 0.6921703815460205, "learning_rate": 5.679144343569697e-05, "loss": 0.8234041213989258, "memory(GiB)": 91.52, "step": 74570, "token_acc": 0.7602427921092565, "train_speed(iter/s)": 0.137536 }, { "epoch": 0.967661360694718, "grad_norm": 0.6907355189323425, "learning_rate": 5.678612935386407e-05, "loss": 0.8218873977661133, "memory(GiB)": 91.52, "step": 74575, "token_acc": 0.7730033230205111, "train_speed(iter/s)": 0.137535 }, { "epoch": 0.9677262390963737, "grad_norm": 0.7219162583351135, "learning_rate": 5.678081519393683e-05, "loss": 0.8388669967651368, "memory(GiB)": 91.52, "step": 74580, "token_acc": 0.805702844006081, "train_speed(iter/s)": 0.137534 }, { "epoch": 0.9677911174980294, "grad_norm": 0.7197524905204773, "learning_rate": 5.6775500955976426e-05, "loss": 0.8567607879638672, "memory(GiB)": 91.52, "step": 74585, "token_acc": 0.7627294641201557, "train_speed(iter/s)": 0.137533 }, { "epoch": 0.967855995899685, "grad_norm": 0.7090789675712585, "learning_rate": 5.677018664004396e-05, "loss": 0.865234661102295, "memory(GiB)": 91.52, "step": 74590, "token_acc": 0.7567328266804397, "train_speed(iter/s)": 0.137531 }, { "epoch": 0.9679208743013407, "grad_norm": 0.6872603893280029, "learning_rate": 5.676487224620064e-05, "loss": 0.8064495086669922, "memory(GiB)": 91.52, "step": 74595, "token_acc": 0.7816982799646475, "train_speed(iter/s)": 0.13753 }, { "epoch": 0.9679857527029964, "grad_norm": 0.7931084632873535, "learning_rate": 5.67595577745076e-05, "loss": 0.8651853561401367, "memory(GiB)": 91.52, "step": 74600, "token_acc": 0.7828441958438064, "train_speed(iter/s)": 0.137529 }, { "epoch": 0.9680506311046521, "grad_norm": 0.6828485131263733, "learning_rate": 5.6754243225026016e-05, "loss": 0.8516129493713379, "memory(GiB)": 91.52, "step": 74605, "token_acc": 0.7696597131243912, "train_speed(iter/s)": 0.137528 }, { "epoch": 0.9681155095063078, "grad_norm": 0.719307541847229, "learning_rate": 5.674892859781703e-05, "loss": 0.8431838989257813, "memory(GiB)": 91.52, "step": 74610, "token_acc": 0.7765221402214022, "train_speed(iter/s)": 0.137527 }, { "epoch": 0.9681803879079635, "grad_norm": 0.751164436340332, "learning_rate": 5.674361389294179e-05, "loss": 0.7899543762207031, "memory(GiB)": 91.52, "step": 74615, "token_acc": 0.7699003067484662, "train_speed(iter/s)": 0.137526 }, { "epoch": 0.9682452663096192, "grad_norm": 0.7498560547828674, "learning_rate": 5.6738299110461515e-05, "loss": 0.8340622901916503, "memory(GiB)": 91.52, "step": 74620, "token_acc": 0.7728369879313276, "train_speed(iter/s)": 0.137524 }, { "epoch": 0.9683101447112749, "grad_norm": 0.7766994833946228, "learning_rate": 5.6732984250437295e-05, "loss": 0.8907211303710938, "memory(GiB)": 91.52, "step": 74625, "token_acc": 0.7598694073358431, "train_speed(iter/s)": 0.137523 }, { "epoch": 0.9683750231129306, "grad_norm": 0.7083754539489746, "learning_rate": 5.672766931293033e-05, "loss": 0.8501251220703125, "memory(GiB)": 91.52, "step": 74630, "token_acc": 0.7831812739957891, "train_speed(iter/s)": 0.137522 }, { "epoch": 0.9684399015145863, "grad_norm": 0.7103943228721619, "learning_rate": 5.672235429800178e-05, "loss": 0.8419001579284668, "memory(GiB)": 91.52, "step": 74635, "token_acc": 0.779886008457437, "train_speed(iter/s)": 0.137521 }, { "epoch": 0.968504779916242, "grad_norm": 0.7823664546012878, "learning_rate": 5.671703920571282e-05, "loss": 0.840882682800293, "memory(GiB)": 91.52, "step": 74640, "token_acc": 0.7648476950643427, "train_speed(iter/s)": 0.137519 }, { "epoch": 0.9685696583178977, "grad_norm": 0.7471721768379211, "learning_rate": 5.671172403612459e-05, "loss": 0.8749932289123535, "memory(GiB)": 91.52, "step": 74645, "token_acc": 0.7650912658094804, "train_speed(iter/s)": 0.137519 }, { "epoch": 0.9686345367195534, "grad_norm": 0.774030327796936, "learning_rate": 5.6706408789298285e-05, "loss": 0.8291872978210449, "memory(GiB)": 91.52, "step": 74650, "token_acc": 0.7812959719789843, "train_speed(iter/s)": 0.137518 }, { "epoch": 0.968699415121209, "grad_norm": 0.7233411073684692, "learning_rate": 5.6701093465295065e-05, "loss": 0.8210613250732421, "memory(GiB)": 91.52, "step": 74655, "token_acc": 0.7677632095469891, "train_speed(iter/s)": 0.137517 }, { "epoch": 0.9687642935228648, "grad_norm": 0.7764096856117249, "learning_rate": 5.6695778064176075e-05, "loss": 0.8356590270996094, "memory(GiB)": 91.52, "step": 74660, "token_acc": 0.7656960873521383, "train_speed(iter/s)": 0.137515 }, { "epoch": 0.9688291719245204, "grad_norm": 0.708742082118988, "learning_rate": 5.6690462586002504e-05, "loss": 0.8277383804321289, "memory(GiB)": 91.52, "step": 74665, "token_acc": 0.7594969957645205, "train_speed(iter/s)": 0.137514 }, { "epoch": 0.9688940503261761, "grad_norm": 0.7660173177719116, "learning_rate": 5.6685147030835515e-05, "loss": 0.8374848365783691, "memory(GiB)": 91.52, "step": 74670, "token_acc": 0.7800025385542934, "train_speed(iter/s)": 0.137512 }, { "epoch": 0.9689589287278318, "grad_norm": 0.740380585193634, "learning_rate": 5.6679831398736305e-05, "loss": 0.8069021224975585, "memory(GiB)": 91.52, "step": 74675, "token_acc": 0.7761637112096264, "train_speed(iter/s)": 0.137511 }, { "epoch": 0.9690238071294875, "grad_norm": 0.7113214731216431, "learning_rate": 5.667451568976601e-05, "loss": 0.8323312759399414, "memory(GiB)": 91.52, "step": 74680, "token_acc": 0.7686306259890332, "train_speed(iter/s)": 0.13751 }, { "epoch": 0.9690886855311432, "grad_norm": 0.7716314792633057, "learning_rate": 5.666919990398582e-05, "loss": 0.8314611434936523, "memory(GiB)": 91.52, "step": 74685, "token_acc": 0.7724482090679273, "train_speed(iter/s)": 0.137509 }, { "epoch": 0.969153563932799, "grad_norm": 0.71671062707901, "learning_rate": 5.666388404145692e-05, "loss": 0.834961223602295, "memory(GiB)": 91.52, "step": 74690, "token_acc": 0.7594121199919468, "train_speed(iter/s)": 0.137508 }, { "epoch": 0.9692184423344546, "grad_norm": 0.726046621799469, "learning_rate": 5.665856810224046e-05, "loss": 0.869113540649414, "memory(GiB)": 91.52, "step": 74695, "token_acc": 0.7758376592732421, "train_speed(iter/s)": 0.137507 }, { "epoch": 0.9692833207361103, "grad_norm": 0.7503023147583008, "learning_rate": 5.665325208639762e-05, "loss": 0.8790606498718262, "memory(GiB)": 91.52, "step": 74700, "token_acc": 0.7419953022623316, "train_speed(iter/s)": 0.137505 }, { "epoch": 0.969348199137766, "grad_norm": 0.774289071559906, "learning_rate": 5.6647935993989585e-05, "loss": 0.842841911315918, "memory(GiB)": 91.52, "step": 74705, "token_acc": 0.773881668826847, "train_speed(iter/s)": 0.137504 }, { "epoch": 0.9694130775394217, "grad_norm": 0.6655502319335938, "learning_rate": 5.6642619825077546e-05, "loss": 0.8377365112304688, "memory(GiB)": 91.52, "step": 74710, "token_acc": 0.7668405913439026, "train_speed(iter/s)": 0.137503 }, { "epoch": 0.9694779559410774, "grad_norm": 0.7953096628189087, "learning_rate": 5.663730357972265e-05, "loss": 0.8915085792541504, "memory(GiB)": 91.52, "step": 74715, "token_acc": 0.7472402278530741, "train_speed(iter/s)": 0.137501 }, { "epoch": 0.9695428343427331, "grad_norm": 0.7270669937133789, "learning_rate": 5.66319872579861e-05, "loss": 0.8865522384643555, "memory(GiB)": 91.52, "step": 74720, "token_acc": 0.7553696657127582, "train_speed(iter/s)": 0.1375 }, { "epoch": 0.9696077127443888, "grad_norm": 0.7621355652809143, "learning_rate": 5.662667085992907e-05, "loss": 0.8109657287597656, "memory(GiB)": 91.52, "step": 74725, "token_acc": 0.7573496848918413, "train_speed(iter/s)": 0.137499 }, { "epoch": 0.9696725911460445, "grad_norm": 0.7680351138114929, "learning_rate": 5.662135438561273e-05, "loss": 0.8430585861206055, "memory(GiB)": 91.52, "step": 74730, "token_acc": 0.7628895590980899, "train_speed(iter/s)": 0.137498 }, { "epoch": 0.9697374695477002, "grad_norm": 0.6911022067070007, "learning_rate": 5.6616037835098276e-05, "loss": 0.8891742706298829, "memory(GiB)": 91.52, "step": 74735, "token_acc": 0.7496519721577726, "train_speed(iter/s)": 0.137497 }, { "epoch": 0.9698023479493559, "grad_norm": 0.7153539061546326, "learning_rate": 5.6610721208446874e-05, "loss": 0.8205510139465332, "memory(GiB)": 91.52, "step": 74740, "token_acc": 0.7832585380695216, "train_speed(iter/s)": 0.137495 }, { "epoch": 0.9698672263510116, "grad_norm": 0.7576988935470581, "learning_rate": 5.6605404505719736e-05, "loss": 0.8029294013977051, "memory(GiB)": 91.52, "step": 74745, "token_acc": 0.767458785558333, "train_speed(iter/s)": 0.137494 }, { "epoch": 0.9699321047526673, "grad_norm": 0.8252048492431641, "learning_rate": 5.6600087726978e-05, "loss": 0.8302091598510742, "memory(GiB)": 91.52, "step": 74750, "token_acc": 0.7749030305613571, "train_speed(iter/s)": 0.137493 }, { "epoch": 0.969996983154323, "grad_norm": 0.6906274557113647, "learning_rate": 5.659477087228291e-05, "loss": 0.8015111923217774, "memory(GiB)": 91.52, "step": 74755, "token_acc": 0.7656279870005736, "train_speed(iter/s)": 0.137492 }, { "epoch": 0.9700618615559787, "grad_norm": 0.7145060300827026, "learning_rate": 5.6589453941695604e-05, "loss": 0.8207168579101562, "memory(GiB)": 91.52, "step": 74760, "token_acc": 0.8009683832488937, "train_speed(iter/s)": 0.13749 }, { "epoch": 0.9701267399576344, "grad_norm": 0.8053091168403625, "learning_rate": 5.6584136935277285e-05, "loss": 0.8277698516845703, "memory(GiB)": 91.52, "step": 74765, "token_acc": 0.7525991189427312, "train_speed(iter/s)": 0.137489 }, { "epoch": 0.9701916183592901, "grad_norm": 0.73594731092453, "learning_rate": 5.657881985308915e-05, "loss": 0.8308332443237305, "memory(GiB)": 91.52, "step": 74770, "token_acc": 0.7681475424729718, "train_speed(iter/s)": 0.137488 }, { "epoch": 0.9702564967609458, "grad_norm": 0.710320770740509, "learning_rate": 5.657350269519237e-05, "loss": 0.8053097724914551, "memory(GiB)": 91.52, "step": 74775, "token_acc": 0.7668793299150651, "train_speed(iter/s)": 0.137486 }, { "epoch": 0.9703213751626015, "grad_norm": 0.8699784278869629, "learning_rate": 5.6568185461648146e-05, "loss": 0.8612120628356934, "memory(GiB)": 91.52, "step": 74780, "token_acc": 0.7636938690393438, "train_speed(iter/s)": 0.137486 }, { "epoch": 0.9703862535642572, "grad_norm": 0.8226265907287598, "learning_rate": 5.656286815251768e-05, "loss": 0.8672676086425781, "memory(GiB)": 91.52, "step": 74785, "token_acc": 0.7605725439167209, "train_speed(iter/s)": 0.137485 }, { "epoch": 0.9704511319659129, "grad_norm": 0.8252704739570618, "learning_rate": 5.6557550767862145e-05, "loss": 0.8311319351196289, "memory(GiB)": 91.52, "step": 74790, "token_acc": 0.7543959854759088, "train_speed(iter/s)": 0.137484 }, { "epoch": 0.9705160103675686, "grad_norm": 0.73103928565979, "learning_rate": 5.655223330774273e-05, "loss": 0.9420988082885742, "memory(GiB)": 91.52, "step": 74795, "token_acc": 0.7589681451484328, "train_speed(iter/s)": 0.137482 }, { "epoch": 0.9705808887692243, "grad_norm": 0.6682544350624084, "learning_rate": 5.6546915772220646e-05, "loss": 0.839176082611084, "memory(GiB)": 91.52, "step": 74800, "token_acc": 0.7402547526083338, "train_speed(iter/s)": 0.13748 }, { "epoch": 0.97064576717088, "grad_norm": 0.6878613829612732, "learning_rate": 5.654159816135707e-05, "loss": 0.7925166130065918, "memory(GiB)": 91.52, "step": 74805, "token_acc": 0.7900079496116921, "train_speed(iter/s)": 0.137479 }, { "epoch": 0.9707106455725357, "grad_norm": 0.7115638852119446, "learning_rate": 5.6536280475213186e-05, "loss": 0.8466316223144531, "memory(GiB)": 91.52, "step": 74810, "token_acc": 0.762567042722397, "train_speed(iter/s)": 0.137477 }, { "epoch": 0.9707755239741914, "grad_norm": 0.6456381678581238, "learning_rate": 5.653096271385023e-05, "loss": 0.8617389678955079, "memory(GiB)": 91.52, "step": 74815, "token_acc": 0.7621390700690391, "train_speed(iter/s)": 0.137476 }, { "epoch": 0.9708404023758471, "grad_norm": 0.7566513419151306, "learning_rate": 5.6525644877329364e-05, "loss": 0.8502895355224609, "memory(GiB)": 91.52, "step": 74820, "token_acc": 0.7695351970360391, "train_speed(iter/s)": 0.137475 }, { "epoch": 0.9709052807775027, "grad_norm": 0.7811445593833923, "learning_rate": 5.652032696571181e-05, "loss": 0.8572532653808593, "memory(GiB)": 91.52, "step": 74825, "token_acc": 0.7707443076726832, "train_speed(iter/s)": 0.137474 }, { "epoch": 0.9709701591791584, "grad_norm": 0.696381151676178, "learning_rate": 5.6515008979058745e-05, "loss": 0.8259521484375, "memory(GiB)": 91.52, "step": 74830, "token_acc": 0.7825226823112081, "train_speed(iter/s)": 0.137472 }, { "epoch": 0.9710350375808141, "grad_norm": 0.7686933875083923, "learning_rate": 5.650969091743138e-05, "loss": 0.8468266487121582, "memory(GiB)": 91.52, "step": 74835, "token_acc": 0.7628669327775619, "train_speed(iter/s)": 0.137471 }, { "epoch": 0.9710999159824698, "grad_norm": 0.641784131526947, "learning_rate": 5.650437278089091e-05, "loss": 0.7989197731018066, "memory(GiB)": 91.52, "step": 74840, "token_acc": 0.7607209658221391, "train_speed(iter/s)": 0.13747 }, { "epoch": 0.9711647943841255, "grad_norm": 0.7285668253898621, "learning_rate": 5.649905456949852e-05, "loss": 0.8122289657592774, "memory(GiB)": 91.52, "step": 74845, "token_acc": 0.7790083842252354, "train_speed(iter/s)": 0.137468 }, { "epoch": 0.9712296727857812, "grad_norm": 0.7175189852714539, "learning_rate": 5.649373628331545e-05, "loss": 0.828127098083496, "memory(GiB)": 91.52, "step": 74850, "token_acc": 0.7728200153861592, "train_speed(iter/s)": 0.137467 }, { "epoch": 0.9712945511874369, "grad_norm": 0.6035562753677368, "learning_rate": 5.6488417922402867e-05, "loss": 0.807741928100586, "memory(GiB)": 91.52, "step": 74855, "token_acc": 0.7680065803590588, "train_speed(iter/s)": 0.137466 }, { "epoch": 0.9713594295890926, "grad_norm": 0.7455465793609619, "learning_rate": 5.6483099486822e-05, "loss": 0.8283103942871094, "memory(GiB)": 91.52, "step": 74860, "token_acc": 0.7692428674782171, "train_speed(iter/s)": 0.137465 }, { "epoch": 0.9714243079907483, "grad_norm": 0.7211979627609253, "learning_rate": 5.647778097663403e-05, "loss": 0.881754207611084, "memory(GiB)": 91.52, "step": 74865, "token_acc": 0.7511514052583862, "train_speed(iter/s)": 0.137464 }, { "epoch": 0.971489186392404, "grad_norm": 0.8099461793899536, "learning_rate": 5.647246239190017e-05, "loss": 0.8020334243774414, "memory(GiB)": 91.52, "step": 74870, "token_acc": 0.7834600132187707, "train_speed(iter/s)": 0.137463 }, { "epoch": 0.9715540647940597, "grad_norm": 0.7000651955604553, "learning_rate": 5.646714373268165e-05, "loss": 0.7967930793762207, "memory(GiB)": 91.52, "step": 74875, "token_acc": 0.7817241497414118, "train_speed(iter/s)": 0.137462 }, { "epoch": 0.9716189431957154, "grad_norm": 0.699580192565918, "learning_rate": 5.646182499903964e-05, "loss": 0.7766201972961426, "memory(GiB)": 91.52, "step": 74880, "token_acc": 0.7869662205226259, "train_speed(iter/s)": 0.13746 }, { "epoch": 0.9716838215973711, "grad_norm": 0.6863961219787598, "learning_rate": 5.6456506191035366e-05, "loss": 0.8905197143554687, "memory(GiB)": 91.52, "step": 74885, "token_acc": 0.7551228348239556, "train_speed(iter/s)": 0.137459 }, { "epoch": 0.9717486999990268, "grad_norm": 0.731590747833252, "learning_rate": 5.6451187308730035e-05, "loss": 0.8553516387939453, "memory(GiB)": 91.52, "step": 74890, "token_acc": 0.7586818196207541, "train_speed(iter/s)": 0.137458 }, { "epoch": 0.9718135784006825, "grad_norm": 0.7316629886627197, "learning_rate": 5.644586835218486e-05, "loss": 0.8458452224731445, "memory(GiB)": 91.52, "step": 74895, "token_acc": 0.7604755048547274, "train_speed(iter/s)": 0.137457 }, { "epoch": 0.9718784568023382, "grad_norm": 0.7453798651695251, "learning_rate": 5.644054932146104e-05, "loss": 0.8264436721801758, "memory(GiB)": 91.52, "step": 74900, "token_acc": 0.7444453454427505, "train_speed(iter/s)": 0.137455 }, { "epoch": 0.9719433352039939, "grad_norm": 0.6513360142707825, "learning_rate": 5.643523021661979e-05, "loss": 0.8305842399597168, "memory(GiB)": 91.52, "step": 74905, "token_acc": 0.7504393055457932, "train_speed(iter/s)": 0.137454 }, { "epoch": 0.9720082136056496, "grad_norm": 0.7380366921424866, "learning_rate": 5.642991103772235e-05, "loss": 0.8614068984985351, "memory(GiB)": 91.52, "step": 74910, "token_acc": 0.7486183980412732, "train_speed(iter/s)": 0.137452 }, { "epoch": 0.9720730920073053, "grad_norm": 0.7018104791641235, "learning_rate": 5.642459178482988e-05, "loss": 0.8245336532592773, "memory(GiB)": 91.52, "step": 74915, "token_acc": 0.7681950509461426, "train_speed(iter/s)": 0.137451 }, { "epoch": 0.972137970408961, "grad_norm": 0.6572501063346863, "learning_rate": 5.6419272458003624e-05, "loss": 0.8462533950805664, "memory(GiB)": 91.52, "step": 74920, "token_acc": 0.77068251395642, "train_speed(iter/s)": 0.137449 }, { "epoch": 0.9722028488106167, "grad_norm": 0.6590708494186401, "learning_rate": 5.6413953057304805e-05, "loss": 0.8116151809692382, "memory(GiB)": 91.52, "step": 74925, "token_acc": 0.75886624832546, "train_speed(iter/s)": 0.137448 }, { "epoch": 0.9722677272122724, "grad_norm": 0.7589411735534668, "learning_rate": 5.640863358279462e-05, "loss": 0.8475668907165528, "memory(GiB)": 91.52, "step": 74930, "token_acc": 0.7701695829540474, "train_speed(iter/s)": 0.137447 }, { "epoch": 0.9723326056139281, "grad_norm": 0.7631983757019043, "learning_rate": 5.6403314034534307e-05, "loss": 0.8243491172790527, "memory(GiB)": 91.52, "step": 74935, "token_acc": 0.7703747356744202, "train_speed(iter/s)": 0.137446 }, { "epoch": 0.9723974840155838, "grad_norm": 0.7621126770973206, "learning_rate": 5.639799441258504e-05, "loss": 0.8397440910339355, "memory(GiB)": 91.52, "step": 74940, "token_acc": 0.7884692428160396, "train_speed(iter/s)": 0.137445 }, { "epoch": 0.9724623624172395, "grad_norm": 0.646449089050293, "learning_rate": 5.6392674717008086e-05, "loss": 0.7740941047668457, "memory(GiB)": 91.52, "step": 74945, "token_acc": 0.7627250091840483, "train_speed(iter/s)": 0.137443 }, { "epoch": 0.9725272408188952, "grad_norm": 0.8065676093101501, "learning_rate": 5.638735494786463e-05, "loss": 0.8429422378540039, "memory(GiB)": 91.52, "step": 74950, "token_acc": 0.7744552437686157, "train_speed(iter/s)": 0.137442 }, { "epoch": 0.9725921192205509, "grad_norm": 0.8065354228019714, "learning_rate": 5.638203510521591e-05, "loss": 0.8713619232177734, "memory(GiB)": 91.52, "step": 74955, "token_acc": 0.7754613570140142, "train_speed(iter/s)": 0.137441 }, { "epoch": 0.9726569976222066, "grad_norm": 0.7042796015739441, "learning_rate": 5.637671518912315e-05, "loss": 0.8881832122802734, "memory(GiB)": 91.52, "step": 74960, "token_acc": 0.7692069392812887, "train_speed(iter/s)": 0.13744 }, { "epoch": 0.9727218760238623, "grad_norm": 0.6973334550857544, "learning_rate": 5.6371395199647556e-05, "loss": 0.8660093307495117, "memory(GiB)": 91.52, "step": 74965, "token_acc": 0.7461616823109947, "train_speed(iter/s)": 0.137439 }, { "epoch": 0.972786754425518, "grad_norm": 0.7529042363166809, "learning_rate": 5.6366075136850364e-05, "loss": 0.8562183380126953, "memory(GiB)": 91.52, "step": 74970, "token_acc": 0.768891446106636, "train_speed(iter/s)": 0.137437 }, { "epoch": 0.9728516328271737, "grad_norm": 0.7484025955200195, "learning_rate": 5.636075500079276e-05, "loss": 0.8394872665405273, "memory(GiB)": 91.52, "step": 74975, "token_acc": 0.7652896799238262, "train_speed(iter/s)": 0.137436 }, { "epoch": 0.9729165112288294, "grad_norm": 0.7626211047172546, "learning_rate": 5.635543479153603e-05, "loss": 0.8301149368286133, "memory(GiB)": 91.52, "step": 74980, "token_acc": 0.7484374013510954, "train_speed(iter/s)": 0.137435 }, { "epoch": 0.9729813896304851, "grad_norm": 0.7441506385803223, "learning_rate": 5.635011450914134e-05, "loss": 0.9027986526489258, "memory(GiB)": 91.52, "step": 74985, "token_acc": 0.7780161168763406, "train_speed(iter/s)": 0.137434 }, { "epoch": 0.9730462680321408, "grad_norm": 0.7382978200912476, "learning_rate": 5.634479415366996e-05, "loss": 0.8614774703979492, "memory(GiB)": 91.52, "step": 74990, "token_acc": 0.777161731744721, "train_speed(iter/s)": 0.137432 }, { "epoch": 0.9731111464337965, "grad_norm": 0.8166074156761169, "learning_rate": 5.633947372518308e-05, "loss": 0.8863780975341797, "memory(GiB)": 91.52, "step": 74995, "token_acc": 0.7602774274905423, "train_speed(iter/s)": 0.137432 }, { "epoch": 0.9731760248354522, "grad_norm": 0.7756046652793884, "learning_rate": 5.633415322374196e-05, "loss": 0.855548095703125, "memory(GiB)": 91.52, "step": 75000, "token_acc": 0.7557188409815611, "train_speed(iter/s)": 0.13743 }, { "epoch": 0.9732409032371079, "grad_norm": 0.7719335556030273, "learning_rate": 5.632883264940782e-05, "loss": 0.8225940704345703, "memory(GiB)": 91.52, "step": 75005, "token_acc": 0.7716522373702193, "train_speed(iter/s)": 0.137429 }, { "epoch": 0.9733057816387636, "grad_norm": 0.6801490783691406, "learning_rate": 5.632351200224185e-05, "loss": 0.8612653732299804, "memory(GiB)": 91.52, "step": 75010, "token_acc": 0.748650732459522, "train_speed(iter/s)": 0.137428 }, { "epoch": 0.9733706600404193, "grad_norm": 0.7827003598213196, "learning_rate": 5.631819128230534e-05, "loss": 0.864291763305664, "memory(GiB)": 91.52, "step": 75015, "token_acc": 0.7593699458547218, "train_speed(iter/s)": 0.137426 }, { "epoch": 0.973435538442075, "grad_norm": 0.7219066619873047, "learning_rate": 5.631287048965947e-05, "loss": 0.9103012084960938, "memory(GiB)": 91.52, "step": 75020, "token_acc": 0.7530995638286911, "train_speed(iter/s)": 0.137425 }, { "epoch": 0.9735004168437307, "grad_norm": 0.6647114157676697, "learning_rate": 5.6307549624365505e-05, "loss": 0.8068302154541016, "memory(GiB)": 91.52, "step": 75025, "token_acc": 0.7605304799939718, "train_speed(iter/s)": 0.137423 }, { "epoch": 0.9735652952453864, "grad_norm": 0.8187376260757446, "learning_rate": 5.6302228686484657e-05, "loss": 0.8527294158935547, "memory(GiB)": 91.52, "step": 75030, "token_acc": 0.761885502077435, "train_speed(iter/s)": 0.137422 }, { "epoch": 0.9736301736470421, "grad_norm": 0.6265847086906433, "learning_rate": 5.6296907676078184e-05, "loss": 0.8181112289428711, "memory(GiB)": 91.52, "step": 75035, "token_acc": 0.7712013828867762, "train_speed(iter/s)": 0.137421 }, { "epoch": 0.9736950520486978, "grad_norm": 0.7518296837806702, "learning_rate": 5.629158659320729e-05, "loss": 0.846309757232666, "memory(GiB)": 91.52, "step": 75040, "token_acc": 0.7607387192044562, "train_speed(iter/s)": 0.13742 }, { "epoch": 0.9737599304503535, "grad_norm": 0.7782810926437378, "learning_rate": 5.628626543793322e-05, "loss": 0.8448934555053711, "memory(GiB)": 91.52, "step": 75045, "token_acc": 0.7641696560455792, "train_speed(iter/s)": 0.137419 }, { "epoch": 0.9738248088520092, "grad_norm": 0.7699460983276367, "learning_rate": 5.628094421031721e-05, "loss": 0.8834146499633789, "memory(GiB)": 91.52, "step": 75050, "token_acc": 0.7560361638007446, "train_speed(iter/s)": 0.137418 }, { "epoch": 0.9738896872536649, "grad_norm": 0.7974493503570557, "learning_rate": 5.627562291042051e-05, "loss": 0.90172119140625, "memory(GiB)": 91.52, "step": 75055, "token_acc": 0.7451560628478827, "train_speed(iter/s)": 0.137417 }, { "epoch": 0.9739545656553206, "grad_norm": 0.7709702849388123, "learning_rate": 5.627030153830434e-05, "loss": 0.8708658218383789, "memory(GiB)": 91.52, "step": 75060, "token_acc": 0.7452421963618031, "train_speed(iter/s)": 0.137416 }, { "epoch": 0.9740194440569762, "grad_norm": 0.7399632334709167, "learning_rate": 5.626498009402993e-05, "loss": 0.8073520660400391, "memory(GiB)": 91.52, "step": 75065, "token_acc": 0.7818762312342912, "train_speed(iter/s)": 0.137414 }, { "epoch": 0.9740843224586319, "grad_norm": 0.786095142364502, "learning_rate": 5.625965857765856e-05, "loss": 0.88502197265625, "memory(GiB)": 91.52, "step": 75070, "token_acc": 0.7709747009217494, "train_speed(iter/s)": 0.137413 }, { "epoch": 0.9741492008602876, "grad_norm": 0.8601041436195374, "learning_rate": 5.625433698925142e-05, "loss": 0.869575309753418, "memory(GiB)": 91.52, "step": 75075, "token_acc": 0.7564264235134348, "train_speed(iter/s)": 0.137412 }, { "epoch": 0.9742140792619433, "grad_norm": 0.7913742661476135, "learning_rate": 5.624901532886978e-05, "loss": 0.8631834983825684, "memory(GiB)": 91.52, "step": 75080, "token_acc": 0.7735881542699724, "train_speed(iter/s)": 0.13741 }, { "epoch": 0.974278957663599, "grad_norm": 0.7548871636390686, "learning_rate": 5.624369359657488e-05, "loss": 0.8151149749755859, "memory(GiB)": 91.52, "step": 75085, "token_acc": 0.7777741353876414, "train_speed(iter/s)": 0.137409 }, { "epoch": 0.9743438360652547, "grad_norm": 0.8396301865577698, "learning_rate": 5.6238371792427946e-05, "loss": 0.8154279708862304, "memory(GiB)": 91.52, "step": 75090, "token_acc": 0.754895842619833, "train_speed(iter/s)": 0.137407 }, { "epoch": 0.9744087144669104, "grad_norm": 0.8350006341934204, "learning_rate": 5.6233049916490234e-05, "loss": 0.8500999450683594, "memory(GiB)": 91.52, "step": 75095, "token_acc": 0.7711499123319696, "train_speed(iter/s)": 0.137406 }, { "epoch": 0.974473592868566, "grad_norm": 0.78655606508255, "learning_rate": 5.6227727968822974e-05, "loss": 0.8255766868591309, "memory(GiB)": 91.52, "step": 75100, "token_acc": 0.7640642939150402, "train_speed(iter/s)": 0.137405 }, { "epoch": 0.9745384712702218, "grad_norm": 0.7244675755500793, "learning_rate": 5.622240594948744e-05, "loss": 0.8510059356689453, "memory(GiB)": 91.52, "step": 75105, "token_acc": 0.7609342560553634, "train_speed(iter/s)": 0.137404 }, { "epoch": 0.9746033496718775, "grad_norm": 0.7507063746452332, "learning_rate": 5.621708385854485e-05, "loss": 0.8386702537536621, "memory(GiB)": 91.52, "step": 75110, "token_acc": 0.7835825450106805, "train_speed(iter/s)": 0.137403 }, { "epoch": 0.9746682280735331, "grad_norm": 0.7056378722190857, "learning_rate": 5.6211761696056456e-05, "loss": 0.888874626159668, "memory(GiB)": 91.52, "step": 75115, "token_acc": 0.786902569366867, "train_speed(iter/s)": 0.137402 }, { "epoch": 0.9747331064751888, "grad_norm": 0.7106817960739136, "learning_rate": 5.6206439462083515e-05, "loss": 0.8014088630676269, "memory(GiB)": 91.52, "step": 75120, "token_acc": 0.7906696159260004, "train_speed(iter/s)": 0.1374 }, { "epoch": 0.9747979848768445, "grad_norm": 0.7752029895782471, "learning_rate": 5.6201117156687264e-05, "loss": 0.8631761550903321, "memory(GiB)": 91.52, "step": 75125, "token_acc": 0.7578923546939237, "train_speed(iter/s)": 0.137399 }, { "epoch": 0.9748628632785002, "grad_norm": 0.7289574146270752, "learning_rate": 5.6195794779928956e-05, "loss": 0.8393375396728515, "memory(GiB)": 91.52, "step": 75130, "token_acc": 0.7617236198574305, "train_speed(iter/s)": 0.137398 }, { "epoch": 0.974927741680156, "grad_norm": 0.7283686995506287, "learning_rate": 5.619047233186984e-05, "loss": 0.8096706390380859, "memory(GiB)": 91.52, "step": 75135, "token_acc": 0.780298931421615, "train_speed(iter/s)": 0.137397 }, { "epoch": 0.9749926200818116, "grad_norm": 0.67951899766922, "learning_rate": 5.618514981257117e-05, "loss": 0.8036949157714843, "memory(GiB)": 91.52, "step": 75140, "token_acc": 0.7615284585375612, "train_speed(iter/s)": 0.137395 }, { "epoch": 0.9750574984834673, "grad_norm": 0.7952263355255127, "learning_rate": 5.6179827222094195e-05, "loss": 0.8851236343383789, "memory(GiB)": 91.52, "step": 75145, "token_acc": 0.773334206047912, "train_speed(iter/s)": 0.137394 }, { "epoch": 0.975122376885123, "grad_norm": 0.7921167016029358, "learning_rate": 5.617450456050016e-05, "loss": 0.79574556350708, "memory(GiB)": 91.52, "step": 75150, "token_acc": 0.7692599489619973, "train_speed(iter/s)": 0.137393 }, { "epoch": 0.9751872552867787, "grad_norm": 0.6991003155708313, "learning_rate": 5.6169181827850326e-05, "loss": 0.8077776908874512, "memory(GiB)": 91.52, "step": 75155, "token_acc": 0.7838848514781627, "train_speed(iter/s)": 0.137392 }, { "epoch": 0.9752521336884344, "grad_norm": 0.7601674795150757, "learning_rate": 5.616385902420596e-05, "loss": 0.8383529663085938, "memory(GiB)": 91.52, "step": 75160, "token_acc": 0.7813309485629703, "train_speed(iter/s)": 0.137391 }, { "epoch": 0.9753170120900901, "grad_norm": 0.7532485127449036, "learning_rate": 5.615853614962828e-05, "loss": 0.8316457748413086, "memory(GiB)": 91.52, "step": 75165, "token_acc": 0.7680680398242573, "train_speed(iter/s)": 0.13739 }, { "epoch": 0.9753818904917458, "grad_norm": 0.7563793659210205, "learning_rate": 5.615321320417856e-05, "loss": 0.8564794540405274, "memory(GiB)": 91.52, "step": 75170, "token_acc": 0.774531058535206, "train_speed(iter/s)": 0.137388 }, { "epoch": 0.9754467688934015, "grad_norm": 0.7267370820045471, "learning_rate": 5.614789018791807e-05, "loss": 0.8141677856445313, "memory(GiB)": 91.52, "step": 75175, "token_acc": 0.7683314033842591, "train_speed(iter/s)": 0.137388 }, { "epoch": 0.9755116472950572, "grad_norm": 0.691849410533905, "learning_rate": 5.614256710090805e-05, "loss": 0.84811372756958, "memory(GiB)": 91.52, "step": 75180, "token_acc": 0.7786317456128777, "train_speed(iter/s)": 0.137387 }, { "epoch": 0.9755765256967129, "grad_norm": 0.68605637550354, "learning_rate": 5.613724394320977e-05, "loss": 0.8305779457092285, "memory(GiB)": 91.52, "step": 75185, "token_acc": 0.774248032486434, "train_speed(iter/s)": 0.137386 }, { "epoch": 0.9756414040983686, "grad_norm": 0.6568068861961365, "learning_rate": 5.6131920714884476e-05, "loss": 0.8603263854980469, "memory(GiB)": 91.52, "step": 75190, "token_acc": 0.7486180659426416, "train_speed(iter/s)": 0.137385 }, { "epoch": 0.9757062825000243, "grad_norm": 0.6788832545280457, "learning_rate": 5.612659741599344e-05, "loss": 0.8167210578918457, "memory(GiB)": 91.52, "step": 75195, "token_acc": 0.7718172846468349, "train_speed(iter/s)": 0.137384 }, { "epoch": 0.97577116090168, "grad_norm": 0.7559280395507812, "learning_rate": 5.6121274046597915e-05, "loss": 0.8313827514648438, "memory(GiB)": 91.52, "step": 75200, "token_acc": 0.7523707344085278, "train_speed(iter/s)": 0.137383 }, { "epoch": 0.9758360393033357, "grad_norm": 0.782611072063446, "learning_rate": 5.611595060675915e-05, "loss": 0.8516737937927246, "memory(GiB)": 91.52, "step": 75205, "token_acc": 0.7600190515071102, "train_speed(iter/s)": 0.137382 }, { "epoch": 0.9759009177049914, "grad_norm": 0.7226055860519409, "learning_rate": 5.6110627096538424e-05, "loss": 0.8635848999023438, "memory(GiB)": 91.52, "step": 75210, "token_acc": 0.7730500852845402, "train_speed(iter/s)": 0.137381 }, { "epoch": 0.9759657961066471, "grad_norm": 0.704887866973877, "learning_rate": 5.6105303515996996e-05, "loss": 0.8617971420288086, "memory(GiB)": 91.52, "step": 75215, "token_acc": 0.7723511276975499, "train_speed(iter/s)": 0.13738 }, { "epoch": 0.9760306745083028, "grad_norm": 0.7654839158058167, "learning_rate": 5.609997986519613e-05, "loss": 0.8398920059204101, "memory(GiB)": 91.52, "step": 75220, "token_acc": 0.7830068278986664, "train_speed(iter/s)": 0.137379 }, { "epoch": 0.9760955529099585, "grad_norm": 0.7121729254722595, "learning_rate": 5.609465614419709e-05, "loss": 0.8408506393432618, "memory(GiB)": 91.52, "step": 75225, "token_acc": 0.7501581488263692, "train_speed(iter/s)": 0.137377 }, { "epoch": 0.9761604313116142, "grad_norm": 0.7321926951408386, "learning_rate": 5.608933235306113e-05, "loss": 0.8352458953857422, "memory(GiB)": 91.52, "step": 75230, "token_acc": 0.7618812579325906, "train_speed(iter/s)": 0.137377 }, { "epoch": 0.9762253097132699, "grad_norm": 0.6843953728675842, "learning_rate": 5.608400849184955e-05, "loss": 0.8234739303588867, "memory(GiB)": 91.52, "step": 75235, "token_acc": 0.7810658810926621, "train_speed(iter/s)": 0.137376 }, { "epoch": 0.9762901881149256, "grad_norm": 0.6795206069946289, "learning_rate": 5.607868456062356e-05, "loss": 0.8414130210876465, "memory(GiB)": 91.52, "step": 75240, "token_acc": 0.7664182207862832, "train_speed(iter/s)": 0.137374 }, { "epoch": 0.9763550665165813, "grad_norm": 0.6551769375801086, "learning_rate": 5.6073360559444474e-05, "loss": 0.8379364967346191, "memory(GiB)": 91.52, "step": 75245, "token_acc": 0.7884192128430865, "train_speed(iter/s)": 0.137373 }, { "epoch": 0.976419944918237, "grad_norm": 0.7769646048545837, "learning_rate": 5.606803648837354e-05, "loss": 0.8266674041748047, "memory(GiB)": 91.52, "step": 75250, "token_acc": 0.7765885100972768, "train_speed(iter/s)": 0.137372 }, { "epoch": 0.9764848233198927, "grad_norm": 0.7980539798736572, "learning_rate": 5.6062712347472045e-05, "loss": 0.8530298233032226, "memory(GiB)": 91.52, "step": 75255, "token_acc": 0.7890477191507252, "train_speed(iter/s)": 0.137371 }, { "epoch": 0.9765497017215484, "grad_norm": 0.697979211807251, "learning_rate": 5.605738813680124e-05, "loss": 0.8210332870483399, "memory(GiB)": 91.52, "step": 75260, "token_acc": 0.7760786875168094, "train_speed(iter/s)": 0.13737 }, { "epoch": 0.9766145801232041, "grad_norm": 0.6483267545700073, "learning_rate": 5.6052063856422386e-05, "loss": 0.8190479278564453, "memory(GiB)": 91.52, "step": 75265, "token_acc": 0.7535258156285939, "train_speed(iter/s)": 0.137369 }, { "epoch": 0.9766794585248598, "grad_norm": 0.7110550999641418, "learning_rate": 5.6046739506396804e-05, "loss": 0.817469596862793, "memory(GiB)": 91.52, "step": 75270, "token_acc": 0.7942790620322634, "train_speed(iter/s)": 0.137367 }, { "epoch": 0.9767443369265155, "grad_norm": 0.6933779120445251, "learning_rate": 5.604141508678571e-05, "loss": 0.8449396133422852, "memory(GiB)": 91.52, "step": 75275, "token_acc": 0.7601763907734057, "train_speed(iter/s)": 0.137366 }, { "epoch": 0.9768092153281712, "grad_norm": 0.7320897579193115, "learning_rate": 5.6036090597650404e-05, "loss": 0.8765705108642579, "memory(GiB)": 91.52, "step": 75280, "token_acc": 0.7597415944795184, "train_speed(iter/s)": 0.137365 }, { "epoch": 0.9768740937298269, "grad_norm": 0.650600790977478, "learning_rate": 5.603076603905215e-05, "loss": 0.871196174621582, "memory(GiB)": 91.52, "step": 75285, "token_acc": 0.7794574684418525, "train_speed(iter/s)": 0.137364 }, { "epoch": 0.9769389721314826, "grad_norm": 0.7351893782615662, "learning_rate": 5.6025441411052234e-05, "loss": 0.8288151741027832, "memory(GiB)": 91.52, "step": 75290, "token_acc": 0.7903391572456321, "train_speed(iter/s)": 0.137363 }, { "epoch": 0.9770038505331383, "grad_norm": 0.7234529852867126, "learning_rate": 5.602011671371193e-05, "loss": 0.8427721977233886, "memory(GiB)": 91.52, "step": 75295, "token_acc": 0.7477893890675241, "train_speed(iter/s)": 0.137362 }, { "epoch": 0.977068728934794, "grad_norm": 0.6856206059455872, "learning_rate": 5.60147919470925e-05, "loss": 0.8547185897827149, "memory(GiB)": 91.52, "step": 75300, "token_acc": 0.7641740627897735, "train_speed(iter/s)": 0.137361 }, { "epoch": 0.9771336073364496, "grad_norm": 0.7601006031036377, "learning_rate": 5.600946711125525e-05, "loss": 0.8370277404785156, "memory(GiB)": 91.52, "step": 75305, "token_acc": 0.7535104986876641, "train_speed(iter/s)": 0.137359 }, { "epoch": 0.9771984857381053, "grad_norm": 0.6955156922340393, "learning_rate": 5.600414220626142e-05, "loss": 0.8235291481018067, "memory(GiB)": 91.52, "step": 75310, "token_acc": 0.7634888438133874, "train_speed(iter/s)": 0.137358 }, { "epoch": 0.977263364139761, "grad_norm": 0.7197437882423401, "learning_rate": 5.599881723217233e-05, "loss": 0.8195899963378906, "memory(GiB)": 91.52, "step": 75315, "token_acc": 0.7904103852596315, "train_speed(iter/s)": 0.137356 }, { "epoch": 0.9773282425414167, "grad_norm": 0.689494788646698, "learning_rate": 5.599349218904921e-05, "loss": 0.8742835044860839, "memory(GiB)": 91.52, "step": 75320, "token_acc": 0.7750322449256669, "train_speed(iter/s)": 0.137355 }, { "epoch": 0.9773931209430724, "grad_norm": 0.7064238786697388, "learning_rate": 5.598816707695339e-05, "loss": 0.8343423843383789, "memory(GiB)": 91.52, "step": 75325, "token_acc": 0.7565695515566981, "train_speed(iter/s)": 0.137353 }, { "epoch": 0.9774579993447281, "grad_norm": 0.767027735710144, "learning_rate": 5.5982841895946126e-05, "loss": 0.8708998680114746, "memory(GiB)": 91.52, "step": 75330, "token_acc": 0.7472511729522582, "train_speed(iter/s)": 0.137352 }, { "epoch": 0.9775228777463838, "grad_norm": 0.7370847463607788, "learning_rate": 5.597751664608868e-05, "loss": 0.8408771514892578, "memory(GiB)": 91.52, "step": 75335, "token_acc": 0.7871926073010539, "train_speed(iter/s)": 0.137351 }, { "epoch": 0.9775877561480395, "grad_norm": 0.8002889752388, "learning_rate": 5.597219132744239e-05, "loss": 0.8867074012756347, "memory(GiB)": 91.52, "step": 75340, "token_acc": 0.7535723564562224, "train_speed(iter/s)": 0.137351 }, { "epoch": 0.9776526345496952, "grad_norm": 0.7512273192405701, "learning_rate": 5.5966865940068483e-05, "loss": 0.8718490600585938, "memory(GiB)": 91.52, "step": 75345, "token_acc": 0.7601913548607552, "train_speed(iter/s)": 0.13735 }, { "epoch": 0.9777175129513509, "grad_norm": 0.7031661868095398, "learning_rate": 5.5961540484028276e-05, "loss": 0.826631736755371, "memory(GiB)": 91.52, "step": 75350, "token_acc": 0.7561263356591499, "train_speed(iter/s)": 0.137349 }, { "epoch": 0.9777823913530066, "grad_norm": 0.7400516271591187, "learning_rate": 5.5956214959383034e-05, "loss": 0.8092000961303711, "memory(GiB)": 91.52, "step": 75355, "token_acc": 0.778597050617776, "train_speed(iter/s)": 0.137347 }, { "epoch": 0.9778472697546623, "grad_norm": 0.8278023600578308, "learning_rate": 5.595088936619407e-05, "loss": 0.8678821563720703, "memory(GiB)": 91.52, "step": 75360, "token_acc": 0.7712696941612605, "train_speed(iter/s)": 0.137346 }, { "epoch": 0.977912148156318, "grad_norm": 0.742534339427948, "learning_rate": 5.594556370452265e-05, "loss": 0.8440858840942382, "memory(GiB)": 91.52, "step": 75365, "token_acc": 0.7678415269381392, "train_speed(iter/s)": 0.137345 }, { "epoch": 0.9779770265579737, "grad_norm": 0.691521942615509, "learning_rate": 5.5940237974430055e-05, "loss": 0.8413146018981934, "memory(GiB)": 91.52, "step": 75370, "token_acc": 0.7568284861218623, "train_speed(iter/s)": 0.137344 }, { "epoch": 0.9780419049596294, "grad_norm": 0.7237668037414551, "learning_rate": 5.593491217597759e-05, "loss": 0.8828061103820801, "memory(GiB)": 91.52, "step": 75375, "token_acc": 0.7690277671536755, "train_speed(iter/s)": 0.137343 }, { "epoch": 0.9781067833612851, "grad_norm": 0.6725212335586548, "learning_rate": 5.592958630922653e-05, "loss": 0.8260522842407226, "memory(GiB)": 91.52, "step": 75380, "token_acc": 0.7899472079708528, "train_speed(iter/s)": 0.137342 }, { "epoch": 0.9781716617629408, "grad_norm": 0.7100170850753784, "learning_rate": 5.592426037423819e-05, "loss": 0.8679750442504883, "memory(GiB)": 91.52, "step": 75385, "token_acc": 0.755754586362063, "train_speed(iter/s)": 0.137341 }, { "epoch": 0.9782365401645965, "grad_norm": 0.7590659260749817, "learning_rate": 5.591893437107383e-05, "loss": 0.8094484329223632, "memory(GiB)": 91.52, "step": 75390, "token_acc": 0.7577569637245629, "train_speed(iter/s)": 0.13734 }, { "epoch": 0.9783014185662522, "grad_norm": 0.7450286746025085, "learning_rate": 5.5913608299794763e-05, "loss": 0.8633431434631348, "memory(GiB)": 91.52, "step": 75395, "token_acc": 0.7618243813510684, "train_speed(iter/s)": 0.137338 }, { "epoch": 0.9783662969679079, "grad_norm": 0.6937126517295837, "learning_rate": 5.5908282160462266e-05, "loss": 0.8279956817626953, "memory(GiB)": 91.52, "step": 75400, "token_acc": 0.7492563212692117, "train_speed(iter/s)": 0.137337 }, { "epoch": 0.9784311753695636, "grad_norm": 0.7518959641456604, "learning_rate": 5.5902955953137634e-05, "loss": 0.8391512870788574, "memory(GiB)": 91.52, "step": 75405, "token_acc": 0.7864816635711425, "train_speed(iter/s)": 0.137336 }, { "epoch": 0.9784960537712193, "grad_norm": 0.7272306084632874, "learning_rate": 5.5897629677882166e-05, "loss": 0.8305145263671875, "memory(GiB)": 91.52, "step": 75410, "token_acc": 0.7863207823388183, "train_speed(iter/s)": 0.137335 }, { "epoch": 0.978560932172875, "grad_norm": 0.7057629823684692, "learning_rate": 5.589230333475717e-05, "loss": 0.7624073982238769, "memory(GiB)": 91.52, "step": 75415, "token_acc": 0.7962345951587652, "train_speed(iter/s)": 0.137334 }, { "epoch": 0.9786258105745307, "grad_norm": 0.6899158358573914, "learning_rate": 5.588697692382391e-05, "loss": 0.8132473945617675, "memory(GiB)": 91.52, "step": 75420, "token_acc": 0.778211523169233, "train_speed(iter/s)": 0.137332 }, { "epoch": 0.9786906889761864, "grad_norm": 0.7453901767730713, "learning_rate": 5.58816504451437e-05, "loss": 0.8225919723510742, "memory(GiB)": 91.52, "step": 75425, "token_acc": 0.781654513101427, "train_speed(iter/s)": 0.137331 }, { "epoch": 0.9787555673778421, "grad_norm": 0.6588926911354065, "learning_rate": 5.5876323898777846e-05, "loss": 0.8335184097290039, "memory(GiB)": 91.52, "step": 75430, "token_acc": 0.7646894756415025, "train_speed(iter/s)": 0.137329 }, { "epoch": 0.9788204457794978, "grad_norm": 0.7532756328582764, "learning_rate": 5.587099728478763e-05, "loss": 0.8768161773681641, "memory(GiB)": 91.52, "step": 75435, "token_acc": 0.7629658460640265, "train_speed(iter/s)": 0.137329 }, { "epoch": 0.9788853241811535, "grad_norm": 0.6610156297683716, "learning_rate": 5.586567060323434e-05, "loss": 0.8436624526977539, "memory(GiB)": 91.52, "step": 75440, "token_acc": 0.7664542920530178, "train_speed(iter/s)": 0.137327 }, { "epoch": 0.9789502025828092, "grad_norm": 0.7023612856864929, "learning_rate": 5.586034385417931e-05, "loss": 0.8355862617492675, "memory(GiB)": 91.52, "step": 75445, "token_acc": 0.7815988995169392, "train_speed(iter/s)": 0.137326 }, { "epoch": 0.9790150809844649, "grad_norm": 0.7813706994056702, "learning_rate": 5.585501703768381e-05, "loss": 0.8833171844482421, "memory(GiB)": 91.52, "step": 75450, "token_acc": 0.7528536891391499, "train_speed(iter/s)": 0.137325 }, { "epoch": 0.9790799593861206, "grad_norm": 0.7500724792480469, "learning_rate": 5.584969015380916e-05, "loss": 0.8514234542846679, "memory(GiB)": 91.52, "step": 75455, "token_acc": 0.7652833694642304, "train_speed(iter/s)": 0.137324 }, { "epoch": 0.9791448377877763, "grad_norm": 0.7154136300086975, "learning_rate": 5.584436320261663e-05, "loss": 0.8372254371643066, "memory(GiB)": 91.52, "step": 75460, "token_acc": 0.7613057268094509, "train_speed(iter/s)": 0.137323 }, { "epoch": 0.979209716189432, "grad_norm": 0.6666472554206848, "learning_rate": 5.583903618416758e-05, "loss": 0.8636957168579101, "memory(GiB)": 91.52, "step": 75465, "token_acc": 0.7564282001039709, "train_speed(iter/s)": 0.137321 }, { "epoch": 0.9792745945910877, "grad_norm": 0.6939681172370911, "learning_rate": 5.583370909852326e-05, "loss": 0.8189235687255859, "memory(GiB)": 91.52, "step": 75470, "token_acc": 0.7701191875104919, "train_speed(iter/s)": 0.13732 }, { "epoch": 0.9793394729927434, "grad_norm": 0.7490719556808472, "learning_rate": 5.582838194574498e-05, "loss": 0.8282439231872558, "memory(GiB)": 91.52, "step": 75475, "token_acc": 0.7872135360890983, "train_speed(iter/s)": 0.137319 }, { "epoch": 0.9794043513943991, "grad_norm": 0.8358582258224487, "learning_rate": 5.582305472589406e-05, "loss": 0.8616925239562988, "memory(GiB)": 91.52, "step": 75480, "token_acc": 0.7740752960152498, "train_speed(iter/s)": 0.137317 }, { "epoch": 0.9794692297960548, "grad_norm": 0.7278704047203064, "learning_rate": 5.5817727439031806e-05, "loss": 0.8552845001220704, "memory(GiB)": 91.52, "step": 75485, "token_acc": 0.760907084085892, "train_speed(iter/s)": 0.137316 }, { "epoch": 0.9795341081977105, "grad_norm": 0.6985148191452026, "learning_rate": 5.581240008521953e-05, "loss": 0.8674386978149414, "memory(GiB)": 91.52, "step": 75490, "token_acc": 0.7429700209156402, "train_speed(iter/s)": 0.137315 }, { "epoch": 0.9795989865993662, "grad_norm": 0.7744284868240356, "learning_rate": 5.58070726645185e-05, "loss": 0.8209877014160156, "memory(GiB)": 91.52, "step": 75495, "token_acc": 0.7612304530713162, "train_speed(iter/s)": 0.137314 }, { "epoch": 0.9796638650010219, "grad_norm": 0.7767477631568909, "learning_rate": 5.580174517699007e-05, "loss": 0.8381979942321778, "memory(GiB)": 91.52, "step": 75500, "token_acc": 0.7670932706659468, "train_speed(iter/s)": 0.137313 }, { "epoch": 0.9797287434026776, "grad_norm": 0.7658150792121887, "learning_rate": 5.579641762269553e-05, "loss": 0.8132844924926758, "memory(GiB)": 91.52, "step": 75505, "token_acc": 0.7891259423116105, "train_speed(iter/s)": 0.137311 }, { "epoch": 0.9797936218043333, "grad_norm": 0.732265055179596, "learning_rate": 5.579109000169619e-05, "loss": 0.859234619140625, "memory(GiB)": 91.52, "step": 75510, "token_acc": 0.7551650529846764, "train_speed(iter/s)": 0.13731 }, { "epoch": 0.979858500205989, "grad_norm": 0.6959055662155151, "learning_rate": 5.5785762314053355e-05, "loss": 0.816829776763916, "memory(GiB)": 91.52, "step": 75515, "token_acc": 0.7716958737694617, "train_speed(iter/s)": 0.137309 }, { "epoch": 0.9799233786076447, "grad_norm": 0.6467354893684387, "learning_rate": 5.578043455982835e-05, "loss": 0.8228238105773926, "memory(GiB)": 91.52, "step": 75520, "token_acc": 0.7735771587391826, "train_speed(iter/s)": 0.137307 }, { "epoch": 0.9799882570093004, "grad_norm": 0.6436893939971924, "learning_rate": 5.577510673908247e-05, "loss": 0.8019216537475586, "memory(GiB)": 91.52, "step": 75525, "token_acc": 0.7735450706811089, "train_speed(iter/s)": 0.137306 }, { "epoch": 0.9800531354109561, "grad_norm": 0.7894796133041382, "learning_rate": 5.5769778851877016e-05, "loss": 0.8562322616577148, "memory(GiB)": 91.52, "step": 75530, "token_acc": 0.7689512272872171, "train_speed(iter/s)": 0.137304 }, { "epoch": 0.9801180138126118, "grad_norm": 0.7241119742393494, "learning_rate": 5.576445089827333e-05, "loss": 0.8422548294067382, "memory(GiB)": 91.52, "step": 75535, "token_acc": 0.7454953446312546, "train_speed(iter/s)": 0.137304 }, { "epoch": 0.9801828922142674, "grad_norm": 0.7167072296142578, "learning_rate": 5.5759122878332725e-05, "loss": 0.8833179473876953, "memory(GiB)": 91.52, "step": 75540, "token_acc": 0.754248799072387, "train_speed(iter/s)": 0.137302 }, { "epoch": 0.980247770615923, "grad_norm": 0.7162345051765442, "learning_rate": 5.57537947921165e-05, "loss": 0.7776241302490234, "memory(GiB)": 91.52, "step": 75545, "token_acc": 0.7922144268101627, "train_speed(iter/s)": 0.137301 }, { "epoch": 0.9803126490175788, "grad_norm": 0.7818155288696289, "learning_rate": 5.574846663968597e-05, "loss": 0.840843391418457, "memory(GiB)": 91.52, "step": 75550, "token_acc": 0.7713766126818556, "train_speed(iter/s)": 0.137301 }, { "epoch": 0.9803775274192345, "grad_norm": 0.8063520789146423, "learning_rate": 5.574313842110246e-05, "loss": 0.8340893745422363, "memory(GiB)": 91.52, "step": 75555, "token_acc": 0.7836926773913624, "train_speed(iter/s)": 0.137299 }, { "epoch": 0.9804424058208902, "grad_norm": 0.6824370622634888, "learning_rate": 5.573781013642728e-05, "loss": 0.8407722473144531, "memory(GiB)": 91.52, "step": 75560, "token_acc": 0.7695639923927591, "train_speed(iter/s)": 0.137298 }, { "epoch": 0.9805072842225458, "grad_norm": 0.7264317870140076, "learning_rate": 5.5732481785721745e-05, "loss": 0.8749993324279786, "memory(GiB)": 91.52, "step": 75565, "token_acc": 0.7497043606799705, "train_speed(iter/s)": 0.137297 }, { "epoch": 0.9805721626242015, "grad_norm": 0.7024055123329163, "learning_rate": 5.572715336904718e-05, "loss": 0.8243955612182617, "memory(GiB)": 91.52, "step": 75570, "token_acc": 0.7809227809227809, "train_speed(iter/s)": 0.137296 }, { "epoch": 0.9806370410258572, "grad_norm": 0.7765678763389587, "learning_rate": 5.572182488646491e-05, "loss": 0.8342802047729492, "memory(GiB)": 91.52, "step": 75575, "token_acc": 0.759846263570806, "train_speed(iter/s)": 0.137294 }, { "epoch": 0.980701919427513, "grad_norm": 0.6577708125114441, "learning_rate": 5.5716496338036236e-05, "loss": 0.8456521987915039, "memory(GiB)": 91.52, "step": 75580, "token_acc": 0.776045494990135, "train_speed(iter/s)": 0.137293 }, { "epoch": 0.9807667978291686, "grad_norm": 0.7282321453094482, "learning_rate": 5.571116772382251e-05, "loss": 0.8200845718383789, "memory(GiB)": 91.52, "step": 75585, "token_acc": 0.7654103795926652, "train_speed(iter/s)": 0.137293 }, { "epoch": 0.9808316762308243, "grad_norm": 0.7840533256530762, "learning_rate": 5.570583904388501e-05, "loss": 0.8837985038757324, "memory(GiB)": 91.52, "step": 75590, "token_acc": 0.7467941751793088, "train_speed(iter/s)": 0.137291 }, { "epoch": 0.98089655463248, "grad_norm": 0.6497525572776794, "learning_rate": 5.57005102982851e-05, "loss": 0.8572355270385742, "memory(GiB)": 91.52, "step": 75595, "token_acc": 0.7612517024039528, "train_speed(iter/s)": 0.137291 }, { "epoch": 0.9809614330341357, "grad_norm": 0.7941902279853821, "learning_rate": 5.569518148708407e-05, "loss": 0.8915500640869141, "memory(GiB)": 91.52, "step": 75600, "token_acc": 0.7428069261814868, "train_speed(iter/s)": 0.137289 }, { "epoch": 0.9810263114357914, "grad_norm": 0.6469407081604004, "learning_rate": 5.568985261034328e-05, "loss": 0.8183534622192383, "memory(GiB)": 91.52, "step": 75605, "token_acc": 0.7722958959210608, "train_speed(iter/s)": 0.137288 }, { "epoch": 0.9810911898374471, "grad_norm": 0.7455586791038513, "learning_rate": 5.568452366812401e-05, "loss": 0.8354604721069336, "memory(GiB)": 91.52, "step": 75610, "token_acc": 0.7749762733312243, "train_speed(iter/s)": 0.137287 }, { "epoch": 0.9811560682391028, "grad_norm": 0.7411566376686096, "learning_rate": 5.567919466048761e-05, "loss": 0.8386474609375, "memory(GiB)": 91.52, "step": 75615, "token_acc": 0.7681121898597627, "train_speed(iter/s)": 0.137287 }, { "epoch": 0.9812209466407585, "grad_norm": 0.7787479758262634, "learning_rate": 5.567386558749541e-05, "loss": 0.8644520759582519, "memory(GiB)": 91.52, "step": 75620, "token_acc": 0.7446572506100717, "train_speed(iter/s)": 0.137285 }, { "epoch": 0.9812858250424142, "grad_norm": 0.6872161030769348, "learning_rate": 5.566853644920872e-05, "loss": 0.8519338607788086, "memory(GiB)": 91.52, "step": 75625, "token_acc": 0.7830782378991531, "train_speed(iter/s)": 0.137284 }, { "epoch": 0.9813507034440699, "grad_norm": 0.6507441401481628, "learning_rate": 5.56632072456889e-05, "loss": 0.8346343994140625, "memory(GiB)": 91.52, "step": 75630, "token_acc": 0.7712268135283982, "train_speed(iter/s)": 0.137282 }, { "epoch": 0.9814155818457256, "grad_norm": 0.734758198261261, "learning_rate": 5.565787797699723e-05, "loss": 0.8477384567260742, "memory(GiB)": 91.52, "step": 75635, "token_acc": 0.7846053220420669, "train_speed(iter/s)": 0.137281 }, { "epoch": 0.9814804602473813, "grad_norm": 0.6915717720985413, "learning_rate": 5.565254864319508e-05, "loss": 0.8458417892456055, "memory(GiB)": 91.52, "step": 75640, "token_acc": 0.7670779714843362, "train_speed(iter/s)": 0.13728 }, { "epoch": 0.981545338649037, "grad_norm": 0.7079700231552124, "learning_rate": 5.564721924434376e-05, "loss": 0.857297134399414, "memory(GiB)": 91.52, "step": 75645, "token_acc": 0.7576739288896088, "train_speed(iter/s)": 0.13728 }, { "epoch": 0.9816102170506927, "grad_norm": 0.6991454362869263, "learning_rate": 5.564188978050461e-05, "loss": 0.8414159774780273, "memory(GiB)": 91.52, "step": 75650, "token_acc": 0.7573738980579307, "train_speed(iter/s)": 0.137278 }, { "epoch": 0.9816750954523484, "grad_norm": 0.7016451954841614, "learning_rate": 5.563656025173894e-05, "loss": 0.8478336334228516, "memory(GiB)": 91.52, "step": 75655, "token_acc": 0.7533456866378423, "train_speed(iter/s)": 0.137277 }, { "epoch": 0.9817399738540041, "grad_norm": 0.6316813230514526, "learning_rate": 5.56312306581081e-05, "loss": 0.8562782287597657, "memory(GiB)": 91.52, "step": 75660, "token_acc": 0.7581530837741914, "train_speed(iter/s)": 0.137275 }, { "epoch": 0.9818048522556598, "grad_norm": 0.7087423205375671, "learning_rate": 5.562590099967344e-05, "loss": 0.84295654296875, "memory(GiB)": 91.52, "step": 75665, "token_acc": 0.7750342935528121, "train_speed(iter/s)": 0.137274 }, { "epoch": 0.9818697306573155, "grad_norm": 0.7941346168518066, "learning_rate": 5.562057127649625e-05, "loss": 0.8473081588745117, "memory(GiB)": 91.52, "step": 75670, "token_acc": 0.7810805614304941, "train_speed(iter/s)": 0.137274 }, { "epoch": 0.9819346090589712, "grad_norm": 0.6996146440505981, "learning_rate": 5.5615241488637906e-05, "loss": 0.8213043212890625, "memory(GiB)": 91.52, "step": 75675, "token_acc": 0.7668647326109752, "train_speed(iter/s)": 0.137273 }, { "epoch": 0.9819994874606269, "grad_norm": 0.7882722020149231, "learning_rate": 5.560991163615972e-05, "loss": 0.8743563652038574, "memory(GiB)": 91.52, "step": 75680, "token_acc": 0.7530174101897675, "train_speed(iter/s)": 0.137272 }, { "epoch": 0.9820643658622826, "grad_norm": 0.7482962608337402, "learning_rate": 5.560458171912302e-05, "loss": 0.8578882217407227, "memory(GiB)": 91.52, "step": 75685, "token_acc": 0.7490849547293392, "train_speed(iter/s)": 0.13727 }, { "epoch": 0.9821292442639383, "grad_norm": 0.7442695498466492, "learning_rate": 5.559925173758916e-05, "loss": 0.8520979881286621, "memory(GiB)": 91.52, "step": 75690, "token_acc": 0.7445097037793666, "train_speed(iter/s)": 0.137269 }, { "epoch": 0.982194122665594, "grad_norm": 0.7465196251869202, "learning_rate": 5.5593921691619465e-05, "loss": 0.8246932983398437, "memory(GiB)": 91.52, "step": 75695, "token_acc": 0.781990191645508, "train_speed(iter/s)": 0.137268 }, { "epoch": 0.9822590010672497, "grad_norm": 0.6249620914459229, "learning_rate": 5.55885915812753e-05, "loss": 0.8248653411865234, "memory(GiB)": 91.52, "step": 75700, "token_acc": 0.7588828978293931, "train_speed(iter/s)": 0.137267 }, { "epoch": 0.9823238794689054, "grad_norm": 0.8995378613471985, "learning_rate": 5.558326140661797e-05, "loss": 0.8400569915771484, "memory(GiB)": 91.52, "step": 75705, "token_acc": 0.7677874458409425, "train_speed(iter/s)": 0.137265 }, { "epoch": 0.9823887578705611, "grad_norm": 0.7753105759620667, "learning_rate": 5.557793116770883e-05, "loss": 0.8549581527709961, "memory(GiB)": 91.52, "step": 75710, "token_acc": 0.7710944906773411, "train_speed(iter/s)": 0.137264 }, { "epoch": 0.9824536362722168, "grad_norm": 0.8618111610412598, "learning_rate": 5.557260086460922e-05, "loss": 0.8585886001586914, "memory(GiB)": 91.52, "step": 75715, "token_acc": 0.750591575958353, "train_speed(iter/s)": 0.137264 }, { "epoch": 0.9825185146738725, "grad_norm": 0.6863151788711548, "learning_rate": 5.556727049738046e-05, "loss": 0.835243034362793, "memory(GiB)": 91.52, "step": 75720, "token_acc": 0.7594897324206596, "train_speed(iter/s)": 0.137262 }, { "epoch": 0.9825833930755282, "grad_norm": 0.7157309055328369, "learning_rate": 5.556194006608393e-05, "loss": 0.8570065498352051, "memory(GiB)": 91.52, "step": 75725, "token_acc": 0.7934753302777029, "train_speed(iter/s)": 0.137261 }, { "epoch": 0.9826482714771839, "grad_norm": 0.6739473342895508, "learning_rate": 5.555660957078094e-05, "loss": 0.8400415420532227, "memory(GiB)": 91.52, "step": 75730, "token_acc": 0.7618236314876163, "train_speed(iter/s)": 0.13726 }, { "epoch": 0.9827131498788396, "grad_norm": 0.6732835173606873, "learning_rate": 5.5551279011532855e-05, "loss": 0.8211879730224609, "memory(GiB)": 91.52, "step": 75735, "token_acc": 0.7900072639989894, "train_speed(iter/s)": 0.137259 }, { "epoch": 0.9827780282804953, "grad_norm": 0.6908504366874695, "learning_rate": 5.554594838840099e-05, "loss": 0.8153548240661621, "memory(GiB)": 91.52, "step": 75740, "token_acc": 0.7812857623695726, "train_speed(iter/s)": 0.137258 }, { "epoch": 0.982842906682151, "grad_norm": 0.7677936553955078, "learning_rate": 5.5540617701446715e-05, "loss": 0.8639581680297852, "memory(GiB)": 91.52, "step": 75745, "token_acc": 0.7645126353790613, "train_speed(iter/s)": 0.137256 }, { "epoch": 0.9829077850838067, "grad_norm": 0.6443750858306885, "learning_rate": 5.553528695073138e-05, "loss": 0.843927001953125, "memory(GiB)": 91.52, "step": 75750, "token_acc": 0.768682334301558, "train_speed(iter/s)": 0.137255 }, { "epoch": 0.9829726634854624, "grad_norm": 0.6605021953582764, "learning_rate": 5.552995613631632e-05, "loss": 0.780604076385498, "memory(GiB)": 91.52, "step": 75755, "token_acc": 0.7699204486176318, "train_speed(iter/s)": 0.137254 }, { "epoch": 0.9830375418871181, "grad_norm": 0.6648063659667969, "learning_rate": 5.5524625258262875e-05, "loss": 0.841318416595459, "memory(GiB)": 91.52, "step": 75760, "token_acc": 0.764797265807053, "train_speed(iter/s)": 0.137253 }, { "epoch": 0.9831024202887738, "grad_norm": 0.70379239320755, "learning_rate": 5.5519294316632373e-05, "loss": 0.8385261535644531, "memory(GiB)": 91.52, "step": 75765, "token_acc": 0.7787680120193949, "train_speed(iter/s)": 0.137251 }, { "epoch": 0.9831672986904295, "grad_norm": 0.698487401008606, "learning_rate": 5.551396331148622e-05, "loss": 0.8124822616577149, "memory(GiB)": 91.52, "step": 75770, "token_acc": 0.7760994388620645, "train_speed(iter/s)": 0.13725 }, { "epoch": 0.9832321770920852, "grad_norm": 0.7681393623352051, "learning_rate": 5.5508632242885715e-05, "loss": 0.8283021926879883, "memory(GiB)": 91.52, "step": 75775, "token_acc": 0.774242732262655, "train_speed(iter/s)": 0.137249 }, { "epoch": 0.9832970554937408, "grad_norm": 0.6210804581642151, "learning_rate": 5.5503301110892236e-05, "loss": 0.7744834899902344, "memory(GiB)": 91.52, "step": 75780, "token_acc": 0.7801620859760394, "train_speed(iter/s)": 0.137248 }, { "epoch": 0.9833619338953965, "grad_norm": 0.7512599229812622, "learning_rate": 5.5497969915567126e-05, "loss": 0.849721622467041, "memory(GiB)": 91.52, "step": 75785, "token_acc": 0.7511840228245363, "train_speed(iter/s)": 0.137247 }, { "epoch": 0.9834268122970522, "grad_norm": 0.7177263498306274, "learning_rate": 5.549263865697172e-05, "loss": 0.867542839050293, "memory(GiB)": 91.52, "step": 75790, "token_acc": 0.7397050441721288, "train_speed(iter/s)": 0.137246 }, { "epoch": 0.9834916906987079, "grad_norm": 0.7228215932846069, "learning_rate": 5.548730733516739e-05, "loss": 0.8828318595886231, "memory(GiB)": 91.52, "step": 75795, "token_acc": 0.7439074301447334, "train_speed(iter/s)": 0.137245 }, { "epoch": 0.9835565691003636, "grad_norm": 0.67594975233078, "learning_rate": 5.548197595021547e-05, "loss": 0.8286048889160156, "memory(GiB)": 91.52, "step": 75800, "token_acc": 0.7756844579646017, "train_speed(iter/s)": 0.137244 }, { "epoch": 0.9836214475020193, "grad_norm": 0.7391577363014221, "learning_rate": 5.5476644502177324e-05, "loss": 0.8701572418212891, "memory(GiB)": 91.52, "step": 75805, "token_acc": 0.7498695101036462, "train_speed(iter/s)": 0.137243 }, { "epoch": 0.983686325903675, "grad_norm": 0.7262820601463318, "learning_rate": 5.5471312991114314e-05, "loss": 0.8335042953491211, "memory(GiB)": 91.52, "step": 75810, "token_acc": 0.7727367397945015, "train_speed(iter/s)": 0.137242 }, { "epoch": 0.9837512043053307, "grad_norm": 0.6802393198013306, "learning_rate": 5.546598141708779e-05, "loss": 0.8800200462341309, "memory(GiB)": 91.52, "step": 75815, "token_acc": 0.7581158076056711, "train_speed(iter/s)": 0.13724 }, { "epoch": 0.9838160827069864, "grad_norm": 0.7475830912590027, "learning_rate": 5.546064978015908e-05, "loss": 0.8831271171569824, "memory(GiB)": 91.52, "step": 75820, "token_acc": 0.7434926644581165, "train_speed(iter/s)": 0.137239 }, { "epoch": 0.9838809611086421, "grad_norm": 0.7100279927253723, "learning_rate": 5.5455318080389586e-05, "loss": 0.8317141532897949, "memory(GiB)": 91.52, "step": 75825, "token_acc": 0.7688198312648621, "train_speed(iter/s)": 0.137237 }, { "epoch": 0.9839458395102978, "grad_norm": 0.729020357131958, "learning_rate": 5.544998631784065e-05, "loss": 0.8634048461914062, "memory(GiB)": 91.52, "step": 75830, "token_acc": 0.7509177616907401, "train_speed(iter/s)": 0.137237 }, { "epoch": 0.9840107179119535, "grad_norm": 0.7493804097175598, "learning_rate": 5.54446544925736e-05, "loss": 0.8341824531555175, "memory(GiB)": 91.52, "step": 75835, "token_acc": 0.7607567496551271, "train_speed(iter/s)": 0.137236 }, { "epoch": 0.9840755963136092, "grad_norm": 0.7155464887619019, "learning_rate": 5.5439322604649824e-05, "loss": 0.8688167572021485, "memory(GiB)": 91.52, "step": 75840, "token_acc": 0.7613525818125219, "train_speed(iter/s)": 0.137235 }, { "epoch": 0.9841404747152649, "grad_norm": 0.9008192420005798, "learning_rate": 5.543399065413067e-05, "loss": 0.8709409713745118, "memory(GiB)": 91.52, "step": 75845, "token_acc": 0.7464975516866159, "train_speed(iter/s)": 0.137233 }, { "epoch": 0.9842053531169206, "grad_norm": 0.7642700672149658, "learning_rate": 5.5428658641077515e-05, "loss": 0.7803454875946045, "memory(GiB)": 91.52, "step": 75850, "token_acc": 0.7919782460910945, "train_speed(iter/s)": 0.137232 }, { "epoch": 0.9842702315185763, "grad_norm": 0.7450535297393799, "learning_rate": 5.5423326565551694e-05, "loss": 0.8697018623352051, "memory(GiB)": 91.52, "step": 75855, "token_acc": 0.7515807440164475, "train_speed(iter/s)": 0.137231 }, { "epoch": 0.984335109920232, "grad_norm": 0.6853719353675842, "learning_rate": 5.541799442761458e-05, "loss": 0.868754768371582, "memory(GiB)": 91.52, "step": 75860, "token_acc": 0.7685920314662393, "train_speed(iter/s)": 0.13723 }, { "epoch": 0.9843999883218877, "grad_norm": 0.7696195244789124, "learning_rate": 5.541266222732755e-05, "loss": 0.8660516738891602, "memory(GiB)": 91.52, "step": 75865, "token_acc": 0.7418055301226881, "train_speed(iter/s)": 0.137229 }, { "epoch": 0.9844648667235434, "grad_norm": 0.6734578013420105, "learning_rate": 5.540732996475193e-05, "loss": 0.8492328643798828, "memory(GiB)": 91.52, "step": 75870, "token_acc": 0.7739608481922103, "train_speed(iter/s)": 0.137228 }, { "epoch": 0.9845297451251991, "grad_norm": 0.7578351497650146, "learning_rate": 5.540199763994912e-05, "loss": 0.8563814163208008, "memory(GiB)": 91.52, "step": 75875, "token_acc": 0.7692370427761693, "train_speed(iter/s)": 0.137226 }, { "epoch": 0.9845946235268548, "grad_norm": 0.6956011652946472, "learning_rate": 5.539666525298046e-05, "loss": 0.8339146614074707, "memory(GiB)": 91.52, "step": 75880, "token_acc": 0.7557894092814097, "train_speed(iter/s)": 0.137225 }, { "epoch": 0.9846595019285105, "grad_norm": 0.7027287483215332, "learning_rate": 5.539133280390732e-05, "loss": 0.8100370407104492, "memory(GiB)": 91.52, "step": 75885, "token_acc": 0.7740424162225957, "train_speed(iter/s)": 0.137224 }, { "epoch": 0.9847243803301662, "grad_norm": 0.6854948997497559, "learning_rate": 5.5386000292791075e-05, "loss": 0.8251060485839844, "memory(GiB)": 91.52, "step": 75890, "token_acc": 0.7622923024754154, "train_speed(iter/s)": 0.137222 }, { "epoch": 0.9847892587318219, "grad_norm": 0.6726873517036438, "learning_rate": 5.538066771969309e-05, "loss": 0.8378656387329102, "memory(GiB)": 91.52, "step": 75895, "token_acc": 0.7677604221267079, "train_speed(iter/s)": 0.137221 }, { "epoch": 0.9848541371334776, "grad_norm": 0.7404592037200928, "learning_rate": 5.537533508467473e-05, "loss": 0.8625782012939454, "memory(GiB)": 91.52, "step": 75900, "token_acc": 0.7736649385491063, "train_speed(iter/s)": 0.13722 }, { "epoch": 0.9849190155351333, "grad_norm": 0.7776161432266235, "learning_rate": 5.537000238779735e-05, "loss": 0.8668878555297852, "memory(GiB)": 91.52, "step": 75905, "token_acc": 0.7639205322154379, "train_speed(iter/s)": 0.13722 }, { "epoch": 0.984983893936789, "grad_norm": 0.7270159125328064, "learning_rate": 5.5364669629122335e-05, "loss": 0.8210866928100586, "memory(GiB)": 91.52, "step": 75910, "token_acc": 0.7672851819789981, "train_speed(iter/s)": 0.137219 }, { "epoch": 0.9850487723384447, "grad_norm": 0.7473382949829102, "learning_rate": 5.5359336808711024e-05, "loss": 0.8781878471374511, "memory(GiB)": 91.52, "step": 75915, "token_acc": 0.7592250313552049, "train_speed(iter/s)": 0.137217 }, { "epoch": 0.9851136507401004, "grad_norm": 0.69449383020401, "learning_rate": 5.535400392662485e-05, "loss": 0.8493034362792968, "memory(GiB)": 91.52, "step": 75920, "token_acc": 0.7637492408394628, "train_speed(iter/s)": 0.137216 }, { "epoch": 0.9851785291417561, "grad_norm": 0.7219908833503723, "learning_rate": 5.534867098292511e-05, "loss": 0.8470247268676758, "memory(GiB)": 91.52, "step": 75925, "token_acc": 0.7711846976552859, "train_speed(iter/s)": 0.137215 }, { "epoch": 0.9852434075434118, "grad_norm": 0.786992073059082, "learning_rate": 5.534333797767323e-05, "loss": 0.830657958984375, "memory(GiB)": 91.52, "step": 75930, "token_acc": 0.7446576462969906, "train_speed(iter/s)": 0.137214 }, { "epoch": 0.9853082859450675, "grad_norm": 0.7783929109573364, "learning_rate": 5.5338004910930553e-05, "loss": 0.8634861946105957, "memory(GiB)": 91.52, "step": 75935, "token_acc": 0.7774900475399065, "train_speed(iter/s)": 0.137213 }, { "epoch": 0.9853731643467232, "grad_norm": 0.7209420800209045, "learning_rate": 5.533267178275845e-05, "loss": 0.8343245506286621, "memory(GiB)": 91.52, "step": 75940, "token_acc": 0.7760323613686162, "train_speed(iter/s)": 0.137211 }, { "epoch": 0.9854380427483789, "grad_norm": 0.7329026460647583, "learning_rate": 5.532733859321832e-05, "loss": 0.8355931282043457, "memory(GiB)": 91.52, "step": 75945, "token_acc": 0.7629126077510687, "train_speed(iter/s)": 0.13721 }, { "epoch": 0.9855029211500346, "grad_norm": 0.7642766237258911, "learning_rate": 5.53220053423715e-05, "loss": 0.9040915489196777, "memory(GiB)": 91.52, "step": 75950, "token_acc": 0.7713195860714556, "train_speed(iter/s)": 0.137209 }, { "epoch": 0.9855677995516903, "grad_norm": 0.7968115210533142, "learning_rate": 5.5316672030279404e-05, "loss": 0.884954833984375, "memory(GiB)": 91.52, "step": 75955, "token_acc": 0.7631183884369654, "train_speed(iter/s)": 0.137208 }, { "epoch": 0.985632677953346, "grad_norm": 0.6891580820083618, "learning_rate": 5.531133865700337e-05, "loss": 0.81439208984375, "memory(GiB)": 91.52, "step": 75960, "token_acc": 0.7990958284814028, "train_speed(iter/s)": 0.137206 }, { "epoch": 0.9856975563550017, "grad_norm": 0.7640371322631836, "learning_rate": 5.530600522260481e-05, "loss": 0.7851504325866699, "memory(GiB)": 91.52, "step": 75965, "token_acc": 0.7786337627465396, "train_speed(iter/s)": 0.137205 }, { "epoch": 0.9857624347566574, "grad_norm": 0.7294161319732666, "learning_rate": 5.530067172714508e-05, "loss": 0.8499505996704102, "memory(GiB)": 91.52, "step": 75970, "token_acc": 0.7711621711621711, "train_speed(iter/s)": 0.137204 }, { "epoch": 0.9858273131583131, "grad_norm": 0.6926600337028503, "learning_rate": 5.5295338170685565e-05, "loss": 0.8487166404724121, "memory(GiB)": 91.52, "step": 75975, "token_acc": 0.7682247602932882, "train_speed(iter/s)": 0.137203 }, { "epoch": 0.9858921915599688, "grad_norm": 0.7589594125747681, "learning_rate": 5.529000455328763e-05, "loss": 0.8520735740661621, "memory(GiB)": 91.52, "step": 75980, "token_acc": 0.7679555804106415, "train_speed(iter/s)": 0.137203 }, { "epoch": 0.9859570699616245, "grad_norm": 0.7479588389396667, "learning_rate": 5.5284670875012655e-05, "loss": 0.8567339897155761, "memory(GiB)": 91.52, "step": 75985, "token_acc": 0.7563917331490672, "train_speed(iter/s)": 0.137202 }, { "epoch": 0.9860219483632802, "grad_norm": 0.7123844623565674, "learning_rate": 5.527933713592205e-05, "loss": 0.8423084259033203, "memory(GiB)": 91.52, "step": 75990, "token_acc": 0.7579135521611959, "train_speed(iter/s)": 0.137201 }, { "epoch": 0.9860868267649359, "grad_norm": 0.7352803945541382, "learning_rate": 5.527400333607715e-05, "loss": 0.8323801040649415, "memory(GiB)": 91.52, "step": 75995, "token_acc": 0.7539489421371716, "train_speed(iter/s)": 0.137199 }, { "epoch": 0.9861517051665916, "grad_norm": 0.6804897785186768, "learning_rate": 5.526866947553937e-05, "loss": 0.8352644920349122, "memory(GiB)": 91.52, "step": 76000, "token_acc": 0.7665892373209446, "train_speed(iter/s)": 0.137199 }, { "epoch": 0.9862165835682473, "grad_norm": 0.725236177444458, "learning_rate": 5.5263335554370086e-05, "loss": 0.8692098617553711, "memory(GiB)": 91.52, "step": 76005, "token_acc": 0.7494771569118384, "train_speed(iter/s)": 0.137198 }, { "epoch": 0.986281461969903, "grad_norm": 0.7259571552276611, "learning_rate": 5.525800157263067e-05, "loss": 0.8098100662231446, "memory(GiB)": 91.52, "step": 76010, "token_acc": 0.7772469045884923, "train_speed(iter/s)": 0.137196 }, { "epoch": 0.9863463403715587, "grad_norm": 0.7468973994255066, "learning_rate": 5.5252667530382504e-05, "loss": 0.8037927627563477, "memory(GiB)": 91.52, "step": 76015, "token_acc": 0.7667546993941277, "train_speed(iter/s)": 0.137195 }, { "epoch": 0.9864112187732142, "grad_norm": 0.8540180921554565, "learning_rate": 5.524733342768698e-05, "loss": 0.8940586090087891, "memory(GiB)": 91.52, "step": 76020, "token_acc": 0.7452983976780247, "train_speed(iter/s)": 0.137194 }, { "epoch": 0.98647609717487, "grad_norm": 0.7868250012397766, "learning_rate": 5.524199926460549e-05, "loss": 0.8751884460449219, "memory(GiB)": 91.52, "step": 76025, "token_acc": 0.760714165855623, "train_speed(iter/s)": 0.137193 }, { "epoch": 0.9865409755765256, "grad_norm": 0.7768582105636597, "learning_rate": 5.5236665041199395e-05, "loss": 0.8411117553710937, "memory(GiB)": 91.52, "step": 76030, "token_acc": 0.7732129131437356, "train_speed(iter/s)": 0.137192 }, { "epoch": 0.9866058539781813, "grad_norm": 0.7991483807563782, "learning_rate": 5.523133075753012e-05, "loss": 0.8533050537109375, "memory(GiB)": 91.52, "step": 76035, "token_acc": 0.7735036652195574, "train_speed(iter/s)": 0.137191 }, { "epoch": 0.986670732379837, "grad_norm": 0.8297145366668701, "learning_rate": 5.522599641365901e-05, "loss": 0.848918342590332, "memory(GiB)": 91.52, "step": 76040, "token_acc": 0.7670691881218197, "train_speed(iter/s)": 0.13719 }, { "epoch": 0.9867356107814927, "grad_norm": 0.8347150683403015, "learning_rate": 5.5220662009647476e-05, "loss": 0.7645246982574463, "memory(GiB)": 91.52, "step": 76045, "token_acc": 0.7946619665113936, "train_speed(iter/s)": 0.137189 }, { "epoch": 0.9868004891831484, "grad_norm": 0.8127349615097046, "learning_rate": 5.521532754555689e-05, "loss": 0.8177671432495117, "memory(GiB)": 91.52, "step": 76050, "token_acc": 0.7820655995367498, "train_speed(iter/s)": 0.137187 }, { "epoch": 0.9868653675848041, "grad_norm": 0.7645502686500549, "learning_rate": 5.5209993021448645e-05, "loss": 0.8243391036987304, "memory(GiB)": 91.52, "step": 76055, "token_acc": 0.7713603818615752, "train_speed(iter/s)": 0.137186 }, { "epoch": 0.9869302459864598, "grad_norm": 0.7752720713615417, "learning_rate": 5.5204658437384174e-05, "loss": 0.8381044387817382, "memory(GiB)": 91.52, "step": 76060, "token_acc": 0.7742617867802603, "train_speed(iter/s)": 0.137185 }, { "epoch": 0.9869951243881155, "grad_norm": 0.7529208064079285, "learning_rate": 5.519932379342478e-05, "loss": 0.8414477348327637, "memory(GiB)": 91.52, "step": 76065, "token_acc": 0.750142979696883, "train_speed(iter/s)": 0.137184 }, { "epoch": 0.9870600027897712, "grad_norm": 0.6555728912353516, "learning_rate": 5.5193989089631926e-05, "loss": 0.8365556716918945, "memory(GiB)": 91.52, "step": 76070, "token_acc": 0.7572666083124566, "train_speed(iter/s)": 0.137182 }, { "epoch": 0.9871248811914269, "grad_norm": 0.6472391486167908, "learning_rate": 5.518865432606698e-05, "loss": 0.8471478462219239, "memory(GiB)": 91.52, "step": 76075, "token_acc": 0.776460549172584, "train_speed(iter/s)": 0.137181 }, { "epoch": 0.9871897595930826, "grad_norm": 0.7303556203842163, "learning_rate": 5.518331950279134e-05, "loss": 0.7855862617492676, "memory(GiB)": 91.52, "step": 76080, "token_acc": 0.7956942298649828, "train_speed(iter/s)": 0.13718 }, { "epoch": 0.9872546379947383, "grad_norm": 0.7135400772094727, "learning_rate": 5.5177984619866384e-05, "loss": 0.8272186279296875, "memory(GiB)": 91.52, "step": 76085, "token_acc": 0.7654149817061429, "train_speed(iter/s)": 0.137179 }, { "epoch": 0.987319516396394, "grad_norm": 0.7241948246955872, "learning_rate": 5.5172649677353506e-05, "loss": 0.8341049194335938, "memory(GiB)": 91.52, "step": 76090, "token_acc": 0.7653278431098859, "train_speed(iter/s)": 0.137178 }, { "epoch": 0.9873843947980497, "grad_norm": 0.675728976726532, "learning_rate": 5.516731467531412e-05, "loss": 0.8496018409729004, "memory(GiB)": 91.52, "step": 76095, "token_acc": 0.7848720727822117, "train_speed(iter/s)": 0.137177 }, { "epoch": 0.9874492731997054, "grad_norm": 0.7062782049179077, "learning_rate": 5.51619796138096e-05, "loss": 0.8178475379943848, "memory(GiB)": 91.52, "step": 76100, "token_acc": 0.774650228563513, "train_speed(iter/s)": 0.137176 }, { "epoch": 0.9875141516013611, "grad_norm": 0.699133574962616, "learning_rate": 5.515664449290137e-05, "loss": 0.9040650367736817, "memory(GiB)": 91.52, "step": 76105, "token_acc": 0.7675257731958763, "train_speed(iter/s)": 0.137174 }, { "epoch": 0.9875790300030168, "grad_norm": 0.841491162776947, "learning_rate": 5.515130931265079e-05, "loss": 0.8231023788452149, "memory(GiB)": 91.52, "step": 76110, "token_acc": 0.7739650378946566, "train_speed(iter/s)": 0.137173 }, { "epoch": 0.9876439084046725, "grad_norm": 0.7111260294914246, "learning_rate": 5.514597407311928e-05, "loss": 0.8383146286010742, "memory(GiB)": 91.52, "step": 76115, "token_acc": 0.7764263881813551, "train_speed(iter/s)": 0.137172 }, { "epoch": 0.9877087868063282, "grad_norm": 0.7952229380607605, "learning_rate": 5.514063877436822e-05, "loss": 0.8102909088134765, "memory(GiB)": 91.52, "step": 76120, "token_acc": 0.7727047964397561, "train_speed(iter/s)": 0.137171 }, { "epoch": 0.9877736652079839, "grad_norm": 0.7128863334655762, "learning_rate": 5.5135303416459024e-05, "loss": 0.7781633377075196, "memory(GiB)": 91.52, "step": 76125, "token_acc": 0.7671415209727318, "train_speed(iter/s)": 0.13717 }, { "epoch": 0.9878385436096396, "grad_norm": 0.6824129223823547, "learning_rate": 5.51299679994531e-05, "loss": 0.8060672760009766, "memory(GiB)": 91.52, "step": 76130, "token_acc": 0.7841558534919665, "train_speed(iter/s)": 0.137168 }, { "epoch": 0.9879034220112953, "grad_norm": 0.7678859829902649, "learning_rate": 5.5124632523411824e-05, "loss": 0.8055974960327148, "memory(GiB)": 91.52, "step": 76135, "token_acc": 0.8018995929443691, "train_speed(iter/s)": 0.137167 }, { "epoch": 0.987968300412951, "grad_norm": 0.6734785437583923, "learning_rate": 5.5119296988396615e-05, "loss": 0.8513631820678711, "memory(GiB)": 91.52, "step": 76140, "token_acc": 0.7682077026824516, "train_speed(iter/s)": 0.137166 }, { "epoch": 0.9880331788146067, "grad_norm": 0.7110514044761658, "learning_rate": 5.511396139446886e-05, "loss": 0.8330635070800781, "memory(GiB)": 91.52, "step": 76145, "token_acc": 0.7565420220132663, "train_speed(iter/s)": 0.137165 }, { "epoch": 0.9880980572162624, "grad_norm": 0.6905040144920349, "learning_rate": 5.5108625741689965e-05, "loss": 0.8230724334716797, "memory(GiB)": 91.52, "step": 76150, "token_acc": 0.7669358380916985, "train_speed(iter/s)": 0.137164 }, { "epoch": 0.9881629356179181, "grad_norm": 0.7636767625808716, "learning_rate": 5.510329003012134e-05, "loss": 0.8891464233398437, "memory(GiB)": 91.52, "step": 76155, "token_acc": 0.7466666666666667, "train_speed(iter/s)": 0.137164 }, { "epoch": 0.9882278140195738, "grad_norm": 0.7364617586135864, "learning_rate": 5.509795425982437e-05, "loss": 0.8560202598571778, "memory(GiB)": 91.52, "step": 76160, "token_acc": 0.7664322274183452, "train_speed(iter/s)": 0.137163 }, { "epoch": 0.9882926924212295, "grad_norm": 0.7026956677436829, "learning_rate": 5.509261843086048e-05, "loss": 0.8336347579956055, "memory(GiB)": 91.52, "step": 76165, "token_acc": 0.7601142149442686, "train_speed(iter/s)": 0.137162 }, { "epoch": 0.9883575708228852, "grad_norm": 0.6747650504112244, "learning_rate": 5.508728254329106e-05, "loss": 0.8256451606750488, "memory(GiB)": 91.52, "step": 76170, "token_acc": 0.7525623735670938, "train_speed(iter/s)": 0.137161 }, { "epoch": 0.9884224492245409, "grad_norm": 0.7159609198570251, "learning_rate": 5.5081946597177524e-05, "loss": 0.8552619934082031, "memory(GiB)": 91.52, "step": 76175, "token_acc": 0.7593059936908517, "train_speed(iter/s)": 0.13716 }, { "epoch": 0.9884873276261966, "grad_norm": 0.7095815539360046, "learning_rate": 5.5076610592581276e-05, "loss": 0.8593697547912598, "memory(GiB)": 91.52, "step": 76180, "token_acc": 0.7621178966912009, "train_speed(iter/s)": 0.137159 }, { "epoch": 0.9885522060278523, "grad_norm": 0.6995316743850708, "learning_rate": 5.507127452956371e-05, "loss": 0.8055094718933106, "memory(GiB)": 91.52, "step": 76185, "token_acc": 0.7808372428726091, "train_speed(iter/s)": 0.137157 }, { "epoch": 0.988617084429508, "grad_norm": 0.7535672187805176, "learning_rate": 5.5065938408186256e-05, "loss": 0.8510162353515625, "memory(GiB)": 91.52, "step": 76190, "token_acc": 0.7449298833229048, "train_speed(iter/s)": 0.137156 }, { "epoch": 0.9886819628311637, "grad_norm": 0.7346996068954468, "learning_rate": 5.50606022285103e-05, "loss": 0.8285503387451172, "memory(GiB)": 91.52, "step": 76195, "token_acc": 0.7728897643539797, "train_speed(iter/s)": 0.137155 }, { "epoch": 0.9887468412328194, "grad_norm": 0.7246178984642029, "learning_rate": 5.505526599059726e-05, "loss": 0.8545300483703613, "memory(GiB)": 91.52, "step": 76200, "token_acc": 0.7638567160170544, "train_speed(iter/s)": 0.137155 }, { "epoch": 0.9888117196344751, "grad_norm": 0.7336471080780029, "learning_rate": 5.5049929694508553e-05, "loss": 0.861386775970459, "memory(GiB)": 91.52, "step": 76205, "token_acc": 0.7578813166434863, "train_speed(iter/s)": 0.137153 }, { "epoch": 0.9888765980361308, "grad_norm": 0.7197766900062561, "learning_rate": 5.5044593340305575e-05, "loss": 0.8221050262451172, "memory(GiB)": 91.52, "step": 76210, "token_acc": 0.7810973624226636, "train_speed(iter/s)": 0.137152 }, { "epoch": 0.9889414764377865, "grad_norm": 0.7558521628379822, "learning_rate": 5.503925692804973e-05, "loss": 0.7954296112060547, "memory(GiB)": 91.52, "step": 76215, "token_acc": 0.7831204306258536, "train_speed(iter/s)": 0.137151 }, { "epoch": 0.9890063548394422, "grad_norm": 0.7244538068771362, "learning_rate": 5.5033920457802446e-05, "loss": 0.8189274787902832, "memory(GiB)": 91.52, "step": 76220, "token_acc": 0.7550586600171688, "train_speed(iter/s)": 0.13715 }, { "epoch": 0.9890712332410979, "grad_norm": 0.6971786022186279, "learning_rate": 5.502858392962514e-05, "loss": 0.8187120437622071, "memory(GiB)": 91.52, "step": 76225, "token_acc": 0.7668724422556704, "train_speed(iter/s)": 0.137148 }, { "epoch": 0.9891361116427536, "grad_norm": 0.6961430311203003, "learning_rate": 5.50232473435792e-05, "loss": 0.8316228866577149, "memory(GiB)": 91.52, "step": 76230, "token_acc": 0.7506692544505421, "train_speed(iter/s)": 0.137148 }, { "epoch": 0.9892009900444093, "grad_norm": 0.6957839131355286, "learning_rate": 5.501791069972606e-05, "loss": 0.8822680473327636, "memory(GiB)": 91.52, "step": 76235, "token_acc": 0.7665796694104263, "train_speed(iter/s)": 0.137147 }, { "epoch": 0.989265868446065, "grad_norm": 0.7046269178390503, "learning_rate": 5.501257399812713e-05, "loss": 0.8250015258789063, "memory(GiB)": 91.52, "step": 76240, "token_acc": 0.7659621872811764, "train_speed(iter/s)": 0.137145 }, { "epoch": 0.9893307468477207, "grad_norm": 0.6748541593551636, "learning_rate": 5.500723723884382e-05, "loss": 0.8182032585144043, "memory(GiB)": 91.52, "step": 76245, "token_acc": 0.7709736469450489, "train_speed(iter/s)": 0.137144 }, { "epoch": 0.9893956252493764, "grad_norm": 0.7516461610794067, "learning_rate": 5.5001900421937536e-05, "loss": 0.8179620742797852, "memory(GiB)": 91.52, "step": 76250, "token_acc": 0.7715607629065007, "train_speed(iter/s)": 0.137142 }, { "epoch": 0.9894605036510321, "grad_norm": 0.7648779153823853, "learning_rate": 5.499656354746972e-05, "loss": 0.8250362396240234, "memory(GiB)": 91.52, "step": 76255, "token_acc": 0.7674271860781774, "train_speed(iter/s)": 0.137141 }, { "epoch": 0.9895253820526877, "grad_norm": 0.7813093066215515, "learning_rate": 5.499122661550177e-05, "loss": 0.822572135925293, "memory(GiB)": 91.52, "step": 76260, "token_acc": 0.7635163021048287, "train_speed(iter/s)": 0.13714 }, { "epoch": 0.9895902604543434, "grad_norm": 0.730943500995636, "learning_rate": 5.498588962609509e-05, "loss": 0.8073619842529297, "memory(GiB)": 91.52, "step": 76265, "token_acc": 0.7805299313052012, "train_speed(iter/s)": 0.137139 }, { "epoch": 0.9896551388559991, "grad_norm": 0.6416626572608948, "learning_rate": 5.498055257931112e-05, "loss": 0.8194429397583007, "memory(GiB)": 91.52, "step": 76270, "token_acc": 0.7680565766901236, "train_speed(iter/s)": 0.137138 }, { "epoch": 0.9897200172576548, "grad_norm": 0.7010806202888489, "learning_rate": 5.497521547521126e-05, "loss": 0.8491298675537109, "memory(GiB)": 91.52, "step": 76275, "token_acc": 0.7748504699515808, "train_speed(iter/s)": 0.137136 }, { "epoch": 0.9897848956593105, "grad_norm": 0.7412251234054565, "learning_rate": 5.496987831385697e-05, "loss": 0.8386634826660156, "memory(GiB)": 91.52, "step": 76280, "token_acc": 0.74718858848897, "train_speed(iter/s)": 0.137135 }, { "epoch": 0.9898497740609662, "grad_norm": 0.7012553811073303, "learning_rate": 5.4964541095309615e-05, "loss": 0.8377975463867188, "memory(GiB)": 91.52, "step": 76285, "token_acc": 0.7629622599727943, "train_speed(iter/s)": 0.137134 }, { "epoch": 0.9899146524626219, "grad_norm": 0.6553387641906738, "learning_rate": 5.4959203819630654e-05, "loss": 0.8235669136047363, "memory(GiB)": 91.52, "step": 76290, "token_acc": 0.7646910863071844, "train_speed(iter/s)": 0.137133 }, { "epoch": 0.9899795308642776, "grad_norm": 0.8378021717071533, "learning_rate": 5.49538664868815e-05, "loss": 0.8126741409301758, "memory(GiB)": 91.52, "step": 76295, "token_acc": 0.8085483758781387, "train_speed(iter/s)": 0.137132 }, { "epoch": 0.9900444092659333, "grad_norm": 0.7202343940734863, "learning_rate": 5.494852909712356e-05, "loss": 0.8452156066894532, "memory(GiB)": 91.52, "step": 76300, "token_acc": 0.7630297446696499, "train_speed(iter/s)": 0.137131 }, { "epoch": 0.990109287667589, "grad_norm": 0.7227016687393188, "learning_rate": 5.494319165041826e-05, "loss": 0.8334614753723144, "memory(GiB)": 91.52, "step": 76305, "token_acc": 0.7766438637807932, "train_speed(iter/s)": 0.13713 }, { "epoch": 0.9901741660692447, "grad_norm": 0.6756397485733032, "learning_rate": 5.493785414682703e-05, "loss": 0.8303134918212891, "memory(GiB)": 91.52, "step": 76310, "token_acc": 0.7575611325611326, "train_speed(iter/s)": 0.137129 }, { "epoch": 0.9902390444709004, "grad_norm": 0.7536317706108093, "learning_rate": 5.49325165864113e-05, "loss": 0.8859314918518066, "memory(GiB)": 91.52, "step": 76315, "token_acc": 0.7593591173918434, "train_speed(iter/s)": 0.137128 }, { "epoch": 0.9903039228725561, "grad_norm": 0.6719008684158325, "learning_rate": 5.492717896923247e-05, "loss": 0.8285102844238281, "memory(GiB)": 91.52, "step": 76320, "token_acc": 0.7626793072466462, "train_speed(iter/s)": 0.137127 }, { "epoch": 0.9903688012742118, "grad_norm": 0.7369667887687683, "learning_rate": 5.4921841295352005e-05, "loss": 0.819032859802246, "memory(GiB)": 91.52, "step": 76325, "token_acc": 0.766323768272875, "train_speed(iter/s)": 0.137125 }, { "epoch": 0.9904336796758675, "grad_norm": 0.685715913772583, "learning_rate": 5.49165035648313e-05, "loss": 0.8511665344238282, "memory(GiB)": 91.52, "step": 76330, "token_acc": 0.7610759996188904, "train_speed(iter/s)": 0.137124 }, { "epoch": 0.9904985580775232, "grad_norm": 0.6364680528640747, "learning_rate": 5.491116577773179e-05, "loss": 0.8200902938842773, "memory(GiB)": 91.52, "step": 76335, "token_acc": 0.7860666354341775, "train_speed(iter/s)": 0.137123 }, { "epoch": 0.9905634364791789, "grad_norm": 0.6955352425575256, "learning_rate": 5.490582793411489e-05, "loss": 0.7900218009948731, "memory(GiB)": 91.52, "step": 76340, "token_acc": 0.7814250947988002, "train_speed(iter/s)": 0.137121 }, { "epoch": 0.9906283148808346, "grad_norm": 0.6933519840240479, "learning_rate": 5.490049003404203e-05, "loss": 0.8558300018310547, "memory(GiB)": 91.52, "step": 76345, "token_acc": 0.7766983961403051, "train_speed(iter/s)": 0.13712 }, { "epoch": 0.9906931932824903, "grad_norm": 0.6974937319755554, "learning_rate": 5.489515207757468e-05, "loss": 0.8108283996582031, "memory(GiB)": 91.52, "step": 76350, "token_acc": 0.7877794532223169, "train_speed(iter/s)": 0.137119 }, { "epoch": 0.990758071684146, "grad_norm": 0.7422372102737427, "learning_rate": 5.4889814064774204e-05, "loss": 0.8345392227172852, "memory(GiB)": 91.52, "step": 76355, "token_acc": 0.772480749916304, "train_speed(iter/s)": 0.137118 }, { "epoch": 0.9908229500858017, "grad_norm": 0.6957353949546814, "learning_rate": 5.488447599570207e-05, "loss": 0.7937991142272949, "memory(GiB)": 91.52, "step": 76360, "token_acc": 0.7714000797766255, "train_speed(iter/s)": 0.137117 }, { "epoch": 0.9908878284874574, "grad_norm": 0.7046706676483154, "learning_rate": 5.4879137870419705e-05, "loss": 0.8298809051513671, "memory(GiB)": 91.52, "step": 76365, "token_acc": 0.7612200717309121, "train_speed(iter/s)": 0.137116 }, { "epoch": 0.9909527068891131, "grad_norm": 0.6994615197181702, "learning_rate": 5.487379968898854e-05, "loss": 0.8648293495178223, "memory(GiB)": 91.52, "step": 76370, "token_acc": 0.7677517947259933, "train_speed(iter/s)": 0.137114 }, { "epoch": 0.9910175852907688, "grad_norm": 0.7749255299568176, "learning_rate": 5.486846145147e-05, "loss": 0.8696882247924804, "memory(GiB)": 91.52, "step": 76375, "token_acc": 0.7684842139175257, "train_speed(iter/s)": 0.137113 }, { "epoch": 0.9910824636924245, "grad_norm": 0.7490757703781128, "learning_rate": 5.48631231579255e-05, "loss": 0.837071418762207, "memory(GiB)": 91.52, "step": 76380, "token_acc": 0.7721678655439267, "train_speed(iter/s)": 0.137112 }, { "epoch": 0.9911473420940802, "grad_norm": 0.7425956130027771, "learning_rate": 5.485778480841651e-05, "loss": 0.8695293426513672, "memory(GiB)": 91.52, "step": 76385, "token_acc": 0.7615843042721308, "train_speed(iter/s)": 0.137111 }, { "epoch": 0.9912122204957359, "grad_norm": 0.7767345309257507, "learning_rate": 5.485244640300444e-05, "loss": 0.8206138610839844, "memory(GiB)": 91.52, "step": 76390, "token_acc": 0.7772640540143843, "train_speed(iter/s)": 0.13711 }, { "epoch": 0.9912770988973916, "grad_norm": 0.6884821057319641, "learning_rate": 5.484710794175073e-05, "loss": 0.8230658531188965, "memory(GiB)": 91.52, "step": 76395, "token_acc": 0.7637763957188314, "train_speed(iter/s)": 0.137109 }, { "epoch": 0.9913419772990473, "grad_norm": 0.6918776035308838, "learning_rate": 5.484176942471682e-05, "loss": 0.8200899124145508, "memory(GiB)": 91.52, "step": 76400, "token_acc": 0.7775884665792923, "train_speed(iter/s)": 0.137107 }, { "epoch": 0.991406855700703, "grad_norm": 0.8150745034217834, "learning_rate": 5.483643085196413e-05, "loss": 0.8106369018554688, "memory(GiB)": 91.52, "step": 76405, "token_acc": 0.7760006429834432, "train_speed(iter/s)": 0.137106 }, { "epoch": 0.9914717341023587, "grad_norm": 0.7801563739776611, "learning_rate": 5.4831092223554115e-05, "loss": 0.8792306900024414, "memory(GiB)": 91.52, "step": 76410, "token_acc": 0.7419411599099099, "train_speed(iter/s)": 0.137105 }, { "epoch": 0.9915366125040144, "grad_norm": 0.6892586350440979, "learning_rate": 5.482575353954819e-05, "loss": 0.8433910369873047, "memory(GiB)": 91.52, "step": 76415, "token_acc": 0.7651731559381726, "train_speed(iter/s)": 0.137104 }, { "epoch": 0.9916014909056701, "grad_norm": 0.7007744312286377, "learning_rate": 5.482041480000781e-05, "loss": 0.8240663528442382, "memory(GiB)": 91.52, "step": 76420, "token_acc": 0.7694458566344687, "train_speed(iter/s)": 0.137103 }, { "epoch": 0.9916663693073258, "grad_norm": 0.7593598961830139, "learning_rate": 5.48150760049944e-05, "loss": 0.8339725494384765, "memory(GiB)": 91.52, "step": 76425, "token_acc": 0.7499588680487003, "train_speed(iter/s)": 0.137102 }, { "epoch": 0.9917312477089815, "grad_norm": 0.7677722573280334, "learning_rate": 5.4809737154569404e-05, "loss": 0.8164817810058593, "memory(GiB)": 91.52, "step": 76430, "token_acc": 0.7959351195166798, "train_speed(iter/s)": 0.1371 }, { "epoch": 0.9917961261106372, "grad_norm": 0.7807883024215698, "learning_rate": 5.480439824879427e-05, "loss": 0.8733606338500977, "memory(GiB)": 91.52, "step": 76435, "token_acc": 0.7511543373612339, "train_speed(iter/s)": 0.137099 }, { "epoch": 0.9918610045122929, "grad_norm": 0.7166024446487427, "learning_rate": 5.4799059287730434e-05, "loss": 0.8128385543823242, "memory(GiB)": 91.52, "step": 76440, "token_acc": 0.7832000269987514, "train_speed(iter/s)": 0.137098 }, { "epoch": 0.9919258829139486, "grad_norm": 0.703803300857544, "learning_rate": 5.479372027143933e-05, "loss": 0.819798469543457, "memory(GiB)": 91.52, "step": 76445, "token_acc": 0.7647888980359873, "train_speed(iter/s)": 0.137097 }, { "epoch": 0.9919907613156043, "grad_norm": 0.7139107584953308, "learning_rate": 5.47883811999824e-05, "loss": 0.8124117851257324, "memory(GiB)": 91.52, "step": 76450, "token_acc": 0.7840856212191717, "train_speed(iter/s)": 0.137096 }, { "epoch": 0.99205563971726, "grad_norm": 0.8243740200996399, "learning_rate": 5.478304207342109e-05, "loss": 0.8493201255798339, "memory(GiB)": 91.52, "step": 76455, "token_acc": 0.7717064684485213, "train_speed(iter/s)": 0.137094 }, { "epoch": 0.9921205181189157, "grad_norm": 0.7530072927474976, "learning_rate": 5.477770289181683e-05, "loss": 0.8825698852539062, "memory(GiB)": 91.52, "step": 76460, "token_acc": 0.7378381922286813, "train_speed(iter/s)": 0.137093 }, { "epoch": 0.9921853965205714, "grad_norm": 0.7569455504417419, "learning_rate": 5.477236365523109e-05, "loss": 0.8448172569274902, "memory(GiB)": 91.52, "step": 76465, "token_acc": 0.7772058290437178, "train_speed(iter/s)": 0.137092 }, { "epoch": 0.9922502749222271, "grad_norm": 0.7857473492622375, "learning_rate": 5.476702436372527e-05, "loss": 0.8609181404113769, "memory(GiB)": 91.52, "step": 76470, "token_acc": 0.7656976549693908, "train_speed(iter/s)": 0.137091 }, { "epoch": 0.9923151533238828, "grad_norm": 0.7210046648979187, "learning_rate": 5.476168501736086e-05, "loss": 0.848599624633789, "memory(GiB)": 91.52, "step": 76475, "token_acc": 0.7625802361518346, "train_speed(iter/s)": 0.13709 }, { "epoch": 0.9923800317255385, "grad_norm": 0.6495693922042847, "learning_rate": 5.475634561619929e-05, "loss": 0.8181592941284179, "memory(GiB)": 91.52, "step": 76480, "token_acc": 0.7836694050286284, "train_speed(iter/s)": 0.137088 }, { "epoch": 0.9924449101271942, "grad_norm": 0.6597015857696533, "learning_rate": 5.4751006160301976e-05, "loss": 0.8304509162902832, "memory(GiB)": 91.52, "step": 76485, "token_acc": 0.7656767739918725, "train_speed(iter/s)": 0.137087 }, { "epoch": 0.9925097885288499, "grad_norm": 0.8768933415412903, "learning_rate": 5.474566664973041e-05, "loss": 0.8539333343505859, "memory(GiB)": 91.52, "step": 76490, "token_acc": 0.760429166372556, "train_speed(iter/s)": 0.137086 }, { "epoch": 0.9925746669305054, "grad_norm": 0.7886553406715393, "learning_rate": 5.474032708454602e-05, "loss": 0.8182592391967773, "memory(GiB)": 91.52, "step": 76495, "token_acc": 0.7668380991565521, "train_speed(iter/s)": 0.137086 }, { "epoch": 0.9926395453321611, "grad_norm": 0.7918372750282288, "learning_rate": 5.473498746481024e-05, "loss": 0.8551433563232422, "memory(GiB)": 91.52, "step": 76500, "token_acc": 0.7502623983206508, "train_speed(iter/s)": 0.137086 }, { "epoch": 0.9927044237338168, "grad_norm": 0.7147414088249207, "learning_rate": 5.472964779058454e-05, "loss": 0.8226104736328125, "memory(GiB)": 91.52, "step": 76505, "token_acc": 0.7696078431372549, "train_speed(iter/s)": 0.137084 }, { "epoch": 0.9927693021354725, "grad_norm": 0.7075819373130798, "learning_rate": 5.472430806193034e-05, "loss": 0.8268850326538086, "memory(GiB)": 91.52, "step": 76510, "token_acc": 0.7786661708193876, "train_speed(iter/s)": 0.137083 }, { "epoch": 0.9928341805371282, "grad_norm": 0.6240552067756653, "learning_rate": 5.471896827890912e-05, "loss": 0.7968096256256103, "memory(GiB)": 91.52, "step": 76515, "token_acc": 0.7709561356503101, "train_speed(iter/s)": 0.137082 }, { "epoch": 0.9928990589387839, "grad_norm": 0.7048808336257935, "learning_rate": 5.471362844158231e-05, "loss": 0.8360553741455078, "memory(GiB)": 91.52, "step": 76520, "token_acc": 0.7562026061989251, "train_speed(iter/s)": 0.13708 }, { "epoch": 0.9929639373404396, "grad_norm": 0.7919250726699829, "learning_rate": 5.4708288550011364e-05, "loss": 0.8633506774902344, "memory(GiB)": 91.52, "step": 76525, "token_acc": 0.7522057241230902, "train_speed(iter/s)": 0.13708 }, { "epoch": 0.9930288157420953, "grad_norm": 0.737984836101532, "learning_rate": 5.470294860425775e-05, "loss": 0.823493766784668, "memory(GiB)": 91.52, "step": 76530, "token_acc": 0.7736415700516832, "train_speed(iter/s)": 0.137079 }, { "epoch": 0.993093694143751, "grad_norm": 0.6720760464668274, "learning_rate": 5.46976086043829e-05, "loss": 0.8645994186401367, "memory(GiB)": 91.52, "step": 76535, "token_acc": 0.7609202851587816, "train_speed(iter/s)": 0.137077 }, { "epoch": 0.9931585725454067, "grad_norm": 0.7292094230651855, "learning_rate": 5.469226855044827e-05, "loss": 0.8298903465270996, "memory(GiB)": 91.52, "step": 76540, "token_acc": 0.7694009799272957, "train_speed(iter/s)": 0.137076 }, { "epoch": 0.9932234509470624, "grad_norm": 0.7927757501602173, "learning_rate": 5.4686928442515304e-05, "loss": 0.8155958175659179, "memory(GiB)": 91.52, "step": 76545, "token_acc": 0.7631637484508564, "train_speed(iter/s)": 0.137075 }, { "epoch": 0.9932883293487181, "grad_norm": 0.726237952709198, "learning_rate": 5.4681588280645504e-05, "loss": 0.8416881561279297, "memory(GiB)": 91.52, "step": 76550, "token_acc": 0.7621038521686401, "train_speed(iter/s)": 0.137073 }, { "epoch": 0.9933532077503738, "grad_norm": 0.7227889895439148, "learning_rate": 5.467624806490025e-05, "loss": 0.8101405143737793, "memory(GiB)": 91.52, "step": 76555, "token_acc": 0.7713559599934415, "train_speed(iter/s)": 0.137072 }, { "epoch": 0.9934180861520295, "grad_norm": 0.7368015050888062, "learning_rate": 5.467090779534105e-05, "loss": 0.8058430671691894, "memory(GiB)": 91.52, "step": 76560, "token_acc": 0.7768099240780911, "train_speed(iter/s)": 0.137071 }, { "epoch": 0.9934829645536852, "grad_norm": 0.6665797829627991, "learning_rate": 5.466556747202935e-05, "loss": 0.818482494354248, "memory(GiB)": 91.52, "step": 76565, "token_acc": 0.7631478803561915, "train_speed(iter/s)": 0.137069 }, { "epoch": 0.9935478429553409, "grad_norm": 0.7682438492774963, "learning_rate": 5.466022709502658e-05, "loss": 0.8606003761291504, "memory(GiB)": 91.52, "step": 76570, "token_acc": 0.7820538252992131, "train_speed(iter/s)": 0.137069 }, { "epoch": 0.9936127213569966, "grad_norm": 0.7467478513717651, "learning_rate": 5.465488666439423e-05, "loss": 0.8490697860717773, "memory(GiB)": 91.52, "step": 76575, "token_acc": 0.7750084077575576, "train_speed(iter/s)": 0.137068 }, { "epoch": 0.9936775997586523, "grad_norm": 0.7869995832443237, "learning_rate": 5.4649546180193735e-05, "loss": 0.7676218986511231, "memory(GiB)": 91.52, "step": 76580, "token_acc": 0.7917295622467625, "train_speed(iter/s)": 0.137066 }, { "epoch": 0.993742478160308, "grad_norm": 0.7353180646896362, "learning_rate": 5.464420564248658e-05, "loss": 0.8668682098388671, "memory(GiB)": 91.52, "step": 76585, "token_acc": 0.7919731846226863, "train_speed(iter/s)": 0.137065 }, { "epoch": 0.9938073565619637, "grad_norm": 0.7722586989402771, "learning_rate": 5.463886505133419e-05, "loss": 0.8500621795654297, "memory(GiB)": 91.52, "step": 76590, "token_acc": 0.7763534486233724, "train_speed(iter/s)": 0.137064 }, { "epoch": 0.9938722349636194, "grad_norm": 0.7415828704833984, "learning_rate": 5.463352440679804e-05, "loss": 0.8401772499084472, "memory(GiB)": 91.52, "step": 76595, "token_acc": 0.7711881319850985, "train_speed(iter/s)": 0.137062 }, { "epoch": 0.9939371133652751, "grad_norm": 0.7757143974304199, "learning_rate": 5.4628183708939594e-05, "loss": 0.8423083305358887, "memory(GiB)": 91.52, "step": 76600, "token_acc": 0.7654461807535835, "train_speed(iter/s)": 0.137061 }, { "epoch": 0.9940019917669308, "grad_norm": 0.7686313986778259, "learning_rate": 5.4622842957820306e-05, "loss": 0.8148921012878418, "memory(GiB)": 91.52, "step": 76605, "token_acc": 0.7756450351837373, "train_speed(iter/s)": 0.13706 }, { "epoch": 0.9940668701685865, "grad_norm": 0.6504325866699219, "learning_rate": 5.4617502153501644e-05, "loss": 0.831393814086914, "memory(GiB)": 91.52, "step": 76610, "token_acc": 0.7705103831529687, "train_speed(iter/s)": 0.137059 }, { "epoch": 0.9941317485702422, "grad_norm": 0.7066813111305237, "learning_rate": 5.461216129604505e-05, "loss": 0.8479392051696777, "memory(GiB)": 91.52, "step": 76615, "token_acc": 0.7578002072379783, "train_speed(iter/s)": 0.137058 }, { "epoch": 0.9941966269718979, "grad_norm": 0.7245796322822571, "learning_rate": 5.4606820385512025e-05, "loss": 0.8541851997375488, "memory(GiB)": 91.52, "step": 76620, "token_acc": 0.7790549169859514, "train_speed(iter/s)": 0.137057 }, { "epoch": 0.9942615053735536, "grad_norm": 0.7548454403877258, "learning_rate": 5.460147942196399e-05, "loss": 0.8590299606323242, "memory(GiB)": 91.52, "step": 76625, "token_acc": 0.7592742106828708, "train_speed(iter/s)": 0.137056 }, { "epoch": 0.9943263837752093, "grad_norm": 0.7042162418365479, "learning_rate": 5.459613840546243e-05, "loss": 0.8465314865112304, "memory(GiB)": 91.52, "step": 76630, "token_acc": 0.7977606717984604, "train_speed(iter/s)": 0.137056 }, { "epoch": 0.994391262176865, "grad_norm": 0.7588555812835693, "learning_rate": 5.459079733606879e-05, "loss": 0.8503433227539062, "memory(GiB)": 91.52, "step": 76635, "token_acc": 0.7588421385637456, "train_speed(iter/s)": 0.137055 }, { "epoch": 0.9944561405785207, "grad_norm": 0.7370774745941162, "learning_rate": 5.4585456213844575e-05, "loss": 0.837526512145996, "memory(GiB)": 91.52, "step": 76640, "token_acc": 0.7653698133954271, "train_speed(iter/s)": 0.137054 }, { "epoch": 0.9945210189801764, "grad_norm": 0.724894642829895, "learning_rate": 5.458011503885122e-05, "loss": 0.8267437934875488, "memory(GiB)": 91.52, "step": 76645, "token_acc": 0.7870769655362749, "train_speed(iter/s)": 0.137053 }, { "epoch": 0.9945858973818321, "grad_norm": 0.7409496307373047, "learning_rate": 5.457477381115018e-05, "loss": 0.8076654434204101, "memory(GiB)": 91.52, "step": 76650, "token_acc": 0.7687909686817188, "train_speed(iter/s)": 0.137052 }, { "epoch": 0.9946507757834878, "grad_norm": 0.7043606042861938, "learning_rate": 5.4569432530802955e-05, "loss": 0.8933616638183594, "memory(GiB)": 91.52, "step": 76655, "token_acc": 0.7509367194004996, "train_speed(iter/s)": 0.13705 }, { "epoch": 0.9947156541851435, "grad_norm": 0.7286545038223267, "learning_rate": 5.456409119787098e-05, "loss": 0.8851715087890625, "memory(GiB)": 91.52, "step": 76660, "token_acc": 0.7541514918640784, "train_speed(iter/s)": 0.137049 }, { "epoch": 0.9947805325867992, "grad_norm": 0.7854582667350769, "learning_rate": 5.4558749812415745e-05, "loss": 0.8359359741210938, "memory(GiB)": 91.52, "step": 76665, "token_acc": 0.7669246787945464, "train_speed(iter/s)": 0.137048 }, { "epoch": 0.9948454109884549, "grad_norm": 0.6890934705734253, "learning_rate": 5.455340837449869e-05, "loss": 0.8193107604980469, "memory(GiB)": 91.52, "step": 76670, "token_acc": 0.7724518888096935, "train_speed(iter/s)": 0.137047 }, { "epoch": 0.9949102893901106, "grad_norm": 0.6934219598770142, "learning_rate": 5.4548066884181346e-05, "loss": 0.798517894744873, "memory(GiB)": 91.52, "step": 76675, "token_acc": 0.7706911342359479, "train_speed(iter/s)": 0.137046 }, { "epoch": 0.9949751677917663, "grad_norm": 0.6938386559486389, "learning_rate": 5.454272534152511e-05, "loss": 0.7881331443786621, "memory(GiB)": 91.52, "step": 76680, "token_acc": 0.7693305567883112, "train_speed(iter/s)": 0.137045 }, { "epoch": 0.995040046193422, "grad_norm": 0.6393419504165649, "learning_rate": 5.45373837465915e-05, "loss": 0.8411094665527343, "memory(GiB)": 91.52, "step": 76685, "token_acc": 0.762760116586457, "train_speed(iter/s)": 0.137044 }, { "epoch": 0.9951049245950777, "grad_norm": 0.6972588896751404, "learning_rate": 5.4532042099441974e-05, "loss": 0.8811627388000488, "memory(GiB)": 91.52, "step": 76690, "token_acc": 0.7618948824343015, "train_speed(iter/s)": 0.137043 }, { "epoch": 0.9951698029967334, "grad_norm": 0.6737754344940186, "learning_rate": 5.4526700400137965e-05, "loss": 0.8035133361816407, "memory(GiB)": 91.52, "step": 76695, "token_acc": 0.7730159280979977, "train_speed(iter/s)": 0.137041 }, { "epoch": 0.9952346813983891, "grad_norm": 0.7175257802009583, "learning_rate": 5.4521358648741004e-05, "loss": 0.8897940635681152, "memory(GiB)": 91.52, "step": 76700, "token_acc": 0.7509949954683375, "train_speed(iter/s)": 0.13704 }, { "epoch": 0.9952995598000448, "grad_norm": 0.7766075134277344, "learning_rate": 5.451601684531254e-05, "loss": 0.8583602905273438, "memory(GiB)": 91.52, "step": 76705, "token_acc": 0.742967527698841, "train_speed(iter/s)": 0.137039 }, { "epoch": 0.9953644382017005, "grad_norm": 0.7697558403015137, "learning_rate": 5.4510674989914045e-05, "loss": 0.821994400024414, "memory(GiB)": 91.52, "step": 76710, "token_acc": 0.7733159385241989, "train_speed(iter/s)": 0.137038 }, { "epoch": 0.9954293166033562, "grad_norm": 0.7608482837677002, "learning_rate": 5.450533308260698e-05, "loss": 0.8138885498046875, "memory(GiB)": 91.52, "step": 76715, "token_acc": 0.7858674463937622, "train_speed(iter/s)": 0.137037 }, { "epoch": 0.9954941950050119, "grad_norm": 0.8141472935676575, "learning_rate": 5.449999112345284e-05, "loss": 0.8580768585205079, "memory(GiB)": 91.52, "step": 76720, "token_acc": 0.7614451089427482, "train_speed(iter/s)": 0.137036 }, { "epoch": 0.9955590734066676, "grad_norm": 0.7656854391098022, "learning_rate": 5.44946491125131e-05, "loss": 0.8488594055175781, "memory(GiB)": 91.52, "step": 76725, "token_acc": 0.7625453050853791, "train_speed(iter/s)": 0.137035 }, { "epoch": 0.9956239518083233, "grad_norm": 0.7060045599937439, "learning_rate": 5.4489307049849216e-05, "loss": 0.8355133056640625, "memory(GiB)": 91.52, "step": 76730, "token_acc": 0.7540657017401203, "train_speed(iter/s)": 0.137035 }, { "epoch": 0.9956888302099789, "grad_norm": 0.7151519060134888, "learning_rate": 5.448396493552268e-05, "loss": 0.8171957969665528, "memory(GiB)": 91.52, "step": 76735, "token_acc": 0.7925459029871198, "train_speed(iter/s)": 0.137034 }, { "epoch": 0.9957537086116346, "grad_norm": 0.7508314847946167, "learning_rate": 5.447862276959494e-05, "loss": 0.9283039093017578, "memory(GiB)": 91.52, "step": 76740, "token_acc": 0.7509789129575745, "train_speed(iter/s)": 0.137033 }, { "epoch": 0.9958185870132903, "grad_norm": 0.6456958651542664, "learning_rate": 5.4473280552127535e-05, "loss": 0.7776919841766358, "memory(GiB)": 91.52, "step": 76745, "token_acc": 0.7715905478435944, "train_speed(iter/s)": 0.137031 }, { "epoch": 0.995883465414946, "grad_norm": 0.7087647914886475, "learning_rate": 5.446793828318188e-05, "loss": 0.847614860534668, "memory(GiB)": 91.52, "step": 76750, "token_acc": 0.7892615745882652, "train_speed(iter/s)": 0.13703 }, { "epoch": 0.9959483438166017, "grad_norm": 0.7131907343864441, "learning_rate": 5.4462595962819494e-05, "loss": 0.8123546600341797, "memory(GiB)": 91.52, "step": 76755, "token_acc": 0.7807771272146605, "train_speed(iter/s)": 0.137029 }, { "epoch": 0.9960132222182574, "grad_norm": 0.8206509351730347, "learning_rate": 5.445725359110184e-05, "loss": 0.9129724502563477, "memory(GiB)": 91.52, "step": 76760, "token_acc": 0.756276342312626, "train_speed(iter/s)": 0.137027 }, { "epoch": 0.9960781006199131, "grad_norm": 0.7039971351623535, "learning_rate": 5.44519111680904e-05, "loss": 0.8071699142456055, "memory(GiB)": 91.52, "step": 76765, "token_acc": 0.773854125149462, "train_speed(iter/s)": 0.137026 }, { "epoch": 0.9961429790215688, "grad_norm": 0.7446341514587402, "learning_rate": 5.4446568693846646e-05, "loss": 0.7844581127166748, "memory(GiB)": 91.52, "step": 76770, "token_acc": 0.7641815877040089, "train_speed(iter/s)": 0.137025 }, { "epoch": 0.9962078574232245, "grad_norm": 0.7243703603744507, "learning_rate": 5.4441226168432055e-05, "loss": 0.8383079528808594, "memory(GiB)": 91.52, "step": 76775, "token_acc": 0.7609033763203941, "train_speed(iter/s)": 0.137024 }, { "epoch": 0.9962727358248802, "grad_norm": 0.8684244751930237, "learning_rate": 5.443588359190815e-05, "loss": 0.8262024879455566, "memory(GiB)": 91.52, "step": 76780, "token_acc": 0.7752261188258633, "train_speed(iter/s)": 0.137022 }, { "epoch": 0.9963376142265359, "grad_norm": 0.6986083388328552, "learning_rate": 5.4430540964336354e-05, "loss": 0.8351995468139648, "memory(GiB)": 91.52, "step": 76785, "token_acc": 0.7623047552933009, "train_speed(iter/s)": 0.137021 }, { "epoch": 0.9964024926281916, "grad_norm": 0.7479248642921448, "learning_rate": 5.442519828577819e-05, "loss": 0.8261896133422851, "memory(GiB)": 91.52, "step": 76790, "token_acc": 0.7756652636084754, "train_speed(iter/s)": 0.137021 }, { "epoch": 0.9964673710298473, "grad_norm": 0.7444161772727966, "learning_rate": 5.4419855556295116e-05, "loss": 0.8472833633422852, "memory(GiB)": 91.52, "step": 76795, "token_acc": 0.7546963963042514, "train_speed(iter/s)": 0.13702 }, { "epoch": 0.996532249431503, "grad_norm": 0.7576394081115723, "learning_rate": 5.4414512775948646e-05, "loss": 0.870433235168457, "memory(GiB)": 91.52, "step": 76800, "token_acc": 0.7610512810543913, "train_speed(iter/s)": 0.137019 }, { "epoch": 0.9965971278331587, "grad_norm": 0.7358790636062622, "learning_rate": 5.4409169944800244e-05, "loss": 0.8418569564819336, "memory(GiB)": 91.52, "step": 76805, "token_acc": 0.7425894378194208, "train_speed(iter/s)": 0.137018 }, { "epoch": 0.9966620062348144, "grad_norm": 0.7786045074462891, "learning_rate": 5.440382706291138e-05, "loss": 0.8246582984924317, "memory(GiB)": 91.52, "step": 76810, "token_acc": 0.7655950540958268, "train_speed(iter/s)": 0.137017 }, { "epoch": 0.9967268846364701, "grad_norm": 0.7973394989967346, "learning_rate": 5.439848413034357e-05, "loss": 0.7978755474090576, "memory(GiB)": 91.52, "step": 76815, "token_acc": 0.7506219815600761, "train_speed(iter/s)": 0.137015 }, { "epoch": 0.9967917630381258, "grad_norm": 0.679719865322113, "learning_rate": 5.43931411471583e-05, "loss": 0.8098293304443359, "memory(GiB)": 91.52, "step": 76820, "token_acc": 0.7856939569300825, "train_speed(iter/s)": 0.137014 }, { "epoch": 0.9968566414397815, "grad_norm": 0.6969493627548218, "learning_rate": 5.438779811341703e-05, "loss": 0.8303560256958008, "memory(GiB)": 91.52, "step": 76825, "token_acc": 0.7810240628823552, "train_speed(iter/s)": 0.137013 }, { "epoch": 0.9969215198414372, "grad_norm": 0.7699350118637085, "learning_rate": 5.438245502918127e-05, "loss": 0.8409226417541504, "memory(GiB)": 91.52, "step": 76830, "token_acc": 0.7508433570722731, "train_speed(iter/s)": 0.137011 }, { "epoch": 0.9969863982430929, "grad_norm": 0.8340044021606445, "learning_rate": 5.4377111894512486e-05, "loss": 0.833919906616211, "memory(GiB)": 91.52, "step": 76835, "token_acc": 0.7622312152693888, "train_speed(iter/s)": 0.13701 }, { "epoch": 0.9970512766447486, "grad_norm": 0.7640237808227539, "learning_rate": 5.4371768709472194e-05, "loss": 0.8574625015258789, "memory(GiB)": 91.52, "step": 76840, "token_acc": 0.7713800063741634, "train_speed(iter/s)": 0.137009 }, { "epoch": 0.9971161550464043, "grad_norm": 0.6882856488227844, "learning_rate": 5.436642547412185e-05, "loss": 0.797459888458252, "memory(GiB)": 91.52, "step": 76845, "token_acc": 0.7848211373273893, "train_speed(iter/s)": 0.137008 }, { "epoch": 0.99718103344806, "grad_norm": 0.7763152718544006, "learning_rate": 5.4361082188522986e-05, "loss": 0.841305923461914, "memory(GiB)": 91.52, "step": 76850, "token_acc": 0.7550698956487497, "train_speed(iter/s)": 0.137007 }, { "epoch": 0.9972459118497157, "grad_norm": 0.6628446578979492, "learning_rate": 5.435573885273706e-05, "loss": 0.8416984558105469, "memory(GiB)": 91.52, "step": 76855, "token_acc": 0.7676193025901881, "train_speed(iter/s)": 0.137006 }, { "epoch": 0.9973107902513714, "grad_norm": 0.7697901129722595, "learning_rate": 5.4350395466825573e-05, "loss": 0.8173864364624024, "memory(GiB)": 91.52, "step": 76860, "token_acc": 0.7713657242673222, "train_speed(iter/s)": 0.137004 }, { "epoch": 0.9973756686530271, "grad_norm": 0.6563647389411926, "learning_rate": 5.434505203085002e-05, "loss": 0.8456816673278809, "memory(GiB)": 91.52, "step": 76865, "token_acc": 0.7748120674239163, "train_speed(iter/s)": 0.137003 }, { "epoch": 0.9974405470546828, "grad_norm": 0.6271795630455017, "learning_rate": 5.433970854487187e-05, "loss": 0.8354022979736329, "memory(GiB)": 91.52, "step": 76870, "token_acc": 0.7676782033970577, "train_speed(iter/s)": 0.137002 }, { "epoch": 0.9975054254563385, "grad_norm": 0.7151060104370117, "learning_rate": 5.433436500895265e-05, "loss": 0.8513429641723633, "memory(GiB)": 91.52, "step": 76875, "token_acc": 0.7815952980688498, "train_speed(iter/s)": 0.137001 }, { "epoch": 0.9975703038579942, "grad_norm": 0.7650177478790283, "learning_rate": 5.432902142315383e-05, "loss": 0.8734930038452149, "memory(GiB)": 91.52, "step": 76880, "token_acc": 0.7561334190939998, "train_speed(iter/s)": 0.137 }, { "epoch": 0.9976351822596499, "grad_norm": 0.6819790601730347, "learning_rate": 5.4323677787536906e-05, "loss": 0.8083061218261719, "memory(GiB)": 91.52, "step": 76885, "token_acc": 0.7716695244427336, "train_speed(iter/s)": 0.136999 }, { "epoch": 0.9977000606613056, "grad_norm": 0.7471675276756287, "learning_rate": 5.431833410216338e-05, "loss": 0.8176586151123046, "memory(GiB)": 91.52, "step": 76890, "token_acc": 0.7695669447755265, "train_speed(iter/s)": 0.136998 }, { "epoch": 0.9977649390629613, "grad_norm": 0.6618499755859375, "learning_rate": 5.4312990367094754e-05, "loss": 0.8220645904541015, "memory(GiB)": 91.52, "step": 76895, "token_acc": 0.7842819682245874, "train_speed(iter/s)": 0.136997 }, { "epoch": 0.997829817464617, "grad_norm": 0.711348295211792, "learning_rate": 5.430764658239249e-05, "loss": 0.814915943145752, "memory(GiB)": 91.52, "step": 76900, "token_acc": 0.7695897134561543, "train_speed(iter/s)": 0.136996 }, { "epoch": 0.9978946958662727, "grad_norm": 0.7597518563270569, "learning_rate": 5.4302302748118116e-05, "loss": 0.8285945892333985, "memory(GiB)": 91.52, "step": 76905, "token_acc": 0.7791842475386779, "train_speed(iter/s)": 0.136995 }, { "epoch": 0.9979595742679284, "grad_norm": 0.6445077657699585, "learning_rate": 5.429695886433314e-05, "loss": 0.8213428497314453, "memory(GiB)": 91.52, "step": 76910, "token_acc": 0.7612408247574786, "train_speed(iter/s)": 0.136994 }, { "epoch": 0.9980244526695841, "grad_norm": 0.6320115923881531, "learning_rate": 5.4291614931099e-05, "loss": 0.8154075622558594, "memory(GiB)": 91.52, "step": 76915, "token_acc": 0.7568108974358975, "train_speed(iter/s)": 0.136993 }, { "epoch": 0.9980893310712398, "grad_norm": 0.7826087474822998, "learning_rate": 5.428627094847726e-05, "loss": 0.887723159790039, "memory(GiB)": 91.52, "step": 76920, "token_acc": 0.7657871450406243, "train_speed(iter/s)": 0.136992 }, { "epoch": 0.9981542094728955, "grad_norm": 0.6907188892364502, "learning_rate": 5.428092691652939e-05, "loss": 0.8344932556152344, "memory(GiB)": 91.52, "step": 76925, "token_acc": 0.7762157556691571, "train_speed(iter/s)": 0.136991 }, { "epoch": 0.9982190878745512, "grad_norm": 0.7924807071685791, "learning_rate": 5.427558283531687e-05, "loss": 0.8691381454467774, "memory(GiB)": 91.52, "step": 76930, "token_acc": 0.7662159514011366, "train_speed(iter/s)": 0.13699 }, { "epoch": 0.9982839662762069, "grad_norm": 0.7406343221664429, "learning_rate": 5.427023870490123e-05, "loss": 0.8406169891357422, "memory(GiB)": 91.52, "step": 76935, "token_acc": 0.744998244998245, "train_speed(iter/s)": 0.136989 }, { "epoch": 0.9983488446778626, "grad_norm": 0.7130187153816223, "learning_rate": 5.426489452534395e-05, "loss": 0.8357587814331054, "memory(GiB)": 91.52, "step": 76940, "token_acc": 0.7613384758231555, "train_speed(iter/s)": 0.136988 }, { "epoch": 0.9984137230795183, "grad_norm": 0.7265575528144836, "learning_rate": 5.425955029670655e-05, "loss": 0.8462860107421875, "memory(GiB)": 91.52, "step": 76945, "token_acc": 0.7789120828889228, "train_speed(iter/s)": 0.136987 }, { "epoch": 0.998478601481174, "grad_norm": 0.6386379599571228, "learning_rate": 5.42542060190505e-05, "loss": 0.8120701789855957, "memory(GiB)": 91.52, "step": 76950, "token_acc": 0.7788256119147897, "train_speed(iter/s)": 0.136986 }, { "epoch": 0.9985434798828297, "grad_norm": 0.7644311785697937, "learning_rate": 5.4248861692437334e-05, "loss": 0.8444313049316406, "memory(GiB)": 91.52, "step": 76955, "token_acc": 0.76822769021431, "train_speed(iter/s)": 0.136985 }, { "epoch": 0.9986083582844854, "grad_norm": 0.6652116179466248, "learning_rate": 5.424351731692855e-05, "loss": 0.8432491302490235, "memory(GiB)": 91.52, "step": 76960, "token_acc": 0.7760787740287673, "train_speed(iter/s)": 0.136984 }, { "epoch": 0.998673236686141, "grad_norm": 0.6799245476722717, "learning_rate": 5.4238172892585636e-05, "loss": 0.825067138671875, "memory(GiB)": 91.52, "step": 76965, "token_acc": 0.7436324684598905, "train_speed(iter/s)": 0.136983 }, { "epoch": 0.9987381150877968, "grad_norm": 0.8422784209251404, "learning_rate": 5.423282841947009e-05, "loss": 0.8242876052856445, "memory(GiB)": 91.52, "step": 76970, "token_acc": 0.7857906179754579, "train_speed(iter/s)": 0.136982 }, { "epoch": 0.9988029934894523, "grad_norm": 0.7998625040054321, "learning_rate": 5.4227483897643426e-05, "loss": 0.8321032524108887, "memory(GiB)": 91.52, "step": 76975, "token_acc": 0.750581213782574, "train_speed(iter/s)": 0.13698 }, { "epoch": 0.998867871891108, "grad_norm": 0.7012109160423279, "learning_rate": 5.422213932716718e-05, "loss": 0.8324764251708985, "memory(GiB)": 91.52, "step": 76980, "token_acc": 0.7661998132586368, "train_speed(iter/s)": 0.136979 }, { "epoch": 0.9989327502927637, "grad_norm": 0.6897986531257629, "learning_rate": 5.421679470810279e-05, "loss": 0.8155111312866211, "memory(GiB)": 91.52, "step": 76985, "token_acc": 0.7692332907201626, "train_speed(iter/s)": 0.136978 }, { "epoch": 0.9989976286944194, "grad_norm": 0.7347491383552551, "learning_rate": 5.4211450040511815e-05, "loss": 0.8354594230651855, "memory(GiB)": 91.52, "step": 76990, "token_acc": 0.7796412556053811, "train_speed(iter/s)": 0.136977 }, { "epoch": 0.9990625070960751, "grad_norm": 0.7841550707817078, "learning_rate": 5.4206105324455736e-05, "loss": 0.8028110504150391, "memory(GiB)": 91.52, "step": 76995, "token_acc": 0.7759208603439877, "train_speed(iter/s)": 0.136976 }, { "epoch": 0.9991273854977308, "grad_norm": 0.679582417011261, "learning_rate": 5.4200760559996075e-05, "loss": 0.7801999092102051, "memory(GiB)": 91.52, "step": 77000, "token_acc": 0.767272475027747, "train_speed(iter/s)": 0.136975 }, { "epoch": 0.9991922638993865, "grad_norm": 0.8243599534034729, "learning_rate": 5.419541574719433e-05, "loss": 0.842461109161377, "memory(GiB)": 91.52, "step": 77005, "token_acc": 0.7690871611372564, "train_speed(iter/s)": 0.136974 }, { "epoch": 0.9992571423010422, "grad_norm": 0.6686335802078247, "learning_rate": 5.419007088611201e-05, "loss": 0.8705652236938477, "memory(GiB)": 91.52, "step": 77010, "token_acc": 0.7522594020277931, "train_speed(iter/s)": 0.136973 }, { "epoch": 0.9993220207026979, "grad_norm": 0.7052117586135864, "learning_rate": 5.418472597681064e-05, "loss": 0.8581656455993653, "memory(GiB)": 91.52, "step": 77015, "token_acc": 0.7636825906597083, "train_speed(iter/s)": 0.136972 }, { "epoch": 0.9993868991043536, "grad_norm": 0.7652562260627747, "learning_rate": 5.417938101935168e-05, "loss": 0.8383868217468262, "memory(GiB)": 91.52, "step": 77020, "token_acc": 0.7524748719893799, "train_speed(iter/s)": 0.136971 }, { "epoch": 0.9994517775060093, "grad_norm": 0.6968792676925659, "learning_rate": 5.4174036013796695e-05, "loss": 0.8334850311279297, "memory(GiB)": 91.52, "step": 77025, "token_acc": 0.7729555110783384, "train_speed(iter/s)": 0.136969 }, { "epoch": 0.999516655907665, "grad_norm": 0.8094812035560608, "learning_rate": 5.4168690960207155e-05, "loss": 0.8547723770141602, "memory(GiB)": 91.52, "step": 77030, "token_acc": 0.766041831920845, "train_speed(iter/s)": 0.136968 }, { "epoch": 0.9995815343093207, "grad_norm": 0.7910928726196289, "learning_rate": 5.416334585864461e-05, "loss": 0.7974098682403564, "memory(GiB)": 91.52, "step": 77035, "token_acc": 0.787378834152171, "train_speed(iter/s)": 0.136966 }, { "epoch": 0.9996464127109764, "grad_norm": 0.6583125591278076, "learning_rate": 5.415800070917054e-05, "loss": 0.8238845825195312, "memory(GiB)": 91.52, "step": 77040, "token_acc": 0.7688852592895059, "train_speed(iter/s)": 0.136965 }, { "epoch": 0.9997112911126321, "grad_norm": 0.6999167203903198, "learning_rate": 5.415265551184646e-05, "loss": 0.8538262367248535, "memory(GiB)": 91.52, "step": 77045, "token_acc": 0.7830180712717446, "train_speed(iter/s)": 0.136964 }, { "epoch": 0.9997761695142878, "grad_norm": 0.6956627368927002, "learning_rate": 5.41473102667339e-05, "loss": 0.8141539573669434, "memory(GiB)": 91.52, "step": 77050, "token_acc": 0.7704613095238095, "train_speed(iter/s)": 0.136963 }, { "epoch": 0.9998410479159435, "grad_norm": 0.7471052408218384, "learning_rate": 5.4141964973894345e-05, "loss": 0.8557767868041992, "memory(GiB)": 91.52, "step": 77055, "token_acc": 0.7686647836245074, "train_speed(iter/s)": 0.136962 }, { "epoch": 0.9999059263175992, "grad_norm": 0.8433261513710022, "learning_rate": 5.413661963338933e-05, "loss": 0.8436470031738281, "memory(GiB)": 91.52, "step": 77060, "token_acc": 0.7817421337811717, "train_speed(iter/s)": 0.13696 }, { "epoch": 0.9999708047192549, "grad_norm": 0.7512041330337524, "learning_rate": 5.413127424528035e-05, "loss": 0.8380195617675781, "memory(GiB)": 91.52, "step": 77065, "token_acc": 0.7624529859645904, "train_speed(iter/s)": 0.13696 }, { "epoch": 1.0000389270409935, "grad_norm": 0.7689199447631836, "learning_rate": 5.4125928809628954e-05, "loss": 0.9285550117492676, "memory(GiB)": 91.52, "step": 77070, "token_acc": 0.7686315713198797, "train_speed(iter/s)": 0.136957 }, { "epoch": 1.0001038054426492, "grad_norm": 0.6756064295768738, "learning_rate": 5.412058332649662e-05, "loss": 0.7430599212646485, "memory(GiB)": 91.52, "step": 77075, "token_acc": 0.7662723805830263, "train_speed(iter/s)": 0.136955 }, { "epoch": 1.000168683844305, "grad_norm": 0.7916273474693298, "learning_rate": 5.411523779594488e-05, "loss": 0.7898098468780518, "memory(GiB)": 91.52, "step": 77080, "token_acc": 0.7634827810266407, "train_speed(iter/s)": 0.136954 }, { "epoch": 1.0002335622459606, "grad_norm": 0.7293190360069275, "learning_rate": 5.4109892218035266e-05, "loss": 0.7912176609039306, "memory(GiB)": 91.52, "step": 77085, "token_acc": 0.7684891706286318, "train_speed(iter/s)": 0.136953 }, { "epoch": 1.0002984406476163, "grad_norm": 0.6885404586791992, "learning_rate": 5.410454659282926e-05, "loss": 0.7483326911926269, "memory(GiB)": 91.52, "step": 77090, "token_acc": 0.7866157681520723, "train_speed(iter/s)": 0.136952 }, { "epoch": 1.000363319049272, "grad_norm": 0.7313986420631409, "learning_rate": 5.4099200920388405e-05, "loss": 0.8071775436401367, "memory(GiB)": 91.52, "step": 77095, "token_acc": 0.7738457084961644, "train_speed(iter/s)": 0.136951 }, { "epoch": 1.0004281974509277, "grad_norm": 0.8072203993797302, "learning_rate": 5.409385520077419e-05, "loss": 0.7868602275848389, "memory(GiB)": 91.52, "step": 77100, "token_acc": 0.7697160883280757, "train_speed(iter/s)": 0.13695 }, { "epoch": 1.0004930758525834, "grad_norm": 0.6515564918518066, "learning_rate": 5.4088509434048177e-05, "loss": 0.7949819564819336, "memory(GiB)": 91.52, "step": 77105, "token_acc": 0.7646371871350567, "train_speed(iter/s)": 0.136949 }, { "epoch": 1.000557954254239, "grad_norm": 0.6500244736671448, "learning_rate": 5.4083163620271835e-05, "loss": 0.7503816604614257, "memory(GiB)": 91.52, "step": 77110, "token_acc": 0.778643359300779, "train_speed(iter/s)": 0.136948 }, { "epoch": 1.0006228326558946, "grad_norm": 0.7155803442001343, "learning_rate": 5.4077817759506735e-05, "loss": 0.8240274429321289, "memory(GiB)": 91.52, "step": 77115, "token_acc": 0.7747694857892737, "train_speed(iter/s)": 0.136947 }, { "epoch": 1.0006877110575503, "grad_norm": 0.6836833953857422, "learning_rate": 5.4072471851814355e-05, "loss": 0.7340295791625977, "memory(GiB)": 91.52, "step": 77120, "token_acc": 0.8004889121704208, "train_speed(iter/s)": 0.136946 }, { "epoch": 1.000752589459206, "grad_norm": 0.7845895290374756, "learning_rate": 5.4067125897256246e-05, "loss": 0.7600085258483886, "memory(GiB)": 91.52, "step": 77125, "token_acc": 0.7783832879200726, "train_speed(iter/s)": 0.136945 }, { "epoch": 1.0008174678608617, "grad_norm": 0.7077704668045044, "learning_rate": 5.4061779895893894e-05, "loss": 0.7465141296386719, "memory(GiB)": 91.52, "step": 77130, "token_acc": 0.7848456185392879, "train_speed(iter/s)": 0.136944 }, { "epoch": 1.0008823462625174, "grad_norm": 0.7365553975105286, "learning_rate": 5.405643384778884e-05, "loss": 0.776404857635498, "memory(GiB)": 91.52, "step": 77135, "token_acc": 0.7711710543379587, "train_speed(iter/s)": 0.136943 }, { "epoch": 1.000947224664173, "grad_norm": 0.8607676029205322, "learning_rate": 5.4051087753002626e-05, "loss": 0.8130586624145508, "memory(GiB)": 91.52, "step": 77140, "token_acc": 0.7779558961526432, "train_speed(iter/s)": 0.136942 }, { "epoch": 1.0010121030658288, "grad_norm": 0.6640310287475586, "learning_rate": 5.404574161159675e-05, "loss": 0.7449564933776855, "memory(GiB)": 91.52, "step": 77145, "token_acc": 0.7671241402919118, "train_speed(iter/s)": 0.136941 }, { "epoch": 1.0010769814674845, "grad_norm": 0.7308955192565918, "learning_rate": 5.4040395423632725e-05, "loss": 0.8440768241882324, "memory(GiB)": 91.52, "step": 77150, "token_acc": 0.7477241278858059, "train_speed(iter/s)": 0.13694 }, { "epoch": 1.0011418598691402, "grad_norm": 0.7284024953842163, "learning_rate": 5.4035049189172104e-05, "loss": 0.7693791389465332, "memory(GiB)": 91.52, "step": 77155, "token_acc": 0.765296154870793, "train_speed(iter/s)": 0.136939 }, { "epoch": 1.0012067382707959, "grad_norm": 0.7126418948173523, "learning_rate": 5.402970290827639e-05, "loss": 0.7798029899597168, "memory(GiB)": 91.52, "step": 77160, "token_acc": 0.7830242307965608, "train_speed(iter/s)": 0.136938 }, { "epoch": 1.0012716166724516, "grad_norm": 0.8039487600326538, "learning_rate": 5.4024356581007116e-05, "loss": 0.7781257629394531, "memory(GiB)": 91.52, "step": 77165, "token_acc": 0.7759396080950851, "train_speed(iter/s)": 0.136937 }, { "epoch": 1.0013364950741073, "grad_norm": 0.6347360014915466, "learning_rate": 5.4019010207425805e-05, "loss": 0.7692024230957031, "memory(GiB)": 91.52, "step": 77170, "token_acc": 0.7887431732006818, "train_speed(iter/s)": 0.136936 }, { "epoch": 1.001401373475763, "grad_norm": 0.6209936141967773, "learning_rate": 5.401366378759398e-05, "loss": 0.7925107955932618, "memory(GiB)": 91.52, "step": 77175, "token_acc": 0.7862970517337289, "train_speed(iter/s)": 0.136935 }, { "epoch": 1.0014662518774187, "grad_norm": 0.6611760854721069, "learning_rate": 5.4008317321573174e-05, "loss": 0.7409549713134765, "memory(GiB)": 91.52, "step": 77180, "token_acc": 0.8011992789611154, "train_speed(iter/s)": 0.136933 }, { "epoch": 1.0015311302790744, "grad_norm": 0.6895447969436646, "learning_rate": 5.400297080942491e-05, "loss": 0.733718729019165, "memory(GiB)": 91.52, "step": 77185, "token_acc": 0.7919481636162634, "train_speed(iter/s)": 0.136932 }, { "epoch": 1.00159600868073, "grad_norm": 0.744857907295227, "learning_rate": 5.399762425121072e-05, "loss": 0.7591989040374756, "memory(GiB)": 91.52, "step": 77190, "token_acc": 0.7875803137536284, "train_speed(iter/s)": 0.13693 }, { "epoch": 1.0016608870823858, "grad_norm": 0.7227488160133362, "learning_rate": 5.399227764699213e-05, "loss": 0.7735004425048828, "memory(GiB)": 91.52, "step": 77195, "token_acc": 0.7779208483241811, "train_speed(iter/s)": 0.136929 }, { "epoch": 1.0017257654840415, "grad_norm": 0.686784565448761, "learning_rate": 5.3986930996830655e-05, "loss": 0.7404465675354004, "memory(GiB)": 91.52, "step": 77200, "token_acc": 0.7896755508590662, "train_speed(iter/s)": 0.136928 }, { "epoch": 1.0017906438856972, "grad_norm": 0.6967642307281494, "learning_rate": 5.398158430078784e-05, "loss": 0.8036537170410156, "memory(GiB)": 91.52, "step": 77205, "token_acc": 0.7790260893320584, "train_speed(iter/s)": 0.136927 }, { "epoch": 1.0018555222873529, "grad_norm": 0.6249158978462219, "learning_rate": 5.39762375589252e-05, "loss": 0.7229156494140625, "memory(GiB)": 91.52, "step": 77210, "token_acc": 0.7837952488687783, "train_speed(iter/s)": 0.136925 }, { "epoch": 1.0019204006890086, "grad_norm": 0.694415271282196, "learning_rate": 5.3970890771304286e-05, "loss": 0.7332996368408203, "memory(GiB)": 91.52, "step": 77215, "token_acc": 0.7763511378002529, "train_speed(iter/s)": 0.136925 }, { "epoch": 1.0019852790906643, "grad_norm": 0.7913516163825989, "learning_rate": 5.3965543937986615e-05, "loss": 0.7842330932617188, "memory(GiB)": 91.52, "step": 77220, "token_acc": 0.763808025177026, "train_speed(iter/s)": 0.136924 }, { "epoch": 1.00205015749232, "grad_norm": 0.7070788145065308, "learning_rate": 5.396019705903372e-05, "loss": 0.7693229675292969, "memory(GiB)": 91.52, "step": 77225, "token_acc": 0.7700205989542069, "train_speed(iter/s)": 0.136922 }, { "epoch": 1.0021150358939757, "grad_norm": 0.7483124136924744, "learning_rate": 5.395485013450713e-05, "loss": 0.7522486686706543, "memory(GiB)": 91.52, "step": 77230, "token_acc": 0.7868106094574553, "train_speed(iter/s)": 0.136921 }, { "epoch": 1.0021799142956314, "grad_norm": 0.7851686477661133, "learning_rate": 5.3949503164468385e-05, "loss": 0.7509097099304199, "memory(GiB)": 91.52, "step": 77235, "token_acc": 0.7804990605531199, "train_speed(iter/s)": 0.13692 }, { "epoch": 1.002244792697287, "grad_norm": 0.7729852199554443, "learning_rate": 5.3944156148978984e-05, "loss": 0.8237994194030762, "memory(GiB)": 91.52, "step": 77240, "token_acc": 0.7850438155935021, "train_speed(iter/s)": 0.136919 }, { "epoch": 1.0023096710989428, "grad_norm": 0.7754432559013367, "learning_rate": 5.393880908810052e-05, "loss": 0.7435422897338867, "memory(GiB)": 91.52, "step": 77245, "token_acc": 0.7734475650605515, "train_speed(iter/s)": 0.136918 }, { "epoch": 1.0023745495005985, "grad_norm": 0.6894274353981018, "learning_rate": 5.3933461981894485e-05, "loss": 0.7436173439025879, "memory(GiB)": 91.52, "step": 77250, "token_acc": 0.8040083896527616, "train_speed(iter/s)": 0.136917 }, { "epoch": 1.0024394279022542, "grad_norm": 0.7084876298904419, "learning_rate": 5.3928114830422416e-05, "loss": 0.7210586547851563, "memory(GiB)": 91.52, "step": 77255, "token_acc": 0.7941243023080404, "train_speed(iter/s)": 0.136916 }, { "epoch": 1.0025043063039099, "grad_norm": 0.7164655327796936, "learning_rate": 5.3922767633745865e-05, "loss": 0.7915337085723877, "memory(GiB)": 91.52, "step": 77260, "token_acc": 0.7714149291739615, "train_speed(iter/s)": 0.136915 }, { "epoch": 1.0025691847055656, "grad_norm": 0.6931076049804688, "learning_rate": 5.391742039192634e-05, "loss": 0.7337028503417968, "memory(GiB)": 91.52, "step": 77265, "token_acc": 0.7798950251026928, "train_speed(iter/s)": 0.136914 }, { "epoch": 1.0026340631072213, "grad_norm": 0.724084198474884, "learning_rate": 5.391207310502542e-05, "loss": 0.76593918800354, "memory(GiB)": 91.52, "step": 77270, "token_acc": 0.7693975981613957, "train_speed(iter/s)": 0.136913 }, { "epoch": 1.002698941508877, "grad_norm": 0.7541322708129883, "learning_rate": 5.3906725773104594e-05, "loss": 0.7440389156341553, "memory(GiB)": 91.52, "step": 77275, "token_acc": 0.798604427333975, "train_speed(iter/s)": 0.136912 }, { "epoch": 1.0027638199105326, "grad_norm": 0.7011085748672485, "learning_rate": 5.390137839622542e-05, "loss": 0.775101613998413, "memory(GiB)": 91.52, "step": 77280, "token_acc": 0.7742119689355871, "train_speed(iter/s)": 0.136912 }, { "epoch": 1.0028286983121883, "grad_norm": 0.7722739577293396, "learning_rate": 5.389603097444944e-05, "loss": 0.7577145576477051, "memory(GiB)": 91.52, "step": 77285, "token_acc": 0.7888316331896098, "train_speed(iter/s)": 0.13691 }, { "epoch": 1.002893576713844, "grad_norm": 0.6783035397529602, "learning_rate": 5.389068350783819e-05, "loss": 0.7467704296112061, "memory(GiB)": 91.52, "step": 77290, "token_acc": 0.794827459348098, "train_speed(iter/s)": 0.136909 }, { "epoch": 1.0029584551154997, "grad_norm": 0.7008704543113708, "learning_rate": 5.3885335996453204e-05, "loss": 0.7380277156829834, "memory(GiB)": 91.52, "step": 77295, "token_acc": 0.7851874040513723, "train_speed(iter/s)": 0.136908 }, { "epoch": 1.0030233335171554, "grad_norm": 0.8408884406089783, "learning_rate": 5.3879988440355996e-05, "loss": 0.8315649032592773, "memory(GiB)": 91.52, "step": 77300, "token_acc": 0.7596313418681839, "train_speed(iter/s)": 0.136907 }, { "epoch": 1.0030882119188111, "grad_norm": 0.7342612147331238, "learning_rate": 5.387464083960817e-05, "loss": 0.7692934989929199, "memory(GiB)": 91.52, "step": 77305, "token_acc": 0.7797436169085709, "train_speed(iter/s)": 0.136905 }, { "epoch": 1.0031530903204668, "grad_norm": 0.7374550104141235, "learning_rate": 5.38692931942712e-05, "loss": 0.7608814239501953, "memory(GiB)": 91.52, "step": 77310, "token_acc": 0.7636111215339012, "train_speed(iter/s)": 0.136905 }, { "epoch": 1.0032179687221225, "grad_norm": 0.6957454085350037, "learning_rate": 5.3863945504406664e-05, "loss": 0.721634817123413, "memory(GiB)": 91.52, "step": 77315, "token_acc": 0.7989461907683424, "train_speed(iter/s)": 0.136904 }, { "epoch": 1.0032828471237782, "grad_norm": 0.7340061664581299, "learning_rate": 5.385859777007609e-05, "loss": 0.7679607391357421, "memory(GiB)": 91.52, "step": 77320, "token_acc": 0.7730189523951649, "train_speed(iter/s)": 0.136903 }, { "epoch": 1.003347725525434, "grad_norm": 0.7244923710823059, "learning_rate": 5.3853249991341024e-05, "loss": 0.7673204421997071, "memory(GiB)": 91.52, "step": 77325, "token_acc": 0.7799654078985299, "train_speed(iter/s)": 0.136902 }, { "epoch": 1.0034126039270896, "grad_norm": 0.6524991989135742, "learning_rate": 5.3847902168262995e-05, "loss": 0.7447549819946289, "memory(GiB)": 91.52, "step": 77330, "token_acc": 0.8028189167918723, "train_speed(iter/s)": 0.136901 }, { "epoch": 1.0034774823287453, "grad_norm": 0.7063096165657043, "learning_rate": 5.384255430090356e-05, "loss": 0.7984007835388184, "memory(GiB)": 91.52, "step": 77335, "token_acc": 0.7772278965287284, "train_speed(iter/s)": 0.1369 }, { "epoch": 1.003542360730401, "grad_norm": 0.6958215832710266, "learning_rate": 5.383720638932427e-05, "loss": 0.7711497783660889, "memory(GiB)": 91.52, "step": 77340, "token_acc": 0.7742777189532597, "train_speed(iter/s)": 0.136899 }, { "epoch": 1.0036072391320567, "grad_norm": 0.7566580772399902, "learning_rate": 5.383185843358663e-05, "loss": 0.7950732231140136, "memory(GiB)": 91.52, "step": 77345, "token_acc": 0.7692307692307693, "train_speed(iter/s)": 0.136898 }, { "epoch": 1.0036721175337124, "grad_norm": 0.6930193901062012, "learning_rate": 5.382651043375223e-05, "loss": 0.7757275581359864, "memory(GiB)": 91.52, "step": 77350, "token_acc": 0.7646568385947625, "train_speed(iter/s)": 0.136897 }, { "epoch": 1.0037369959353681, "grad_norm": 0.771277666091919, "learning_rate": 5.382116238988259e-05, "loss": 0.7537680625915527, "memory(GiB)": 91.52, "step": 77355, "token_acc": 0.7888697999754571, "train_speed(iter/s)": 0.136896 }, { "epoch": 1.0038018743370238, "grad_norm": 0.6971801519393921, "learning_rate": 5.3815814302039256e-05, "loss": 0.7541640281677247, "memory(GiB)": 91.52, "step": 77360, "token_acc": 0.7690042075736325, "train_speed(iter/s)": 0.136896 }, { "epoch": 1.0038667527386795, "grad_norm": 0.6557146310806274, "learning_rate": 5.381046617028379e-05, "loss": 0.7687993049621582, "memory(GiB)": 91.52, "step": 77365, "token_acc": 0.7914283918808521, "train_speed(iter/s)": 0.136895 }, { "epoch": 1.0039316311403352, "grad_norm": 0.6990090012550354, "learning_rate": 5.38051179946777e-05, "loss": 0.7606448173522949, "memory(GiB)": 91.52, "step": 77370, "token_acc": 0.7811872029600332, "train_speed(iter/s)": 0.136894 }, { "epoch": 1.003996509541991, "grad_norm": 0.6441535949707031, "learning_rate": 5.379976977528258e-05, "loss": 0.7401861667633056, "memory(GiB)": 91.52, "step": 77375, "token_acc": 0.7809642745640248, "train_speed(iter/s)": 0.136893 }, { "epoch": 1.0040613879436466, "grad_norm": 0.7486153244972229, "learning_rate": 5.379442151215994e-05, "loss": 0.7229686737060547, "memory(GiB)": 91.52, "step": 77380, "token_acc": 0.7945027820775635, "train_speed(iter/s)": 0.136891 }, { "epoch": 1.0041262663453023, "grad_norm": 0.744480550289154, "learning_rate": 5.378907320537134e-05, "loss": 0.760871696472168, "memory(GiB)": 91.52, "step": 77385, "token_acc": 0.7676474054287622, "train_speed(iter/s)": 0.136891 }, { "epoch": 1.004191144746958, "grad_norm": 0.6782874464988708, "learning_rate": 5.3783724854978344e-05, "loss": 0.7780380249023438, "memory(GiB)": 91.52, "step": 77390, "token_acc": 0.7695543833532895, "train_speed(iter/s)": 0.136889 }, { "epoch": 1.0042560231486137, "grad_norm": 0.7163611650466919, "learning_rate": 5.377837646104248e-05, "loss": 0.7350624084472657, "memory(GiB)": 91.52, "step": 77395, "token_acc": 0.7871244635193133, "train_speed(iter/s)": 0.136888 }, { "epoch": 1.0043209015502694, "grad_norm": 0.632969319820404, "learning_rate": 5.3773028023625306e-05, "loss": 0.736351728439331, "memory(GiB)": 91.52, "step": 77400, "token_acc": 0.8014025167692223, "train_speed(iter/s)": 0.136886 }, { "epoch": 1.0043857799519251, "grad_norm": 0.7172192931175232, "learning_rate": 5.3767679542788355e-05, "loss": 0.8012161254882812, "memory(GiB)": 91.52, "step": 77405, "token_acc": 0.7691408153321423, "train_speed(iter/s)": 0.136886 }, { "epoch": 1.0044506583535808, "grad_norm": 0.777874767780304, "learning_rate": 5.376233101859319e-05, "loss": 0.7696844100952148, "memory(GiB)": 91.52, "step": 77410, "token_acc": 0.7867484306397184, "train_speed(iter/s)": 0.136884 }, { "epoch": 1.0045155367552365, "grad_norm": 0.6948013305664062, "learning_rate": 5.3756982451101367e-05, "loss": 0.7397259712219239, "memory(GiB)": 91.52, "step": 77415, "token_acc": 0.7882719457370455, "train_speed(iter/s)": 0.136883 }, { "epoch": 1.0045804151568922, "grad_norm": 0.7097170352935791, "learning_rate": 5.375163384037444e-05, "loss": 0.7287763595581055, "memory(GiB)": 91.52, "step": 77420, "token_acc": 0.7631005216849646, "train_speed(iter/s)": 0.136882 }, { "epoch": 1.004645293558548, "grad_norm": 0.6663745641708374, "learning_rate": 5.374628518647394e-05, "loss": 0.7681007385253906, "memory(GiB)": 91.52, "step": 77425, "token_acc": 0.7947962888261396, "train_speed(iter/s)": 0.136881 }, { "epoch": 1.0047101719602036, "grad_norm": 0.7586739659309387, "learning_rate": 5.374093648946145e-05, "loss": 0.7511087417602539, "memory(GiB)": 91.52, "step": 77430, "token_acc": 0.7912190901881624, "train_speed(iter/s)": 0.136879 }, { "epoch": 1.0047750503618593, "grad_norm": 0.7307358980178833, "learning_rate": 5.3735587749398494e-05, "loss": 0.7636147022247315, "memory(GiB)": 91.52, "step": 77435, "token_acc": 0.7815985743515279, "train_speed(iter/s)": 0.136878 }, { "epoch": 1.004839928763515, "grad_norm": 0.7941005229949951, "learning_rate": 5.3730238966346625e-05, "loss": 0.80849609375, "memory(GiB)": 91.52, "step": 77440, "token_acc": 0.7589870157742247, "train_speed(iter/s)": 0.136877 }, { "epoch": 1.0049048071651707, "grad_norm": 0.7407634258270264, "learning_rate": 5.372489014036741e-05, "loss": 0.7472448348999023, "memory(GiB)": 91.52, "step": 77445, "token_acc": 0.7852422107633089, "train_speed(iter/s)": 0.136876 }, { "epoch": 1.0049696855668264, "grad_norm": 0.7655802965164185, "learning_rate": 5.37195412715224e-05, "loss": 0.7227265357971191, "memory(GiB)": 91.52, "step": 77450, "token_acc": 0.7903703945010914, "train_speed(iter/s)": 0.136875 }, { "epoch": 1.0050345639684821, "grad_norm": 0.7448201179504395, "learning_rate": 5.371419235987316e-05, "loss": 0.7436770439147949, "memory(GiB)": 91.52, "step": 77455, "token_acc": 0.7638908676449637, "train_speed(iter/s)": 0.136874 }, { "epoch": 1.0050994423701378, "grad_norm": 0.7712724804878235, "learning_rate": 5.370884340548121e-05, "loss": 0.7709986686706543, "memory(GiB)": 91.52, "step": 77460, "token_acc": 0.7827179451385496, "train_speed(iter/s)": 0.136873 }, { "epoch": 1.0051643207717935, "grad_norm": 0.6924340128898621, "learning_rate": 5.370349440840815e-05, "loss": 0.7577849864959717, "memory(GiB)": 91.52, "step": 77465, "token_acc": 0.7956413122082703, "train_speed(iter/s)": 0.136872 }, { "epoch": 1.0052291991734492, "grad_norm": 0.7170453667640686, "learning_rate": 5.369814536871551e-05, "loss": 0.7409110546112061, "memory(GiB)": 91.52, "step": 77470, "token_acc": 0.764769465876477, "train_speed(iter/s)": 0.136871 }, { "epoch": 1.005294077575105, "grad_norm": 0.7532721757888794, "learning_rate": 5.369279628646484e-05, "loss": 0.7811948776245117, "memory(GiB)": 91.52, "step": 77475, "token_acc": 0.7739887749177472, "train_speed(iter/s)": 0.13687 }, { "epoch": 1.0053589559767606, "grad_norm": 0.7987435460090637, "learning_rate": 5.368744716171772e-05, "loss": 0.8005194664001465, "memory(GiB)": 91.52, "step": 77480, "token_acc": 0.7724790680742628, "train_speed(iter/s)": 0.136869 }, { "epoch": 1.0054238343784163, "grad_norm": 0.721335768699646, "learning_rate": 5.3682097994535696e-05, "loss": 0.7975834846496582, "memory(GiB)": 91.52, "step": 77485, "token_acc": 0.7783987270539838, "train_speed(iter/s)": 0.136867 }, { "epoch": 1.005488712780072, "grad_norm": 0.6613350510597229, "learning_rate": 5.367674878498032e-05, "loss": 0.7530432224273682, "memory(GiB)": 91.52, "step": 77490, "token_acc": 0.7921389545788143, "train_speed(iter/s)": 0.136866 }, { "epoch": 1.0055535911817277, "grad_norm": 0.8135889172554016, "learning_rate": 5.3671399533113145e-05, "loss": 0.7643832206726074, "memory(GiB)": 91.52, "step": 77495, "token_acc": 0.7803197243479306, "train_speed(iter/s)": 0.136866 }, { "epoch": 1.0056184695833834, "grad_norm": 0.8301946520805359, "learning_rate": 5.3666050238995754e-05, "loss": 0.7847240447998047, "memory(GiB)": 91.52, "step": 77500, "token_acc": 0.7663624589620794, "train_speed(iter/s)": 0.136864 }, { "epoch": 1.0056833479850391, "grad_norm": 0.7409906983375549, "learning_rate": 5.36607009026897e-05, "loss": 0.7428267002105713, "memory(GiB)": 91.52, "step": 77505, "token_acc": 0.7923797082516874, "train_speed(iter/s)": 0.136863 }, { "epoch": 1.0057482263866948, "grad_norm": 0.7077964544296265, "learning_rate": 5.365535152425653e-05, "loss": 0.7507586479187012, "memory(GiB)": 91.52, "step": 77510, "token_acc": 0.762409673553869, "train_speed(iter/s)": 0.136862 }, { "epoch": 1.0058131047883505, "grad_norm": 0.7521745562553406, "learning_rate": 5.3650002103757804e-05, "loss": 0.7827336311340332, "memory(GiB)": 91.52, "step": 77515, "token_acc": 0.7662745860471145, "train_speed(iter/s)": 0.136861 }, { "epoch": 1.0058779831900062, "grad_norm": 0.779105544090271, "learning_rate": 5.364465264125509e-05, "loss": 0.7651727199554443, "memory(GiB)": 91.52, "step": 77520, "token_acc": 0.7872431769395891, "train_speed(iter/s)": 0.13686 }, { "epoch": 1.005942861591662, "grad_norm": 0.6895347833633423, "learning_rate": 5.363930313680995e-05, "loss": 0.7983731746673584, "memory(GiB)": 91.52, "step": 77525, "token_acc": 0.7602420532795438, "train_speed(iter/s)": 0.136859 }, { "epoch": 1.0060077399933176, "grad_norm": 0.7412010431289673, "learning_rate": 5.363395359048393e-05, "loss": 0.77952561378479, "memory(GiB)": 91.52, "step": 77530, "token_acc": 0.7982695528127519, "train_speed(iter/s)": 0.136858 }, { "epoch": 1.0060726183949733, "grad_norm": 0.700275182723999, "learning_rate": 5.3628604002338625e-05, "loss": 0.7730841636657715, "memory(GiB)": 91.52, "step": 77535, "token_acc": 0.7767586931011056, "train_speed(iter/s)": 0.136857 }, { "epoch": 1.006137496796629, "grad_norm": 0.7451398372650146, "learning_rate": 5.362325437243557e-05, "loss": 0.7727190494537354, "memory(GiB)": 91.52, "step": 77540, "token_acc": 0.7794454241867097, "train_speed(iter/s)": 0.136856 }, { "epoch": 1.0062023751982847, "grad_norm": 0.7232997417449951, "learning_rate": 5.361790470083634e-05, "loss": 0.748076343536377, "memory(GiB)": 91.52, "step": 77545, "token_acc": 0.772245330723432, "train_speed(iter/s)": 0.136856 }, { "epoch": 1.0062672535999404, "grad_norm": 0.7779895067214966, "learning_rate": 5.3612554987602506e-05, "loss": 0.7761691093444825, "memory(GiB)": 91.52, "step": 77550, "token_acc": 0.7935095408863505, "train_speed(iter/s)": 0.136855 }, { "epoch": 1.006332132001596, "grad_norm": 0.6711764335632324, "learning_rate": 5.360720523279561e-05, "loss": 0.7834954738616944, "memory(GiB)": 91.52, "step": 77555, "token_acc": 0.7860508053734004, "train_speed(iter/s)": 0.136854 }, { "epoch": 1.0063970104032518, "grad_norm": 0.7100803256034851, "learning_rate": 5.360185543647722e-05, "loss": 0.7657126903533935, "memory(GiB)": 91.52, "step": 77560, "token_acc": 0.783002853324665, "train_speed(iter/s)": 0.136852 }, { "epoch": 1.0064618888049075, "grad_norm": 0.714221179485321, "learning_rate": 5.3596505598708916e-05, "loss": 0.743287467956543, "memory(GiB)": 91.52, "step": 77565, "token_acc": 0.7999636231356857, "train_speed(iter/s)": 0.136851 }, { "epoch": 1.0065267672065632, "grad_norm": 0.7134130001068115, "learning_rate": 5.359115571955225e-05, "loss": 0.7555456161499023, "memory(GiB)": 91.52, "step": 77570, "token_acc": 0.7826047052344969, "train_speed(iter/s)": 0.13685 }, { "epoch": 1.006591645608219, "grad_norm": 0.6858759522438049, "learning_rate": 5.358580579906881e-05, "loss": 0.7729991912841797, "memory(GiB)": 91.52, "step": 77575, "token_acc": 0.7993503149387949, "train_speed(iter/s)": 0.136849 }, { "epoch": 1.0066565240098746, "grad_norm": 0.6654310822486877, "learning_rate": 5.358045583732015e-05, "loss": 0.7310939788818359, "memory(GiB)": 91.52, "step": 77580, "token_acc": 0.7884337064713354, "train_speed(iter/s)": 0.136848 }, { "epoch": 1.0067214024115303, "grad_norm": 0.7124375104904175, "learning_rate": 5.357510583436782e-05, "loss": 0.7854086875915527, "memory(GiB)": 91.52, "step": 77585, "token_acc": 0.784414207457148, "train_speed(iter/s)": 0.136847 }, { "epoch": 1.006786280813186, "grad_norm": 0.6683455109596252, "learning_rate": 5.356975579027341e-05, "loss": 0.7875701427459717, "memory(GiB)": 91.52, "step": 77590, "token_acc": 0.7754066557230264, "train_speed(iter/s)": 0.136847 }, { "epoch": 1.0068511592148415, "grad_norm": 0.7028453350067139, "learning_rate": 5.356440570509849e-05, "loss": 0.7605993270874023, "memory(GiB)": 91.52, "step": 77595, "token_acc": 0.7850084444284738, "train_speed(iter/s)": 0.136846 }, { "epoch": 1.0069160376164972, "grad_norm": 0.7605669498443604, "learning_rate": 5.355905557890459e-05, "loss": 0.7850402355194092, "memory(GiB)": 91.52, "step": 77600, "token_acc": 0.7849724854152013, "train_speed(iter/s)": 0.136845 }, { "epoch": 1.0069809160181529, "grad_norm": 0.6560158729553223, "learning_rate": 5.355370541175333e-05, "loss": 0.7435185432434082, "memory(GiB)": 91.52, "step": 77605, "token_acc": 0.7964847039012068, "train_speed(iter/s)": 0.136845 }, { "epoch": 1.0070457944198086, "grad_norm": 0.6927880644798279, "learning_rate": 5.3548355203706254e-05, "loss": 0.7557910919189453, "memory(GiB)": 91.52, "step": 77610, "token_acc": 0.7810935544430538, "train_speed(iter/s)": 0.136843 }, { "epoch": 1.0071106728214643, "grad_norm": 0.6769036054611206, "learning_rate": 5.354300495482494e-05, "loss": 0.7297313213348389, "memory(GiB)": 91.52, "step": 77615, "token_acc": 0.7857657843238233, "train_speed(iter/s)": 0.136842 }, { "epoch": 1.00717555122312, "grad_norm": 0.6596149206161499, "learning_rate": 5.353765466517094e-05, "loss": 0.7133214950561524, "memory(GiB)": 91.52, "step": 77620, "token_acc": 0.7938698089770705, "train_speed(iter/s)": 0.136841 }, { "epoch": 1.0072404296247757, "grad_norm": 0.7120230197906494, "learning_rate": 5.353230433480584e-05, "loss": 0.7779108047485351, "memory(GiB)": 91.52, "step": 77625, "token_acc": 0.8013538656426903, "train_speed(iter/s)": 0.13684 }, { "epoch": 1.0073053080264314, "grad_norm": 0.6563724279403687, "learning_rate": 5.3526953963791224e-05, "loss": 0.7884017944335937, "memory(GiB)": 91.52, "step": 77630, "token_acc": 0.7741599111978203, "train_speed(iter/s)": 0.136838 }, { "epoch": 1.007370186428087, "grad_norm": 0.6987583041191101, "learning_rate": 5.3521603552188626e-05, "loss": 0.7444091796875, "memory(GiB)": 91.52, "step": 77635, "token_acc": 0.7764921030756442, "train_speed(iter/s)": 0.136837 }, { "epoch": 1.0074350648297428, "grad_norm": 0.6962289810180664, "learning_rate": 5.3516253100059656e-05, "loss": 0.7985734939575195, "memory(GiB)": 91.52, "step": 77640, "token_acc": 0.760643478807302, "train_speed(iter/s)": 0.136836 }, { "epoch": 1.0074999432313985, "grad_norm": 0.7389109134674072, "learning_rate": 5.3510902607465875e-05, "loss": 0.7618379116058349, "memory(GiB)": 91.52, "step": 77645, "token_acc": 0.7701365817644887, "train_speed(iter/s)": 0.136835 }, { "epoch": 1.0075648216330542, "grad_norm": 0.7663214206695557, "learning_rate": 5.3505552074468847e-05, "loss": 0.7440874576568604, "memory(GiB)": 91.52, "step": 77650, "token_acc": 0.7911700627963604, "train_speed(iter/s)": 0.136834 }, { "epoch": 1.0076297000347099, "grad_norm": 0.7475079894065857, "learning_rate": 5.3500201501130146e-05, "loss": 0.7608360767364502, "memory(GiB)": 91.52, "step": 77655, "token_acc": 0.7640105581270801, "train_speed(iter/s)": 0.136834 }, { "epoch": 1.0076945784363656, "grad_norm": 0.6908311247825623, "learning_rate": 5.349485088751135e-05, "loss": 0.7645675659179687, "memory(GiB)": 91.52, "step": 77660, "token_acc": 0.7687137183012527, "train_speed(iter/s)": 0.136833 }, { "epoch": 1.0077594568380213, "grad_norm": 0.6610738039016724, "learning_rate": 5.348950023367405e-05, "loss": 0.7894001960754394, "memory(GiB)": 91.52, "step": 77665, "token_acc": 0.7640412774809316, "train_speed(iter/s)": 0.136831 }, { "epoch": 1.007824335239677, "grad_norm": 0.7985846400260925, "learning_rate": 5.3484149539679784e-05, "loss": 0.8056526184082031, "memory(GiB)": 91.52, "step": 77670, "token_acc": 0.7747593971309242, "train_speed(iter/s)": 0.13683 }, { "epoch": 1.0078892136413327, "grad_norm": 0.7020426392555237, "learning_rate": 5.347879880559017e-05, "loss": 0.8018650054931641, "memory(GiB)": 91.52, "step": 77675, "token_acc": 0.7736823187269222, "train_speed(iter/s)": 0.136829 }, { "epoch": 1.0079540920429884, "grad_norm": 0.7364782691001892, "learning_rate": 5.347344803146675e-05, "loss": 0.7720067977905274, "memory(GiB)": 91.52, "step": 77680, "token_acc": 0.7838146257964307, "train_speed(iter/s)": 0.136828 }, { "epoch": 1.008018970444644, "grad_norm": 0.683132529258728, "learning_rate": 5.346809721737111e-05, "loss": 0.744774580001831, "memory(GiB)": 91.52, "step": 77685, "token_acc": 0.77813517284142, "train_speed(iter/s)": 0.136827 }, { "epoch": 1.0080838488462998, "grad_norm": 0.777586042881012, "learning_rate": 5.346274636336485e-05, "loss": 0.7433743476867676, "memory(GiB)": 91.52, "step": 77690, "token_acc": 0.7714237869971114, "train_speed(iter/s)": 0.136826 }, { "epoch": 1.0081487272479555, "grad_norm": 0.7731541991233826, "learning_rate": 5.345739546950951e-05, "loss": 0.787635087966919, "memory(GiB)": 91.52, "step": 77695, "token_acc": 0.7894954042393547, "train_speed(iter/s)": 0.136825 }, { "epoch": 1.0082136056496112, "grad_norm": 0.6531187891960144, "learning_rate": 5.3452044535866696e-05, "loss": 0.7235321998596191, "memory(GiB)": 91.52, "step": 77700, "token_acc": 0.7808345525093824, "train_speed(iter/s)": 0.136824 }, { "epoch": 1.0082784840512669, "grad_norm": 0.744742751121521, "learning_rate": 5.3446693562497963e-05, "loss": 0.7717045783996582, "memory(GiB)": 91.52, "step": 77705, "token_acc": 0.7672052035249686, "train_speed(iter/s)": 0.136823 }, { "epoch": 1.0083433624529226, "grad_norm": 0.6786307096481323, "learning_rate": 5.3441342549464914e-05, "loss": 0.7687251091003418, "memory(GiB)": 91.52, "step": 77710, "token_acc": 0.7883480574410752, "train_speed(iter/s)": 0.136822 }, { "epoch": 1.0084082408545783, "grad_norm": 0.618228018283844, "learning_rate": 5.343599149682912e-05, "loss": 0.7546265602111817, "memory(GiB)": 91.52, "step": 77715, "token_acc": 0.7826190072844317, "train_speed(iter/s)": 0.136821 }, { "epoch": 1.008473119256234, "grad_norm": 0.7709228992462158, "learning_rate": 5.3430640404652155e-05, "loss": 0.7919313430786132, "memory(GiB)": 91.52, "step": 77720, "token_acc": 0.7704099142040038, "train_speed(iter/s)": 0.13682 }, { "epoch": 1.0085379976578897, "grad_norm": 0.661227285861969, "learning_rate": 5.3425289272995597e-05, "loss": 0.7962427139282227, "memory(GiB)": 91.52, "step": 77725, "token_acc": 0.756477023353313, "train_speed(iter/s)": 0.136819 }, { "epoch": 1.0086028760595454, "grad_norm": 0.6706599593162537, "learning_rate": 5.341993810192102e-05, "loss": 0.7504516124725342, "memory(GiB)": 91.52, "step": 77730, "token_acc": 0.7820210701413918, "train_speed(iter/s)": 0.136818 }, { "epoch": 1.008667754461201, "grad_norm": 0.6735967993736267, "learning_rate": 5.341458689149004e-05, "loss": 0.7905357837677002, "memory(GiB)": 91.52, "step": 77735, "token_acc": 0.790705162605989, "train_speed(iter/s)": 0.136817 }, { "epoch": 1.0087326328628567, "grad_norm": 0.7223976850509644, "learning_rate": 5.340923564176419e-05, "loss": 0.7952708244323731, "memory(GiB)": 91.52, "step": 77740, "token_acc": 0.7741995671318755, "train_speed(iter/s)": 0.136816 }, { "epoch": 1.0087975112645124, "grad_norm": 0.7146738767623901, "learning_rate": 5.340388435280508e-05, "loss": 0.7900753021240234, "memory(GiB)": 91.52, "step": 77745, "token_acc": 0.7934242480851859, "train_speed(iter/s)": 0.136815 }, { "epoch": 1.0088623896661681, "grad_norm": 0.6894820332527161, "learning_rate": 5.339853302467429e-05, "loss": 0.7651532649993896, "memory(GiB)": 91.52, "step": 77750, "token_acc": 0.7711032028469751, "train_speed(iter/s)": 0.136814 }, { "epoch": 1.0089272680678238, "grad_norm": 0.7551639676094055, "learning_rate": 5.339318165743341e-05, "loss": 0.7663548946380615, "memory(GiB)": 91.52, "step": 77755, "token_acc": 0.7913674391056489, "train_speed(iter/s)": 0.136813 }, { "epoch": 1.0089921464694795, "grad_norm": 0.7838989496231079, "learning_rate": 5.338783025114401e-05, "loss": 0.7810942649841308, "memory(GiB)": 91.52, "step": 77760, "token_acc": 0.7696947108054077, "train_speed(iter/s)": 0.136813 }, { "epoch": 1.0090570248711352, "grad_norm": 0.7707633376121521, "learning_rate": 5.338247880586767e-05, "loss": 0.7645491600036621, "memory(GiB)": 91.52, "step": 77765, "token_acc": 0.771721994504752, "train_speed(iter/s)": 0.136812 }, { "epoch": 1.009121903272791, "grad_norm": 0.6515711545944214, "learning_rate": 5.3377127321665974e-05, "loss": 0.7472669124603272, "memory(GiB)": 91.52, "step": 77770, "token_acc": 0.79668798682794, "train_speed(iter/s)": 0.136811 }, { "epoch": 1.0091867816744466, "grad_norm": 0.6848292350769043, "learning_rate": 5.337177579860053e-05, "loss": 0.7976884841918945, "memory(GiB)": 91.52, "step": 77775, "token_acc": 0.7681177343556764, "train_speed(iter/s)": 0.13681 }, { "epoch": 1.0092516600761023, "grad_norm": 0.7696681618690491, "learning_rate": 5.3366424236732905e-05, "loss": 0.773830795288086, "memory(GiB)": 91.52, "step": 77780, "token_acc": 0.7789363920750783, "train_speed(iter/s)": 0.136809 }, { "epoch": 1.009316538477758, "grad_norm": 0.735379159450531, "learning_rate": 5.336107263612469e-05, "loss": 0.7587471961975097, "memory(GiB)": 91.52, "step": 77785, "token_acc": 0.7933813147835669, "train_speed(iter/s)": 0.136808 }, { "epoch": 1.0093814168794137, "grad_norm": 0.647882342338562, "learning_rate": 5.335572099683747e-05, "loss": 0.7660172939300537, "memory(GiB)": 91.52, "step": 77790, "token_acc": 0.7982888432580424, "train_speed(iter/s)": 0.136807 }, { "epoch": 1.0094462952810694, "grad_norm": 0.6664140820503235, "learning_rate": 5.335036931893281e-05, "loss": 0.7539837837219239, "memory(GiB)": 91.52, "step": 77795, "token_acc": 0.76656009366416, "train_speed(iter/s)": 0.136806 }, { "epoch": 1.0095111736827251, "grad_norm": 0.7266139984130859, "learning_rate": 5.334501760247233e-05, "loss": 0.7367999076843261, "memory(GiB)": 91.52, "step": 77800, "token_acc": 0.7852805042182319, "train_speed(iter/s)": 0.136804 }, { "epoch": 1.0095760520843808, "grad_norm": 0.7443991899490356, "learning_rate": 5.33396658475176e-05, "loss": 0.7435768127441407, "memory(GiB)": 91.52, "step": 77805, "token_acc": 0.8002645502645502, "train_speed(iter/s)": 0.136803 }, { "epoch": 1.0096409304860365, "grad_norm": 0.6486986875534058, "learning_rate": 5.333431405413021e-05, "loss": 0.7301782608032227, "memory(GiB)": 91.52, "step": 77810, "token_acc": 0.7820864007304685, "train_speed(iter/s)": 0.136801 }, { "epoch": 1.0097058088876922, "grad_norm": 0.652957558631897, "learning_rate": 5.332896222237175e-05, "loss": 0.741268539428711, "memory(GiB)": 91.52, "step": 77815, "token_acc": 0.8073038941680641, "train_speed(iter/s)": 0.1368 }, { "epoch": 1.009770687289348, "grad_norm": 0.7417285442352295, "learning_rate": 5.33236103523038e-05, "loss": 0.7703342437744141, "memory(GiB)": 91.52, "step": 77820, "token_acc": 0.7726719320700528, "train_speed(iter/s)": 0.136799 }, { "epoch": 1.0098355656910036, "grad_norm": 0.7150958776473999, "learning_rate": 5.3318258443987965e-05, "loss": 0.7445359706878663, "memory(GiB)": 91.52, "step": 77825, "token_acc": 0.7851121536431608, "train_speed(iter/s)": 0.136798 }, { "epoch": 1.0099004440926593, "grad_norm": 0.6620991826057434, "learning_rate": 5.331290649748583e-05, "loss": 0.7285645484924317, "memory(GiB)": 91.52, "step": 77830, "token_acc": 0.8078710570552754, "train_speed(iter/s)": 0.136797 }, { "epoch": 1.009965322494315, "grad_norm": 0.7759124636650085, "learning_rate": 5.330755451285896e-05, "loss": 0.7936720371246337, "memory(GiB)": 91.52, "step": 77835, "token_acc": 0.7836584756945683, "train_speed(iter/s)": 0.136796 }, { "epoch": 1.0100302008959707, "grad_norm": 0.6965234875679016, "learning_rate": 5.330220249016897e-05, "loss": 0.7663425445556641, "memory(GiB)": 91.52, "step": 77840, "token_acc": 0.7880989280640579, "train_speed(iter/s)": 0.136795 }, { "epoch": 1.0100950792976264, "grad_norm": 0.5723393559455872, "learning_rate": 5.329685042947746e-05, "loss": 0.7921503067016602, "memory(GiB)": 91.52, "step": 77845, "token_acc": 0.7665673241134514, "train_speed(iter/s)": 0.136794 }, { "epoch": 1.0101599576992821, "grad_norm": 0.7313956618309021, "learning_rate": 5.3291498330846005e-05, "loss": 0.7498560905456543, "memory(GiB)": 91.52, "step": 77850, "token_acc": 0.7873691831764629, "train_speed(iter/s)": 0.136792 }, { "epoch": 1.0102248361009378, "grad_norm": 0.7112502455711365, "learning_rate": 5.328614619433618e-05, "loss": 0.7650917053222657, "memory(GiB)": 91.52, "step": 77855, "token_acc": 0.7796568324898161, "train_speed(iter/s)": 0.136792 }, { "epoch": 1.0102897145025935, "grad_norm": 0.640375018119812, "learning_rate": 5.328079402000962e-05, "loss": 0.7725927352905273, "memory(GiB)": 91.52, "step": 77860, "token_acc": 0.785152871272235, "train_speed(iter/s)": 0.136791 }, { "epoch": 1.0103545929042492, "grad_norm": 0.7651796340942383, "learning_rate": 5.3275441807927896e-05, "loss": 0.7721458435058594, "memory(GiB)": 91.52, "step": 77865, "token_acc": 0.7771194808858796, "train_speed(iter/s)": 0.13679 }, { "epoch": 1.010419471305905, "grad_norm": 0.7610244750976562, "learning_rate": 5.327008955815258e-05, "loss": 0.7787117481231689, "memory(GiB)": 91.52, "step": 77870, "token_acc": 0.78426832702356, "train_speed(iter/s)": 0.136788 }, { "epoch": 1.0104843497075606, "grad_norm": 0.6811971068382263, "learning_rate": 5.326473727074529e-05, "loss": 0.7672486305236816, "memory(GiB)": 91.52, "step": 77875, "token_acc": 0.7755351455447744, "train_speed(iter/s)": 0.136788 }, { "epoch": 1.0105492281092163, "grad_norm": 0.632283627986908, "learning_rate": 5.3259384945767624e-05, "loss": 0.8092478752136231, "memory(GiB)": 91.52, "step": 77880, "token_acc": 0.7776120992650974, "train_speed(iter/s)": 0.136786 }, { "epoch": 1.010614106510872, "grad_norm": 0.6973105072975159, "learning_rate": 5.3254032583281154e-05, "loss": 0.7795803070068359, "memory(GiB)": 91.52, "step": 77885, "token_acc": 0.7662158987953425, "train_speed(iter/s)": 0.136785 }, { "epoch": 1.0106789849125277, "grad_norm": 0.8148519396781921, "learning_rate": 5.3248680183347474e-05, "loss": 0.7744483470916748, "memory(GiB)": 91.52, "step": 77890, "token_acc": 0.7902129218743639, "train_speed(iter/s)": 0.136785 }, { "epoch": 1.0107438633141834, "grad_norm": 0.6401830315589905, "learning_rate": 5.3243327746028205e-05, "loss": 0.7580384254455567, "memory(GiB)": 91.52, "step": 77895, "token_acc": 0.7691818567189487, "train_speed(iter/s)": 0.136784 }, { "epoch": 1.0108087417158391, "grad_norm": 0.7097728848457336, "learning_rate": 5.3237975271384944e-05, "loss": 0.7611923217773438, "memory(GiB)": 91.52, "step": 77900, "token_acc": 0.7991304858703954, "train_speed(iter/s)": 0.136783 }, { "epoch": 1.0108736201174948, "grad_norm": 0.732585608959198, "learning_rate": 5.3232622759479254e-05, "loss": 0.7651056289672852, "memory(GiB)": 91.52, "step": 77905, "token_acc": 0.782919447179375, "train_speed(iter/s)": 0.136782 }, { "epoch": 1.0109384985191505, "grad_norm": 0.6045264601707458, "learning_rate": 5.322727021037276e-05, "loss": 0.771910285949707, "memory(GiB)": 91.52, "step": 77910, "token_acc": 0.7879556530908883, "train_speed(iter/s)": 0.136781 }, { "epoch": 1.0110033769208062, "grad_norm": 0.6781831383705139, "learning_rate": 5.322191762412704e-05, "loss": 0.7360121726989746, "memory(GiB)": 91.52, "step": 77915, "token_acc": 0.7809431613884663, "train_speed(iter/s)": 0.136779 }, { "epoch": 1.011068255322462, "grad_norm": 0.7088205218315125, "learning_rate": 5.32165650008037e-05, "loss": 0.7720551490783691, "memory(GiB)": 91.52, "step": 77920, "token_acc": 0.7982530290222598, "train_speed(iter/s)": 0.136778 }, { "epoch": 1.0111331337241176, "grad_norm": 0.7029349207878113, "learning_rate": 5.321121234046433e-05, "loss": 0.7995171546936035, "memory(GiB)": 91.52, "step": 77925, "token_acc": 0.7486686390532544, "train_speed(iter/s)": 0.136778 }, { "epoch": 1.0111980121257733, "grad_norm": 0.7349357604980469, "learning_rate": 5.320585964317054e-05, "loss": 0.7419671535491943, "memory(GiB)": 91.52, "step": 77930, "token_acc": 0.7725275659740922, "train_speed(iter/s)": 0.136777 }, { "epoch": 1.011262890527429, "grad_norm": 0.6479551792144775, "learning_rate": 5.320050690898394e-05, "loss": 0.7209484100341796, "memory(GiB)": 91.52, "step": 77935, "token_acc": 0.7950081711484178, "train_speed(iter/s)": 0.136775 }, { "epoch": 1.0113277689290847, "grad_norm": 0.8445757627487183, "learning_rate": 5.31951541379661e-05, "loss": 0.7875427722930908, "memory(GiB)": 91.52, "step": 77940, "token_acc": 0.8022380467955239, "train_speed(iter/s)": 0.136774 }, { "epoch": 1.0113926473307404, "grad_norm": 0.6723659038543701, "learning_rate": 5.3189801330178626e-05, "loss": 0.7574209213256836, "memory(GiB)": 91.52, "step": 77945, "token_acc": 0.7750124822368168, "train_speed(iter/s)": 0.136773 }, { "epoch": 1.0114575257323961, "grad_norm": 0.7771655321121216, "learning_rate": 5.3184448485683125e-05, "loss": 0.7729352951049805, "memory(GiB)": 91.52, "step": 77950, "token_acc": 0.7761693780227957, "train_speed(iter/s)": 0.136772 }, { "epoch": 1.0115224041340518, "grad_norm": 0.7854791879653931, "learning_rate": 5.3179095604541204e-05, "loss": 0.7590945720672607, "memory(GiB)": 91.52, "step": 77955, "token_acc": 0.7716291953874895, "train_speed(iter/s)": 0.136772 }, { "epoch": 1.0115872825357075, "grad_norm": 0.8371906876564026, "learning_rate": 5.317374268681444e-05, "loss": 0.8051967620849609, "memory(GiB)": 91.52, "step": 77960, "token_acc": 0.7632205616685827, "train_speed(iter/s)": 0.136771 }, { "epoch": 1.0116521609373632, "grad_norm": 0.6941065192222595, "learning_rate": 5.3168389732564475e-05, "loss": 0.7574185371398926, "memory(GiB)": 91.52, "step": 77965, "token_acc": 0.7986811045749185, "train_speed(iter/s)": 0.13677 }, { "epoch": 1.011717039339019, "grad_norm": 0.7033621668815613, "learning_rate": 5.3163036741852866e-05, "loss": 0.8022295951843261, "memory(GiB)": 91.52, "step": 77970, "token_acc": 0.7595713066295723, "train_speed(iter/s)": 0.13677 }, { "epoch": 1.0117819177406746, "grad_norm": 0.704086422920227, "learning_rate": 5.3157683714741235e-05, "loss": 0.7498523712158203, "memory(GiB)": 91.52, "step": 77975, "token_acc": 0.7804809941010158, "train_speed(iter/s)": 0.136768 }, { "epoch": 1.0118467961423303, "grad_norm": 0.6751002073287964, "learning_rate": 5.3152330651291193e-05, "loss": 0.7472271919250488, "memory(GiB)": 91.52, "step": 77980, "token_acc": 0.7952531106187148, "train_speed(iter/s)": 0.136767 }, { "epoch": 1.011911674543986, "grad_norm": 0.7588029503822327, "learning_rate": 5.3146977551564316e-05, "loss": 0.7469841957092285, "memory(GiB)": 91.52, "step": 77985, "token_acc": 0.7776063847624157, "train_speed(iter/s)": 0.136766 }, { "epoch": 1.0119765529456417, "grad_norm": 0.6312860250473022, "learning_rate": 5.314162441562225e-05, "loss": 0.7703300476074219, "memory(GiB)": 91.52, "step": 77990, "token_acc": 0.777563775008844, "train_speed(iter/s)": 0.136765 }, { "epoch": 1.0120414313472974, "grad_norm": 0.6728101968765259, "learning_rate": 5.3136271243526535e-05, "loss": 0.7646445274353028, "memory(GiB)": 91.52, "step": 77995, "token_acc": 0.784025539860856, "train_speed(iter/s)": 0.136764 }, { "epoch": 1.012106309748953, "grad_norm": 0.7243165373802185, "learning_rate": 5.313091803533884e-05, "loss": 0.7619126319885254, "memory(GiB)": 91.52, "step": 78000, "token_acc": 0.796168811733014, "train_speed(iter/s)": 0.136763 }, { "epoch": 1.0121711881506088, "grad_norm": 0.6967560648918152, "learning_rate": 5.312556479112074e-05, "loss": 0.7622894287109375, "memory(GiB)": 91.52, "step": 78005, "token_acc": 0.7964226876912215, "train_speed(iter/s)": 0.136762 }, { "epoch": 1.0122360665522645, "grad_norm": 0.7250383496284485, "learning_rate": 5.3120211510933835e-05, "loss": 0.764243221282959, "memory(GiB)": 91.52, "step": 78010, "token_acc": 0.7832358312440029, "train_speed(iter/s)": 0.13676 }, { "epoch": 1.0123009449539202, "grad_norm": 0.7362536191940308, "learning_rate": 5.3114858194839743e-05, "loss": 0.803378677368164, "memory(GiB)": 91.52, "step": 78015, "token_acc": 0.7682421085101279, "train_speed(iter/s)": 0.136759 }, { "epoch": 1.012365823355576, "grad_norm": 0.5488595366477966, "learning_rate": 5.310950484290005e-05, "loss": 0.7483748912811279, "memory(GiB)": 91.52, "step": 78020, "token_acc": 0.7953818241794491, "train_speed(iter/s)": 0.136758 }, { "epoch": 1.0124307017572316, "grad_norm": 0.7542641162872314, "learning_rate": 5.31041514551764e-05, "loss": 0.7964344978332519, "memory(GiB)": 91.52, "step": 78025, "token_acc": 0.7725385043000556, "train_speed(iter/s)": 0.136757 }, { "epoch": 1.0124955801588873, "grad_norm": 0.7254071831703186, "learning_rate": 5.3098798031730344e-05, "loss": 0.7614259719848633, "memory(GiB)": 91.52, "step": 78030, "token_acc": 0.780977098165749, "train_speed(iter/s)": 0.136756 }, { "epoch": 1.012560458560543, "grad_norm": 0.7193824052810669, "learning_rate": 5.309344457262354e-05, "loss": 0.7671039581298829, "memory(GiB)": 91.52, "step": 78035, "token_acc": 0.7704488885074242, "train_speed(iter/s)": 0.136755 }, { "epoch": 1.0126253369621987, "grad_norm": 0.753710925579071, "learning_rate": 5.308809107791757e-05, "loss": 0.7866105556488037, "memory(GiB)": 91.52, "step": 78040, "token_acc": 0.7853817341687962, "train_speed(iter/s)": 0.136754 }, { "epoch": 1.0126902153638544, "grad_norm": 0.6326672434806824, "learning_rate": 5.308273754767404e-05, "loss": 0.7746378898620605, "memory(GiB)": 91.52, "step": 78045, "token_acc": 0.7612776212620074, "train_speed(iter/s)": 0.136753 }, { "epoch": 1.01275509376551, "grad_norm": 0.6649174690246582, "learning_rate": 5.307738398195458e-05, "loss": 0.763526725769043, "memory(GiB)": 91.52, "step": 78050, "token_acc": 0.7809749492213948, "train_speed(iter/s)": 0.136752 }, { "epoch": 1.0128199721671658, "grad_norm": 0.8196828365325928, "learning_rate": 5.307203038082076e-05, "loss": 0.7774588584899902, "memory(GiB)": 91.52, "step": 78055, "token_acc": 0.7801796221740477, "train_speed(iter/s)": 0.136751 }, { "epoch": 1.0128848505688215, "grad_norm": 0.6713426113128662, "learning_rate": 5.306667674433424e-05, "loss": 0.7954731941223144, "memory(GiB)": 91.52, "step": 78060, "token_acc": 0.7844780962682532, "train_speed(iter/s)": 0.13675 }, { "epoch": 1.0129497289704772, "grad_norm": 0.7202021479606628, "learning_rate": 5.306132307255658e-05, "loss": 0.7765284538269043, "memory(GiB)": 91.52, "step": 78065, "token_acc": 0.7917496552780757, "train_speed(iter/s)": 0.136748 }, { "epoch": 1.0130146073721327, "grad_norm": 0.72385573387146, "learning_rate": 5.305596936554942e-05, "loss": 0.8067218780517578, "memory(GiB)": 91.52, "step": 78070, "token_acc": 0.7837533018727926, "train_speed(iter/s)": 0.136747 }, { "epoch": 1.0130794857737884, "grad_norm": 0.6548576354980469, "learning_rate": 5.305061562337437e-05, "loss": 0.7329280376434326, "memory(GiB)": 91.52, "step": 78075, "token_acc": 0.7752817291705154, "train_speed(iter/s)": 0.136746 }, { "epoch": 1.013144364175444, "grad_norm": 0.7068772912025452, "learning_rate": 5.304526184609302e-05, "loss": 0.7990233421325683, "memory(GiB)": 91.52, "step": 78080, "token_acc": 0.7821674955879994, "train_speed(iter/s)": 0.136745 }, { "epoch": 1.0132092425770998, "grad_norm": 0.7328919172286987, "learning_rate": 5.303990803376699e-05, "loss": 0.7608366489410401, "memory(GiB)": 91.52, "step": 78085, "token_acc": 0.7828555764411027, "train_speed(iter/s)": 0.136744 }, { "epoch": 1.0132741209787555, "grad_norm": 0.7718411684036255, "learning_rate": 5.3034554186457894e-05, "loss": 0.8125345230102539, "memory(GiB)": 91.52, "step": 78090, "token_acc": 0.7867373024917609, "train_speed(iter/s)": 0.136743 }, { "epoch": 1.0133389993804112, "grad_norm": 0.6391164064407349, "learning_rate": 5.302920030422736e-05, "loss": 0.7042100429534912, "memory(GiB)": 91.52, "step": 78095, "token_acc": 0.7890316205533597, "train_speed(iter/s)": 0.136742 }, { "epoch": 1.0134038777820669, "grad_norm": 0.6992268562316895, "learning_rate": 5.302384638713697e-05, "loss": 0.7478847026824951, "memory(GiB)": 91.52, "step": 78100, "token_acc": 0.7717990668363328, "train_speed(iter/s)": 0.136741 }, { "epoch": 1.0134687561837226, "grad_norm": 0.7884940505027771, "learning_rate": 5.3018492435248346e-05, "loss": 0.7645626544952393, "memory(GiB)": 91.52, "step": 78105, "token_acc": 0.771285872839702, "train_speed(iter/s)": 0.13674 }, { "epoch": 1.0135336345853783, "grad_norm": 0.6830796599388123, "learning_rate": 5.301313844862311e-05, "loss": 0.7470963478088379, "memory(GiB)": 91.52, "step": 78110, "token_acc": 0.7822671156004489, "train_speed(iter/s)": 0.136739 }, { "epoch": 1.013598512987034, "grad_norm": 0.6660162806510925, "learning_rate": 5.3007784427322884e-05, "loss": 0.7728487968444824, "memory(GiB)": 91.52, "step": 78115, "token_acc": 0.7763099505034989, "train_speed(iter/s)": 0.136738 }, { "epoch": 1.0136633913886897, "grad_norm": 0.7801540493965149, "learning_rate": 5.300243037140925e-05, "loss": 0.7991840362548828, "memory(GiB)": 91.52, "step": 78120, "token_acc": 0.7759320645089203, "train_speed(iter/s)": 0.136737 }, { "epoch": 1.0137282697903454, "grad_norm": 0.752903401851654, "learning_rate": 5.2997076280943836e-05, "loss": 0.7648154735565186, "memory(GiB)": 91.52, "step": 78125, "token_acc": 0.7805461864867805, "train_speed(iter/s)": 0.136736 }, { "epoch": 1.013793148192001, "grad_norm": 0.723585844039917, "learning_rate": 5.2991722155988276e-05, "loss": 0.7372781753540039, "memory(GiB)": 91.52, "step": 78130, "token_acc": 0.7765144024784127, "train_speed(iter/s)": 0.136735 }, { "epoch": 1.0138580265936568, "grad_norm": 0.6333579421043396, "learning_rate": 5.298636799660417e-05, "loss": 0.746585464477539, "memory(GiB)": 91.52, "step": 78135, "token_acc": 0.7885629074767652, "train_speed(iter/s)": 0.136733 }, { "epoch": 1.0139229049953125, "grad_norm": 0.6361833810806274, "learning_rate": 5.298101380285313e-05, "loss": 0.7440228462219238, "memory(GiB)": 91.52, "step": 78140, "token_acc": 0.791149429399349, "train_speed(iter/s)": 0.136732 }, { "epoch": 1.0139877833969682, "grad_norm": 0.7058197259902954, "learning_rate": 5.297565957479678e-05, "loss": 0.772855281829834, "memory(GiB)": 91.52, "step": 78145, "token_acc": 0.7658454411455043, "train_speed(iter/s)": 0.136731 }, { "epoch": 1.0140526617986239, "grad_norm": 0.6821759343147278, "learning_rate": 5.297030531249673e-05, "loss": 0.7984649181365967, "memory(GiB)": 91.52, "step": 78150, "token_acc": 0.7708273826525754, "train_speed(iter/s)": 0.13673 }, { "epoch": 1.0141175402002796, "grad_norm": 0.6791120171546936, "learning_rate": 5.2964951016014586e-05, "loss": 0.7380484580993653, "memory(GiB)": 91.52, "step": 78155, "token_acc": 0.7822903247677808, "train_speed(iter/s)": 0.136729 }, { "epoch": 1.0141824186019353, "grad_norm": 0.6808285713195801, "learning_rate": 5.295959668541197e-05, "loss": 0.728704309463501, "memory(GiB)": 91.52, "step": 78160, "token_acc": 0.7891872158242341, "train_speed(iter/s)": 0.136728 }, { "epoch": 1.014247297003591, "grad_norm": 0.8455430269241333, "learning_rate": 5.295424232075052e-05, "loss": 0.7731089115142822, "memory(GiB)": 91.52, "step": 78165, "token_acc": 0.7732137548310376, "train_speed(iter/s)": 0.136727 }, { "epoch": 1.0143121754052467, "grad_norm": 0.6460274457931519, "learning_rate": 5.294888792209185e-05, "loss": 0.7727834224700928, "memory(GiB)": 91.52, "step": 78170, "token_acc": 0.7915215326155794, "train_speed(iter/s)": 0.136726 }, { "epoch": 1.0143770538069024, "grad_norm": 0.6320897936820984, "learning_rate": 5.294353348949756e-05, "loss": 0.7311423301696778, "memory(GiB)": 91.52, "step": 78175, "token_acc": 0.7887175893482831, "train_speed(iter/s)": 0.136726 }, { "epoch": 1.014441932208558, "grad_norm": 0.7661504149436951, "learning_rate": 5.293817902302928e-05, "loss": 0.7633329868316651, "memory(GiB)": 91.52, "step": 78180, "token_acc": 0.7854258311098291, "train_speed(iter/s)": 0.136724 }, { "epoch": 1.0145068106102137, "grad_norm": 0.6790255308151245, "learning_rate": 5.2932824522748614e-05, "loss": 0.7535256385803223, "memory(GiB)": 91.52, "step": 78185, "token_acc": 0.7636942675159236, "train_speed(iter/s)": 0.136724 }, { "epoch": 1.0145716890118694, "grad_norm": 0.7625606656074524, "learning_rate": 5.292746998871719e-05, "loss": 0.7769845962524414, "memory(GiB)": 91.52, "step": 78190, "token_acc": 0.7695928887018725, "train_speed(iter/s)": 0.136722 }, { "epoch": 1.0146365674135251, "grad_norm": 0.6667648553848267, "learning_rate": 5.292211542099663e-05, "loss": 0.75135817527771, "memory(GiB)": 91.52, "step": 78195, "token_acc": 0.7717685791693817, "train_speed(iter/s)": 0.136721 }, { "epoch": 1.0147014458151808, "grad_norm": 0.6681122183799744, "learning_rate": 5.291676081964856e-05, "loss": 0.7864141464233398, "memory(GiB)": 91.52, "step": 78200, "token_acc": 0.7790500671846702, "train_speed(iter/s)": 0.13672 }, { "epoch": 1.0147663242168365, "grad_norm": 0.8006243109703064, "learning_rate": 5.2911406184734605e-05, "loss": 0.7662688732147217, "memory(GiB)": 91.52, "step": 78205, "token_acc": 0.7910767036772944, "train_speed(iter/s)": 0.136719 }, { "epoch": 1.0148312026184922, "grad_norm": 0.6979716420173645, "learning_rate": 5.290605151631636e-05, "loss": 0.787994384765625, "memory(GiB)": 91.52, "step": 78210, "token_acc": 0.7888320011786806, "train_speed(iter/s)": 0.136719 }, { "epoch": 1.014896081020148, "grad_norm": 0.601337194442749, "learning_rate": 5.2900696814455466e-05, "loss": 0.7378915309906006, "memory(GiB)": 91.52, "step": 78215, "token_acc": 0.8068523604027723, "train_speed(iter/s)": 0.136718 }, { "epoch": 1.0149609594218036, "grad_norm": 0.7001345157623291, "learning_rate": 5.289534207921353e-05, "loss": 0.7540120124816895, "memory(GiB)": 91.52, "step": 78220, "token_acc": 0.775569067664484, "train_speed(iter/s)": 0.136717 }, { "epoch": 1.0150258378234593, "grad_norm": 0.6364879608154297, "learning_rate": 5.28899873106522e-05, "loss": 0.7171754360198974, "memory(GiB)": 91.52, "step": 78225, "token_acc": 0.7987555555555556, "train_speed(iter/s)": 0.136715 }, { "epoch": 1.015090716225115, "grad_norm": 0.7290219068527222, "learning_rate": 5.288463250883307e-05, "loss": 0.7330464363098145, "memory(GiB)": 91.52, "step": 78230, "token_acc": 0.7833154174403075, "train_speed(iter/s)": 0.136714 }, { "epoch": 1.0151555946267707, "grad_norm": 0.7547829747200012, "learning_rate": 5.2879277673817783e-05, "loss": 0.7579799652099609, "memory(GiB)": 91.52, "step": 78235, "token_acc": 0.7682700677440109, "train_speed(iter/s)": 0.136713 }, { "epoch": 1.0152204730284264, "grad_norm": 0.6247082948684692, "learning_rate": 5.287392280566795e-05, "loss": 0.7863883018493653, "memory(GiB)": 91.52, "step": 78240, "token_acc": 0.7818680731264032, "train_speed(iter/s)": 0.136712 }, { "epoch": 1.0152853514300821, "grad_norm": 0.6625982522964478, "learning_rate": 5.28685679044452e-05, "loss": 0.772573709487915, "memory(GiB)": 91.52, "step": 78245, "token_acc": 0.7754268997891013, "train_speed(iter/s)": 0.136711 }, { "epoch": 1.0153502298317378, "grad_norm": 0.7327286601066589, "learning_rate": 5.286321297021114e-05, "loss": 0.7703625679016113, "memory(GiB)": 91.52, "step": 78250, "token_acc": 0.7811593326885881, "train_speed(iter/s)": 0.136709 }, { "epoch": 1.0154151082333935, "grad_norm": 0.7408847212791443, "learning_rate": 5.285785800302743e-05, "loss": 0.7611690998077393, "memory(GiB)": 91.52, "step": 78255, "token_acc": 0.8067038390053195, "train_speed(iter/s)": 0.136709 }, { "epoch": 1.0154799866350492, "grad_norm": 0.7140238881111145, "learning_rate": 5.2852503002955675e-05, "loss": 0.7383753776550293, "memory(GiB)": 91.52, "step": 78260, "token_acc": 0.7956710646948619, "train_speed(iter/s)": 0.136708 }, { "epoch": 1.015544865036705, "grad_norm": 0.7381482124328613, "learning_rate": 5.284714797005749e-05, "loss": 0.7384642124176025, "memory(GiB)": 91.52, "step": 78265, "token_acc": 0.787511977004152, "train_speed(iter/s)": 0.136707 }, { "epoch": 1.0156097434383606, "grad_norm": 0.7669535875320435, "learning_rate": 5.2841792904394504e-05, "loss": 0.759367561340332, "memory(GiB)": 91.52, "step": 78270, "token_acc": 0.7969503399958788, "train_speed(iter/s)": 0.136706 }, { "epoch": 1.0156746218400163, "grad_norm": 0.7752165198326111, "learning_rate": 5.283643780602836e-05, "loss": 0.7733256340026855, "memory(GiB)": 91.52, "step": 78275, "token_acc": 0.7911635470366398, "train_speed(iter/s)": 0.136705 }, { "epoch": 1.015739500241672, "grad_norm": 0.6314480304718018, "learning_rate": 5.283108267502066e-05, "loss": 0.7844873905181885, "memory(GiB)": 91.52, "step": 78280, "token_acc": 0.7803326643763393, "train_speed(iter/s)": 0.136704 }, { "epoch": 1.0158043786433277, "grad_norm": 0.745585024356842, "learning_rate": 5.2825727511433044e-05, "loss": 0.7925992012023926, "memory(GiB)": 91.52, "step": 78285, "token_acc": 0.777196780914253, "train_speed(iter/s)": 0.136703 }, { "epoch": 1.0158692570449834, "grad_norm": 0.7108488082885742, "learning_rate": 5.282037231532715e-05, "loss": 0.8061868667602539, "memory(GiB)": 91.52, "step": 78290, "token_acc": 0.7759352251936626, "train_speed(iter/s)": 0.136701 }, { "epoch": 1.0159341354466391, "grad_norm": 0.7339954376220703, "learning_rate": 5.2815017086764595e-05, "loss": 0.7909403324127198, "memory(GiB)": 91.52, "step": 78295, "token_acc": 0.7923978878707874, "train_speed(iter/s)": 0.1367 }, { "epoch": 1.0159990138482948, "grad_norm": 0.7274677157402039, "learning_rate": 5.280966182580699e-05, "loss": 0.7466679573059082, "memory(GiB)": 91.52, "step": 78300, "token_acc": 0.7794520970077964, "train_speed(iter/s)": 0.136699 }, { "epoch": 1.0160638922499505, "grad_norm": 0.689826250076294, "learning_rate": 5.280430653251598e-05, "loss": 0.7649580955505371, "memory(GiB)": 91.52, "step": 78305, "token_acc": 0.7650123095586399, "train_speed(iter/s)": 0.136698 }, { "epoch": 1.0161287706516062, "grad_norm": 0.7278439402580261, "learning_rate": 5.279895120695318e-05, "loss": 0.7692827224731446, "memory(GiB)": 91.52, "step": 78310, "token_acc": 0.7783801849307438, "train_speed(iter/s)": 0.136697 }, { "epoch": 1.016193649053262, "grad_norm": 0.7042855024337769, "learning_rate": 5.2793595849180266e-05, "loss": 0.781041431427002, "memory(GiB)": 91.52, "step": 78315, "token_acc": 0.7802610011347876, "train_speed(iter/s)": 0.136695 }, { "epoch": 1.0162585274549176, "grad_norm": 0.693800151348114, "learning_rate": 5.27882404592588e-05, "loss": 0.7680037021636963, "memory(GiB)": 91.52, "step": 78320, "token_acc": 0.7820112207939777, "train_speed(iter/s)": 0.136695 }, { "epoch": 1.0163234058565733, "grad_norm": 0.6584135890007019, "learning_rate": 5.2782885037250454e-05, "loss": 0.7360758781433105, "memory(GiB)": 91.52, "step": 78325, "token_acc": 0.7847658527106678, "train_speed(iter/s)": 0.136694 }, { "epoch": 1.016388284258229, "grad_norm": 0.7890973687171936, "learning_rate": 5.277752958321683e-05, "loss": 0.7838775634765625, "memory(GiB)": 91.52, "step": 78330, "token_acc": 0.7701487787413478, "train_speed(iter/s)": 0.136693 }, { "epoch": 1.0164531626598847, "grad_norm": 0.7389476895332336, "learning_rate": 5.2772174097219594e-05, "loss": 0.801585578918457, "memory(GiB)": 91.52, "step": 78335, "token_acc": 0.7820243717567489, "train_speed(iter/s)": 0.136692 }, { "epoch": 1.0165180410615404, "grad_norm": 0.665626585483551, "learning_rate": 5.276681857932035e-05, "loss": 0.7505769729614258, "memory(GiB)": 91.52, "step": 78340, "token_acc": 0.7853570714142829, "train_speed(iter/s)": 0.136691 }, { "epoch": 1.0165829194631961, "grad_norm": 0.7097321152687073, "learning_rate": 5.276146302958073e-05, "loss": 0.7694943904876709, "memory(GiB)": 91.52, "step": 78345, "token_acc": 0.7869347780787664, "train_speed(iter/s)": 0.13669 }, { "epoch": 1.0166477978648518, "grad_norm": 0.7456353902816772, "learning_rate": 5.2756107448062384e-05, "loss": 0.7409406661987304, "memory(GiB)": 91.52, "step": 78350, "token_acc": 0.7802391954335417, "train_speed(iter/s)": 0.136689 }, { "epoch": 1.0167126762665075, "grad_norm": 0.7136179208755493, "learning_rate": 5.2750751834826914e-05, "loss": 0.7379464149475098, "memory(GiB)": 91.52, "step": 78355, "token_acc": 0.7933840059950755, "train_speed(iter/s)": 0.136688 }, { "epoch": 1.0167775546681632, "grad_norm": 0.6274749040603638, "learning_rate": 5.274539618993599e-05, "loss": 0.7638990402221679, "memory(GiB)": 91.52, "step": 78360, "token_acc": 0.7807843923297922, "train_speed(iter/s)": 0.136687 }, { "epoch": 1.016842433069819, "grad_norm": 0.6662155985832214, "learning_rate": 5.2740040513451216e-05, "loss": 0.7448227882385254, "memory(GiB)": 91.52, "step": 78365, "token_acc": 0.7887727272727273, "train_speed(iter/s)": 0.136687 }, { "epoch": 1.0169073114714746, "grad_norm": 0.7172187566757202, "learning_rate": 5.273468480543422e-05, "loss": 0.7704180240631103, "memory(GiB)": 91.52, "step": 78370, "token_acc": 0.7749801482265749, "train_speed(iter/s)": 0.136686 }, { "epoch": 1.0169721898731303, "grad_norm": 0.6854945421218872, "learning_rate": 5.272932906594666e-05, "loss": 0.7304014205932617, "memory(GiB)": 91.52, "step": 78375, "token_acc": 0.7847462574538177, "train_speed(iter/s)": 0.136684 }, { "epoch": 1.017037068274786, "grad_norm": 0.7408952116966248, "learning_rate": 5.272397329505014e-05, "loss": 0.7470179080963135, "memory(GiB)": 91.52, "step": 78380, "token_acc": 0.7901666084119772, "train_speed(iter/s)": 0.136683 }, { "epoch": 1.0171019466764417, "grad_norm": 0.7289255261421204, "learning_rate": 5.2718617492806344e-05, "loss": 0.761402940750122, "memory(GiB)": 91.52, "step": 78385, "token_acc": 0.7815390376334019, "train_speed(iter/s)": 0.136682 }, { "epoch": 1.0171668250780974, "grad_norm": 0.7266189455986023, "learning_rate": 5.2713261659276835e-05, "loss": 0.7505598545074463, "memory(GiB)": 91.52, "step": 78390, "token_acc": 0.7603838656470235, "train_speed(iter/s)": 0.136681 }, { "epoch": 1.0172317034797531, "grad_norm": 0.7257473468780518, "learning_rate": 5.2707905794523306e-05, "loss": 0.7639615535736084, "memory(GiB)": 91.52, "step": 78395, "token_acc": 0.7661334402215502, "train_speed(iter/s)": 0.13668 }, { "epoch": 1.0172965818814088, "grad_norm": 0.6801700592041016, "learning_rate": 5.270254989860737e-05, "loss": 0.7308954715728759, "memory(GiB)": 91.52, "step": 78400, "token_acc": 0.7892001438237505, "train_speed(iter/s)": 0.136679 }, { "epoch": 1.0173614602830645, "grad_norm": 0.6584367156028748, "learning_rate": 5.269719397159066e-05, "loss": 0.7297523498535157, "memory(GiB)": 91.52, "step": 78405, "token_acc": 0.7912373585264559, "train_speed(iter/s)": 0.136678 }, { "epoch": 1.0174263386847202, "grad_norm": 0.7158973217010498, "learning_rate": 5.269183801353482e-05, "loss": 0.7432986259460449, "memory(GiB)": 91.52, "step": 78410, "token_acc": 0.8178521720020512, "train_speed(iter/s)": 0.136677 }, { "epoch": 1.017491217086376, "grad_norm": 0.7330589890480042, "learning_rate": 5.2686482024501473e-05, "loss": 0.7725314140319824, "memory(GiB)": 91.52, "step": 78415, "token_acc": 0.7777040268032114, "train_speed(iter/s)": 0.136677 }, { "epoch": 1.0175560954880316, "grad_norm": 0.7925567030906677, "learning_rate": 5.2681126004552284e-05, "loss": 0.7711844444274902, "memory(GiB)": 91.52, "step": 78420, "token_acc": 0.7802654356607039, "train_speed(iter/s)": 0.136676 }, { "epoch": 1.0176209738896873, "grad_norm": 0.6598532795906067, "learning_rate": 5.267576995374884e-05, "loss": 0.7710552215576172, "memory(GiB)": 91.52, "step": 78425, "token_acc": 0.7658912241296326, "train_speed(iter/s)": 0.136674 }, { "epoch": 1.017685852291343, "grad_norm": 0.7204222083091736, "learning_rate": 5.2670413872152826e-05, "loss": 0.7421331882476807, "memory(GiB)": 91.52, "step": 78430, "token_acc": 0.8004015735125389, "train_speed(iter/s)": 0.136673 }, { "epoch": 1.0177507306929987, "grad_norm": 0.7950604557991028, "learning_rate": 5.2665057759825864e-05, "loss": 0.7589781761169434, "memory(GiB)": 91.52, "step": 78435, "token_acc": 0.7947651570071349, "train_speed(iter/s)": 0.136673 }, { "epoch": 1.0178156090946544, "grad_norm": 0.8029578924179077, "learning_rate": 5.265970161682958e-05, "loss": 0.8055273056030273, "memory(GiB)": 91.52, "step": 78440, "token_acc": 0.7796187734756205, "train_speed(iter/s)": 0.136671 }, { "epoch": 1.01788048749631, "grad_norm": 0.684826672077179, "learning_rate": 5.265434544322563e-05, "loss": 0.7901440620422363, "memory(GiB)": 91.52, "step": 78445, "token_acc": 0.7701491516146689, "train_speed(iter/s)": 0.136671 }, { "epoch": 1.0179453658979658, "grad_norm": 0.8129456639289856, "learning_rate": 5.264898923907562e-05, "loss": 0.7808342933654785, "memory(GiB)": 91.52, "step": 78450, "token_acc": 0.759998504896464, "train_speed(iter/s)": 0.13667 }, { "epoch": 1.0180102442996215, "grad_norm": 0.7239347696304321, "learning_rate": 5.264363300444124e-05, "loss": 0.7693772792816163, "memory(GiB)": 91.52, "step": 78455, "token_acc": 0.7829814348191271, "train_speed(iter/s)": 0.136669 }, { "epoch": 1.0180751227012772, "grad_norm": 0.7956210970878601, "learning_rate": 5.263827673938409e-05, "loss": 0.7733941555023194, "memory(GiB)": 91.52, "step": 78460, "token_acc": 0.7858932928839939, "train_speed(iter/s)": 0.136668 }, { "epoch": 1.018140001102933, "grad_norm": 0.7085729241371155, "learning_rate": 5.263292044396582e-05, "loss": 0.7474484920501709, "memory(GiB)": 91.52, "step": 78465, "token_acc": 0.7956032099264954, "train_speed(iter/s)": 0.136667 }, { "epoch": 1.0182048795045886, "grad_norm": 0.6557479500770569, "learning_rate": 5.2627564118248075e-05, "loss": 0.7217426300048828, "memory(GiB)": 91.52, "step": 78470, "token_acc": 0.7857974812550047, "train_speed(iter/s)": 0.136666 }, { "epoch": 1.0182697579062443, "grad_norm": 0.7388215065002441, "learning_rate": 5.262220776229248e-05, "loss": 0.804236888885498, "memory(GiB)": 91.52, "step": 78475, "token_acc": 0.7703629474211631, "train_speed(iter/s)": 0.136665 }, { "epoch": 1.0183346363079, "grad_norm": 0.6625247597694397, "learning_rate": 5.2616851376160705e-05, "loss": 0.7586106777191162, "memory(GiB)": 91.52, "step": 78480, "token_acc": 0.7847385428907168, "train_speed(iter/s)": 0.136664 }, { "epoch": 1.0183995147095557, "grad_norm": 0.720950186252594, "learning_rate": 5.261149495991435e-05, "loss": 0.779009485244751, "memory(GiB)": 91.52, "step": 78485, "token_acc": 0.777564649048025, "train_speed(iter/s)": 0.136663 }, { "epoch": 1.0184643931112114, "grad_norm": 0.7304147481918335, "learning_rate": 5.2606138513615085e-05, "loss": 0.7516862392425537, "memory(GiB)": 91.52, "step": 78490, "token_acc": 0.7740084630280427, "train_speed(iter/s)": 0.136662 }, { "epoch": 1.018529271512867, "grad_norm": 0.7248108983039856, "learning_rate": 5.260078203732456e-05, "loss": 0.7366519927978515, "memory(GiB)": 91.52, "step": 78495, "token_acc": 0.7777374514571916, "train_speed(iter/s)": 0.136661 }, { "epoch": 1.0185941499145228, "grad_norm": 0.7604814171791077, "learning_rate": 5.2595425531104406e-05, "loss": 0.7646555423736572, "memory(GiB)": 91.52, "step": 78500, "token_acc": 0.7761059492442103, "train_speed(iter/s)": 0.13666 }, { "epoch": 1.0186590283161785, "grad_norm": 0.6360858082771301, "learning_rate": 5.259006899501625e-05, "loss": 0.7807451725006104, "memory(GiB)": 91.52, "step": 78505, "token_acc": 0.7861115525186715, "train_speed(iter/s)": 0.136659 }, { "epoch": 1.0187239067178342, "grad_norm": 0.7121089696884155, "learning_rate": 5.258471242912175e-05, "loss": 0.7790977954864502, "memory(GiB)": 91.52, "step": 78510, "token_acc": 0.7852576596242301, "train_speed(iter/s)": 0.136659 }, { "epoch": 1.01878878511949, "grad_norm": 0.7081284523010254, "learning_rate": 5.2579355833482545e-05, "loss": 0.7290503501892089, "memory(GiB)": 91.52, "step": 78515, "token_acc": 0.7843994172957224, "train_speed(iter/s)": 0.136657 }, { "epoch": 1.0188536635211456, "grad_norm": 0.8058992624282837, "learning_rate": 5.2573999208160263e-05, "loss": 0.7657215118408203, "memory(GiB)": 91.52, "step": 78520, "token_acc": 0.7817865681373615, "train_speed(iter/s)": 0.136657 }, { "epoch": 1.0189185419228013, "grad_norm": 0.8033435940742493, "learning_rate": 5.256864255321658e-05, "loss": 0.7475056648254395, "memory(GiB)": 91.52, "step": 78525, "token_acc": 0.7888845595267614, "train_speed(iter/s)": 0.136656 }, { "epoch": 1.018983420324457, "grad_norm": 0.7463361620903015, "learning_rate": 5.256328586871313e-05, "loss": 0.7850982189178467, "memory(GiB)": 91.52, "step": 78530, "token_acc": 0.7939512988112575, "train_speed(iter/s)": 0.136655 }, { "epoch": 1.0190482987261127, "grad_norm": 0.6636669635772705, "learning_rate": 5.2557929154711537e-05, "loss": 0.7839640140533447, "memory(GiB)": 91.52, "step": 78535, "token_acc": 0.7630779502587214, "train_speed(iter/s)": 0.136654 }, { "epoch": 1.0191131771277684, "grad_norm": 0.7157753705978394, "learning_rate": 5.255257241127347e-05, "loss": 0.7543162345886231, "memory(GiB)": 91.52, "step": 78540, "token_acc": 0.7787184396314467, "train_speed(iter/s)": 0.136653 }, { "epoch": 1.0191780555294239, "grad_norm": 0.7000411748886108, "learning_rate": 5.254721563846057e-05, "loss": 0.7731513023376465, "memory(GiB)": 91.52, "step": 78545, "token_acc": 0.7951381674631207, "train_speed(iter/s)": 0.136651 }, { "epoch": 1.0192429339310798, "grad_norm": 0.7227159142494202, "learning_rate": 5.2541858836334465e-05, "loss": 0.752402400970459, "memory(GiB)": 91.52, "step": 78550, "token_acc": 0.7759869019465163, "train_speed(iter/s)": 0.13665 }, { "epoch": 1.0193078123327353, "grad_norm": 0.6749991178512573, "learning_rate": 5.253650200495681e-05, "loss": 0.7581730365753174, "memory(GiB)": 91.52, "step": 78555, "token_acc": 0.7911912503694946, "train_speed(iter/s)": 0.136649 }, { "epoch": 1.019372690734391, "grad_norm": 0.6437214612960815, "learning_rate": 5.253114514438926e-05, "loss": 0.7710569381713868, "memory(GiB)": 91.52, "step": 78560, "token_acc": 0.7846561365643089, "train_speed(iter/s)": 0.136648 }, { "epoch": 1.0194375691360467, "grad_norm": 0.7060742378234863, "learning_rate": 5.2525788254693454e-05, "loss": 0.7637410163879395, "memory(GiB)": 91.52, "step": 78565, "token_acc": 0.777414384100129, "train_speed(iter/s)": 0.136647 }, { "epoch": 1.0195024475377024, "grad_norm": 0.7729095220565796, "learning_rate": 5.252043133593103e-05, "loss": 0.7715409278869629, "memory(GiB)": 91.52, "step": 78570, "token_acc": 0.7593132375941988, "train_speed(iter/s)": 0.136646 }, { "epoch": 1.019567325939358, "grad_norm": 0.7401872873306274, "learning_rate": 5.251507438816365e-05, "loss": 0.7394147872924804, "memory(GiB)": 91.52, "step": 78575, "token_acc": 0.7950473320449408, "train_speed(iter/s)": 0.136645 }, { "epoch": 1.0196322043410138, "grad_norm": 0.646437406539917, "learning_rate": 5.250971741145295e-05, "loss": 0.7910037994384765, "memory(GiB)": 91.52, "step": 78580, "token_acc": 0.7922765065622208, "train_speed(iter/s)": 0.136644 }, { "epoch": 1.0196970827426695, "grad_norm": 0.7522865533828735, "learning_rate": 5.250436040586061e-05, "loss": 0.7700334548950195, "memory(GiB)": 91.52, "step": 78585, "token_acc": 0.7801006852188506, "train_speed(iter/s)": 0.136643 }, { "epoch": 1.0197619611443252, "grad_norm": 0.7807669043540955, "learning_rate": 5.249900337144822e-05, "loss": 0.796846580505371, "memory(GiB)": 91.52, "step": 78590, "token_acc": 0.7777222200853879, "train_speed(iter/s)": 0.136642 }, { "epoch": 1.0198268395459809, "grad_norm": 0.6913983821868896, "learning_rate": 5.249364630827748e-05, "loss": 0.7541831970214844, "memory(GiB)": 91.52, "step": 78595, "token_acc": 0.7962822697721391, "train_speed(iter/s)": 0.136641 }, { "epoch": 1.0198917179476366, "grad_norm": 0.6663830876350403, "learning_rate": 5.248828921641001e-05, "loss": 0.7668700218200684, "memory(GiB)": 91.52, "step": 78600, "token_acc": 0.7835362716515177, "train_speed(iter/s)": 0.13664 }, { "epoch": 1.0199565963492923, "grad_norm": 0.693307101726532, "learning_rate": 5.2482932095907467e-05, "loss": 0.743437385559082, "memory(GiB)": 91.52, "step": 78605, "token_acc": 0.7776904387594683, "train_speed(iter/s)": 0.136639 }, { "epoch": 1.020021474750948, "grad_norm": 0.7107958793640137, "learning_rate": 5.247757494683151e-05, "loss": 0.7766812324523926, "memory(GiB)": 91.52, "step": 78610, "token_acc": 0.7631112237142134, "train_speed(iter/s)": 0.136639 }, { "epoch": 1.0200863531526037, "grad_norm": 0.7478624582290649, "learning_rate": 5.247221776924376e-05, "loss": 0.8112556457519531, "memory(GiB)": 91.52, "step": 78615, "token_acc": 0.7624866108620622, "train_speed(iter/s)": 0.136637 }, { "epoch": 1.0201512315542594, "grad_norm": 0.632082998752594, "learning_rate": 5.246686056320592e-05, "loss": 0.7822860717773438, "memory(GiB)": 91.52, "step": 78620, "token_acc": 0.7755590805884731, "train_speed(iter/s)": 0.136636 }, { "epoch": 1.020216109955915, "grad_norm": 0.8066333532333374, "learning_rate": 5.246150332877957e-05, "loss": 0.8029045104980469, "memory(GiB)": 91.52, "step": 78625, "token_acc": 0.7701050585112349, "train_speed(iter/s)": 0.136636 }, { "epoch": 1.0202809883575708, "grad_norm": 0.7044313549995422, "learning_rate": 5.245614606602642e-05, "loss": 0.7413643360137939, "memory(GiB)": 91.52, "step": 78630, "token_acc": 0.7954637232467358, "train_speed(iter/s)": 0.136634 }, { "epoch": 1.0203458667592264, "grad_norm": 0.6954547166824341, "learning_rate": 5.24507887750081e-05, "loss": 0.7614453315734864, "memory(GiB)": 91.52, "step": 78635, "token_acc": 0.7817848181852226, "train_speed(iter/s)": 0.136633 }, { "epoch": 1.0204107451608821, "grad_norm": 0.74737149477005, "learning_rate": 5.2445431455786256e-05, "loss": 0.7539299964904785, "memory(GiB)": 91.52, "step": 78640, "token_acc": 0.7757211864024197, "train_speed(iter/s)": 0.136632 }, { "epoch": 1.0204756235625378, "grad_norm": 0.7686874270439148, "learning_rate": 5.244007410842254e-05, "loss": 0.7508686542510986, "memory(GiB)": 91.52, "step": 78645, "token_acc": 0.7804893754024469, "train_speed(iter/s)": 0.13663 }, { "epoch": 1.0205405019641935, "grad_norm": 0.7438802719116211, "learning_rate": 5.2434716732978604e-05, "loss": 0.7806023597717285, "memory(GiB)": 91.52, "step": 78650, "token_acc": 0.7605812399328284, "train_speed(iter/s)": 0.136629 }, { "epoch": 1.0206053803658492, "grad_norm": 0.675865888595581, "learning_rate": 5.2429359329516136e-05, "loss": 0.7593938827514648, "memory(GiB)": 91.52, "step": 78655, "token_acc": 0.7990981240981241, "train_speed(iter/s)": 0.136628 }, { "epoch": 1.020670258767505, "grad_norm": 0.7022305130958557, "learning_rate": 5.242400189809672e-05, "loss": 0.7692251205444336, "memory(GiB)": 91.52, "step": 78660, "token_acc": 0.7734227207859016, "train_speed(iter/s)": 0.136627 }, { "epoch": 1.0207351371691606, "grad_norm": 0.6378419995307922, "learning_rate": 5.241864443878206e-05, "loss": 0.7664566993713379, "memory(GiB)": 91.52, "step": 78665, "token_acc": 0.7974226618055806, "train_speed(iter/s)": 0.136625 }, { "epoch": 1.0208000155708163, "grad_norm": 0.708621084690094, "learning_rate": 5.241328695163379e-05, "loss": 0.7626423835754395, "memory(GiB)": 91.52, "step": 78670, "token_acc": 0.7747125605611397, "train_speed(iter/s)": 0.136624 }, { "epoch": 1.020864893972472, "grad_norm": 0.7192848324775696, "learning_rate": 5.240792943671359e-05, "loss": 0.7562172889709473, "memory(GiB)": 91.52, "step": 78675, "token_acc": 0.7740772470907403, "train_speed(iter/s)": 0.136623 }, { "epoch": 1.0209297723741277, "grad_norm": 0.6866594552993774, "learning_rate": 5.2402571894083065e-05, "loss": 0.7788180351257324, "memory(GiB)": 91.52, "step": 78680, "token_acc": 0.7883898455529913, "train_speed(iter/s)": 0.136622 }, { "epoch": 1.0209946507757834, "grad_norm": 0.6447156071662903, "learning_rate": 5.239721432380391e-05, "loss": 0.7438021659851074, "memory(GiB)": 91.52, "step": 78685, "token_acc": 0.8096168294515402, "train_speed(iter/s)": 0.136621 }, { "epoch": 1.0210595291774391, "grad_norm": 0.6975151896476746, "learning_rate": 5.2391856725937785e-05, "loss": 0.7728402137756347, "memory(GiB)": 91.52, "step": 78690, "token_acc": 0.768000322333696, "train_speed(iter/s)": 0.136621 }, { "epoch": 1.0211244075790948, "grad_norm": 0.6889374852180481, "learning_rate": 5.2386499100546295e-05, "loss": 0.798073148727417, "memory(GiB)": 91.52, "step": 78695, "token_acc": 0.7905002216387629, "train_speed(iter/s)": 0.13662 }, { "epoch": 1.0211892859807505, "grad_norm": 0.6822054982185364, "learning_rate": 5.238114144769114e-05, "loss": 0.7707197189331054, "memory(GiB)": 91.52, "step": 78700, "token_acc": 0.768256920105858, "train_speed(iter/s)": 0.136619 }, { "epoch": 1.0212541643824062, "grad_norm": 0.7817798256874084, "learning_rate": 5.237578376743395e-05, "loss": 0.7374820709228516, "memory(GiB)": 91.52, "step": 78705, "token_acc": 0.7799782372143634, "train_speed(iter/s)": 0.136618 }, { "epoch": 1.021319042784062, "grad_norm": 0.8342962265014648, "learning_rate": 5.237042605983642e-05, "loss": 0.7246249675750732, "memory(GiB)": 91.52, "step": 78710, "token_acc": 0.7895815748221392, "train_speed(iter/s)": 0.136616 }, { "epoch": 1.0213839211857176, "grad_norm": 0.6270992159843445, "learning_rate": 5.2365068324960155e-05, "loss": 0.739178466796875, "memory(GiB)": 91.52, "step": 78715, "token_acc": 0.7977695466623722, "train_speed(iter/s)": 0.136615 }, { "epoch": 1.0214487995873733, "grad_norm": 0.706987202167511, "learning_rate": 5.235971056286685e-05, "loss": 0.7742997169494629, "memory(GiB)": 91.52, "step": 78720, "token_acc": 0.7780794865465317, "train_speed(iter/s)": 0.136614 }, { "epoch": 1.021513677989029, "grad_norm": 0.7160231471061707, "learning_rate": 5.235435277361815e-05, "loss": 0.7670510292053223, "memory(GiB)": 91.52, "step": 78725, "token_acc": 0.7777242542063793, "train_speed(iter/s)": 0.136612 }, { "epoch": 1.0215785563906847, "grad_norm": 0.6625851392745972, "learning_rate": 5.23489949572757e-05, "loss": 0.7496816635131835, "memory(GiB)": 91.52, "step": 78730, "token_acc": 0.7774277446060295, "train_speed(iter/s)": 0.136611 }, { "epoch": 1.0216434347923404, "grad_norm": 0.6881566643714905, "learning_rate": 5.234363711390118e-05, "loss": 0.7495965003967285, "memory(GiB)": 91.52, "step": 78735, "token_acc": 0.7901360811767482, "train_speed(iter/s)": 0.13661 }, { "epoch": 1.0217083131939961, "grad_norm": 0.6815569400787354, "learning_rate": 5.233827924355621e-05, "loss": 0.7659197807312011, "memory(GiB)": 91.52, "step": 78740, "token_acc": 0.7736943243428728, "train_speed(iter/s)": 0.13661 }, { "epoch": 1.0217731915956518, "grad_norm": 0.6511089205741882, "learning_rate": 5.233292134630251e-05, "loss": 0.7280341625213623, "memory(GiB)": 91.52, "step": 78745, "token_acc": 0.7759872438594111, "train_speed(iter/s)": 0.136608 }, { "epoch": 1.0218380699973075, "grad_norm": 0.7084004878997803, "learning_rate": 5.2327563422201664e-05, "loss": 0.7691402435302734, "memory(GiB)": 91.52, "step": 78750, "token_acc": 0.7633964179537379, "train_speed(iter/s)": 0.136608 }, { "epoch": 1.0219029483989632, "grad_norm": 0.6856834888458252, "learning_rate": 5.232220547131539e-05, "loss": 0.7774484634399415, "memory(GiB)": 91.52, "step": 78755, "token_acc": 0.7807925740806855, "train_speed(iter/s)": 0.136607 }, { "epoch": 1.021967826800619, "grad_norm": 0.8112438917160034, "learning_rate": 5.2316847493705326e-05, "loss": 0.7658432006835938, "memory(GiB)": 91.52, "step": 78760, "token_acc": 0.7886316127994855, "train_speed(iter/s)": 0.136606 }, { "epoch": 1.0220327052022746, "grad_norm": 0.7099340558052063, "learning_rate": 5.231148948943312e-05, "loss": 0.7496802806854248, "memory(GiB)": 91.52, "step": 78765, "token_acc": 0.7753316749585406, "train_speed(iter/s)": 0.136605 }, { "epoch": 1.0220975836039303, "grad_norm": 0.7371734976768494, "learning_rate": 5.2306131458560445e-05, "loss": 0.742781114578247, "memory(GiB)": 91.52, "step": 78770, "token_acc": 0.7816987956037782, "train_speed(iter/s)": 0.136603 }, { "epoch": 1.022162462005586, "grad_norm": 0.6297431588172913, "learning_rate": 5.230077340114895e-05, "loss": 0.7727603912353516, "memory(GiB)": 91.52, "step": 78775, "token_acc": 0.7886216324148312, "train_speed(iter/s)": 0.136602 }, { "epoch": 1.0222273404072417, "grad_norm": 0.7194907069206238, "learning_rate": 5.229541531726032e-05, "loss": 0.8115985870361329, "memory(GiB)": 91.52, "step": 78780, "token_acc": 0.7584617114584504, "train_speed(iter/s)": 0.136601 }, { "epoch": 1.0222922188088974, "grad_norm": 0.787218451499939, "learning_rate": 5.2290057206956175e-05, "loss": 0.7533225536346435, "memory(GiB)": 91.52, "step": 78785, "token_acc": 0.7779123763375732, "train_speed(iter/s)": 0.1366 }, { "epoch": 1.0223570972105531, "grad_norm": 0.7264562845230103, "learning_rate": 5.228469907029823e-05, "loss": 0.738164234161377, "memory(GiB)": 91.52, "step": 78790, "token_acc": 0.7820497234210686, "train_speed(iter/s)": 0.136599 }, { "epoch": 1.0224219756122088, "grad_norm": 0.7462866306304932, "learning_rate": 5.22793409073481e-05, "loss": 0.7697932720184326, "memory(GiB)": 91.52, "step": 78795, "token_acc": 0.7710272043355796, "train_speed(iter/s)": 0.136598 }, { "epoch": 1.0224868540138645, "grad_norm": 0.7876254916191101, "learning_rate": 5.227398271816745e-05, "loss": 0.7587531566619873, "memory(GiB)": 91.52, "step": 78800, "token_acc": 0.8037810453478315, "train_speed(iter/s)": 0.136597 }, { "epoch": 1.0225517324155202, "grad_norm": 0.7094051837921143, "learning_rate": 5.2268624502817963e-05, "loss": 0.7914377212524414, "memory(GiB)": 91.52, "step": 78805, "token_acc": 0.7581704195570724, "train_speed(iter/s)": 0.136596 }, { "epoch": 1.022616610817176, "grad_norm": 0.7181963920593262, "learning_rate": 5.226326626136129e-05, "loss": 0.7843343257904053, "memory(GiB)": 91.52, "step": 78810, "token_acc": 0.7819899081381809, "train_speed(iter/s)": 0.136595 }, { "epoch": 1.0226814892188316, "grad_norm": 0.7309262156486511, "learning_rate": 5.22579079938591e-05, "loss": 0.7876576900482177, "memory(GiB)": 91.52, "step": 78815, "token_acc": 0.778240058910162, "train_speed(iter/s)": 0.136594 }, { "epoch": 1.0227463676204873, "grad_norm": 0.8036940693855286, "learning_rate": 5.2252549700373044e-05, "loss": 0.7393639087677002, "memory(GiB)": 91.52, "step": 78820, "token_acc": 0.7993207110854169, "train_speed(iter/s)": 0.136593 }, { "epoch": 1.022811246022143, "grad_norm": 0.6618633270263672, "learning_rate": 5.224719138096479e-05, "loss": 0.7273045539855957, "memory(GiB)": 91.52, "step": 78825, "token_acc": 0.7843700159489633, "train_speed(iter/s)": 0.136592 }, { "epoch": 1.0228761244237987, "grad_norm": 0.6500225067138672, "learning_rate": 5.224183303569601e-05, "loss": 0.752760934829712, "memory(GiB)": 91.52, "step": 78830, "token_acc": 0.7735258117935057, "train_speed(iter/s)": 0.136591 }, { "epoch": 1.0229410028254544, "grad_norm": 0.7190791964530945, "learning_rate": 5.2236474664628355e-05, "loss": 0.7847108840942383, "memory(GiB)": 91.52, "step": 78835, "token_acc": 0.7814476458186929, "train_speed(iter/s)": 0.13659 }, { "epoch": 1.0230058812271101, "grad_norm": 0.725708544254303, "learning_rate": 5.223111626782349e-05, "loss": 0.7854715824127197, "memory(GiB)": 91.52, "step": 78840, "token_acc": 0.7813463514902363, "train_speed(iter/s)": 0.136589 }, { "epoch": 1.0230707596287658, "grad_norm": 0.6838071942329407, "learning_rate": 5.222575784534308e-05, "loss": 0.7575002670288086, "memory(GiB)": 91.52, "step": 78845, "token_acc": 0.7909922870958718, "train_speed(iter/s)": 0.136588 }, { "epoch": 1.0231356380304215, "grad_norm": 0.7216826677322388, "learning_rate": 5.22203993972488e-05, "loss": 0.7531206130981445, "memory(GiB)": 91.52, "step": 78850, "token_acc": 0.7824282457473549, "train_speed(iter/s)": 0.136587 }, { "epoch": 1.0232005164320772, "grad_norm": 0.786919116973877, "learning_rate": 5.2215040923602296e-05, "loss": 0.7865641117095947, "memory(GiB)": 91.52, "step": 78855, "token_acc": 0.7627308876346844, "train_speed(iter/s)": 0.136586 }, { "epoch": 1.023265394833733, "grad_norm": 0.737248957157135, "learning_rate": 5.220968242446526e-05, "loss": 0.7697032928466797, "memory(GiB)": 91.52, "step": 78860, "token_acc": 0.7685430463576159, "train_speed(iter/s)": 0.136585 }, { "epoch": 1.0233302732353886, "grad_norm": 0.7591983079910278, "learning_rate": 5.220432389989933e-05, "loss": 0.8173048019409179, "memory(GiB)": 91.52, "step": 78865, "token_acc": 0.7637238358525114, "train_speed(iter/s)": 0.136584 }, { "epoch": 1.0233951516370443, "grad_norm": 0.6776074171066284, "learning_rate": 5.219896534996619e-05, "loss": 0.7932240486145019, "memory(GiB)": 91.52, "step": 78870, "token_acc": 0.7770392749244713, "train_speed(iter/s)": 0.136583 }, { "epoch": 1.0234600300387, "grad_norm": 1.1839888095855713, "learning_rate": 5.21936067747275e-05, "loss": 0.7898487091064453, "memory(GiB)": 91.52, "step": 78875, "token_acc": 0.7743034388783304, "train_speed(iter/s)": 0.136582 }, { "epoch": 1.0235249084403557, "grad_norm": 0.6211388111114502, "learning_rate": 5.2188248174244903e-05, "loss": 0.7707997322082519, "memory(GiB)": 91.52, "step": 78880, "token_acc": 0.791607979298372, "train_speed(iter/s)": 0.136582 }, { "epoch": 1.0235897868420114, "grad_norm": 0.725097119808197, "learning_rate": 5.2182889548580104e-05, "loss": 0.7998459339141846, "memory(GiB)": 91.52, "step": 78885, "token_acc": 0.76425, "train_speed(iter/s)": 0.13658 }, { "epoch": 1.023654665243667, "grad_norm": 0.6732567548751831, "learning_rate": 5.217753089779476e-05, "loss": 0.7316243171691894, "memory(GiB)": 91.52, "step": 78890, "token_acc": 0.7795307592832071, "train_speed(iter/s)": 0.136579 }, { "epoch": 1.0237195436453228, "grad_norm": 0.7367329597473145, "learning_rate": 5.217217222195052e-05, "loss": 0.749946117401123, "memory(GiB)": 91.52, "step": 78895, "token_acc": 0.7654343404997095, "train_speed(iter/s)": 0.136578 }, { "epoch": 1.0237844220469785, "grad_norm": 0.7853792309761047, "learning_rate": 5.216681352110906e-05, "loss": 0.7916267395019532, "memory(GiB)": 91.52, "step": 78900, "token_acc": 0.7745101754651912, "train_speed(iter/s)": 0.136577 }, { "epoch": 1.0238493004486342, "grad_norm": 0.6892027258872986, "learning_rate": 5.216145479533204e-05, "loss": 0.7437900543212891, "memory(GiB)": 91.52, "step": 78905, "token_acc": 0.7924294239924897, "train_speed(iter/s)": 0.136576 }, { "epoch": 1.02391417885029, "grad_norm": 0.7900153994560242, "learning_rate": 5.215609604468117e-05, "loss": 0.7775883674621582, "memory(GiB)": 91.52, "step": 78910, "token_acc": 0.7849018621036739, "train_speed(iter/s)": 0.136575 }, { "epoch": 1.0239790572519456, "grad_norm": 0.7357990145683289, "learning_rate": 5.2150737269218055e-05, "loss": 0.7408722400665283, "memory(GiB)": 91.52, "step": 78915, "token_acc": 0.7634803073302415, "train_speed(iter/s)": 0.136574 }, { "epoch": 1.0240439356536013, "grad_norm": 0.6805192232131958, "learning_rate": 5.21453784690044e-05, "loss": 0.7728579998016357, "memory(GiB)": 91.52, "step": 78920, "token_acc": 0.7745767745767745, "train_speed(iter/s)": 0.136573 }, { "epoch": 1.024108814055257, "grad_norm": 0.6299238801002502, "learning_rate": 5.2140019644101875e-05, "loss": 0.7676626205444336, "memory(GiB)": 91.52, "step": 78925, "token_acc": 0.7811378460194114, "train_speed(iter/s)": 0.136572 }, { "epoch": 1.0241736924569127, "grad_norm": 0.7713234424591064, "learning_rate": 5.213466079457214e-05, "loss": 0.7508107662200928, "memory(GiB)": 91.52, "step": 78930, "token_acc": 0.7755234469631425, "train_speed(iter/s)": 0.136571 }, { "epoch": 1.0242385708585684, "grad_norm": 0.7186282277107239, "learning_rate": 5.212930192047687e-05, "loss": 0.7474300384521484, "memory(GiB)": 91.52, "step": 78935, "token_acc": 0.7814988545871059, "train_speed(iter/s)": 0.13657 }, { "epoch": 1.024303449260224, "grad_norm": 0.7132551670074463, "learning_rate": 5.212394302187771e-05, "loss": 0.7961137294769287, "memory(GiB)": 91.52, "step": 78940, "token_acc": 0.76612, "train_speed(iter/s)": 0.13657 }, { "epoch": 1.0243683276618798, "grad_norm": 0.7058007717132568, "learning_rate": 5.2118584098836374e-05, "loss": 0.7619867324829102, "memory(GiB)": 91.52, "step": 78945, "token_acc": 0.7883456904541242, "train_speed(iter/s)": 0.136569 }, { "epoch": 1.0244332060635355, "grad_norm": 0.7353172302246094, "learning_rate": 5.211322515141449e-05, "loss": 0.7918061256408692, "memory(GiB)": 91.52, "step": 78950, "token_acc": 0.7798416434838433, "train_speed(iter/s)": 0.136568 }, { "epoch": 1.0244980844651912, "grad_norm": 0.6632817387580872, "learning_rate": 5.210786617967376e-05, "loss": 0.7496211051940918, "memory(GiB)": 91.52, "step": 78955, "token_acc": 0.7672718311016183, "train_speed(iter/s)": 0.136567 }, { "epoch": 1.024562962866847, "grad_norm": 0.6618356108665466, "learning_rate": 5.210250718367584e-05, "loss": 0.7589276790618896, "memory(GiB)": 91.52, "step": 78960, "token_acc": 0.7861462545913256, "train_speed(iter/s)": 0.136566 }, { "epoch": 1.0246278412685026, "grad_norm": 0.7855767011642456, "learning_rate": 5.209714816348241e-05, "loss": 0.7994279861450195, "memory(GiB)": 91.52, "step": 78965, "token_acc": 0.7683710258931498, "train_speed(iter/s)": 0.136565 }, { "epoch": 1.0246927196701583, "grad_norm": 0.7034984827041626, "learning_rate": 5.209178911915512e-05, "loss": 0.767535924911499, "memory(GiB)": 91.52, "step": 78970, "token_acc": 0.7719054242002782, "train_speed(iter/s)": 0.136564 }, { "epoch": 1.024757598071814, "grad_norm": 0.6752921342849731, "learning_rate": 5.208643005075565e-05, "loss": 0.7213680744171143, "memory(GiB)": 91.52, "step": 78975, "token_acc": 0.791395872301303, "train_speed(iter/s)": 0.136563 }, { "epoch": 1.0248224764734697, "grad_norm": 0.6666625142097473, "learning_rate": 5.2081070958345704e-05, "loss": 0.753175163269043, "memory(GiB)": 91.52, "step": 78980, "token_acc": 0.7870086731869349, "train_speed(iter/s)": 0.136562 }, { "epoch": 1.0248873548751254, "grad_norm": 0.7711260318756104, "learning_rate": 5.2075711841986905e-05, "loss": 0.7700652122497559, "memory(GiB)": 91.52, "step": 78985, "token_acc": 0.7784209463552534, "train_speed(iter/s)": 0.136561 }, { "epoch": 1.024952233276781, "grad_norm": 0.7225708961486816, "learning_rate": 5.2070352701740956e-05, "loss": 0.7709050178527832, "memory(GiB)": 91.52, "step": 78990, "token_acc": 0.7813241043389092, "train_speed(iter/s)": 0.13656 }, { "epoch": 1.0250171116784368, "grad_norm": 0.7009119987487793, "learning_rate": 5.2064993537669516e-05, "loss": 0.745121955871582, "memory(GiB)": 91.52, "step": 78995, "token_acc": 0.779037905312833, "train_speed(iter/s)": 0.136559 }, { "epoch": 1.0250819900800925, "grad_norm": 0.6928693652153015, "learning_rate": 5.2059634349834275e-05, "loss": 0.75418119430542, "memory(GiB)": 91.52, "step": 79000, "token_acc": 0.7878301855913064, "train_speed(iter/s)": 0.136558 }, { "epoch": 1.0251468684817482, "grad_norm": 0.7149299383163452, "learning_rate": 5.2054275138296884e-05, "loss": 0.8078003883361816, "memory(GiB)": 91.52, "step": 79005, "token_acc": 0.7817294801132132, "train_speed(iter/s)": 0.136557 }, { "epoch": 1.0252117468834039, "grad_norm": 0.6890568137168884, "learning_rate": 5.204891590311902e-05, "loss": 0.7750741958618164, "memory(GiB)": 91.52, "step": 79010, "token_acc": 0.7662897331586568, "train_speed(iter/s)": 0.136556 }, { "epoch": 1.0252766252850596, "grad_norm": 0.7393261194229126, "learning_rate": 5.204355664436238e-05, "loss": 0.7627361297607422, "memory(GiB)": 91.52, "step": 79015, "token_acc": 0.791226362625139, "train_speed(iter/s)": 0.136554 }, { "epoch": 1.0253415036867153, "grad_norm": 0.6580654382705688, "learning_rate": 5.2038197362088605e-05, "loss": 0.7809018611907959, "memory(GiB)": 91.52, "step": 79020, "token_acc": 0.790432877439915, "train_speed(iter/s)": 0.136553 }, { "epoch": 1.025406382088371, "grad_norm": 0.7780637741088867, "learning_rate": 5.20328380563594e-05, "loss": 0.7564807891845703, "memory(GiB)": 91.52, "step": 79025, "token_acc": 0.7904401836348907, "train_speed(iter/s)": 0.136552 }, { "epoch": 1.0254712604900265, "grad_norm": 0.6929134726524353, "learning_rate": 5.202747872723641e-05, "loss": 0.7541197776794434, "memory(GiB)": 91.52, "step": 79030, "token_acc": 0.7744022503516175, "train_speed(iter/s)": 0.136551 }, { "epoch": 1.0255361388916822, "grad_norm": 0.6576738953590393, "learning_rate": 5.202211937478134e-05, "loss": 0.7446506500244141, "memory(GiB)": 91.52, "step": 79035, "token_acc": 0.7791723979825768, "train_speed(iter/s)": 0.13655 }, { "epoch": 1.0256010172933379, "grad_norm": 0.715398371219635, "learning_rate": 5.201675999905585e-05, "loss": 0.7811668395996094, "memory(GiB)": 91.52, "step": 79040, "token_acc": 0.7783844427823485, "train_speed(iter/s)": 0.13655 }, { "epoch": 1.0256658956949936, "grad_norm": 0.7551885843276978, "learning_rate": 5.2011400600121605e-05, "loss": 0.7977568626403808, "memory(GiB)": 91.52, "step": 79045, "token_acc": 0.7922050034000214, "train_speed(iter/s)": 0.136549 }, { "epoch": 1.0257307740966493, "grad_norm": 0.7035340666770935, "learning_rate": 5.200604117804031e-05, "loss": 0.7906815528869628, "memory(GiB)": 91.52, "step": 79050, "token_acc": 0.7605999534919774, "train_speed(iter/s)": 0.136548 }, { "epoch": 1.025795652498305, "grad_norm": 0.670132577419281, "learning_rate": 5.20006817328736e-05, "loss": 0.7606088161468506, "memory(GiB)": 91.52, "step": 79055, "token_acc": 0.8012181246235192, "train_speed(iter/s)": 0.136547 }, { "epoch": 1.0258605308999607, "grad_norm": 0.721659779548645, "learning_rate": 5.199532226468319e-05, "loss": 0.7769978523254395, "memory(GiB)": 91.52, "step": 79060, "token_acc": 0.78499642285286, "train_speed(iter/s)": 0.136546 }, { "epoch": 1.0259254093016164, "grad_norm": 0.684212863445282, "learning_rate": 5.198996277353072e-05, "loss": 0.7613007545471191, "memory(GiB)": 91.52, "step": 79065, "token_acc": 0.7910544257670153, "train_speed(iter/s)": 0.136545 }, { "epoch": 1.025990287703272, "grad_norm": 0.7414145469665527, "learning_rate": 5.198460325947792e-05, "loss": 0.7344356536865234, "memory(GiB)": 91.52, "step": 79070, "token_acc": 0.782720182342676, "train_speed(iter/s)": 0.136544 }, { "epoch": 1.0260551661049278, "grad_norm": 0.6704226732254028, "learning_rate": 5.1979243722586416e-05, "loss": 0.7358918190002441, "memory(GiB)": 91.52, "step": 79075, "token_acc": 0.794769077967224, "train_speed(iter/s)": 0.136543 }, { "epoch": 1.0261200445065835, "grad_norm": 0.7465217709541321, "learning_rate": 5.197388416291789e-05, "loss": 0.7641199111938477, "memory(GiB)": 91.52, "step": 79080, "token_acc": 0.78743810809875, "train_speed(iter/s)": 0.136542 }, { "epoch": 1.0261849229082391, "grad_norm": 0.6741266250610352, "learning_rate": 5.196852458053404e-05, "loss": 0.780639934539795, "memory(GiB)": 91.52, "step": 79085, "token_acc": 0.7979775135693977, "train_speed(iter/s)": 0.136541 }, { "epoch": 1.0262498013098948, "grad_norm": 0.7045394778251648, "learning_rate": 5.196316497549654e-05, "loss": 0.7753199577331543, "memory(GiB)": 91.52, "step": 79090, "token_acc": 0.7682859151281957, "train_speed(iter/s)": 0.136539 }, { "epoch": 1.0263146797115505, "grad_norm": 0.6951555609703064, "learning_rate": 5.195780534786706e-05, "loss": 0.7498458862304688, "memory(GiB)": 91.52, "step": 79095, "token_acc": 0.7897556322007778, "train_speed(iter/s)": 0.136538 }, { "epoch": 1.0263795581132062, "grad_norm": 0.5993849635124207, "learning_rate": 5.195244569770728e-05, "loss": 0.734846544265747, "memory(GiB)": 91.52, "step": 79100, "token_acc": 0.7800961328381035, "train_speed(iter/s)": 0.136537 }, { "epoch": 1.026444436514862, "grad_norm": 0.7102833986282349, "learning_rate": 5.1947086025078895e-05, "loss": 0.8270648002624512, "memory(GiB)": 91.52, "step": 79105, "token_acc": 0.7557063541024059, "train_speed(iter/s)": 0.136537 }, { "epoch": 1.0265093149165176, "grad_norm": 0.6802724599838257, "learning_rate": 5.194172633004355e-05, "loss": 0.7503883361816406, "memory(GiB)": 91.52, "step": 79110, "token_acc": 0.7818342555908451, "train_speed(iter/s)": 0.136536 }, { "epoch": 1.0265741933181733, "grad_norm": 0.7275196313858032, "learning_rate": 5.1936366612662956e-05, "loss": 0.7736556529998779, "memory(GiB)": 91.52, "step": 79115, "token_acc": 0.8014103939904952, "train_speed(iter/s)": 0.136534 }, { "epoch": 1.026639071719829, "grad_norm": 0.73482745885849, "learning_rate": 5.193100687299878e-05, "loss": 0.7931971073150634, "memory(GiB)": 91.52, "step": 79120, "token_acc": 0.7584938325037871, "train_speed(iter/s)": 0.136533 }, { "epoch": 1.0267039501214847, "grad_norm": 0.7175012826919556, "learning_rate": 5.1925647111112696e-05, "loss": 0.7875468730926514, "memory(GiB)": 91.52, "step": 79125, "token_acc": 0.7664707942561388, "train_speed(iter/s)": 0.136532 }, { "epoch": 1.0267688285231404, "grad_norm": 0.8000180721282959, "learning_rate": 5.1920287327066386e-05, "loss": 0.7780075073242188, "memory(GiB)": 91.52, "step": 79130, "token_acc": 0.7810755562544225, "train_speed(iter/s)": 0.136531 }, { "epoch": 1.0268337069247961, "grad_norm": 0.6400423049926758, "learning_rate": 5.191492752092153e-05, "loss": 0.7755795955657959, "memory(GiB)": 91.52, "step": 79135, "token_acc": 0.7760142627299212, "train_speed(iter/s)": 0.13653 }, { "epoch": 1.0268985853264518, "grad_norm": 0.7142317891120911, "learning_rate": 5.1909567692739836e-05, "loss": 0.8072576522827148, "memory(GiB)": 91.52, "step": 79140, "token_acc": 0.7698976189542227, "train_speed(iter/s)": 0.136529 }, { "epoch": 1.0269634637281075, "grad_norm": 0.6202300190925598, "learning_rate": 5.190420784258293e-05, "loss": 0.7839102745056152, "memory(GiB)": 91.52, "step": 79145, "token_acc": 0.7580757692039349, "train_speed(iter/s)": 0.136528 }, { "epoch": 1.0270283421297632, "grad_norm": 0.6951985955238342, "learning_rate": 5.189884797051253e-05, "loss": 0.7366557121276855, "memory(GiB)": 91.52, "step": 79150, "token_acc": 0.796446108736355, "train_speed(iter/s)": 0.136527 }, { "epoch": 1.027093220531419, "grad_norm": 0.7207387089729309, "learning_rate": 5.1893488076590326e-05, "loss": 0.776141357421875, "memory(GiB)": 91.52, "step": 79155, "token_acc": 0.7787763100104733, "train_speed(iter/s)": 0.136527 }, { "epoch": 1.0271580989330746, "grad_norm": 0.7097761631011963, "learning_rate": 5.188812816087797e-05, "loss": 0.7730761528015136, "memory(GiB)": 91.52, "step": 79160, "token_acc": 0.7740522738420573, "train_speed(iter/s)": 0.136526 }, { "epoch": 1.0272229773347303, "grad_norm": 0.8018854856491089, "learning_rate": 5.1882768223437164e-05, "loss": 0.8142611503601074, "memory(GiB)": 91.52, "step": 79165, "token_acc": 0.7771214121898673, "train_speed(iter/s)": 0.136525 }, { "epoch": 1.027287855736386, "grad_norm": 0.6476094722747803, "learning_rate": 5.187740826432956e-05, "loss": 0.7560929298400879, "memory(GiB)": 91.52, "step": 79170, "token_acc": 0.7917311328863675, "train_speed(iter/s)": 0.136523 }, { "epoch": 1.0273527341380417, "grad_norm": 0.8005877137184143, "learning_rate": 5.187204828361688e-05, "loss": 0.7413735389709473, "memory(GiB)": 91.52, "step": 79175, "token_acc": 0.7856101304681504, "train_speed(iter/s)": 0.136522 }, { "epoch": 1.0274176125396974, "grad_norm": 0.7461915612220764, "learning_rate": 5.186668828136079e-05, "loss": 0.7683990001678467, "memory(GiB)": 91.52, "step": 79180, "token_acc": 0.7795741849634065, "train_speed(iter/s)": 0.136521 }, { "epoch": 1.0274824909413531, "grad_norm": 0.6868959069252014, "learning_rate": 5.186132825762298e-05, "loss": 0.8041717529296875, "memory(GiB)": 91.52, "step": 79185, "token_acc": 0.7621702419908698, "train_speed(iter/s)": 0.13652 }, { "epoch": 1.0275473693430088, "grad_norm": 0.7555394768714905, "learning_rate": 5.185596821246511e-05, "loss": 0.8017688751220703, "memory(GiB)": 91.52, "step": 79190, "token_acc": 0.7708806137126418, "train_speed(iter/s)": 0.136519 }, { "epoch": 1.0276122477446645, "grad_norm": 0.7621507048606873, "learning_rate": 5.1850608145948876e-05, "loss": 0.7738729476928711, "memory(GiB)": 91.52, "step": 79195, "token_acc": 0.7629225736095965, "train_speed(iter/s)": 0.136517 }, { "epoch": 1.0276771261463202, "grad_norm": 0.6977618336677551, "learning_rate": 5.184524805813598e-05, "loss": 0.7791648864746094, "memory(GiB)": 91.52, "step": 79200, "token_acc": 0.7812375249500998, "train_speed(iter/s)": 0.136516 }, { "epoch": 1.027742004547976, "grad_norm": 0.755185604095459, "learning_rate": 5.183988794908806e-05, "loss": 0.7640946388244629, "memory(GiB)": 91.52, "step": 79205, "token_acc": 0.7728877919119043, "train_speed(iter/s)": 0.136515 }, { "epoch": 1.0278068829496316, "grad_norm": 0.7085602283477783, "learning_rate": 5.183452781886684e-05, "loss": 0.7093895435333252, "memory(GiB)": 91.52, "step": 79210, "token_acc": 0.7902408311034157, "train_speed(iter/s)": 0.136514 }, { "epoch": 1.0278717613512873, "grad_norm": 0.6895774602890015, "learning_rate": 5.1829167667534006e-05, "loss": 0.7537165641784668, "memory(GiB)": 91.52, "step": 79215, "token_acc": 0.7842015048952674, "train_speed(iter/s)": 0.136513 }, { "epoch": 1.027936639752943, "grad_norm": 0.6999403238296509, "learning_rate": 5.182380749515121e-05, "loss": 0.7560393333435058, "memory(GiB)": 91.52, "step": 79220, "token_acc": 0.7749691955641612, "train_speed(iter/s)": 0.136512 }, { "epoch": 1.0280015181545987, "grad_norm": 0.6964171528816223, "learning_rate": 5.1818447301780174e-05, "loss": 0.7736682415008544, "memory(GiB)": 91.52, "step": 79225, "token_acc": 0.7828579181223078, "train_speed(iter/s)": 0.136511 }, { "epoch": 1.0280663965562544, "grad_norm": 0.7022285461425781, "learning_rate": 5.181308708748255e-05, "loss": 0.7801724910736084, "memory(GiB)": 91.52, "step": 79230, "token_acc": 0.7717950687456794, "train_speed(iter/s)": 0.13651 }, { "epoch": 1.0281312749579101, "grad_norm": 0.9091563820838928, "learning_rate": 5.180772685232004e-05, "loss": 0.8456127166748046, "memory(GiB)": 91.52, "step": 79235, "token_acc": 0.748432464107823, "train_speed(iter/s)": 0.136509 }, { "epoch": 1.0281961533595658, "grad_norm": 0.7356053590774536, "learning_rate": 5.180236659635431e-05, "loss": 0.7364912033081055, "memory(GiB)": 91.52, "step": 79240, "token_acc": 0.7819416209581528, "train_speed(iter/s)": 0.136508 }, { "epoch": 1.0282610317612215, "grad_norm": 0.7310732007026672, "learning_rate": 5.179700631964708e-05, "loss": 0.7440000534057617, "memory(GiB)": 91.52, "step": 79245, "token_acc": 0.7881397637795275, "train_speed(iter/s)": 0.136507 }, { "epoch": 1.0283259101628772, "grad_norm": 0.7056140303611755, "learning_rate": 5.179164602226002e-05, "loss": 0.7757935523986816, "memory(GiB)": 91.52, "step": 79250, "token_acc": 0.774668891808401, "train_speed(iter/s)": 0.136507 }, { "epoch": 1.028390788564533, "grad_norm": 0.694461464881897, "learning_rate": 5.17862857042548e-05, "loss": 0.7682044029235839, "memory(GiB)": 91.52, "step": 79255, "token_acc": 0.7721788772597526, "train_speed(iter/s)": 0.136506 }, { "epoch": 1.0284556669661886, "grad_norm": 0.6738429069519043, "learning_rate": 5.178092536569313e-05, "loss": 0.7718301296234131, "memory(GiB)": 91.52, "step": 79260, "token_acc": 0.7796819403093457, "train_speed(iter/s)": 0.136504 }, { "epoch": 1.0285205453678443, "grad_norm": 0.8305057883262634, "learning_rate": 5.177556500663667e-05, "loss": 0.7584543704986573, "memory(GiB)": 91.52, "step": 79265, "token_acc": 0.7818531417791322, "train_speed(iter/s)": 0.136503 }, { "epoch": 1.0285854237695, "grad_norm": 0.6926250457763672, "learning_rate": 5.1770204627147143e-05, "loss": 0.758342170715332, "memory(GiB)": 91.52, "step": 79270, "token_acc": 0.780330943927112, "train_speed(iter/s)": 0.136502 }, { "epoch": 1.0286503021711557, "grad_norm": 0.6997409462928772, "learning_rate": 5.176484422728619e-05, "loss": 0.7593247413635253, "memory(GiB)": 91.52, "step": 79275, "token_acc": 0.7976473883209635, "train_speed(iter/s)": 0.136501 }, { "epoch": 1.0287151805728114, "grad_norm": 0.6724953055381775, "learning_rate": 5.175948380711555e-05, "loss": 0.7256168365478516, "memory(GiB)": 91.52, "step": 79280, "token_acc": 0.7932354566264109, "train_speed(iter/s)": 0.1365 }, { "epoch": 1.0287800589744671, "grad_norm": 0.6494723558425903, "learning_rate": 5.175412336669687e-05, "loss": 0.8095048904418946, "memory(GiB)": 91.52, "step": 79285, "token_acc": 0.7866871143243145, "train_speed(iter/s)": 0.136499 }, { "epoch": 1.0288449373761228, "grad_norm": 0.7171235680580139, "learning_rate": 5.174876290609185e-05, "loss": 0.772434139251709, "memory(GiB)": 91.52, "step": 79290, "token_acc": 0.786531377237769, "train_speed(iter/s)": 0.136498 }, { "epoch": 1.0289098157777785, "grad_norm": 0.7031006217002869, "learning_rate": 5.174340242536219e-05, "loss": 0.7693235397338867, "memory(GiB)": 91.52, "step": 79295, "token_acc": 0.782139422220639, "train_speed(iter/s)": 0.136497 }, { "epoch": 1.0289746941794342, "grad_norm": 0.7625597715377808, "learning_rate": 5.173804192456955e-05, "loss": 0.7337358474731446, "memory(GiB)": 91.52, "step": 79300, "token_acc": 0.7908303204089265, "train_speed(iter/s)": 0.136496 }, { "epoch": 1.02903957258109, "grad_norm": 0.710790753364563, "learning_rate": 5.1732681403775654e-05, "loss": 0.7249044418334961, "memory(GiB)": 91.52, "step": 79305, "token_acc": 0.7890069312025182, "train_speed(iter/s)": 0.136494 }, { "epoch": 1.0291044509827456, "grad_norm": 0.7405180931091309, "learning_rate": 5.172732086304215e-05, "loss": 0.738617992401123, "memory(GiB)": 91.52, "step": 79310, "token_acc": 0.770978210477515, "train_speed(iter/s)": 0.136493 }, { "epoch": 1.0291693293844013, "grad_norm": 0.7114624977111816, "learning_rate": 5.1721960302430764e-05, "loss": 0.7261841297149658, "memory(GiB)": 91.52, "step": 79315, "token_acc": 0.7766116464713927, "train_speed(iter/s)": 0.136492 }, { "epoch": 1.029234207786057, "grad_norm": 0.7672656774520874, "learning_rate": 5.1716599722003155e-05, "loss": 0.7647034645080566, "memory(GiB)": 91.52, "step": 79320, "token_acc": 0.7697952732199308, "train_speed(iter/s)": 0.136492 }, { "epoch": 1.0292990861877127, "grad_norm": 0.6671985387802124, "learning_rate": 5.1711239121821045e-05, "loss": 0.7175870895385742, "memory(GiB)": 91.52, "step": 79325, "token_acc": 0.7995256448265639, "train_speed(iter/s)": 0.13649 }, { "epoch": 1.0293639645893684, "grad_norm": 0.7615451812744141, "learning_rate": 5.17058785019461e-05, "loss": 0.733409309387207, "memory(GiB)": 91.52, "step": 79330, "token_acc": 0.7929938900203666, "train_speed(iter/s)": 0.136489 }, { "epoch": 1.029428842991024, "grad_norm": 0.7047197818756104, "learning_rate": 5.170051786244e-05, "loss": 0.7819977760314941, "memory(GiB)": 91.52, "step": 79335, "token_acc": 0.7623583771124754, "train_speed(iter/s)": 0.136489 }, { "epoch": 1.0294937213926798, "grad_norm": 0.7093276977539062, "learning_rate": 5.1695157203364474e-05, "loss": 0.7585484027862549, "memory(GiB)": 91.52, "step": 79340, "token_acc": 0.8066391514667377, "train_speed(iter/s)": 0.136488 }, { "epoch": 1.0295585997943355, "grad_norm": 0.6607406139373779, "learning_rate": 5.168979652478118e-05, "loss": 0.7696023941040039, "memory(GiB)": 91.52, "step": 79345, "token_acc": 0.7864035843529209, "train_speed(iter/s)": 0.136487 }, { "epoch": 1.0296234781959912, "grad_norm": 0.7476579546928406, "learning_rate": 5.168443582675181e-05, "loss": 0.785734748840332, "memory(GiB)": 91.52, "step": 79350, "token_acc": 0.7826993778280543, "train_speed(iter/s)": 0.136486 }, { "epoch": 1.029688356597647, "grad_norm": 0.6605002880096436, "learning_rate": 5.167907510933806e-05, "loss": 0.7899171829223632, "memory(GiB)": 91.52, "step": 79355, "token_acc": 0.7689483912761822, "train_speed(iter/s)": 0.136484 }, { "epoch": 1.0297532349993026, "grad_norm": 0.7088109254837036, "learning_rate": 5.167371437260163e-05, "loss": 0.8443920135498046, "memory(GiB)": 91.52, "step": 79360, "token_acc": 0.7578132622199096, "train_speed(iter/s)": 0.136483 }, { "epoch": 1.0298181134009583, "grad_norm": 0.6525179147720337, "learning_rate": 5.1668353616604207e-05, "loss": 0.7528791427612305, "memory(GiB)": 91.52, "step": 79365, "token_acc": 0.7811466184645817, "train_speed(iter/s)": 0.136482 }, { "epoch": 1.029882991802614, "grad_norm": 0.7215098738670349, "learning_rate": 5.166299284140745e-05, "loss": 0.7372828483581543, "memory(GiB)": 91.52, "step": 79370, "token_acc": 0.7731799070590839, "train_speed(iter/s)": 0.136481 }, { "epoch": 1.0299478702042697, "grad_norm": 0.7397992014884949, "learning_rate": 5.165763204707311e-05, "loss": 0.7639634132385253, "memory(GiB)": 91.52, "step": 79375, "token_acc": 0.7770639254738194, "train_speed(iter/s)": 0.13648 }, { "epoch": 1.0300127486059254, "grad_norm": 0.7391231656074524, "learning_rate": 5.165227123366283e-05, "loss": 0.768190574645996, "memory(GiB)": 91.52, "step": 79380, "token_acc": 0.7632377452027864, "train_speed(iter/s)": 0.136479 }, { "epoch": 1.030077627007581, "grad_norm": 0.7367064952850342, "learning_rate": 5.1646910401238324e-05, "loss": 0.7685773372650146, "memory(GiB)": 91.52, "step": 79385, "token_acc": 0.7680813569278214, "train_speed(iter/s)": 0.136478 }, { "epoch": 1.0301425054092368, "grad_norm": 0.6836325526237488, "learning_rate": 5.164154954986128e-05, "loss": 0.767595911026001, "memory(GiB)": 91.52, "step": 79390, "token_acc": 0.7827587623782718, "train_speed(iter/s)": 0.136477 }, { "epoch": 1.0302073838108925, "grad_norm": 0.79823237657547, "learning_rate": 5.163618867959339e-05, "loss": 0.7926017284393311, "memory(GiB)": 91.52, "step": 79395, "token_acc": 0.7615663371213454, "train_speed(iter/s)": 0.136476 }, { "epoch": 1.0302722622125482, "grad_norm": 0.7243512272834778, "learning_rate": 5.163082779049635e-05, "loss": 0.7841235160827636, "memory(GiB)": 91.52, "step": 79400, "token_acc": 0.7593114453536143, "train_speed(iter/s)": 0.136475 }, { "epoch": 1.030337140614204, "grad_norm": 0.6542015075683594, "learning_rate": 5.1625466882631834e-05, "loss": 0.7425355911254883, "memory(GiB)": 91.52, "step": 79405, "token_acc": 0.7906036196959995, "train_speed(iter/s)": 0.136474 }, { "epoch": 1.0304020190158596, "grad_norm": 0.7401036620140076, "learning_rate": 5.162010595606157e-05, "loss": 0.7764778137207031, "memory(GiB)": 91.52, "step": 79410, "token_acc": 0.780354750207363, "train_speed(iter/s)": 0.136473 }, { "epoch": 1.0304668974175153, "grad_norm": 0.717633068561554, "learning_rate": 5.161474501084721e-05, "loss": 0.7363037109375, "memory(GiB)": 91.52, "step": 79415, "token_acc": 0.7826551077181859, "train_speed(iter/s)": 0.136473 }, { "epoch": 1.030531775819171, "grad_norm": 0.6651545763015747, "learning_rate": 5.160938404705048e-05, "loss": 0.7250804424285888, "memory(GiB)": 91.52, "step": 79420, "token_acc": 0.7759649520356522, "train_speed(iter/s)": 0.136471 }, { "epoch": 1.0305966542208267, "grad_norm": 0.6488865613937378, "learning_rate": 5.1604023064733055e-05, "loss": 0.7525702953338623, "memory(GiB)": 91.52, "step": 79425, "token_acc": 0.7941219263785828, "train_speed(iter/s)": 0.13647 }, { "epoch": 1.0306615326224824, "grad_norm": 0.828601062297821, "learning_rate": 5.1598662063956646e-05, "loss": 0.7703707695007325, "memory(GiB)": 91.52, "step": 79430, "token_acc": 0.7847503132271344, "train_speed(iter/s)": 0.136469 }, { "epoch": 1.030726411024138, "grad_norm": 0.6985267996788025, "learning_rate": 5.159330104478294e-05, "loss": 0.7846197128295899, "memory(GiB)": 91.52, "step": 79435, "token_acc": 0.7937126689720214, "train_speed(iter/s)": 0.136468 }, { "epoch": 1.0307912894257938, "grad_norm": 0.7457543611526489, "learning_rate": 5.1587940007273605e-05, "loss": 0.7631275177001953, "memory(GiB)": 91.52, "step": 79440, "token_acc": 0.7894718147266722, "train_speed(iter/s)": 0.136467 }, { "epoch": 1.0308561678274495, "grad_norm": 0.7033593654632568, "learning_rate": 5.158257895149038e-05, "loss": 0.7618921279907227, "memory(GiB)": 91.52, "step": 79445, "token_acc": 0.7838237183605293, "train_speed(iter/s)": 0.136466 }, { "epoch": 1.0309210462291052, "grad_norm": 0.7618846893310547, "learning_rate": 5.157721787749493e-05, "loss": 0.7722308158874511, "memory(GiB)": 91.52, "step": 79450, "token_acc": 0.7742619322255822, "train_speed(iter/s)": 0.136465 }, { "epoch": 1.0309859246307609, "grad_norm": 0.7942498922348022, "learning_rate": 5.157185678534896e-05, "loss": 0.7651679039001464, "memory(GiB)": 91.52, "step": 79455, "token_acc": 0.7709021601016518, "train_speed(iter/s)": 0.136464 }, { "epoch": 1.0310508030324166, "grad_norm": 0.6819554567337036, "learning_rate": 5.1566495675114154e-05, "loss": 0.7793168067932129, "memory(GiB)": 91.52, "step": 79460, "token_acc": 0.7688184051820416, "train_speed(iter/s)": 0.136463 }, { "epoch": 1.0311156814340723, "grad_norm": 0.7338221669197083, "learning_rate": 5.156113454685224e-05, "loss": 0.7920013427734375, "memory(GiB)": 91.52, "step": 79465, "token_acc": 0.771149640242163, "train_speed(iter/s)": 0.136462 }, { "epoch": 1.031180559835728, "grad_norm": 0.6952688694000244, "learning_rate": 5.1555773400624885e-05, "loss": 0.7869906425476074, "memory(GiB)": 91.52, "step": 79470, "token_acc": 0.7818268025874623, "train_speed(iter/s)": 0.13646 }, { "epoch": 1.0312454382373837, "grad_norm": 0.7093207240104675, "learning_rate": 5.155041223649376e-05, "loss": 0.7596148490905762, "memory(GiB)": 91.52, "step": 79475, "token_acc": 0.7801468669623887, "train_speed(iter/s)": 0.13646 }, { "epoch": 1.0313103166390394, "grad_norm": 0.7575538754463196, "learning_rate": 5.154505105452062e-05, "loss": 0.7702289581298828, "memory(GiB)": 91.52, "step": 79480, "token_acc": 0.7746888872716687, "train_speed(iter/s)": 0.136458 }, { "epoch": 1.031375195040695, "grad_norm": 0.7721045613288879, "learning_rate": 5.1539689854767124e-05, "loss": 0.8088750839233398, "memory(GiB)": 91.52, "step": 79485, "token_acc": 0.7668014829794405, "train_speed(iter/s)": 0.136457 }, { "epoch": 1.0314400734423508, "grad_norm": 0.7527465224266052, "learning_rate": 5.153432863729498e-05, "loss": 0.76841139793396, "memory(GiB)": 91.52, "step": 79490, "token_acc": 0.7727598321792817, "train_speed(iter/s)": 0.136457 }, { "epoch": 1.0315049518440063, "grad_norm": 0.694841206073761, "learning_rate": 5.152896740216586e-05, "loss": 0.7714265823364258, "memory(GiB)": 91.52, "step": 79495, "token_acc": 0.7816480191401027, "train_speed(iter/s)": 0.136456 }, { "epoch": 1.0315698302456622, "grad_norm": 0.701941192150116, "learning_rate": 5.1523606149441514e-05, "loss": 0.7779996871948243, "memory(GiB)": 91.52, "step": 79500, "token_acc": 0.7838791988535077, "train_speed(iter/s)": 0.136454 }, { "epoch": 1.0316347086473177, "grad_norm": 0.7633001208305359, "learning_rate": 5.1518244879183585e-05, "loss": 0.7600783348083496, "memory(GiB)": 91.52, "step": 79505, "token_acc": 0.78201043380834, "train_speed(iter/s)": 0.136453 }, { "epoch": 1.0316995870489734, "grad_norm": 0.8162091374397278, "learning_rate": 5.15128835914538e-05, "loss": 0.7644593238830566, "memory(GiB)": 91.52, "step": 79510, "token_acc": 0.7829816201497617, "train_speed(iter/s)": 0.136452 }, { "epoch": 1.031764465450629, "grad_norm": 0.6535295844078064, "learning_rate": 5.150752228631385e-05, "loss": 0.7755391597747803, "memory(GiB)": 91.52, "step": 79515, "token_acc": 0.7893200939892367, "train_speed(iter/s)": 0.136452 }, { "epoch": 1.0318293438522848, "grad_norm": 0.7232823371887207, "learning_rate": 5.150216096382542e-05, "loss": 0.7475738525390625, "memory(GiB)": 91.52, "step": 79520, "token_acc": 0.7710366182983474, "train_speed(iter/s)": 0.136451 }, { "epoch": 1.0318942222539405, "grad_norm": 0.6693138480186462, "learning_rate": 5.149679962405023e-05, "loss": 0.7386527061462402, "memory(GiB)": 91.52, "step": 79525, "token_acc": 0.7750836386808985, "train_speed(iter/s)": 0.13645 }, { "epoch": 1.0319591006555962, "grad_norm": 0.703423261642456, "learning_rate": 5.149143826704995e-05, "loss": 0.7762267589569092, "memory(GiB)": 91.52, "step": 79530, "token_acc": 0.7872875092387287, "train_speed(iter/s)": 0.136449 }, { "epoch": 1.0320239790572519, "grad_norm": 0.6932246088981628, "learning_rate": 5.1486076892886306e-05, "loss": 0.7717637062072754, "memory(GiB)": 91.52, "step": 79535, "token_acc": 0.7743882720707125, "train_speed(iter/s)": 0.136448 }, { "epoch": 1.0320888574589075, "grad_norm": 0.6692540645599365, "learning_rate": 5.148071550162098e-05, "loss": 0.7496689796447754, "memory(GiB)": 91.52, "step": 79540, "token_acc": 0.7961724694489278, "train_speed(iter/s)": 0.136446 }, { "epoch": 1.0321537358605632, "grad_norm": 0.6988245844841003, "learning_rate": 5.147535409331569e-05, "loss": 0.7738398551940918, "memory(GiB)": 91.52, "step": 79545, "token_acc": 0.778632090308601, "train_speed(iter/s)": 0.136445 }, { "epoch": 1.032218614262219, "grad_norm": 0.7221930623054504, "learning_rate": 5.146999266803211e-05, "loss": 0.8177314758300781, "memory(GiB)": 91.52, "step": 79550, "token_acc": 0.7656860953773068, "train_speed(iter/s)": 0.136445 }, { "epoch": 1.0322834926638746, "grad_norm": 0.735886812210083, "learning_rate": 5.146463122583195e-05, "loss": 0.771791648864746, "memory(GiB)": 91.52, "step": 79555, "token_acc": 0.7690460212623177, "train_speed(iter/s)": 0.136443 }, { "epoch": 1.0323483710655303, "grad_norm": 0.7783571481704712, "learning_rate": 5.1459269766776906e-05, "loss": 0.7591367244720459, "memory(GiB)": 91.52, "step": 79560, "token_acc": 0.8075472957191953, "train_speed(iter/s)": 0.136442 }, { "epoch": 1.032413249467186, "grad_norm": 0.769300103187561, "learning_rate": 5.1453908290928666e-05, "loss": 0.7744411945343017, "memory(GiB)": 91.52, "step": 79565, "token_acc": 0.7889911250326286, "train_speed(iter/s)": 0.136441 }, { "epoch": 1.0324781278688417, "grad_norm": 0.7541791200637817, "learning_rate": 5.144854679834896e-05, "loss": 0.7825934410095214, "memory(GiB)": 91.52, "step": 79570, "token_acc": 0.7826520070838253, "train_speed(iter/s)": 0.13644 }, { "epoch": 1.0325430062704974, "grad_norm": 0.774229109287262, "learning_rate": 5.144318528909947e-05, "loss": 0.7329370498657226, "memory(GiB)": 91.52, "step": 79575, "token_acc": 0.7912449908306731, "train_speed(iter/s)": 0.13644 }, { "epoch": 1.0326078846721531, "grad_norm": 0.7043852210044861, "learning_rate": 5.143782376324189e-05, "loss": 0.7565747261047363, "memory(GiB)": 91.52, "step": 79580, "token_acc": 0.8023684504524634, "train_speed(iter/s)": 0.136439 }, { "epoch": 1.0326727630738088, "grad_norm": 0.655867874622345, "learning_rate": 5.143246222083793e-05, "loss": 0.7758944511413575, "memory(GiB)": 91.52, "step": 79585, "token_acc": 0.7762173210694623, "train_speed(iter/s)": 0.136438 }, { "epoch": 1.0327376414754645, "grad_norm": 0.7003276944160461, "learning_rate": 5.142710066194929e-05, "loss": 0.7881380081176758, "memory(GiB)": 91.52, "step": 79590, "token_acc": 0.7826901945575593, "train_speed(iter/s)": 0.136436 }, { "epoch": 1.0328025198771202, "grad_norm": 0.7242348194122314, "learning_rate": 5.142173908663766e-05, "loss": 0.7535893440246582, "memory(GiB)": 91.52, "step": 79595, "token_acc": 0.7664883407522359, "train_speed(iter/s)": 0.136436 }, { "epoch": 1.032867398278776, "grad_norm": 0.736491858959198, "learning_rate": 5.1416377494964755e-05, "loss": 0.7716244220733642, "memory(GiB)": 91.52, "step": 79600, "token_acc": 0.777904631426723, "train_speed(iter/s)": 0.136435 }, { "epoch": 1.0329322766804316, "grad_norm": 0.7312732934951782, "learning_rate": 5.141101588699226e-05, "loss": 0.8003142356872559, "memory(GiB)": 91.52, "step": 79605, "token_acc": 0.7864548863887051, "train_speed(iter/s)": 0.136434 }, { "epoch": 1.0329971550820873, "grad_norm": 0.6652575731277466, "learning_rate": 5.14056542627819e-05, "loss": 0.7747220039367676, "memory(GiB)": 91.52, "step": 79610, "token_acc": 0.7786178835741518, "train_speed(iter/s)": 0.136433 }, { "epoch": 1.033062033483743, "grad_norm": 0.7684310674667358, "learning_rate": 5.140029262239535e-05, "loss": 0.7898714065551757, "memory(GiB)": 91.52, "step": 79615, "token_acc": 0.7754881113473806, "train_speed(iter/s)": 0.136432 }, { "epoch": 1.0331269118853987, "grad_norm": 0.6050150394439697, "learning_rate": 5.139493096589433e-05, "loss": 0.7627677917480469, "memory(GiB)": 91.52, "step": 79620, "token_acc": 0.7832029013170453, "train_speed(iter/s)": 0.136431 }, { "epoch": 1.0331917902870544, "grad_norm": 0.7967265248298645, "learning_rate": 5.138956929334052e-05, "loss": 0.775019359588623, "memory(GiB)": 91.52, "step": 79625, "token_acc": 0.7546815698377611, "train_speed(iter/s)": 0.13643 }, { "epoch": 1.0332566686887101, "grad_norm": 0.7094757556915283, "learning_rate": 5.138420760479567e-05, "loss": 0.7597431182861328, "memory(GiB)": 91.52, "step": 79630, "token_acc": 0.7832873639863002, "train_speed(iter/s)": 0.136428 }, { "epoch": 1.0333215470903658, "grad_norm": 0.6519648432731628, "learning_rate": 5.13788459003214e-05, "loss": 0.7322704315185546, "memory(GiB)": 91.52, "step": 79635, "token_acc": 0.7716061601805297, "train_speed(iter/s)": 0.136427 }, { "epoch": 1.0333864254920215, "grad_norm": 0.6558685898780823, "learning_rate": 5.1373484179979493e-05, "loss": 0.7564274311065674, "memory(GiB)": 91.52, "step": 79640, "token_acc": 0.777511050144795, "train_speed(iter/s)": 0.136426 }, { "epoch": 1.0334513038936772, "grad_norm": 0.7950233221054077, "learning_rate": 5.136812244383161e-05, "loss": 0.7667797565460205, "memory(GiB)": 91.52, "step": 79645, "token_acc": 0.7706098299889511, "train_speed(iter/s)": 0.136425 }, { "epoch": 1.033516182295333, "grad_norm": 0.7727617621421814, "learning_rate": 5.136276069193946e-05, "loss": 0.7780771732330323, "memory(GiB)": 91.52, "step": 79650, "token_acc": 0.7877756160830091, "train_speed(iter/s)": 0.136424 }, { "epoch": 1.0335810606969886, "grad_norm": 0.7422450184822083, "learning_rate": 5.135739892436475e-05, "loss": 0.7716615676879883, "memory(GiB)": 91.52, "step": 79655, "token_acc": 0.7847030508954485, "train_speed(iter/s)": 0.136423 }, { "epoch": 1.0336459390986443, "grad_norm": 0.6936274766921997, "learning_rate": 5.1352037141169174e-05, "loss": 0.7587038516998291, "memory(GiB)": 91.52, "step": 79660, "token_acc": 0.7831189183279273, "train_speed(iter/s)": 0.136422 }, { "epoch": 1.0337108175003, "grad_norm": 0.6813404560089111, "learning_rate": 5.134667534241445e-05, "loss": 0.739483118057251, "memory(GiB)": 91.52, "step": 79665, "token_acc": 0.7798507462686567, "train_speed(iter/s)": 0.136421 }, { "epoch": 1.0337756959019557, "grad_norm": 0.7184536457061768, "learning_rate": 5.134131352816227e-05, "loss": 0.7754827499389648, "memory(GiB)": 91.52, "step": 79670, "token_acc": 0.7853046297065913, "train_speed(iter/s)": 0.13642 }, { "epoch": 1.0338405743036114, "grad_norm": 0.6658360362052917, "learning_rate": 5.1335951698474336e-05, "loss": 0.7795533180236817, "memory(GiB)": 91.52, "step": 79675, "token_acc": 0.7760005201729575, "train_speed(iter/s)": 0.136419 }, { "epoch": 1.0339054527052671, "grad_norm": 0.7188195586204529, "learning_rate": 5.133058985341237e-05, "loss": 0.8048624038696289, "memory(GiB)": 91.52, "step": 79680, "token_acc": 0.772310989867498, "train_speed(iter/s)": 0.136418 }, { "epoch": 1.0339703311069228, "grad_norm": 0.746459424495697, "learning_rate": 5.132522799303805e-05, "loss": 0.7346200942993164, "memory(GiB)": 91.52, "step": 79685, "token_acc": 0.8157551826258638, "train_speed(iter/s)": 0.136417 }, { "epoch": 1.0340352095085785, "grad_norm": 0.7262412309646606, "learning_rate": 5.1319866117413095e-05, "loss": 0.8050653457641601, "memory(GiB)": 91.52, "step": 79690, "token_acc": 0.7671688695812922, "train_speed(iter/s)": 0.136416 }, { "epoch": 1.0341000879102342, "grad_norm": 0.6483908295631409, "learning_rate": 5.131450422659919e-05, "loss": 0.733293867111206, "memory(GiB)": 91.52, "step": 79695, "token_acc": 0.7757277397260274, "train_speed(iter/s)": 0.136415 }, { "epoch": 1.03416496631189, "grad_norm": 0.769356906414032, "learning_rate": 5.1309142320658086e-05, "loss": 0.7913452625274658, "memory(GiB)": 91.52, "step": 79700, "token_acc": 0.7810037934053108, "train_speed(iter/s)": 0.136414 }, { "epoch": 1.0342298447135456, "grad_norm": 0.7020399570465088, "learning_rate": 5.130378039965142e-05, "loss": 0.7534576892852783, "memory(GiB)": 91.52, "step": 79705, "token_acc": 0.7837677518406085, "train_speed(iter/s)": 0.136413 }, { "epoch": 1.0342947231152013, "grad_norm": 0.7255473136901855, "learning_rate": 5.1298418463640966e-05, "loss": 0.7840064525604248, "memory(GiB)": 91.52, "step": 79710, "token_acc": 0.7817208271787297, "train_speed(iter/s)": 0.136411 }, { "epoch": 1.034359601516857, "grad_norm": 0.658454954624176, "learning_rate": 5.1293056512688374e-05, "loss": 0.798548412322998, "memory(GiB)": 91.52, "step": 79715, "token_acc": 0.7665236829386177, "train_speed(iter/s)": 0.136411 }, { "epoch": 1.0344244799185127, "grad_norm": 0.7406232953071594, "learning_rate": 5.128769454685538e-05, "loss": 0.7835005760192871, "memory(GiB)": 91.52, "step": 79720, "token_acc": 0.7705362732141615, "train_speed(iter/s)": 0.13641 }, { "epoch": 1.0344893583201684, "grad_norm": 0.7912604212760925, "learning_rate": 5.128233256620368e-05, "loss": 0.7672584533691407, "memory(GiB)": 91.52, "step": 79725, "token_acc": 0.77954061094009, "train_speed(iter/s)": 0.136409 }, { "epoch": 1.0345542367218241, "grad_norm": 0.7197816371917725, "learning_rate": 5.127697057079498e-05, "loss": 0.7444810390472412, "memory(GiB)": 91.52, "step": 79730, "token_acc": 0.7942686056458511, "train_speed(iter/s)": 0.136408 }, { "epoch": 1.0346191151234798, "grad_norm": 0.7259830832481384, "learning_rate": 5.127160856069099e-05, "loss": 0.7866226673126221, "memory(GiB)": 91.52, "step": 79735, "token_acc": 0.798099469332346, "train_speed(iter/s)": 0.136408 }, { "epoch": 1.0346839935251355, "grad_norm": 0.6865537762641907, "learning_rate": 5.12662465359534e-05, "loss": 0.7709092140197754, "memory(GiB)": 91.52, "step": 79740, "token_acc": 0.773528491464729, "train_speed(iter/s)": 0.136406 }, { "epoch": 1.0347488719267912, "grad_norm": 0.7943596839904785, "learning_rate": 5.126088449664392e-05, "loss": 0.7784010887145996, "memory(GiB)": 91.52, "step": 79745, "token_acc": 0.7799799656554093, "train_speed(iter/s)": 0.136406 }, { "epoch": 1.034813750328447, "grad_norm": 0.7590733766555786, "learning_rate": 5.1255522442824276e-05, "loss": 0.7769711971282959, "memory(GiB)": 91.52, "step": 79750, "token_acc": 0.7766099326813158, "train_speed(iter/s)": 0.136405 }, { "epoch": 1.0348786287301026, "grad_norm": 0.761559247970581, "learning_rate": 5.125016037455616e-05, "loss": 0.7976520538330079, "memory(GiB)": 91.52, "step": 79755, "token_acc": 0.7527540319027055, "train_speed(iter/s)": 0.136404 }, { "epoch": 1.0349435071317583, "grad_norm": 0.7114458680152893, "learning_rate": 5.1244798291901285e-05, "loss": 0.7509853363037109, "memory(GiB)": 91.52, "step": 79760, "token_acc": 0.7737544551452188, "train_speed(iter/s)": 0.136404 }, { "epoch": 1.035008385533414, "grad_norm": 0.7345972061157227, "learning_rate": 5.123943619492133e-05, "loss": 0.7321459293365479, "memory(GiB)": 91.52, "step": 79765, "token_acc": 0.7869609856262834, "train_speed(iter/s)": 0.136403 }, { "epoch": 1.0350732639350697, "grad_norm": 0.7850273251533508, "learning_rate": 5.123407408367805e-05, "loss": 0.775583553314209, "memory(GiB)": 91.52, "step": 79770, "token_acc": 0.7704684382225319, "train_speed(iter/s)": 0.136402 }, { "epoch": 1.0351381423367254, "grad_norm": 0.6842387914657593, "learning_rate": 5.12287119582331e-05, "loss": 0.8130085945129395, "memory(GiB)": 91.52, "step": 79775, "token_acc": 0.7610873729188153, "train_speed(iter/s)": 0.136401 }, { "epoch": 1.035203020738381, "grad_norm": 0.7255518436431885, "learning_rate": 5.1223349818648225e-05, "loss": 0.7427582263946533, "memory(GiB)": 91.52, "step": 79780, "token_acc": 0.7924360795454546, "train_speed(iter/s)": 0.1364 }, { "epoch": 1.0352678991400368, "grad_norm": 0.6686609983444214, "learning_rate": 5.121798766498511e-05, "loss": 0.7301677703857422, "memory(GiB)": 91.52, "step": 79785, "token_acc": 0.7885578236056161, "train_speed(iter/s)": 0.136399 }, { "epoch": 1.0353327775416925, "grad_norm": 0.7254069447517395, "learning_rate": 5.121262549730549e-05, "loss": 0.7425395011901855, "memory(GiB)": 91.52, "step": 79790, "token_acc": 0.7686671551322187, "train_speed(iter/s)": 0.136397 }, { "epoch": 1.0353976559433482, "grad_norm": 0.7317370176315308, "learning_rate": 5.120726331567104e-05, "loss": 0.781855297088623, "memory(GiB)": 91.52, "step": 79795, "token_acc": 0.7701258782201406, "train_speed(iter/s)": 0.136396 }, { "epoch": 1.035462534345004, "grad_norm": 0.7064619064331055, "learning_rate": 5.120190112014347e-05, "loss": 0.7369573593139649, "memory(GiB)": 91.52, "step": 79800, "token_acc": 0.7862757824895428, "train_speed(iter/s)": 0.136395 }, { "epoch": 1.0355274127466596, "grad_norm": 0.6786861419677734, "learning_rate": 5.119653891078452e-05, "loss": 0.7621649742126465, "memory(GiB)": 91.52, "step": 79805, "token_acc": 0.7780339941235541, "train_speed(iter/s)": 0.136394 }, { "epoch": 1.0355922911483153, "grad_norm": 0.6361120939254761, "learning_rate": 5.1191176687655865e-05, "loss": 0.7920873165130615, "memory(GiB)": 91.52, "step": 79810, "token_acc": 0.7826580410850074, "train_speed(iter/s)": 0.136393 }, { "epoch": 1.035657169549971, "grad_norm": 0.7897443175315857, "learning_rate": 5.118581445081924e-05, "loss": 0.7572909355163574, "memory(GiB)": 91.52, "step": 79815, "token_acc": 0.7971698113207547, "train_speed(iter/s)": 0.136392 }, { "epoch": 1.0357220479516267, "grad_norm": 0.7415107488632202, "learning_rate": 5.118045220033633e-05, "loss": 0.7817679405212402, "memory(GiB)": 91.52, "step": 79820, "token_acc": 0.7736079388832008, "train_speed(iter/s)": 0.136391 }, { "epoch": 1.0357869263532824, "grad_norm": 0.7225651741027832, "learning_rate": 5.117508993626884e-05, "loss": 0.7839945793151856, "memory(GiB)": 91.52, "step": 79825, "token_acc": 0.7737871313482627, "train_speed(iter/s)": 0.13639 }, { "epoch": 1.035851804754938, "grad_norm": 0.7248324751853943, "learning_rate": 5.11697276586785e-05, "loss": 0.7422624588012695, "memory(GiB)": 91.52, "step": 79830, "token_acc": 0.8123770670940482, "train_speed(iter/s)": 0.136389 }, { "epoch": 1.0359166831565938, "grad_norm": 0.748927891254425, "learning_rate": 5.116436536762701e-05, "loss": 0.7892748832702636, "memory(GiB)": 91.52, "step": 79835, "token_acc": 0.7719528178243774, "train_speed(iter/s)": 0.136388 }, { "epoch": 1.0359815615582495, "grad_norm": 0.742313802242279, "learning_rate": 5.115900306317608e-05, "loss": 0.7314449310302734, "memory(GiB)": 91.52, "step": 79840, "token_acc": 0.7938847003274694, "train_speed(iter/s)": 0.136388 }, { "epoch": 1.0360464399599052, "grad_norm": 0.7031592726707458, "learning_rate": 5.115364074538741e-05, "loss": 0.7600334167480469, "memory(GiB)": 91.52, "step": 79845, "token_acc": 0.7746055597295267, "train_speed(iter/s)": 0.136387 }, { "epoch": 1.036111318361561, "grad_norm": 0.7399635910987854, "learning_rate": 5.114827841432272e-05, "loss": 0.7713394165039062, "memory(GiB)": 91.52, "step": 79850, "token_acc": 0.7782049386366997, "train_speed(iter/s)": 0.136387 }, { "epoch": 1.0361761967632166, "grad_norm": 0.6646125912666321, "learning_rate": 5.1142916070043714e-05, "loss": 0.7676692485809327, "memory(GiB)": 91.52, "step": 79855, "token_acc": 0.774746096710626, "train_speed(iter/s)": 0.136386 }, { "epoch": 1.0362410751648723, "grad_norm": 0.7841183543205261, "learning_rate": 5.113755371261212e-05, "loss": 0.764217472076416, "memory(GiB)": 91.52, "step": 79860, "token_acc": 0.7823026968233149, "train_speed(iter/s)": 0.136385 }, { "epoch": 1.036305953566528, "grad_norm": 0.6995758414268494, "learning_rate": 5.113219134208962e-05, "loss": 0.7539443969726562, "memory(GiB)": 91.52, "step": 79865, "token_acc": 0.7919021030459059, "train_speed(iter/s)": 0.136384 }, { "epoch": 1.0363708319681837, "grad_norm": 0.7910720705986023, "learning_rate": 5.112682895853793e-05, "loss": 0.7214831352233887, "memory(GiB)": 91.52, "step": 79870, "token_acc": 0.7853129445234709, "train_speed(iter/s)": 0.136382 }, { "epoch": 1.0364357103698394, "grad_norm": 0.6315916776657104, "learning_rate": 5.112146656201877e-05, "loss": 0.7482437133789063, "memory(GiB)": 91.52, "step": 79875, "token_acc": 0.7912955935878572, "train_speed(iter/s)": 0.136381 }, { "epoch": 1.036500588771495, "grad_norm": 0.7583797574043274, "learning_rate": 5.111610415259385e-05, "loss": 0.7447722911834717, "memory(GiB)": 91.52, "step": 79880, "token_acc": 0.7832337200671902, "train_speed(iter/s)": 0.136381 }, { "epoch": 1.0365654671731508, "grad_norm": 0.7379243969917297, "learning_rate": 5.111074173032486e-05, "loss": 0.7824993133544922, "memory(GiB)": 91.52, "step": 79885, "token_acc": 0.7768206734534064, "train_speed(iter/s)": 0.13638 }, { "epoch": 1.0366303455748065, "grad_norm": 0.6575890183448792, "learning_rate": 5.110537929527353e-05, "loss": 0.7502547264099121, "memory(GiB)": 91.52, "step": 79890, "token_acc": 0.7810998398291511, "train_speed(iter/s)": 0.136378 }, { "epoch": 1.0366952239764622, "grad_norm": 0.6657263040542603, "learning_rate": 5.110001684750157e-05, "loss": 0.7632406234741211, "memory(GiB)": 91.52, "step": 79895, "token_acc": 0.7909795056146462, "train_speed(iter/s)": 0.136377 }, { "epoch": 1.0367601023781179, "grad_norm": 0.6980007886886597, "learning_rate": 5.1094654387070704e-05, "loss": 0.7769976139068604, "memory(GiB)": 91.52, "step": 79900, "token_acc": 0.771342383107089, "train_speed(iter/s)": 0.136377 }, { "epoch": 1.0368249807797736, "grad_norm": 0.722992479801178, "learning_rate": 5.1089291914042606e-05, "loss": 0.7662429809570312, "memory(GiB)": 91.52, "step": 79905, "token_acc": 0.7804599395592245, "train_speed(iter/s)": 0.136376 }, { "epoch": 1.0368898591814293, "grad_norm": 0.7309004664421082, "learning_rate": 5.1083929428479e-05, "loss": 0.74658203125, "memory(GiB)": 91.52, "step": 79910, "token_acc": 0.781324395439168, "train_speed(iter/s)": 0.136375 }, { "epoch": 1.036954737583085, "grad_norm": 0.7343015074729919, "learning_rate": 5.107856693044162e-05, "loss": 0.754628038406372, "memory(GiB)": 91.52, "step": 79915, "token_acc": 0.7794385432473444, "train_speed(iter/s)": 0.136374 }, { "epoch": 1.0370196159847407, "grad_norm": 0.6586419343948364, "learning_rate": 5.107320441999216e-05, "loss": 0.8510431289672852, "memory(GiB)": 91.52, "step": 79920, "token_acc": 0.7642670645281612, "train_speed(iter/s)": 0.136373 }, { "epoch": 1.0370844943863964, "grad_norm": 0.7475935816764832, "learning_rate": 5.106784189719231e-05, "loss": 0.7783169746398926, "memory(GiB)": 91.52, "step": 79925, "token_acc": 0.7915405349367614, "train_speed(iter/s)": 0.136372 }, { "epoch": 1.037149372788052, "grad_norm": 0.7766937017440796, "learning_rate": 5.106247936210383e-05, "loss": 0.7626414775848389, "memory(GiB)": 91.52, "step": 79930, "token_acc": 0.7746165264720435, "train_speed(iter/s)": 0.13637 }, { "epoch": 1.0372142511897078, "grad_norm": 0.7268058061599731, "learning_rate": 5.1057116814788395e-05, "loss": 0.7546932697296143, "memory(GiB)": 91.52, "step": 79935, "token_acc": 0.7900569981000634, "train_speed(iter/s)": 0.136369 }, { "epoch": 1.0372791295913635, "grad_norm": 0.7326198816299438, "learning_rate": 5.105175425530774e-05, "loss": 0.7617499351501464, "memory(GiB)": 91.52, "step": 79940, "token_acc": 0.7761161285604592, "train_speed(iter/s)": 0.136368 }, { "epoch": 1.0373440079930192, "grad_norm": 0.7940647602081299, "learning_rate": 5.104639168372355e-05, "loss": 0.7722295761108399, "memory(GiB)": 91.52, "step": 79945, "token_acc": 0.7780319414406921, "train_speed(iter/s)": 0.136367 }, { "epoch": 1.0374088863946749, "grad_norm": 0.671683132648468, "learning_rate": 5.104102910009756e-05, "loss": 0.7481236457824707, "memory(GiB)": 91.52, "step": 79950, "token_acc": 0.7871968244356982, "train_speed(iter/s)": 0.136366 }, { "epoch": 1.0374737647963306, "grad_norm": 0.7282978892326355, "learning_rate": 5.103566650449147e-05, "loss": 0.7859326362609863, "memory(GiB)": 91.52, "step": 79955, "token_acc": 0.7708884576938908, "train_speed(iter/s)": 0.136365 }, { "epoch": 1.0375386431979863, "grad_norm": 0.7119449377059937, "learning_rate": 5.103030389696699e-05, "loss": 0.7512977123260498, "memory(GiB)": 91.52, "step": 79960, "token_acc": 0.7806110257723161, "train_speed(iter/s)": 0.136365 }, { "epoch": 1.037603521599642, "grad_norm": 0.7898194193840027, "learning_rate": 5.102494127758585e-05, "loss": 0.8254958152770996, "memory(GiB)": 91.52, "step": 79965, "token_acc": 0.7724421087206438, "train_speed(iter/s)": 0.136364 }, { "epoch": 1.0376684000012977, "grad_norm": 0.6395187973976135, "learning_rate": 5.1019578646409746e-05, "loss": 0.7497650146484375, "memory(GiB)": 91.52, "step": 79970, "token_acc": 0.7737191823100106, "train_speed(iter/s)": 0.136363 }, { "epoch": 1.0377332784029534, "grad_norm": 0.6878753900527954, "learning_rate": 5.10142160035004e-05, "loss": 0.7472286224365234, "memory(GiB)": 91.52, "step": 79975, "token_acc": 0.7635502720128954, "train_speed(iter/s)": 0.136362 }, { "epoch": 1.0377981568046089, "grad_norm": 0.6718879342079163, "learning_rate": 5.1008853348919526e-05, "loss": 0.7477897644042969, "memory(GiB)": 91.52, "step": 79980, "token_acc": 0.7880227747511571, "train_speed(iter/s)": 0.136361 }, { "epoch": 1.0378630352062648, "grad_norm": 0.7121539115905762, "learning_rate": 5.1003490682728816e-05, "loss": 0.7586565017700195, "memory(GiB)": 91.52, "step": 79985, "token_acc": 0.7814091548195594, "train_speed(iter/s)": 0.13636 }, { "epoch": 1.0379279136079202, "grad_norm": 0.6742861270904541, "learning_rate": 5.099812800499003e-05, "loss": 0.7362567901611328, "memory(GiB)": 91.52, "step": 79990, "token_acc": 0.8001928711843033, "train_speed(iter/s)": 0.136359 }, { "epoch": 1.037992792009576, "grad_norm": 0.6696576476097107, "learning_rate": 5.0992765315764825e-05, "loss": 0.7415318012237548, "memory(GiB)": 91.52, "step": 79995, "token_acc": 0.771809196390202, "train_speed(iter/s)": 0.136358 }, { "epoch": 1.0380576704112316, "grad_norm": 0.7298002243041992, "learning_rate": 5.0987402615114945e-05, "loss": 0.8358657836914063, "memory(GiB)": 91.52, "step": 80000, "token_acc": 0.7678373040365387, "train_speed(iter/s)": 0.136357 }, { "epoch": 1.0380576704112316, "eval_loss": 0.8290087580680847, "eval_runtime": 2091.9262, "eval_samples_per_second": 23.816, "eval_steps_per_second": 1.489, "eval_token_acc": 0.7698574665577299, "step": 80000 }, { "epoch": 1.0381225488128873, "grad_norm": 0.7116577625274658, "learning_rate": 5.098203990310211e-05, "loss": 0.7543182373046875, "memory(GiB)": 91.52, "step": 80005, "token_acc": 0.7714395602375965, "train_speed(iter/s)": 0.135843 }, { "epoch": 1.038187427214543, "grad_norm": 0.7455072999000549, "learning_rate": 5.0976677179788015e-05, "loss": 0.7318243026733399, "memory(GiB)": 91.52, "step": 80010, "token_acc": 0.7865758478785299, "train_speed(iter/s)": 0.135842 }, { "epoch": 1.0382523056161987, "grad_norm": 0.7277557253837585, "learning_rate": 5.0971314445234374e-05, "loss": 0.8061102867126465, "memory(GiB)": 91.52, "step": 80015, "token_acc": 0.7602394340649374, "train_speed(iter/s)": 0.135841 }, { "epoch": 1.0383171840178544, "grad_norm": 0.7057135105133057, "learning_rate": 5.0965951699502914e-05, "loss": 0.7358424186706543, "memory(GiB)": 91.52, "step": 80020, "token_acc": 0.7875530293669426, "train_speed(iter/s)": 0.13584 }, { "epoch": 1.0383820624195101, "grad_norm": 0.7262944579124451, "learning_rate": 5.0960588942655366e-05, "loss": 0.7361271858215332, "memory(GiB)": 91.52, "step": 80025, "token_acc": 0.7766165413533834, "train_speed(iter/s)": 0.135839 }, { "epoch": 1.0384469408211658, "grad_norm": 0.6011651754379272, "learning_rate": 5.095522617475339e-05, "loss": 0.7291868209838868, "memory(GiB)": 91.52, "step": 80030, "token_acc": 0.7976486052285673, "train_speed(iter/s)": 0.135838 }, { "epoch": 1.0385118192228215, "grad_norm": 0.751143217086792, "learning_rate": 5.0949863395858744e-05, "loss": 0.7877243995666504, "memory(GiB)": 91.52, "step": 80035, "token_acc": 0.785081509647668, "train_speed(iter/s)": 0.135837 }, { "epoch": 1.0385766976244772, "grad_norm": 0.6975885033607483, "learning_rate": 5.094450060603313e-05, "loss": 0.7981991291046142, "memory(GiB)": 91.52, "step": 80040, "token_acc": 0.7661879302512492, "train_speed(iter/s)": 0.135836 }, { "epoch": 1.038641576026133, "grad_norm": 0.7174736261367798, "learning_rate": 5.0939137805338275e-05, "loss": 0.7599362373352051, "memory(GiB)": 91.52, "step": 80045, "token_acc": 0.7882707774798927, "train_speed(iter/s)": 0.135835 }, { "epoch": 1.0387064544277886, "grad_norm": 0.7418023943901062, "learning_rate": 5.093377499383587e-05, "loss": 0.7613081932067871, "memory(GiB)": 91.52, "step": 80050, "token_acc": 0.7832420010606328, "train_speed(iter/s)": 0.135834 }, { "epoch": 1.0387713328294443, "grad_norm": 0.6800834536552429, "learning_rate": 5.0928412171587636e-05, "loss": 0.7832059860229492, "memory(GiB)": 91.52, "step": 80055, "token_acc": 0.7524130988334221, "train_speed(iter/s)": 0.135833 }, { "epoch": 1.0388362112311, "grad_norm": 0.7784898281097412, "learning_rate": 5.092304933865533e-05, "loss": 0.7899654865264892, "memory(GiB)": 91.52, "step": 80060, "token_acc": 0.7759526721089335, "train_speed(iter/s)": 0.135832 }, { "epoch": 1.0389010896327557, "grad_norm": 0.7468331456184387, "learning_rate": 5.0917686495100594e-05, "loss": 0.7924422740936279, "memory(GiB)": 91.52, "step": 80065, "token_acc": 0.7715014685376924, "train_speed(iter/s)": 0.135831 }, { "epoch": 1.0389659680344114, "grad_norm": 0.6633062362670898, "learning_rate": 5.0912323640985197e-05, "loss": 0.7867154598236084, "memory(GiB)": 91.52, "step": 80070, "token_acc": 0.7619098143236074, "train_speed(iter/s)": 0.13583 }, { "epoch": 1.0390308464360671, "grad_norm": 0.6993347406387329, "learning_rate": 5.0906960776370826e-05, "loss": 0.7524308681488037, "memory(GiB)": 91.52, "step": 80075, "token_acc": 0.7649849649181424, "train_speed(iter/s)": 0.13583 }, { "epoch": 1.0390957248377228, "grad_norm": 0.7305500507354736, "learning_rate": 5.090159790131922e-05, "loss": 0.768779706954956, "memory(GiB)": 91.52, "step": 80080, "token_acc": 0.7554455445544555, "train_speed(iter/s)": 0.135829 }, { "epoch": 1.0391606032393785, "grad_norm": 0.6886422634124756, "learning_rate": 5.089623501589209e-05, "loss": 0.757956075668335, "memory(GiB)": 91.52, "step": 80085, "token_acc": 0.7564475873544093, "train_speed(iter/s)": 0.135828 }, { "epoch": 1.0392254816410342, "grad_norm": 0.6915972232818604, "learning_rate": 5.089087212015112e-05, "loss": 0.7526005268096924, "memory(GiB)": 91.52, "step": 80090, "token_acc": 0.7812329514457174, "train_speed(iter/s)": 0.135827 }, { "epoch": 1.03929036004269, "grad_norm": 0.6935597658157349, "learning_rate": 5.088550921415809e-05, "loss": 0.8117351531982422, "memory(GiB)": 91.52, "step": 80095, "token_acc": 0.7683502647016741, "train_speed(iter/s)": 0.135826 }, { "epoch": 1.0393552384443456, "grad_norm": 0.7349245548248291, "learning_rate": 5.088014629797464e-05, "loss": 0.760751485824585, "memory(GiB)": 91.52, "step": 80100, "token_acc": 0.7850172582775813, "train_speed(iter/s)": 0.135826 }, { "epoch": 1.0394201168460013, "grad_norm": 0.7245868444442749, "learning_rate": 5.0874783371662536e-05, "loss": 0.74708251953125, "memory(GiB)": 91.52, "step": 80105, "token_acc": 0.7882249868582443, "train_speed(iter/s)": 0.135824 }, { "epoch": 1.039484995247657, "grad_norm": 0.6261598467826843, "learning_rate": 5.086942043528349e-05, "loss": 0.7536363124847412, "memory(GiB)": 91.52, "step": 80110, "token_acc": 0.8009833395966429, "train_speed(iter/s)": 0.135823 }, { "epoch": 1.0395498736493127, "grad_norm": 0.6814044713973999, "learning_rate": 5.08640574888992e-05, "loss": 0.7292052268981933, "memory(GiB)": 91.52, "step": 80115, "token_acc": 0.7843019983112862, "train_speed(iter/s)": 0.135822 }, { "epoch": 1.0396147520509684, "grad_norm": 0.7318150997161865, "learning_rate": 5.0858694532571395e-05, "loss": 0.7590327262878418, "memory(GiB)": 91.52, "step": 80120, "token_acc": 0.7863817592823324, "train_speed(iter/s)": 0.135821 }, { "epoch": 1.0396796304526241, "grad_norm": 0.7231958508491516, "learning_rate": 5.0853331566361785e-05, "loss": 0.7341209888458252, "memory(GiB)": 91.52, "step": 80125, "token_acc": 0.7797605149928288, "train_speed(iter/s)": 0.13582 }, { "epoch": 1.0397445088542798, "grad_norm": 0.7725694179534912, "learning_rate": 5.0847968590332095e-05, "loss": 0.7653436660766602, "memory(GiB)": 91.52, "step": 80130, "token_acc": 0.7803305785123967, "train_speed(iter/s)": 0.13582 }, { "epoch": 1.0398093872559355, "grad_norm": 0.7552673816680908, "learning_rate": 5.084260560454404e-05, "loss": 0.789630126953125, "memory(GiB)": 91.52, "step": 80135, "token_acc": 0.7841440136905153, "train_speed(iter/s)": 0.135819 }, { "epoch": 1.0398742656575912, "grad_norm": 0.8049811124801636, "learning_rate": 5.083724260905932e-05, "loss": 0.7657569885253906, "memory(GiB)": 91.52, "step": 80140, "token_acc": 0.7804595621494481, "train_speed(iter/s)": 0.135818 }, { "epoch": 1.039939144059247, "grad_norm": 0.7043212056159973, "learning_rate": 5.083187960393968e-05, "loss": 0.7564770221710205, "memory(GiB)": 91.52, "step": 80145, "token_acc": 0.7835040877574252, "train_speed(iter/s)": 0.135817 }, { "epoch": 1.0400040224609026, "grad_norm": 0.7450372576713562, "learning_rate": 5.082651658924682e-05, "loss": 0.7971369743347168, "memory(GiB)": 91.52, "step": 80150, "token_acc": 0.7632184663948441, "train_speed(iter/s)": 0.135817 }, { "epoch": 1.0400689008625583, "grad_norm": 0.7047946453094482, "learning_rate": 5.082115356504246e-05, "loss": 0.7675673961639404, "memory(GiB)": 91.52, "step": 80155, "token_acc": 0.7863049095607235, "train_speed(iter/s)": 0.135816 }, { "epoch": 1.040133779264214, "grad_norm": 0.7233739495277405, "learning_rate": 5.081579053138831e-05, "loss": 0.7633278846740723, "memory(GiB)": 91.52, "step": 80160, "token_acc": 0.7790598726981858, "train_speed(iter/s)": 0.135815 }, { "epoch": 1.0401986576658697, "grad_norm": 0.7116095423698425, "learning_rate": 5.08104274883461e-05, "loss": 0.792530107498169, "memory(GiB)": 91.52, "step": 80165, "token_acc": 0.7748442485944386, "train_speed(iter/s)": 0.135814 }, { "epoch": 1.0402635360675254, "grad_norm": 0.7632558345794678, "learning_rate": 5.080506443597754e-05, "loss": 0.7514158725738526, "memory(GiB)": 91.52, "step": 80170, "token_acc": 0.7719743961990233, "train_speed(iter/s)": 0.135813 }, { "epoch": 1.0403284144691811, "grad_norm": 0.7445675730705261, "learning_rate": 5.079970137434436e-05, "loss": 0.7635716915130615, "memory(GiB)": 91.52, "step": 80175, "token_acc": 0.7876482354622089, "train_speed(iter/s)": 0.135813 }, { "epoch": 1.0403932928708368, "grad_norm": 0.6714195609092712, "learning_rate": 5.079433830350826e-05, "loss": 0.7337889194488525, "memory(GiB)": 91.52, "step": 80180, "token_acc": 0.7900623395672901, "train_speed(iter/s)": 0.135812 }, { "epoch": 1.0404581712724925, "grad_norm": 0.7141760587692261, "learning_rate": 5.078897522353097e-05, "loss": 0.8003416061401367, "memory(GiB)": 91.52, "step": 80185, "token_acc": 0.7709040782705174, "train_speed(iter/s)": 0.13581 }, { "epoch": 1.0405230496741482, "grad_norm": 0.7258458733558655, "learning_rate": 5.0783612134474204e-05, "loss": 0.6970147132873535, "memory(GiB)": 91.52, "step": 80190, "token_acc": 0.8062712421883566, "train_speed(iter/s)": 0.135809 }, { "epoch": 1.040587928075804, "grad_norm": 0.7779659032821655, "learning_rate": 5.077824903639967e-05, "loss": 0.7719518661499023, "memory(GiB)": 91.52, "step": 80195, "token_acc": 0.7728626283206645, "train_speed(iter/s)": 0.135808 }, { "epoch": 1.0406528064774596, "grad_norm": 0.7092911005020142, "learning_rate": 5.0772885929369105e-05, "loss": 0.7784822463989258, "memory(GiB)": 91.52, "step": 80200, "token_acc": 0.7641712050318324, "train_speed(iter/s)": 0.135807 }, { "epoch": 1.0407176848791153, "grad_norm": 0.6634369492530823, "learning_rate": 5.076752281344423e-05, "loss": 0.7954444885253906, "memory(GiB)": 91.52, "step": 80205, "token_acc": 0.7728047394903425, "train_speed(iter/s)": 0.135807 }, { "epoch": 1.040782563280771, "grad_norm": 0.7231289148330688, "learning_rate": 5.0762159688686725e-05, "loss": 0.7618669509887696, "memory(GiB)": 91.52, "step": 80210, "token_acc": 0.7923455684870188, "train_speed(iter/s)": 0.135806 }, { "epoch": 1.0408474416824267, "grad_norm": 0.6600418090820312, "learning_rate": 5.075679655515836e-05, "loss": 0.7623867988586426, "memory(GiB)": 91.52, "step": 80215, "token_acc": 0.7846273346749991, "train_speed(iter/s)": 0.135805 }, { "epoch": 1.0409123200840824, "grad_norm": 0.6573085784912109, "learning_rate": 5.075143341292081e-05, "loss": 0.7818430900573731, "memory(GiB)": 91.52, "step": 80220, "token_acc": 0.7910648950510303, "train_speed(iter/s)": 0.135804 }, { "epoch": 1.040977198485738, "grad_norm": 0.721973717212677, "learning_rate": 5.0746070262035825e-05, "loss": 0.7609207153320312, "memory(GiB)": 91.52, "step": 80225, "token_acc": 0.7707039698597642, "train_speed(iter/s)": 0.135804 }, { "epoch": 1.0410420768873938, "grad_norm": 0.6735854744911194, "learning_rate": 5.07407071025651e-05, "loss": 0.7538669586181641, "memory(GiB)": 91.52, "step": 80230, "token_acc": 0.7806010177417136, "train_speed(iter/s)": 0.135802 }, { "epoch": 1.0411069552890495, "grad_norm": 0.70987868309021, "learning_rate": 5.073534393457038e-05, "loss": 0.787759256362915, "memory(GiB)": 91.52, "step": 80235, "token_acc": 0.7728895352680637, "train_speed(iter/s)": 0.135802 }, { "epoch": 1.0411718336907052, "grad_norm": 0.6900408267974854, "learning_rate": 5.072998075811336e-05, "loss": 0.789670753479004, "memory(GiB)": 91.52, "step": 80240, "token_acc": 0.7780121444550975, "train_speed(iter/s)": 0.135801 }, { "epoch": 1.041236712092361, "grad_norm": 0.7091284990310669, "learning_rate": 5.072461757325577e-05, "loss": 0.7826891899108886, "memory(GiB)": 91.52, "step": 80245, "token_acc": 0.7720227377043393, "train_speed(iter/s)": 0.135799 }, { "epoch": 1.0413015904940166, "grad_norm": 0.6787387132644653, "learning_rate": 5.0719254380059335e-05, "loss": 0.7856148719787598, "memory(GiB)": 91.52, "step": 80250, "token_acc": 0.7786408398062985, "train_speed(iter/s)": 0.135798 }, { "epoch": 1.0413664688956723, "grad_norm": 0.6466469168663025, "learning_rate": 5.071389117858575e-05, "loss": 0.7765227317810058, "memory(GiB)": 91.52, "step": 80255, "token_acc": 0.7645449308755761, "train_speed(iter/s)": 0.135797 }, { "epoch": 1.041431347297328, "grad_norm": 0.6905257701873779, "learning_rate": 5.070852796889677e-05, "loss": 0.7316072463989258, "memory(GiB)": 91.52, "step": 80260, "token_acc": 0.8053187170117743, "train_speed(iter/s)": 0.135796 }, { "epoch": 1.0414962256989837, "grad_norm": 0.7297045588493347, "learning_rate": 5.070316475105408e-05, "loss": 0.7879535675048828, "memory(GiB)": 91.52, "step": 80265, "token_acc": 0.7730395976482255, "train_speed(iter/s)": 0.135795 }, { "epoch": 1.0415611041006394, "grad_norm": 0.6436935663223267, "learning_rate": 5.069780152511943e-05, "loss": 0.7720870971679688, "memory(GiB)": 91.52, "step": 80270, "token_acc": 0.7767898442719844, "train_speed(iter/s)": 0.135794 }, { "epoch": 1.041625982502295, "grad_norm": 0.6653308868408203, "learning_rate": 5.069243829115451e-05, "loss": 0.7167458057403564, "memory(GiB)": 91.52, "step": 80275, "token_acc": 0.8070587299060958, "train_speed(iter/s)": 0.135793 }, { "epoch": 1.0416908609039508, "grad_norm": 0.7674450278282166, "learning_rate": 5.068707504922107e-05, "loss": 0.7668247222900391, "memory(GiB)": 91.52, "step": 80280, "token_acc": 0.7871103884872176, "train_speed(iter/s)": 0.135791 }, { "epoch": 1.0417557393056065, "grad_norm": 0.7286208271980286, "learning_rate": 5.0681711799380794e-05, "loss": 0.7736459732055664, "memory(GiB)": 91.52, "step": 80285, "token_acc": 0.7731014190472644, "train_speed(iter/s)": 0.135791 }, { "epoch": 1.0418206177072622, "grad_norm": 0.7181944847106934, "learning_rate": 5.067634854169544e-05, "loss": 0.7651153564453125, "memory(GiB)": 91.52, "step": 80290, "token_acc": 0.8014772806833298, "train_speed(iter/s)": 0.13579 }, { "epoch": 1.041885496108918, "grad_norm": 0.7571409940719604, "learning_rate": 5.067098527622671e-05, "loss": 0.7830626487731933, "memory(GiB)": 91.52, "step": 80295, "token_acc": 0.7908181752300891, "train_speed(iter/s)": 0.13579 }, { "epoch": 1.0419503745105736, "grad_norm": 0.735978364944458, "learning_rate": 5.066562200303632e-05, "loss": 0.7654484748840332, "memory(GiB)": 91.52, "step": 80300, "token_acc": 0.768965040762747, "train_speed(iter/s)": 0.135789 }, { "epoch": 1.0420152529122293, "grad_norm": 0.6649114489555359, "learning_rate": 5.0660258722185995e-05, "loss": 0.7816740036010742, "memory(GiB)": 91.52, "step": 80305, "token_acc": 0.7980439936775553, "train_speed(iter/s)": 0.135788 }, { "epoch": 1.042080131313885, "grad_norm": 0.6760402321815491, "learning_rate": 5.065489543373746e-05, "loss": 0.7368619918823243, "memory(GiB)": 91.52, "step": 80310, "token_acc": 0.7970135651442392, "train_speed(iter/s)": 0.135786 }, { "epoch": 1.0421450097155407, "grad_norm": 0.6900481581687927, "learning_rate": 5.064953213775242e-05, "loss": 0.7444366455078125, "memory(GiB)": 91.52, "step": 80315, "token_acc": 0.7960870579608705, "train_speed(iter/s)": 0.135785 }, { "epoch": 1.0422098881171964, "grad_norm": 0.8577722907066345, "learning_rate": 5.064416883429261e-05, "loss": 0.7704380512237549, "memory(GiB)": 91.52, "step": 80320, "token_acc": 0.7937282229965157, "train_speed(iter/s)": 0.135785 }, { "epoch": 1.042274766518852, "grad_norm": 0.6453102231025696, "learning_rate": 5.063880552341974e-05, "loss": 0.72097749710083, "memory(GiB)": 91.52, "step": 80325, "token_acc": 0.8068274104567476, "train_speed(iter/s)": 0.135784 }, { "epoch": 1.0423396449205078, "grad_norm": 0.7275775074958801, "learning_rate": 5.063344220519557e-05, "loss": 0.7761544227600098, "memory(GiB)": 91.52, "step": 80330, "token_acc": 0.7831829797715447, "train_speed(iter/s)": 0.135782 }, { "epoch": 1.0424045233221635, "grad_norm": 0.6309330463409424, "learning_rate": 5.062807887968175e-05, "loss": 0.7292740821838379, "memory(GiB)": 91.52, "step": 80335, "token_acc": 0.7902738919157709, "train_speed(iter/s)": 0.135781 }, { "epoch": 1.0424694017238192, "grad_norm": 0.742746889591217, "learning_rate": 5.062271554694005e-05, "loss": 0.7657909870147706, "memory(GiB)": 91.52, "step": 80340, "token_acc": 0.7887935319261739, "train_speed(iter/s)": 0.13578 }, { "epoch": 1.0425342801254749, "grad_norm": 0.8552654385566711, "learning_rate": 5.0617352207032175e-05, "loss": 0.8207612037658691, "memory(GiB)": 91.52, "step": 80345, "token_acc": 0.7846302181909082, "train_speed(iter/s)": 0.13578 }, { "epoch": 1.0425991585271306, "grad_norm": 0.7300941348075867, "learning_rate": 5.061198886001986e-05, "loss": 0.8194927215576172, "memory(GiB)": 91.52, "step": 80350, "token_acc": 0.7846926645960351, "train_speed(iter/s)": 0.135779 }, { "epoch": 1.0426640369287863, "grad_norm": 0.7044118046760559, "learning_rate": 5.060662550596481e-05, "loss": 0.7681937694549561, "memory(GiB)": 91.52, "step": 80355, "token_acc": 0.8029396746662776, "train_speed(iter/s)": 0.135778 }, { "epoch": 1.042728915330442, "grad_norm": 0.6498668193817139, "learning_rate": 5.060126214492876e-05, "loss": 0.7434318542480469, "memory(GiB)": 91.52, "step": 80360, "token_acc": 0.7628824913996016, "train_speed(iter/s)": 0.135777 }, { "epoch": 1.0427937937320977, "grad_norm": 0.6023567914962769, "learning_rate": 5.0595898776973425e-05, "loss": 0.7394783020019531, "memory(GiB)": 91.52, "step": 80365, "token_acc": 0.7918302078790522, "train_speed(iter/s)": 0.135776 }, { "epoch": 1.0428586721337534, "grad_norm": 0.7686255574226379, "learning_rate": 5.059053540216052e-05, "loss": 0.744927978515625, "memory(GiB)": 91.52, "step": 80370, "token_acc": 0.7907315763225534, "train_speed(iter/s)": 0.135775 }, { "epoch": 1.042923550535409, "grad_norm": 0.6911531686782837, "learning_rate": 5.058517202055177e-05, "loss": 0.7465503692626954, "memory(GiB)": 91.52, "step": 80375, "token_acc": 0.79418618198765, "train_speed(iter/s)": 0.135773 }, { "epoch": 1.0429884289370648, "grad_norm": 0.6573554277420044, "learning_rate": 5.057980863220889e-05, "loss": 0.7914387702941894, "memory(GiB)": 91.52, "step": 80380, "token_acc": 0.7737583868791934, "train_speed(iter/s)": 0.135773 }, { "epoch": 1.0430533073387205, "grad_norm": 0.6348486542701721, "learning_rate": 5.057444523719364e-05, "loss": 0.7647155284881592, "memory(GiB)": 91.52, "step": 80385, "token_acc": 0.7796014962974273, "train_speed(iter/s)": 0.135772 }, { "epoch": 1.0431181857403762, "grad_norm": 0.7378503084182739, "learning_rate": 5.056908183556768e-05, "loss": 0.7736390113830567, "memory(GiB)": 91.52, "step": 80390, "token_acc": 0.7667437516677044, "train_speed(iter/s)": 0.135771 }, { "epoch": 1.0431830641420319, "grad_norm": 0.7489373087882996, "learning_rate": 5.056371842739277e-05, "loss": 0.774648380279541, "memory(GiB)": 91.52, "step": 80395, "token_acc": 0.7652787396121884, "train_speed(iter/s)": 0.135771 }, { "epoch": 1.0432479425436876, "grad_norm": 0.7256471514701843, "learning_rate": 5.055835501273063e-05, "loss": 0.7782176017761231, "memory(GiB)": 91.52, "step": 80400, "token_acc": 0.7864749372583599, "train_speed(iter/s)": 0.13577 }, { "epoch": 1.0433128209453433, "grad_norm": 0.6843057870864868, "learning_rate": 5.0552991591642984e-05, "loss": 0.7647544384002686, "memory(GiB)": 91.52, "step": 80405, "token_acc": 0.7854848605146343, "train_speed(iter/s)": 0.135769 }, { "epoch": 1.043377699346999, "grad_norm": 0.7893680930137634, "learning_rate": 5.054762816419154e-05, "loss": 0.7422918319702149, "memory(GiB)": 91.52, "step": 80410, "token_acc": 0.7780725388601036, "train_speed(iter/s)": 0.135768 }, { "epoch": 1.0434425777486547, "grad_norm": 0.6532461047172546, "learning_rate": 5.0542264730438e-05, "loss": 0.7419334411621094, "memory(GiB)": 91.52, "step": 80415, "token_acc": 0.7835539085936122, "train_speed(iter/s)": 0.135767 }, { "epoch": 1.0435074561503104, "grad_norm": 0.6773496866226196, "learning_rate": 5.053690129044416e-05, "loss": 0.7747287750244141, "memory(GiB)": 91.52, "step": 80420, "token_acc": 0.7977966913525663, "train_speed(iter/s)": 0.135766 }, { "epoch": 1.043572334551966, "grad_norm": 0.6702592372894287, "learning_rate": 5.053153784427166e-05, "loss": 0.7897360324859619, "memory(GiB)": 91.52, "step": 80425, "token_acc": 0.7934089191232049, "train_speed(iter/s)": 0.135765 }, { "epoch": 1.0436372129536218, "grad_norm": 0.737278163433075, "learning_rate": 5.052617439198227e-05, "loss": 0.7823181629180909, "memory(GiB)": 91.52, "step": 80430, "token_acc": 0.7731508953192083, "train_speed(iter/s)": 0.135764 }, { "epoch": 1.0437020913552775, "grad_norm": 0.7550613284111023, "learning_rate": 5.0520810933637695e-05, "loss": 0.7579774856567383, "memory(GiB)": 91.52, "step": 80435, "token_acc": 0.7769528624915998, "train_speed(iter/s)": 0.135762 }, { "epoch": 1.0437669697569332, "grad_norm": 0.6373404860496521, "learning_rate": 5.051544746929966e-05, "loss": 0.7442153453826904, "memory(GiB)": 91.52, "step": 80440, "token_acc": 0.7865528610606107, "train_speed(iter/s)": 0.135761 }, { "epoch": 1.0438318481585889, "grad_norm": 0.7081704139709473, "learning_rate": 5.05100839990299e-05, "loss": 0.7807039260864258, "memory(GiB)": 91.52, "step": 80445, "token_acc": 0.768286790994299, "train_speed(iter/s)": 0.13576 }, { "epoch": 1.0438967265602446, "grad_norm": 0.6443536877632141, "learning_rate": 5.05047205228901e-05, "loss": 0.7315930366516114, "memory(GiB)": 91.52, "step": 80450, "token_acc": 0.7857352401906857, "train_speed(iter/s)": 0.135758 }, { "epoch": 1.0439616049619, "grad_norm": 0.7352525591850281, "learning_rate": 5.0499357040942044e-05, "loss": 0.766627311706543, "memory(GiB)": 91.52, "step": 80455, "token_acc": 0.7644129896971784, "train_speed(iter/s)": 0.135757 }, { "epoch": 1.044026483363556, "grad_norm": 0.6581948399543762, "learning_rate": 5.0493993553247385e-05, "loss": 0.7794268608093262, "memory(GiB)": 91.52, "step": 80460, "token_acc": 0.7875699146305564, "train_speed(iter/s)": 0.135756 }, { "epoch": 1.0440913617652114, "grad_norm": 0.7754033207893372, "learning_rate": 5.04886300598679e-05, "loss": 0.7579595565795898, "memory(GiB)": 91.52, "step": 80465, "token_acc": 0.7859719969547516, "train_speed(iter/s)": 0.135756 }, { "epoch": 1.0441562401668671, "grad_norm": 0.7035544514656067, "learning_rate": 5.04832665608653e-05, "loss": 0.7888789653778077, "memory(GiB)": 91.52, "step": 80470, "token_acc": 0.7759052773035994, "train_speed(iter/s)": 0.135755 }, { "epoch": 1.0442211185685228, "grad_norm": 0.6757075786590576, "learning_rate": 5.047790305630128e-05, "loss": 0.7500141143798829, "memory(GiB)": 91.52, "step": 80475, "token_acc": 0.7967872311275517, "train_speed(iter/s)": 0.135754 }, { "epoch": 1.0442859969701785, "grad_norm": 0.662060558795929, "learning_rate": 5.04725395462376e-05, "loss": 0.7410335063934326, "memory(GiB)": 91.52, "step": 80480, "token_acc": 0.794497020679986, "train_speed(iter/s)": 0.135753 }, { "epoch": 1.0443508753718342, "grad_norm": 0.7830069661140442, "learning_rate": 5.0467176030735944e-05, "loss": 0.7931562423706054, "memory(GiB)": 91.52, "step": 80485, "token_acc": 0.7514594666312857, "train_speed(iter/s)": 0.135752 }, { "epoch": 1.04441575377349, "grad_norm": 0.775900661945343, "learning_rate": 5.0461812509858064e-05, "loss": 0.7808231353759766, "memory(GiB)": 91.52, "step": 80490, "token_acc": 0.770516717325228, "train_speed(iter/s)": 0.135751 }, { "epoch": 1.0444806321751456, "grad_norm": 0.8007524609565735, "learning_rate": 5.045644898366568e-05, "loss": 0.7728855609893799, "memory(GiB)": 91.52, "step": 80495, "token_acc": 0.7800880943127348, "train_speed(iter/s)": 0.13575 }, { "epoch": 1.0445455105768013, "grad_norm": 0.6888449788093567, "learning_rate": 5.045108545222052e-05, "loss": 0.7925292015075683, "memory(GiB)": 91.52, "step": 80500, "token_acc": 0.7757489806771849, "train_speed(iter/s)": 0.135749 }, { "epoch": 1.044610388978457, "grad_norm": 0.7284309267997742, "learning_rate": 5.044572191558429e-05, "loss": 0.7711541175842285, "memory(GiB)": 91.52, "step": 80505, "token_acc": 0.7775534921275736, "train_speed(iter/s)": 0.135748 }, { "epoch": 1.0446752673801127, "grad_norm": 0.7293274998664856, "learning_rate": 5.044035837381871e-05, "loss": 0.8155637741088867, "memory(GiB)": 91.52, "step": 80510, "token_acc": 0.7878709582164477, "train_speed(iter/s)": 0.135748 }, { "epoch": 1.0447401457817684, "grad_norm": 0.6582865715026855, "learning_rate": 5.0434994826985525e-05, "loss": 0.8025156021118164, "memory(GiB)": 91.52, "step": 80515, "token_acc": 0.7793279058924792, "train_speed(iter/s)": 0.135747 }, { "epoch": 1.0448050241834241, "grad_norm": 0.6771129965782166, "learning_rate": 5.0429631275146436e-05, "loss": 0.7260346412658691, "memory(GiB)": 91.52, "step": 80520, "token_acc": 0.8032333293806079, "train_speed(iter/s)": 0.135746 }, { "epoch": 1.0448699025850798, "grad_norm": 0.6844883561134338, "learning_rate": 5.042426771836318e-05, "loss": 0.7414481163024902, "memory(GiB)": 91.52, "step": 80525, "token_acc": 0.7772156954105235, "train_speed(iter/s)": 0.135745 }, { "epoch": 1.0449347809867355, "grad_norm": 0.696982204914093, "learning_rate": 5.04189041566975e-05, "loss": 0.7800278663635254, "memory(GiB)": 91.52, "step": 80530, "token_acc": 0.7729133167665546, "train_speed(iter/s)": 0.135744 }, { "epoch": 1.0449996593883912, "grad_norm": 0.6439664959907532, "learning_rate": 5.0413540590211084e-05, "loss": 0.7643799781799316, "memory(GiB)": 91.52, "step": 80535, "token_acc": 0.7889505615076804, "train_speed(iter/s)": 0.135743 }, { "epoch": 1.045064537790047, "grad_norm": 0.6872556805610657, "learning_rate": 5.0408177018965664e-05, "loss": 0.7661314964294433, "memory(GiB)": 91.52, "step": 80540, "token_acc": 0.7790200193735873, "train_speed(iter/s)": 0.135742 }, { "epoch": 1.0451294161917026, "grad_norm": 0.6867358684539795, "learning_rate": 5.040281344302298e-05, "loss": 0.7052226066589355, "memory(GiB)": 91.52, "step": 80545, "token_acc": 0.7919087739686229, "train_speed(iter/s)": 0.135741 }, { "epoch": 1.0451942945933583, "grad_norm": 0.7961623668670654, "learning_rate": 5.039744986244473e-05, "loss": 0.7780416488647461, "memory(GiB)": 91.52, "step": 80550, "token_acc": 0.7985825913138288, "train_speed(iter/s)": 0.13574 }, { "epoch": 1.045259172995014, "grad_norm": 0.8220556378364563, "learning_rate": 5.039208627729266e-05, "loss": 0.7939352989196777, "memory(GiB)": 91.52, "step": 80555, "token_acc": 0.7615060066346743, "train_speed(iter/s)": 0.135739 }, { "epoch": 1.0453240513966697, "grad_norm": 0.7010329365730286, "learning_rate": 5.0386722687628485e-05, "loss": 0.7847723960876465, "memory(GiB)": 91.52, "step": 80560, "token_acc": 0.7822914291298824, "train_speed(iter/s)": 0.135738 }, { "epoch": 1.0453889297983254, "grad_norm": 0.770839512348175, "learning_rate": 5.0381359093513935e-05, "loss": 0.7133594036102295, "memory(GiB)": 91.52, "step": 80565, "token_acc": 0.8060179027213574, "train_speed(iter/s)": 0.135737 }, { "epoch": 1.0454538081999811, "grad_norm": 0.6642371416091919, "learning_rate": 5.037599549501073e-05, "loss": 0.7442507743835449, "memory(GiB)": 91.52, "step": 80570, "token_acc": 0.7899225566120969, "train_speed(iter/s)": 0.135736 }, { "epoch": 1.0455186866016368, "grad_norm": 0.730010449886322, "learning_rate": 5.037063189218059e-05, "loss": 0.7414247989654541, "memory(GiB)": 91.52, "step": 80575, "token_acc": 0.7869683827644096, "train_speed(iter/s)": 0.135734 }, { "epoch": 1.0455835650032925, "grad_norm": 0.7586297988891602, "learning_rate": 5.036526828508523e-05, "loss": 0.7660394191741944, "memory(GiB)": 91.52, "step": 80580, "token_acc": 0.7696675651392633, "train_speed(iter/s)": 0.135733 }, { "epoch": 1.0456484434049482, "grad_norm": 0.6717393398284912, "learning_rate": 5.035990467378641e-05, "loss": 0.7510047435760498, "memory(GiB)": 91.52, "step": 80585, "token_acc": 0.8094822861129105, "train_speed(iter/s)": 0.135731 }, { "epoch": 1.045713321806604, "grad_norm": 0.759983241558075, "learning_rate": 5.0354541058345804e-05, "loss": 0.7862461090087891, "memory(GiB)": 91.52, "step": 80590, "token_acc": 0.7810767363275483, "train_speed(iter/s)": 0.135731 }, { "epoch": 1.0457782002082596, "grad_norm": 0.744686484336853, "learning_rate": 5.034917743882518e-05, "loss": 0.7742136955261231, "memory(GiB)": 91.52, "step": 80595, "token_acc": 0.7740319208945814, "train_speed(iter/s)": 0.13573 }, { "epoch": 1.0458430786099153, "grad_norm": 0.6766502261161804, "learning_rate": 5.034381381528625e-05, "loss": 0.7446435928344727, "memory(GiB)": 91.52, "step": 80600, "token_acc": 0.7819616930374018, "train_speed(iter/s)": 0.135729 }, { "epoch": 1.045907957011571, "grad_norm": 0.7154499888420105, "learning_rate": 5.033845018779072e-05, "loss": 0.7724788188934326, "memory(GiB)": 91.52, "step": 80605, "token_acc": 0.7866339893188967, "train_speed(iter/s)": 0.135728 }, { "epoch": 1.0459728354132267, "grad_norm": 0.7600975036621094, "learning_rate": 5.033308655640033e-05, "loss": 0.7645124435424805, "memory(GiB)": 91.52, "step": 80610, "token_acc": 0.7690457097032879, "train_speed(iter/s)": 0.135727 }, { "epoch": 1.0460377138148824, "grad_norm": 0.7227625846862793, "learning_rate": 5.03277229211768e-05, "loss": 0.8062971115112305, "memory(GiB)": 91.52, "step": 80615, "token_acc": 0.7792792792792793, "train_speed(iter/s)": 0.135726 }, { "epoch": 1.0461025922165381, "grad_norm": 0.6462640166282654, "learning_rate": 5.032235928218186e-05, "loss": 0.8055353164672852, "memory(GiB)": 91.52, "step": 80620, "token_acc": 0.769952584634769, "train_speed(iter/s)": 0.135726 }, { "epoch": 1.0461674706181938, "grad_norm": 0.7584245800971985, "learning_rate": 5.0316995639477226e-05, "loss": 0.7675885200500489, "memory(GiB)": 91.52, "step": 80625, "token_acc": 0.7895097192867562, "train_speed(iter/s)": 0.135725 }, { "epoch": 1.0462323490198495, "grad_norm": 0.7183228731155396, "learning_rate": 5.031163199312463e-05, "loss": 0.8078468322753907, "memory(GiB)": 91.52, "step": 80630, "token_acc": 0.7842249611737104, "train_speed(iter/s)": 0.135724 }, { "epoch": 1.0462972274215052, "grad_norm": 0.6507315039634705, "learning_rate": 5.030626834318579e-05, "loss": 0.7806442260742188, "memory(GiB)": 91.52, "step": 80635, "token_acc": 0.7769259771270579, "train_speed(iter/s)": 0.135723 }, { "epoch": 1.046362105823161, "grad_norm": 0.5990819334983826, "learning_rate": 5.030090468972244e-05, "loss": 0.77166748046875, "memory(GiB)": 91.52, "step": 80640, "token_acc": 0.7755910939546425, "train_speed(iter/s)": 0.135722 }, { "epoch": 1.0464269842248166, "grad_norm": 0.7229293584823608, "learning_rate": 5.029554103279629e-05, "loss": 0.7587263107299804, "memory(GiB)": 91.52, "step": 80645, "token_acc": 0.7937152711247365, "train_speed(iter/s)": 0.135721 }, { "epoch": 1.0464918626264723, "grad_norm": 0.6686103343963623, "learning_rate": 5.029017737246907e-05, "loss": 0.7617045402526855, "memory(GiB)": 91.52, "step": 80650, "token_acc": 0.7818174086830804, "train_speed(iter/s)": 0.13572 }, { "epoch": 1.046556741028128, "grad_norm": 0.770506739616394, "learning_rate": 5.028481370880252e-05, "loss": 0.7395797729492187, "memory(GiB)": 91.52, "step": 80655, "token_acc": 0.8007551456829725, "train_speed(iter/s)": 0.135719 }, { "epoch": 1.0466216194297837, "grad_norm": 0.7253752946853638, "learning_rate": 5.0279450041858344e-05, "loss": 0.7539228439331055, "memory(GiB)": 91.52, "step": 80660, "token_acc": 0.796852751104458, "train_speed(iter/s)": 0.135718 }, { "epoch": 1.0466864978314394, "grad_norm": 0.7756445407867432, "learning_rate": 5.027408637169827e-05, "loss": 0.7702560424804688, "memory(GiB)": 91.52, "step": 80665, "token_acc": 0.7683729433272395, "train_speed(iter/s)": 0.135717 }, { "epoch": 1.046751376233095, "grad_norm": 0.6742216348648071, "learning_rate": 5.026872269838403e-05, "loss": 0.7819499969482422, "memory(GiB)": 91.52, "step": 80670, "token_acc": 0.7798391489634401, "train_speed(iter/s)": 0.135716 }, { "epoch": 1.0468162546347508, "grad_norm": 0.7285366654396057, "learning_rate": 5.0263359021977366e-05, "loss": 0.7763802528381347, "memory(GiB)": 91.52, "step": 80675, "token_acc": 0.783606687581907, "train_speed(iter/s)": 0.135715 }, { "epoch": 1.0468811330364065, "grad_norm": 0.6987460255622864, "learning_rate": 5.025799534253995e-05, "loss": 0.7690751075744628, "memory(GiB)": 91.52, "step": 80680, "token_acc": 0.7876131322094055, "train_speed(iter/s)": 0.135714 }, { "epoch": 1.0469460114380622, "grad_norm": 0.7326175570487976, "learning_rate": 5.025263166013357e-05, "loss": 0.7795119285583496, "memory(GiB)": 91.52, "step": 80685, "token_acc": 0.769224874516265, "train_speed(iter/s)": 0.135713 }, { "epoch": 1.047010889839718, "grad_norm": 0.6862456202507019, "learning_rate": 5.024726797481992e-05, "loss": 0.7585929870605469, "memory(GiB)": 91.52, "step": 80690, "token_acc": 0.7782633593538081, "train_speed(iter/s)": 0.135712 }, { "epoch": 1.0470757682413736, "grad_norm": 0.7291443943977356, "learning_rate": 5.02419042866607e-05, "loss": 0.8023035049438476, "memory(GiB)": 91.52, "step": 80695, "token_acc": 0.7786535176269607, "train_speed(iter/s)": 0.135711 }, { "epoch": 1.0471406466430293, "grad_norm": 0.6992384195327759, "learning_rate": 5.0236540595717684e-05, "loss": 0.7980524063110351, "memory(GiB)": 91.52, "step": 80700, "token_acc": 0.765022100107514, "train_speed(iter/s)": 0.135711 }, { "epoch": 1.047205525044685, "grad_norm": 0.742405891418457, "learning_rate": 5.023117690205256e-05, "loss": 0.7702448844909668, "memory(GiB)": 91.52, "step": 80705, "token_acc": 0.7913625520703529, "train_speed(iter/s)": 0.13571 }, { "epoch": 1.0472704034463407, "grad_norm": 0.7776187062263489, "learning_rate": 5.0225813205727076e-05, "loss": 0.7620813369750976, "memory(GiB)": 91.52, "step": 80710, "token_acc": 0.760331910094256, "train_speed(iter/s)": 0.135709 }, { "epoch": 1.0473352818479964, "grad_norm": 0.6513503789901733, "learning_rate": 5.022044950680294e-05, "loss": 0.7784717082977295, "memory(GiB)": 91.52, "step": 80715, "token_acc": 0.7824040066777963, "train_speed(iter/s)": 0.135708 }, { "epoch": 1.047400160249652, "grad_norm": 0.6396037340164185, "learning_rate": 5.0215085805341896e-05, "loss": 0.737298583984375, "memory(GiB)": 91.52, "step": 80720, "token_acc": 0.8027732902530964, "train_speed(iter/s)": 0.135707 }, { "epoch": 1.0474650386513078, "grad_norm": 0.7631036639213562, "learning_rate": 5.020972210140567e-05, "loss": 0.7488308429718018, "memory(GiB)": 91.52, "step": 80725, "token_acc": 0.7856319870457447, "train_speed(iter/s)": 0.135706 }, { "epoch": 1.0475299170529635, "grad_norm": 0.6757858991622925, "learning_rate": 5.020435839505594e-05, "loss": 0.7120416641235352, "memory(GiB)": 91.52, "step": 80730, "token_acc": 0.7760054015276195, "train_speed(iter/s)": 0.135705 }, { "epoch": 1.0475947954546192, "grad_norm": 0.7284958958625793, "learning_rate": 5.019899468635449e-05, "loss": 0.7506070137023926, "memory(GiB)": 91.52, "step": 80735, "token_acc": 0.7855583915954574, "train_speed(iter/s)": 0.135704 }, { "epoch": 1.047659673856275, "grad_norm": 0.6789535284042358, "learning_rate": 5.0193630975363e-05, "loss": 0.7623260498046875, "memory(GiB)": 91.52, "step": 80740, "token_acc": 0.7818896522416602, "train_speed(iter/s)": 0.135703 }, { "epoch": 1.0477245522579306, "grad_norm": 0.6069558262825012, "learning_rate": 5.018826726214325e-05, "loss": 0.7413181781768798, "memory(GiB)": 91.52, "step": 80745, "token_acc": 0.7888142889267242, "train_speed(iter/s)": 0.135703 }, { "epoch": 1.0477894306595863, "grad_norm": 0.7435779571533203, "learning_rate": 5.018290354675691e-05, "loss": 0.8020789146423339, "memory(GiB)": 91.52, "step": 80750, "token_acc": 0.7764520032266738, "train_speed(iter/s)": 0.135701 }, { "epoch": 1.047854309061242, "grad_norm": 0.6570408940315247, "learning_rate": 5.017753982926574e-05, "loss": 0.7811325550079345, "memory(GiB)": 91.52, "step": 80755, "token_acc": 0.7855288330144848, "train_speed(iter/s)": 0.1357 }, { "epoch": 1.0479191874628977, "grad_norm": 0.8301880955696106, "learning_rate": 5.017217610973145e-05, "loss": 0.7760303497314454, "memory(GiB)": 91.52, "step": 80760, "token_acc": 0.7826665266547657, "train_speed(iter/s)": 0.1357 }, { "epoch": 1.0479840658645534, "grad_norm": 0.6844547986984253, "learning_rate": 5.016681238821577e-05, "loss": 0.7442722797393799, "memory(GiB)": 91.52, "step": 80765, "token_acc": 0.8034369437217582, "train_speed(iter/s)": 0.135699 }, { "epoch": 1.048048944266209, "grad_norm": 0.6643788814544678, "learning_rate": 5.016144866478042e-05, "loss": 0.7670141696929932, "memory(GiB)": 91.52, "step": 80770, "token_acc": 0.7918467549531483, "train_speed(iter/s)": 0.135698 }, { "epoch": 1.0481138226678648, "grad_norm": 0.7488315105438232, "learning_rate": 5.0156084939487124e-05, "loss": 0.7672759056091308, "memory(GiB)": 91.52, "step": 80775, "token_acc": 0.7810307369020907, "train_speed(iter/s)": 0.135697 }, { "epoch": 1.0481787010695205, "grad_norm": 0.6745046973228455, "learning_rate": 5.015072121239762e-05, "loss": 0.7520868301391601, "memory(GiB)": 91.52, "step": 80780, "token_acc": 0.7668743873407086, "train_speed(iter/s)": 0.135696 }, { "epoch": 1.0482435794711762, "grad_norm": 0.7849716544151306, "learning_rate": 5.0145357483573615e-05, "loss": 0.7553213596343994, "memory(GiB)": 91.52, "step": 80785, "token_acc": 0.7776794369283938, "train_speed(iter/s)": 0.135694 }, { "epoch": 1.048308457872832, "grad_norm": 0.6565046906471252, "learning_rate": 5.0139993753076866e-05, "loss": 0.7670061588287354, "memory(GiB)": 91.52, "step": 80790, "token_acc": 0.7696057442113238, "train_speed(iter/s)": 0.135694 }, { "epoch": 1.0483733362744876, "grad_norm": 0.7661032676696777, "learning_rate": 5.013463002096907e-05, "loss": 0.7489359378814697, "memory(GiB)": 91.52, "step": 80795, "token_acc": 0.7945012336975679, "train_speed(iter/s)": 0.135692 }, { "epoch": 1.0484382146761433, "grad_norm": 0.6687561869621277, "learning_rate": 5.012926628731195e-05, "loss": 0.7784754753112793, "memory(GiB)": 91.52, "step": 80800, "token_acc": 0.7657859381997313, "train_speed(iter/s)": 0.135691 }, { "epoch": 1.048503093077799, "grad_norm": 0.7383682727813721, "learning_rate": 5.0123902552167255e-05, "loss": 0.7766969680786133, "memory(GiB)": 91.52, "step": 80805, "token_acc": 0.7700872485671522, "train_speed(iter/s)": 0.135691 }, { "epoch": 1.0485679714794547, "grad_norm": 0.6939226984977722, "learning_rate": 5.0118538815596683e-05, "loss": 0.7584702491760253, "memory(GiB)": 91.52, "step": 80810, "token_acc": 0.8049479388316736, "train_speed(iter/s)": 0.13569 }, { "epoch": 1.0486328498811104, "grad_norm": 0.7385684251785278, "learning_rate": 5.011317507766199e-05, "loss": 0.7894269943237304, "memory(GiB)": 91.52, "step": 80815, "token_acc": 0.7678937709165498, "train_speed(iter/s)": 0.135689 }, { "epoch": 1.048697728282766, "grad_norm": 0.7671500444412231, "learning_rate": 5.010781133842487e-05, "loss": 0.7878998279571533, "memory(GiB)": 91.52, "step": 80820, "token_acc": 0.7933875714038429, "train_speed(iter/s)": 0.135688 }, { "epoch": 1.0487626066844218, "grad_norm": 0.6894187331199646, "learning_rate": 5.010244759794708e-05, "loss": 0.7460880756378174, "memory(GiB)": 91.52, "step": 80825, "token_acc": 0.7771327095143774, "train_speed(iter/s)": 0.135687 }, { "epoch": 1.0488274850860775, "grad_norm": 0.6712985038757324, "learning_rate": 5.009708385629032e-05, "loss": 0.7794939517974854, "memory(GiB)": 91.52, "step": 80830, "token_acc": 0.7780235706930089, "train_speed(iter/s)": 0.135686 }, { "epoch": 1.0488923634877332, "grad_norm": 0.6999342441558838, "learning_rate": 5.009172011351633e-05, "loss": 0.7756319999694824, "memory(GiB)": 91.52, "step": 80835, "token_acc": 0.7750017254468907, "train_speed(iter/s)": 0.135685 }, { "epoch": 1.0489572418893889, "grad_norm": 0.7308348417282104, "learning_rate": 5.008635636968683e-05, "loss": 0.763735294342041, "memory(GiB)": 91.52, "step": 80840, "token_acc": 0.7800254716701714, "train_speed(iter/s)": 0.135684 }, { "epoch": 1.0490221202910446, "grad_norm": 0.6924304366111755, "learning_rate": 5.008099262486352e-05, "loss": 0.7365918159484863, "memory(GiB)": 91.52, "step": 80845, "token_acc": 0.7947314403010606, "train_speed(iter/s)": 0.135683 }, { "epoch": 1.0490869986927003, "grad_norm": 0.7685492038726807, "learning_rate": 5.007562887910818e-05, "loss": 0.7629077911376954, "memory(GiB)": 91.52, "step": 80850, "token_acc": 0.7704803042049692, "train_speed(iter/s)": 0.135682 }, { "epoch": 1.049151877094356, "grad_norm": 0.6952677965164185, "learning_rate": 5.007026513248251e-05, "loss": 0.7645166397094727, "memory(GiB)": 91.52, "step": 80855, "token_acc": 0.7881229483736198, "train_speed(iter/s)": 0.135681 }, { "epoch": 1.0492167554960117, "grad_norm": 0.687980592250824, "learning_rate": 5.006490138504823e-05, "loss": 0.7740771293640136, "memory(GiB)": 91.52, "step": 80860, "token_acc": 0.7819054839524893, "train_speed(iter/s)": 0.135681 }, { "epoch": 1.0492816338976674, "grad_norm": 0.674182653427124, "learning_rate": 5.005953763686706e-05, "loss": 0.768023681640625, "memory(GiB)": 91.52, "step": 80865, "token_acc": 0.7791185254457148, "train_speed(iter/s)": 0.13568 }, { "epoch": 1.049346512299323, "grad_norm": 0.779373288154602, "learning_rate": 5.0054173888000745e-05, "loss": 0.7462573051452637, "memory(GiB)": 91.52, "step": 80870, "token_acc": 0.7918853451756157, "train_speed(iter/s)": 0.135679 }, { "epoch": 1.0494113907009788, "grad_norm": 0.7545949816703796, "learning_rate": 5.0048810138511003e-05, "loss": 0.7740977287292481, "memory(GiB)": 91.52, "step": 80875, "token_acc": 0.7554030373831776, "train_speed(iter/s)": 0.135679 }, { "epoch": 1.0494762691026345, "grad_norm": 0.6447601914405823, "learning_rate": 5.004344638845954e-05, "loss": 0.752830171585083, "memory(GiB)": 91.52, "step": 80880, "token_acc": 0.7907807174160039, "train_speed(iter/s)": 0.135678 }, { "epoch": 1.0495411475042902, "grad_norm": 0.687736988067627, "learning_rate": 5.003808263790811e-05, "loss": 0.758596658706665, "memory(GiB)": 91.52, "step": 80885, "token_acc": 0.7679679225298834, "train_speed(iter/s)": 0.135677 }, { "epoch": 1.0496060259059459, "grad_norm": 0.7021349668502808, "learning_rate": 5.003271888691843e-05, "loss": 0.768272066116333, "memory(GiB)": 91.52, "step": 80890, "token_acc": 0.7826153964706789, "train_speed(iter/s)": 0.135676 }, { "epoch": 1.0496709043076016, "grad_norm": 0.7173088192939758, "learning_rate": 5.002735513555221e-05, "loss": 0.7327693462371826, "memory(GiB)": 91.52, "step": 80895, "token_acc": 0.7645351779989877, "train_speed(iter/s)": 0.135675 }, { "epoch": 1.0497357827092573, "grad_norm": 0.7534662485122681, "learning_rate": 5.0021991383871203e-05, "loss": 0.7520478248596192, "memory(GiB)": 91.52, "step": 80900, "token_acc": 0.7992203378535968, "train_speed(iter/s)": 0.135674 }, { "epoch": 1.049800661110913, "grad_norm": 0.6286566257476807, "learning_rate": 5.001662763193712e-05, "loss": 0.7230573654174804, "memory(GiB)": 91.52, "step": 80905, "token_acc": 0.8021647193749551, "train_speed(iter/s)": 0.135674 }, { "epoch": 1.0498655395125687, "grad_norm": 0.6868165135383606, "learning_rate": 5.0011263879811685e-05, "loss": 0.7418311595916748, "memory(GiB)": 91.52, "step": 80910, "token_acc": 0.7834989821578254, "train_speed(iter/s)": 0.135673 }, { "epoch": 1.0499304179142244, "grad_norm": 0.7882614731788635, "learning_rate": 5.000590012755662e-05, "loss": 0.8101753234863281, "memory(GiB)": 91.52, "step": 80915, "token_acc": 0.7694568336617916, "train_speed(iter/s)": 0.135672 }, { "epoch": 1.04999529631588, "grad_norm": 0.7071865797042847, "learning_rate": 5.000053637523365e-05, "loss": 0.740226936340332, "memory(GiB)": 91.52, "step": 80920, "token_acc": 0.7740688187300461, "train_speed(iter/s)": 0.135671 }, { "epoch": 1.0500601747175358, "grad_norm": 0.6725398302078247, "learning_rate": 4.999517262290453e-05, "loss": 0.7457611083984375, "memory(GiB)": 91.52, "step": 80925, "token_acc": 0.7829747100740534, "train_speed(iter/s)": 0.13567 }, { "epoch": 1.0501250531191912, "grad_norm": 0.6380279064178467, "learning_rate": 4.998980887063094e-05, "loss": 0.7407719612121582, "memory(GiB)": 91.52, "step": 80930, "token_acc": 0.8009934802856256, "train_speed(iter/s)": 0.135668 }, { "epoch": 1.0501899315208472, "grad_norm": 0.7313293814659119, "learning_rate": 4.998444511847465e-05, "loss": 0.7488584041595459, "memory(GiB)": 91.52, "step": 80935, "token_acc": 0.7791437906295261, "train_speed(iter/s)": 0.135667 }, { "epoch": 1.0502548099225026, "grad_norm": 0.6819082498550415, "learning_rate": 4.997908136649736e-05, "loss": 0.7772449493408203, "memory(GiB)": 91.52, "step": 80940, "token_acc": 0.773340007866414, "train_speed(iter/s)": 0.135667 }, { "epoch": 1.0503196883241583, "grad_norm": 0.7202655673027039, "learning_rate": 4.99737176147608e-05, "loss": 0.7615849494934082, "memory(GiB)": 91.52, "step": 80945, "token_acc": 0.8018802343672415, "train_speed(iter/s)": 0.135666 }, { "epoch": 1.050384566725814, "grad_norm": 0.6786247491836548, "learning_rate": 4.9968353863326675e-05, "loss": 0.7671717643737793, "memory(GiB)": 91.52, "step": 80950, "token_acc": 0.7940386579415214, "train_speed(iter/s)": 0.135665 }, { "epoch": 1.0504494451274697, "grad_norm": 0.7409055829048157, "learning_rate": 4.996299011225675e-05, "loss": 0.7520557403564453, "memory(GiB)": 91.52, "step": 80955, "token_acc": 0.7523613676762748, "train_speed(iter/s)": 0.135664 }, { "epoch": 1.0505143235291254, "grad_norm": 0.7067068219184875, "learning_rate": 4.995762636161273e-05, "loss": 0.7862460613250732, "memory(GiB)": 91.52, "step": 80960, "token_acc": 0.7694377398805049, "train_speed(iter/s)": 0.135664 }, { "epoch": 1.0505792019307811, "grad_norm": 0.6769319176673889, "learning_rate": 4.9952262611456336e-05, "loss": 0.7436795234680176, "memory(GiB)": 91.52, "step": 80965, "token_acc": 0.7915108862986585, "train_speed(iter/s)": 0.135663 }, { "epoch": 1.0506440803324368, "grad_norm": 0.7101556062698364, "learning_rate": 4.9946898861849324e-05, "loss": 0.7529419898986817, "memory(GiB)": 91.52, "step": 80970, "token_acc": 0.7820420851267336, "train_speed(iter/s)": 0.135662 }, { "epoch": 1.0507089587340925, "grad_norm": 0.6908684968948364, "learning_rate": 4.9941535112853376e-05, "loss": 0.7848719596862793, "memory(GiB)": 91.52, "step": 80975, "token_acc": 0.7717568755494161, "train_speed(iter/s)": 0.135661 }, { "epoch": 1.0507738371357482, "grad_norm": 0.6437702775001526, "learning_rate": 4.993617136453025e-05, "loss": 0.7945569038391114, "memory(GiB)": 91.52, "step": 80980, "token_acc": 0.7764144967159003, "train_speed(iter/s)": 0.13566 }, { "epoch": 1.050838715537404, "grad_norm": 0.7525973916053772, "learning_rate": 4.993080761694164e-05, "loss": 0.7790314674377441, "memory(GiB)": 91.52, "step": 80985, "token_acc": 0.7900917176209005, "train_speed(iter/s)": 0.13566 }, { "epoch": 1.0509035939390596, "grad_norm": 0.6831509470939636, "learning_rate": 4.992544387014931e-05, "loss": 0.7304903030395508, "memory(GiB)": 91.52, "step": 80990, "token_acc": 0.7894387246761874, "train_speed(iter/s)": 0.135659 }, { "epoch": 1.0509684723407153, "grad_norm": 0.7384469509124756, "learning_rate": 4.992008012421496e-05, "loss": 0.7761649131774903, "memory(GiB)": 91.52, "step": 80995, "token_acc": 0.7878557999934035, "train_speed(iter/s)": 0.135658 }, { "epoch": 1.051033350742371, "grad_norm": 0.7475671768188477, "learning_rate": 4.9914716379200315e-05, "loss": 0.8157510757446289, "memory(GiB)": 91.52, "step": 81000, "token_acc": 0.7681736231588258, "train_speed(iter/s)": 0.135657 }, { "epoch": 1.0510982291440267, "grad_norm": 0.6984883546829224, "learning_rate": 4.9909352635167126e-05, "loss": 0.7871140480041504, "memory(GiB)": 91.52, "step": 81005, "token_acc": 0.7590652966523036, "train_speed(iter/s)": 0.135656 }, { "epoch": 1.0511631075456824, "grad_norm": 0.6561785340309143, "learning_rate": 4.9903988892177095e-05, "loss": 0.7465472221374512, "memory(GiB)": 91.52, "step": 81010, "token_acc": 0.7976368649074439, "train_speed(iter/s)": 0.135655 }, { "epoch": 1.0512279859473381, "grad_norm": 0.7930207848548889, "learning_rate": 4.9898625150291955e-05, "loss": 0.7881531715393066, "memory(GiB)": 91.52, "step": 81015, "token_acc": 0.7767302828236377, "train_speed(iter/s)": 0.135654 }, { "epoch": 1.0512928643489938, "grad_norm": 0.664578914642334, "learning_rate": 4.9893261409573416e-05, "loss": 0.7432303428649902, "memory(GiB)": 91.52, "step": 81020, "token_acc": 0.8027637269994868, "train_speed(iter/s)": 0.135653 }, { "epoch": 1.0513577427506495, "grad_norm": 0.7285761833190918, "learning_rate": 4.988789767008323e-05, "loss": 0.7694379806518554, "memory(GiB)": 91.52, "step": 81025, "token_acc": 0.7871374756047747, "train_speed(iter/s)": 0.135652 }, { "epoch": 1.0514226211523052, "grad_norm": 0.6930109262466431, "learning_rate": 4.98825339318831e-05, "loss": 0.8194171905517578, "memory(GiB)": 91.52, "step": 81030, "token_acc": 0.7939895470383276, "train_speed(iter/s)": 0.135652 }, { "epoch": 1.051487499553961, "grad_norm": 0.7598806619644165, "learning_rate": 4.987717019503477e-05, "loss": 0.7954629898071289, "memory(GiB)": 91.52, "step": 81035, "token_acc": 0.7880829433596902, "train_speed(iter/s)": 0.135651 }, { "epoch": 1.0515523779556166, "grad_norm": 0.721805214881897, "learning_rate": 4.987180645959997e-05, "loss": 0.7656588077545166, "memory(GiB)": 91.52, "step": 81040, "token_acc": 0.7797190947772715, "train_speed(iter/s)": 0.13565 }, { "epoch": 1.0516172563572723, "grad_norm": 0.7011282444000244, "learning_rate": 4.98664427256404e-05, "loss": 0.7617023468017579, "memory(GiB)": 91.52, "step": 81045, "token_acc": 0.7730336374181008, "train_speed(iter/s)": 0.135649 }, { "epoch": 1.051682134758928, "grad_norm": 0.6665641069412231, "learning_rate": 4.9861078993217806e-05, "loss": 0.7988775253295899, "memory(GiB)": 91.52, "step": 81050, "token_acc": 0.783884489986027, "train_speed(iter/s)": 0.135648 }, { "epoch": 1.0517470131605837, "grad_norm": 0.6429014801979065, "learning_rate": 4.985571526239389e-05, "loss": 0.7301380157470703, "memory(GiB)": 91.52, "step": 81055, "token_acc": 0.7893905677984352, "train_speed(iter/s)": 0.135648 }, { "epoch": 1.0518118915622394, "grad_norm": 0.7593722343444824, "learning_rate": 4.985035153323041e-05, "loss": 0.7438491821289063, "memory(GiB)": 91.52, "step": 81060, "token_acc": 0.7756190168437209, "train_speed(iter/s)": 0.135647 }, { "epoch": 1.0518767699638951, "grad_norm": 0.8160414099693298, "learning_rate": 4.9844987805789055e-05, "loss": 0.7497736930847168, "memory(GiB)": 91.52, "step": 81065, "token_acc": 0.7866057155677707, "train_speed(iter/s)": 0.135646 }, { "epoch": 1.0519416483655508, "grad_norm": 0.7113611102104187, "learning_rate": 4.983962408013159e-05, "loss": 0.7705791473388672, "memory(GiB)": 91.52, "step": 81070, "token_acc": 0.8014816715307399, "train_speed(iter/s)": 0.135645 }, { "epoch": 1.0520065267672065, "grad_norm": 0.671649158000946, "learning_rate": 4.983426035631971e-05, "loss": 0.7635675430297851, "memory(GiB)": 91.52, "step": 81075, "token_acc": 0.7831376282661588, "train_speed(iter/s)": 0.135644 }, { "epoch": 1.0520714051688622, "grad_norm": 0.8193967938423157, "learning_rate": 4.982889663441517e-05, "loss": 0.7645220756530762, "memory(GiB)": 91.52, "step": 81080, "token_acc": 0.8257170077860639, "train_speed(iter/s)": 0.135643 }, { "epoch": 1.052136283570518, "grad_norm": 0.7340214252471924, "learning_rate": 4.982353291447966e-05, "loss": 0.7590010166168213, "memory(GiB)": 91.52, "step": 81085, "token_acc": 0.7947952458739368, "train_speed(iter/s)": 0.135642 }, { "epoch": 1.0522011619721736, "grad_norm": 0.7511987090110779, "learning_rate": 4.981816919657491e-05, "loss": 0.7695080757141113, "memory(GiB)": 91.52, "step": 81090, "token_acc": 0.7651995643538984, "train_speed(iter/s)": 0.135641 }, { "epoch": 1.0522660403738293, "grad_norm": 0.6706491112709045, "learning_rate": 4.981280548076267e-05, "loss": 0.7385478973388672, "memory(GiB)": 91.52, "step": 81095, "token_acc": 0.7937480063795853, "train_speed(iter/s)": 0.13564 }, { "epoch": 1.052330918775485, "grad_norm": 0.7653605341911316, "learning_rate": 4.9807441767104654e-05, "loss": 0.7642183303833008, "memory(GiB)": 91.52, "step": 81100, "token_acc": 0.786178127553434, "train_speed(iter/s)": 0.13564 }, { "epoch": 1.0523957971771407, "grad_norm": 0.6714946031570435, "learning_rate": 4.980207805566258e-05, "loss": 0.7495768070220947, "memory(GiB)": 91.52, "step": 81105, "token_acc": 0.7799233836291142, "train_speed(iter/s)": 0.135638 }, { "epoch": 1.0524606755787964, "grad_norm": 0.7593004703521729, "learning_rate": 4.979671434649819e-05, "loss": 0.7638710975646973, "memory(GiB)": 91.52, "step": 81110, "token_acc": 0.7796880667392093, "train_speed(iter/s)": 0.135637 }, { "epoch": 1.052525553980452, "grad_norm": 0.8097706437110901, "learning_rate": 4.9791350639673206e-05, "loss": 0.7527228355407715, "memory(GiB)": 91.52, "step": 81115, "token_acc": 0.7932182658331068, "train_speed(iter/s)": 0.135637 }, { "epoch": 1.0525904323821078, "grad_norm": 0.813147246837616, "learning_rate": 4.978598693524933e-05, "loss": 0.7860183238983154, "memory(GiB)": 91.52, "step": 81120, "token_acc": 0.785809906291834, "train_speed(iter/s)": 0.135636 }, { "epoch": 1.0526553107837635, "grad_norm": 0.7799100875854492, "learning_rate": 4.9780623233288295e-05, "loss": 0.7773765563964844, "memory(GiB)": 91.52, "step": 81125, "token_acc": 0.7853115618364209, "train_speed(iter/s)": 0.135635 }, { "epoch": 1.0527201891854192, "grad_norm": 0.6617407202720642, "learning_rate": 4.9775259533851845e-05, "loss": 0.7502443313598632, "memory(GiB)": 91.52, "step": 81130, "token_acc": 0.7840679176419248, "train_speed(iter/s)": 0.135634 }, { "epoch": 1.052785067587075, "grad_norm": 0.7157840728759766, "learning_rate": 4.9769895837001684e-05, "loss": 0.748545789718628, "memory(GiB)": 91.52, "step": 81135, "token_acc": 0.7803322149041051, "train_speed(iter/s)": 0.135633 }, { "epoch": 1.0528499459887306, "grad_norm": 0.6992666125297546, "learning_rate": 4.976453214279956e-05, "loss": 0.769019889831543, "memory(GiB)": 91.52, "step": 81140, "token_acc": 0.7659634516333874, "train_speed(iter/s)": 0.135632 }, { "epoch": 1.0529148243903863, "grad_norm": 0.7447926998138428, "learning_rate": 4.975916845130717e-05, "loss": 0.7336762905120849, "memory(GiB)": 91.52, "step": 81145, "token_acc": 0.8045043536503684, "train_speed(iter/s)": 0.135631 }, { "epoch": 1.052979702792042, "grad_norm": 0.7011080980300903, "learning_rate": 4.975380476258628e-05, "loss": 0.7526142120361328, "memory(GiB)": 91.52, "step": 81150, "token_acc": 0.7772184457100864, "train_speed(iter/s)": 0.13563 }, { "epoch": 1.0530445811936977, "grad_norm": 0.6941817402839661, "learning_rate": 4.9748441076698574e-05, "loss": 0.7666382789611816, "memory(GiB)": 91.52, "step": 81155, "token_acc": 0.7677312177922431, "train_speed(iter/s)": 0.135628 }, { "epoch": 1.0531094595953534, "grad_norm": 0.854539692401886, "learning_rate": 4.9743077393705785e-05, "loss": 0.7588274478912354, "memory(GiB)": 91.52, "step": 81160, "token_acc": 0.7690475434306222, "train_speed(iter/s)": 0.135628 }, { "epoch": 1.053174337997009, "grad_norm": 0.7605555653572083, "learning_rate": 4.9737713713669655e-05, "loss": 0.7968343257904053, "memory(GiB)": 91.52, "step": 81165, "token_acc": 0.7712300914944087, "train_speed(iter/s)": 0.135627 }, { "epoch": 1.0532392163986648, "grad_norm": 0.734922468662262, "learning_rate": 4.973235003665188e-05, "loss": 0.7738957405090332, "memory(GiB)": 91.52, "step": 81170, "token_acc": 0.7930759719741722, "train_speed(iter/s)": 0.135626 }, { "epoch": 1.0533040948003205, "grad_norm": 0.7536882162094116, "learning_rate": 4.9726986362714214e-05, "loss": 0.7824395179748536, "memory(GiB)": 91.52, "step": 81175, "token_acc": 0.7679997135798933, "train_speed(iter/s)": 0.135625 }, { "epoch": 1.0533689732019762, "grad_norm": 0.7923726439476013, "learning_rate": 4.972162269191839e-05, "loss": 0.7944036483764648, "memory(GiB)": 91.52, "step": 81180, "token_acc": 0.7844274809160305, "train_speed(iter/s)": 0.135624 }, { "epoch": 1.053433851603632, "grad_norm": 0.7208966612815857, "learning_rate": 4.971625902432611e-05, "loss": 0.7178873538970947, "memory(GiB)": 91.52, "step": 81185, "token_acc": 0.7919309905011245, "train_speed(iter/s)": 0.135623 }, { "epoch": 1.0534987300052876, "grad_norm": 0.6970635652542114, "learning_rate": 4.971089535999908e-05, "loss": 0.7493049144744873, "memory(GiB)": 91.52, "step": 81190, "token_acc": 0.7828660716968658, "train_speed(iter/s)": 0.135622 }, { "epoch": 1.0535636084069433, "grad_norm": 0.7116245627403259, "learning_rate": 4.970553169899906e-05, "loss": 0.7755885124206543, "memory(GiB)": 91.52, "step": 81195, "token_acc": 0.7859105973648047, "train_speed(iter/s)": 0.135621 }, { "epoch": 1.053628486808599, "grad_norm": 0.7112133502960205, "learning_rate": 4.970016804138776e-05, "loss": 0.7339110374450684, "memory(GiB)": 91.52, "step": 81200, "token_acc": 0.769815676024305, "train_speed(iter/s)": 0.13562 }, { "epoch": 1.0536933652102547, "grad_norm": 0.7254423499107361, "learning_rate": 4.96948043872269e-05, "loss": 0.7667679786682129, "memory(GiB)": 91.52, "step": 81205, "token_acc": 0.7842291644752989, "train_speed(iter/s)": 0.135619 }, { "epoch": 1.0537582436119104, "grad_norm": 0.7490929961204529, "learning_rate": 4.968944073657822e-05, "loss": 0.7729598045349121, "memory(GiB)": 91.52, "step": 81210, "token_acc": 0.7775277636640778, "train_speed(iter/s)": 0.135618 }, { "epoch": 1.053823122013566, "grad_norm": 0.6217684745788574, "learning_rate": 4.968407708950344e-05, "loss": 0.7605720043182373, "memory(GiB)": 91.52, "step": 81215, "token_acc": 0.7935132295975313, "train_speed(iter/s)": 0.135617 }, { "epoch": 1.0538880004152218, "grad_norm": 0.6702299118041992, "learning_rate": 4.967871344606429e-05, "loss": 0.7516839981079102, "memory(GiB)": 91.52, "step": 81220, "token_acc": 0.7719339622641509, "train_speed(iter/s)": 0.135616 }, { "epoch": 1.0539528788168775, "grad_norm": 0.6822916865348816, "learning_rate": 4.967334980632246e-05, "loss": 0.7923895835876464, "memory(GiB)": 91.52, "step": 81225, "token_acc": 0.7641641368489273, "train_speed(iter/s)": 0.135615 }, { "epoch": 1.0540177572185332, "grad_norm": 0.734447717666626, "learning_rate": 4.9667986170339716e-05, "loss": 0.7642396926879883, "memory(GiB)": 91.52, "step": 81230, "token_acc": 0.7635420045410315, "train_speed(iter/s)": 0.135614 }, { "epoch": 1.054082635620189, "grad_norm": 0.6942445635795593, "learning_rate": 4.966262253817776e-05, "loss": 0.7460093975067139, "memory(GiB)": 91.52, "step": 81235, "token_acc": 0.766850954256286, "train_speed(iter/s)": 0.135614 }, { "epoch": 1.0541475140218446, "grad_norm": 0.6755661368370056, "learning_rate": 4.965725890989831e-05, "loss": 0.7980900287628174, "memory(GiB)": 91.52, "step": 81240, "token_acc": 0.7959995286631709, "train_speed(iter/s)": 0.135613 }, { "epoch": 1.0542123924235003, "grad_norm": 0.669464111328125, "learning_rate": 4.965189528556312e-05, "loss": 0.7850618839263916, "memory(GiB)": 91.52, "step": 81245, "token_acc": 0.7781056571181805, "train_speed(iter/s)": 0.135612 }, { "epoch": 1.054277270825156, "grad_norm": 0.6641419529914856, "learning_rate": 4.964653166523389e-05, "loss": 0.7424266815185547, "memory(GiB)": 91.52, "step": 81250, "token_acc": 0.7691253710892898, "train_speed(iter/s)": 0.135611 }, { "epoch": 1.0543421492268117, "grad_norm": 0.7283433079719543, "learning_rate": 4.964116804897235e-05, "loss": 0.7325877189636231, "memory(GiB)": 91.52, "step": 81255, "token_acc": 0.7802016498625115, "train_speed(iter/s)": 0.13561 }, { "epoch": 1.0544070276284674, "grad_norm": 0.6916073560714722, "learning_rate": 4.963580443684024e-05, "loss": 0.7924105167388916, "memory(GiB)": 91.52, "step": 81260, "token_acc": 0.7795192645321927, "train_speed(iter/s)": 0.135609 }, { "epoch": 1.054471906030123, "grad_norm": 0.7094005346298218, "learning_rate": 4.9630440828899254e-05, "loss": 0.7761828422546386, "memory(GiB)": 91.52, "step": 81265, "token_acc": 0.7908322956281763, "train_speed(iter/s)": 0.135608 }, { "epoch": 1.0545367844317788, "grad_norm": 0.6939356327056885, "learning_rate": 4.962507722521114e-05, "loss": 0.7701343536376953, "memory(GiB)": 91.52, "step": 81270, "token_acc": 0.7697275213337287, "train_speed(iter/s)": 0.135607 }, { "epoch": 1.0546016628334345, "grad_norm": 0.7779294848442078, "learning_rate": 4.9619713625837595e-05, "loss": 0.7582773685455322, "memory(GiB)": 91.52, "step": 81275, "token_acc": 0.7915741230389565, "train_speed(iter/s)": 0.135606 }, { "epoch": 1.0546665412350902, "grad_norm": 0.723688542842865, "learning_rate": 4.9614350030840375e-05, "loss": 0.7257180690765381, "memory(GiB)": 91.52, "step": 81280, "token_acc": 0.7662273844092026, "train_speed(iter/s)": 0.135605 }, { "epoch": 1.0547314196367459, "grad_norm": 0.6513846516609192, "learning_rate": 4.9608986440281194e-05, "loss": 0.7094307899475097, "memory(GiB)": 91.52, "step": 81285, "token_acc": 0.7783792453586752, "train_speed(iter/s)": 0.135604 }, { "epoch": 1.0547962980384016, "grad_norm": 0.7298066020011902, "learning_rate": 4.960362285422176e-05, "loss": 0.7767674446105957, "memory(GiB)": 91.52, "step": 81290, "token_acc": 0.7712921199257098, "train_speed(iter/s)": 0.135603 }, { "epoch": 1.0548611764400573, "grad_norm": 0.808903694152832, "learning_rate": 4.9598259272723816e-05, "loss": 0.7496402740478516, "memory(GiB)": 91.52, "step": 81295, "token_acc": 0.7674344464812306, "train_speed(iter/s)": 0.135602 }, { "epoch": 1.054926054841713, "grad_norm": 0.7285813093185425, "learning_rate": 4.959289569584908e-05, "loss": 0.7632663726806641, "memory(GiB)": 91.52, "step": 81300, "token_acc": 0.7912997147961263, "train_speed(iter/s)": 0.135601 }, { "epoch": 1.0549909332433687, "grad_norm": 0.7060366868972778, "learning_rate": 4.9587532123659275e-05, "loss": 0.7685245513916016, "memory(GiB)": 91.52, "step": 81305, "token_acc": 0.777379006467202, "train_speed(iter/s)": 0.1356 }, { "epoch": 1.0550558116450244, "grad_norm": 0.6852076053619385, "learning_rate": 4.958216855621611e-05, "loss": 0.8045031547546386, "memory(GiB)": 91.52, "step": 81310, "token_acc": 0.7650483220455064, "train_speed(iter/s)": 0.1356 }, { "epoch": 1.05512069004668, "grad_norm": 0.7403397560119629, "learning_rate": 4.9576804993581337e-05, "loss": 0.8098220825195312, "memory(GiB)": 91.52, "step": 81315, "token_acc": 0.7617396473152368, "train_speed(iter/s)": 0.135599 }, { "epoch": 1.0551855684483358, "grad_norm": 0.6829013228416443, "learning_rate": 4.957144143581666e-05, "loss": 0.7363306045532226, "memory(GiB)": 91.52, "step": 81320, "token_acc": 0.7944286526435475, "train_speed(iter/s)": 0.135598 }, { "epoch": 1.0552504468499915, "grad_norm": 0.6442685127258301, "learning_rate": 4.95660778829838e-05, "loss": 0.7908729553222656, "memory(GiB)": 91.52, "step": 81325, "token_acc": 0.7949661908339595, "train_speed(iter/s)": 0.135598 }, { "epoch": 1.0553153252516472, "grad_norm": 0.729356050491333, "learning_rate": 4.956071433514451e-05, "loss": 0.784150505065918, "memory(GiB)": 91.52, "step": 81330, "token_acc": 0.7862674573085772, "train_speed(iter/s)": 0.135597 }, { "epoch": 1.0553802036533029, "grad_norm": 0.7141989469528198, "learning_rate": 4.955535079236048e-05, "loss": 0.7849835395812989, "memory(GiB)": 91.52, "step": 81335, "token_acc": 0.7584732340829902, "train_speed(iter/s)": 0.135596 }, { "epoch": 1.0554450820549586, "grad_norm": 0.7408648133277893, "learning_rate": 4.9549987254693445e-05, "loss": 0.7576370239257812, "memory(GiB)": 91.52, "step": 81340, "token_acc": 0.7647555484078482, "train_speed(iter/s)": 0.135595 }, { "epoch": 1.0555099604566143, "grad_norm": 0.7688354253768921, "learning_rate": 4.954462372220511e-05, "loss": 0.77403564453125, "memory(GiB)": 91.52, "step": 81345, "token_acc": 0.7709215836696575, "train_speed(iter/s)": 0.135593 }, { "epoch": 1.05557483885827, "grad_norm": 0.7038239240646362, "learning_rate": 4.953926019495723e-05, "loss": 0.7093947410583497, "memory(GiB)": 91.52, "step": 81350, "token_acc": 0.7955042044626895, "train_speed(iter/s)": 0.135592 }, { "epoch": 1.0556397172599257, "grad_norm": 0.6307413578033447, "learning_rate": 4.953389667301152e-05, "loss": 0.7662993431091308, "memory(GiB)": 91.52, "step": 81355, "token_acc": 0.7872607862238004, "train_speed(iter/s)": 0.135591 }, { "epoch": 1.0557045956615814, "grad_norm": 0.7674939632415771, "learning_rate": 4.952853315642968e-05, "loss": 0.7582688331604004, "memory(GiB)": 91.52, "step": 81360, "token_acc": 0.7714360177221787, "train_speed(iter/s)": 0.13559 }, { "epoch": 1.055769474063237, "grad_norm": 0.7784945368766785, "learning_rate": 4.952316964527347e-05, "loss": 0.714875602722168, "memory(GiB)": 91.52, "step": 81365, "token_acc": 0.785818660042376, "train_speed(iter/s)": 0.135589 }, { "epoch": 1.0558343524648928, "grad_norm": 0.719404935836792, "learning_rate": 4.951780613960459e-05, "loss": 0.77784104347229, "memory(GiB)": 91.52, "step": 81370, "token_acc": 0.7794871794871795, "train_speed(iter/s)": 0.135588 }, { "epoch": 1.0558992308665485, "grad_norm": 0.6783004999160767, "learning_rate": 4.951244263948476e-05, "loss": 0.7078537940979004, "memory(GiB)": 91.52, "step": 81375, "token_acc": 0.8269721857961212, "train_speed(iter/s)": 0.135587 }, { "epoch": 1.0559641092682042, "grad_norm": 0.6414528489112854, "learning_rate": 4.95070791449757e-05, "loss": 0.7689896583557129, "memory(GiB)": 91.52, "step": 81380, "token_acc": 0.7751940647111355, "train_speed(iter/s)": 0.135586 }, { "epoch": 1.0560289876698599, "grad_norm": 0.6984164118766785, "learning_rate": 4.950171565613915e-05, "loss": 0.7664527893066406, "memory(GiB)": 91.52, "step": 81385, "token_acc": 0.7778389538885065, "train_speed(iter/s)": 0.135585 }, { "epoch": 1.0560938660715156, "grad_norm": 0.7698705196380615, "learning_rate": 4.949635217303683e-05, "loss": 0.76150484085083, "memory(GiB)": 91.52, "step": 81390, "token_acc": 0.792271540469974, "train_speed(iter/s)": 0.135584 }, { "epoch": 1.0561587444731713, "grad_norm": 0.7933986186981201, "learning_rate": 4.949098869573044e-05, "loss": 0.8045093536376953, "memory(GiB)": 91.52, "step": 81395, "token_acc": 0.7777870563674322, "train_speed(iter/s)": 0.135583 }, { "epoch": 1.056223622874827, "grad_norm": 0.7010364532470703, "learning_rate": 4.948562522428174e-05, "loss": 0.7887187480926514, "memory(GiB)": 91.52, "step": 81400, "token_acc": 0.7922308468794598, "train_speed(iter/s)": 0.135582 }, { "epoch": 1.0562885012764824, "grad_norm": 0.7593325972557068, "learning_rate": 4.948026175875243e-05, "loss": 0.7747961044311523, "memory(GiB)": 91.52, "step": 81405, "token_acc": 0.7917230352970115, "train_speed(iter/s)": 0.135581 }, { "epoch": 1.0563533796781384, "grad_norm": 0.7180027961730957, "learning_rate": 4.947489829920423e-05, "loss": 0.7511000633239746, "memory(GiB)": 91.52, "step": 81410, "token_acc": 0.7798020615732338, "train_speed(iter/s)": 0.13558 }, { "epoch": 1.0564182580797938, "grad_norm": 0.6832510828971863, "learning_rate": 4.946953484569885e-05, "loss": 0.7887825965881348, "memory(GiB)": 91.52, "step": 81415, "token_acc": 0.7771596352959357, "train_speed(iter/s)": 0.135579 }, { "epoch": 1.0564831364814495, "grad_norm": 0.6692165732383728, "learning_rate": 4.946417139829804e-05, "loss": 0.7297235012054444, "memory(GiB)": 91.52, "step": 81420, "token_acc": 0.7942475638129278, "train_speed(iter/s)": 0.135578 }, { "epoch": 1.0565480148831052, "grad_norm": 0.8551298379898071, "learning_rate": 4.945880795706352e-05, "loss": 0.75442476272583, "memory(GiB)": 91.52, "step": 81425, "token_acc": 0.7954386781475448, "train_speed(iter/s)": 0.135577 }, { "epoch": 1.056612893284761, "grad_norm": 0.6435257792472839, "learning_rate": 4.9453444522056984e-05, "loss": 0.754216480255127, "memory(GiB)": 91.52, "step": 81430, "token_acc": 0.7633227157224151, "train_speed(iter/s)": 0.135576 }, { "epoch": 1.0566777716864166, "grad_norm": 0.7604333758354187, "learning_rate": 4.9448081093340185e-05, "loss": 0.747586441040039, "memory(GiB)": 91.52, "step": 81435, "token_acc": 0.7733333333333333, "train_speed(iter/s)": 0.135575 }, { "epoch": 1.0567426500880723, "grad_norm": 0.680002748966217, "learning_rate": 4.9442717670974845e-05, "loss": 0.7586349487304688, "memory(GiB)": 91.52, "step": 81440, "token_acc": 0.7980320598697279, "train_speed(iter/s)": 0.135575 }, { "epoch": 1.056807528489728, "grad_norm": 0.6804476976394653, "learning_rate": 4.943735425502266e-05, "loss": 0.7330291748046875, "memory(GiB)": 91.52, "step": 81445, "token_acc": 0.7876267748478701, "train_speed(iter/s)": 0.135573 }, { "epoch": 1.0568724068913837, "grad_norm": 0.7225799560546875, "learning_rate": 4.9431990845545364e-05, "loss": 0.7424246788024902, "memory(GiB)": 91.52, "step": 81450, "token_acc": 0.7839406532266373, "train_speed(iter/s)": 0.135573 }, { "epoch": 1.0569372852930394, "grad_norm": 0.6517876386642456, "learning_rate": 4.942662744260469e-05, "loss": 0.7341450691223145, "memory(GiB)": 91.52, "step": 81455, "token_acc": 0.7713103559090544, "train_speed(iter/s)": 0.135572 }, { "epoch": 1.0570021636946951, "grad_norm": 0.6891558170318604, "learning_rate": 4.942126404626234e-05, "loss": 0.7663050651550293, "memory(GiB)": 91.52, "step": 81460, "token_acc": 0.7817932602300081, "train_speed(iter/s)": 0.135571 }, { "epoch": 1.0570670420963508, "grad_norm": 0.7231773138046265, "learning_rate": 4.941590065658005e-05, "loss": 0.7915240287780761, "memory(GiB)": 91.52, "step": 81465, "token_acc": 0.7821929530825613, "train_speed(iter/s)": 0.13557 }, { "epoch": 1.0571319204980065, "grad_norm": 0.7221091985702515, "learning_rate": 4.9410537273619545e-05, "loss": 0.7743100643157959, "memory(GiB)": 91.52, "step": 81470, "token_acc": 0.7762853027741249, "train_speed(iter/s)": 0.13557 }, { "epoch": 1.0571967988996622, "grad_norm": 0.6812413930892944, "learning_rate": 4.9405173897442546e-05, "loss": 0.7369885444641113, "memory(GiB)": 91.52, "step": 81475, "token_acc": 0.7874969542970518, "train_speed(iter/s)": 0.135569 }, { "epoch": 1.057261677301318, "grad_norm": 0.7242047786712646, "learning_rate": 4.939981052811076e-05, "loss": 0.7166065692901611, "memory(GiB)": 91.52, "step": 81480, "token_acc": 0.7981567172740831, "train_speed(iter/s)": 0.135568 }, { "epoch": 1.0573265557029736, "grad_norm": 0.818869411945343, "learning_rate": 4.93944471656859e-05, "loss": 0.7717620849609375, "memory(GiB)": 91.52, "step": 81485, "token_acc": 0.7773177135932134, "train_speed(iter/s)": 0.135567 }, { "epoch": 1.0573914341046293, "grad_norm": 0.7179471850395203, "learning_rate": 4.9389083810229716e-05, "loss": 0.7598649024963379, "memory(GiB)": 91.52, "step": 81490, "token_acc": 0.7909816363811172, "train_speed(iter/s)": 0.135566 }, { "epoch": 1.057456312506285, "grad_norm": 0.71434485912323, "learning_rate": 4.938372046180391e-05, "loss": 0.7716831207275391, "memory(GiB)": 91.52, "step": 81495, "token_acc": 0.7693786258474873, "train_speed(iter/s)": 0.135565 }, { "epoch": 1.0575211909079407, "grad_norm": 0.6766026020050049, "learning_rate": 4.937835712047021e-05, "loss": 0.7302968978881836, "memory(GiB)": 91.52, "step": 81500, "token_acc": 0.7936304874764342, "train_speed(iter/s)": 0.135563 }, { "epoch": 1.0575860693095964, "grad_norm": 0.6866010427474976, "learning_rate": 4.937299378629034e-05, "loss": 0.7910798072814942, "memory(GiB)": 91.52, "step": 81505, "token_acc": 0.777456542951104, "train_speed(iter/s)": 0.135563 }, { "epoch": 1.0576509477112521, "grad_norm": 0.7979319095611572, "learning_rate": 4.936763045932603e-05, "loss": 0.7597405910491943, "memory(GiB)": 91.52, "step": 81510, "token_acc": 0.8025458144217316, "train_speed(iter/s)": 0.135562 }, { "epoch": 1.0577158261129078, "grad_norm": 0.664604663848877, "learning_rate": 4.936226713963898e-05, "loss": 0.7663152694702149, "memory(GiB)": 91.52, "step": 81515, "token_acc": 0.7789502915856706, "train_speed(iter/s)": 0.135561 }, { "epoch": 1.0577807045145635, "grad_norm": 0.7650021910667419, "learning_rate": 4.93569038272909e-05, "loss": 0.7481083869934082, "memory(GiB)": 91.52, "step": 81520, "token_acc": 0.7841942251973224, "train_speed(iter/s)": 0.135561 }, { "epoch": 1.0578455829162192, "grad_norm": 0.7399307489395142, "learning_rate": 4.935154052234355e-05, "loss": 0.7634332656860352, "memory(GiB)": 91.52, "step": 81525, "token_acc": 0.7739487338341207, "train_speed(iter/s)": 0.13556 }, { "epoch": 1.057910461317875, "grad_norm": 0.7457265853881836, "learning_rate": 4.9346177224858613e-05, "loss": 0.7702818393707276, "memory(GiB)": 91.52, "step": 81530, "token_acc": 0.7741004950688029, "train_speed(iter/s)": 0.135559 }, { "epoch": 1.0579753397195306, "grad_norm": 0.7142650485038757, "learning_rate": 4.934081393489784e-05, "loss": 0.7988316535949707, "memory(GiB)": 91.52, "step": 81535, "token_acc": 0.7676099166457983, "train_speed(iter/s)": 0.135558 }, { "epoch": 1.0580402181211863, "grad_norm": 0.7783240079879761, "learning_rate": 4.9335450652522936e-05, "loss": 0.8080120086669922, "memory(GiB)": 91.52, "step": 81540, "token_acc": 0.7592068878833584, "train_speed(iter/s)": 0.135557 }, { "epoch": 1.058105096522842, "grad_norm": 0.7329580783843994, "learning_rate": 4.933008737779563e-05, "loss": 0.7754592418670654, "memory(GiB)": 91.52, "step": 81545, "token_acc": 0.7656986675777246, "train_speed(iter/s)": 0.135556 }, { "epoch": 1.0581699749244977, "grad_norm": 0.762951672077179, "learning_rate": 4.932472411077763e-05, "loss": 0.7681626796722412, "memory(GiB)": 91.52, "step": 81550, "token_acc": 0.7798149299081555, "train_speed(iter/s)": 0.135555 }, { "epoch": 1.0582348533261534, "grad_norm": 0.7857989072799683, "learning_rate": 4.931936085153065e-05, "loss": 0.777832317352295, "memory(GiB)": 91.52, "step": 81555, "token_acc": 0.7872953305033353, "train_speed(iter/s)": 0.135554 }, { "epoch": 1.0582997317278091, "grad_norm": 0.717637836933136, "learning_rate": 4.931399760011643e-05, "loss": 0.7635963916778564, "memory(GiB)": 91.52, "step": 81560, "token_acc": 0.7861832435080842, "train_speed(iter/s)": 0.135553 }, { "epoch": 1.0583646101294648, "grad_norm": 0.6732408404350281, "learning_rate": 4.9308634356596673e-05, "loss": 0.8016821861267089, "memory(GiB)": 91.52, "step": 81565, "token_acc": 0.7805741660013191, "train_speed(iter/s)": 0.135552 }, { "epoch": 1.0584294885311205, "grad_norm": 0.6684451103210449, "learning_rate": 4.930327112103312e-05, "loss": 0.7596005439758301, "memory(GiB)": 91.52, "step": 81570, "token_acc": 0.7867064188004209, "train_speed(iter/s)": 0.135551 }, { "epoch": 1.0584943669327762, "grad_norm": 0.7190641760826111, "learning_rate": 4.929790789348747e-05, "loss": 0.7972587585449219, "memory(GiB)": 91.52, "step": 81575, "token_acc": 0.7724033463008229, "train_speed(iter/s)": 0.13555 }, { "epoch": 1.058559245334432, "grad_norm": 0.7320365309715271, "learning_rate": 4.929254467402148e-05, "loss": 0.751095199584961, "memory(GiB)": 91.52, "step": 81580, "token_acc": 0.7686126853505163, "train_speed(iter/s)": 0.135549 }, { "epoch": 1.0586241237360876, "grad_norm": 0.776622474193573, "learning_rate": 4.928718146269681e-05, "loss": 0.7700712203979492, "memory(GiB)": 91.52, "step": 81585, "token_acc": 0.7644767760002575, "train_speed(iter/s)": 0.135548 }, { "epoch": 1.0586890021377433, "grad_norm": 0.7103320956230164, "learning_rate": 4.92818182595752e-05, "loss": 0.7843049526214599, "memory(GiB)": 91.52, "step": 81590, "token_acc": 0.7648191497394851, "train_speed(iter/s)": 0.135548 }, { "epoch": 1.058753880539399, "grad_norm": 0.7191620469093323, "learning_rate": 4.9276455064718405e-05, "loss": 0.7783380031585694, "memory(GiB)": 91.52, "step": 81595, "token_acc": 0.78076171875, "train_speed(iter/s)": 0.135546 }, { "epoch": 1.0588187589410547, "grad_norm": 0.7001842260360718, "learning_rate": 4.927109187818809e-05, "loss": 0.759756326675415, "memory(GiB)": 91.52, "step": 81600, "token_acc": 0.7814096016343207, "train_speed(iter/s)": 0.135545 }, { "epoch": 1.0588836373427104, "grad_norm": 0.7295702695846558, "learning_rate": 4.926572870004603e-05, "loss": 0.7746411323547363, "memory(GiB)": 91.52, "step": 81605, "token_acc": 0.7925332644361194, "train_speed(iter/s)": 0.135544 }, { "epoch": 1.058948515744366, "grad_norm": 0.6884212493896484, "learning_rate": 4.92603655303539e-05, "loss": 0.7744997024536133, "memory(GiB)": 91.52, "step": 81610, "token_acc": 0.7835839033366897, "train_speed(iter/s)": 0.135544 }, { "epoch": 1.0590133941460218, "grad_norm": 0.764630913734436, "learning_rate": 4.9255002369173436e-05, "loss": 0.7806317329406738, "memory(GiB)": 91.52, "step": 81615, "token_acc": 0.7742386970905081, "train_speed(iter/s)": 0.135543 }, { "epoch": 1.0590782725476775, "grad_norm": 0.702872097492218, "learning_rate": 4.924963921656638e-05, "loss": 0.747600507736206, "memory(GiB)": 91.52, "step": 81620, "token_acc": 0.793723415662866, "train_speed(iter/s)": 0.135541 }, { "epoch": 1.0591431509493332, "grad_norm": 0.6677613258361816, "learning_rate": 4.9244276072594416e-05, "loss": 0.7919340610504151, "memory(GiB)": 91.52, "step": 81625, "token_acc": 0.7630353945737501, "train_speed(iter/s)": 0.13554 }, { "epoch": 1.059208029350989, "grad_norm": 0.8157834410667419, "learning_rate": 4.9238912937319266e-05, "loss": 0.7766669273376465, "memory(GiB)": 91.52, "step": 81630, "token_acc": 0.788287507367983, "train_speed(iter/s)": 0.135539 }, { "epoch": 1.0592729077526446, "grad_norm": 0.7608585953712463, "learning_rate": 4.9233549810802654e-05, "loss": 0.7661555290222168, "memory(GiB)": 91.52, "step": 81635, "token_acc": 0.7734938711640441, "train_speed(iter/s)": 0.135538 }, { "epoch": 1.0593377861543003, "grad_norm": 0.645785927772522, "learning_rate": 4.922818669310631e-05, "loss": 0.7692509651184082, "memory(GiB)": 91.52, "step": 81640, "token_acc": 0.7890196078431373, "train_speed(iter/s)": 0.135537 }, { "epoch": 1.059402664555956, "grad_norm": 0.6953866481781006, "learning_rate": 4.922282358429194e-05, "loss": 0.7413348197937012, "memory(GiB)": 91.52, "step": 81645, "token_acc": 0.7825884209771146, "train_speed(iter/s)": 0.135536 }, { "epoch": 1.0594675429576117, "grad_norm": 0.5815725922584534, "learning_rate": 4.9217460484421265e-05, "loss": 0.7585153579711914, "memory(GiB)": 91.52, "step": 81650, "token_acc": 0.773641361074386, "train_speed(iter/s)": 0.135535 }, { "epoch": 1.0595324213592674, "grad_norm": 0.7554927468299866, "learning_rate": 4.921209739355602e-05, "loss": 0.7504354000091553, "memory(GiB)": 91.52, "step": 81655, "token_acc": 0.7865328944759488, "train_speed(iter/s)": 0.135534 }, { "epoch": 1.059597299760923, "grad_norm": 0.6751965880393982, "learning_rate": 4.9206734311757894e-05, "loss": 0.7670783519744873, "memory(GiB)": 91.52, "step": 81660, "token_acc": 0.7732911311464927, "train_speed(iter/s)": 0.135533 }, { "epoch": 1.0596621781625788, "grad_norm": 0.7606663107872009, "learning_rate": 4.920137123908862e-05, "loss": 0.7696501255035401, "memory(GiB)": 91.52, "step": 81665, "token_acc": 0.7725702016304895, "train_speed(iter/s)": 0.135532 }, { "epoch": 1.0597270565642345, "grad_norm": 0.7741261720657349, "learning_rate": 4.91960081756099e-05, "loss": 0.7815265655517578, "memory(GiB)": 91.52, "step": 81670, "token_acc": 0.772358647870244, "train_speed(iter/s)": 0.135531 }, { "epoch": 1.0597919349658902, "grad_norm": 0.702896237373352, "learning_rate": 4.919064512138349e-05, "loss": 0.7848117828369141, "memory(GiB)": 91.52, "step": 81675, "token_acc": 0.7804784115708845, "train_speed(iter/s)": 0.135531 }, { "epoch": 1.059856813367546, "grad_norm": 0.6728907823562622, "learning_rate": 4.918528207647107e-05, "loss": 0.716639232635498, "memory(GiB)": 91.52, "step": 81680, "token_acc": 0.7943281938325991, "train_speed(iter/s)": 0.13553 }, { "epoch": 1.0599216917692016, "grad_norm": 0.664400041103363, "learning_rate": 4.917991904093437e-05, "loss": 0.7144608497619629, "memory(GiB)": 91.52, "step": 81685, "token_acc": 0.7818409126426004, "train_speed(iter/s)": 0.135529 }, { "epoch": 1.0599865701708573, "grad_norm": 0.7656662464141846, "learning_rate": 4.9174556014835126e-05, "loss": 0.7568617343902588, "memory(GiB)": 91.52, "step": 81690, "token_acc": 0.7986856868087835, "train_speed(iter/s)": 0.135528 }, { "epoch": 1.060051448572513, "grad_norm": 0.7927575707435608, "learning_rate": 4.916919299823503e-05, "loss": 0.7205464839935303, "memory(GiB)": 91.52, "step": 81695, "token_acc": 0.7906622277485861, "train_speed(iter/s)": 0.135527 }, { "epoch": 1.0601163269741687, "grad_norm": 0.7077029347419739, "learning_rate": 4.9163829991195805e-05, "loss": 0.7392401695251465, "memory(GiB)": 91.52, "step": 81700, "token_acc": 0.7767814251401121, "train_speed(iter/s)": 0.135526 }, { "epoch": 1.0601812053758244, "grad_norm": 0.7411674857139587, "learning_rate": 4.915846699377916e-05, "loss": 0.7266138553619385, "memory(GiB)": 91.52, "step": 81705, "token_acc": 0.7697063369397218, "train_speed(iter/s)": 0.135525 }, { "epoch": 1.06024608377748, "grad_norm": 0.7814721465110779, "learning_rate": 4.915310400604683e-05, "loss": 0.7699197292327881, "memory(GiB)": 91.52, "step": 81710, "token_acc": 0.7763347186700768, "train_speed(iter/s)": 0.135524 }, { "epoch": 1.0603109621791358, "grad_norm": 0.6932356357574463, "learning_rate": 4.914774102806054e-05, "loss": 0.7727042675018311, "memory(GiB)": 91.52, "step": 81715, "token_acc": 0.7709000785830865, "train_speed(iter/s)": 0.135522 }, { "epoch": 1.0603758405807915, "grad_norm": 0.6579569578170776, "learning_rate": 4.914237805988196e-05, "loss": 0.7852666854858399, "memory(GiB)": 91.52, "step": 81720, "token_acc": 0.7709620476610768, "train_speed(iter/s)": 0.135521 }, { "epoch": 1.0604407189824472, "grad_norm": 0.6880123019218445, "learning_rate": 4.913701510157287e-05, "loss": 0.7999283313751221, "memory(GiB)": 91.52, "step": 81725, "token_acc": 0.777773842830329, "train_speed(iter/s)": 0.13552 }, { "epoch": 1.0605055973841029, "grad_norm": 0.7173017859458923, "learning_rate": 4.9131652153194946e-05, "loss": 0.7860030651092529, "memory(GiB)": 91.52, "step": 81730, "token_acc": 0.7898760608918227, "train_speed(iter/s)": 0.135519 }, { "epoch": 1.0605704757857586, "grad_norm": 0.7860402464866638, "learning_rate": 4.912628921480991e-05, "loss": 0.7228653430938721, "memory(GiB)": 91.52, "step": 81735, "token_acc": 0.7981300782844211, "train_speed(iter/s)": 0.135519 }, { "epoch": 1.0606353541874143, "grad_norm": 0.7313002347946167, "learning_rate": 4.912092628647947e-05, "loss": 0.791215181350708, "memory(GiB)": 91.52, "step": 81740, "token_acc": 0.7682322313754859, "train_speed(iter/s)": 0.135518 }, { "epoch": 1.06070023258907, "grad_norm": 0.7422685623168945, "learning_rate": 4.911556336826537e-05, "loss": 0.7907013416290283, "memory(GiB)": 91.52, "step": 81745, "token_acc": 0.7564192960808508, "train_speed(iter/s)": 0.135517 }, { "epoch": 1.0607651109907257, "grad_norm": 0.6652098894119263, "learning_rate": 4.91102004602293e-05, "loss": 0.7513158321380615, "memory(GiB)": 91.52, "step": 81750, "token_acc": 0.7839699228107533, "train_speed(iter/s)": 0.135516 }, { "epoch": 1.0608299893923814, "grad_norm": 0.6620153784751892, "learning_rate": 4.910483756243297e-05, "loss": 0.7501035213470459, "memory(GiB)": 91.52, "step": 81755, "token_acc": 0.7840731884811311, "train_speed(iter/s)": 0.135514 }, { "epoch": 1.060894867794037, "grad_norm": 0.6907922625541687, "learning_rate": 4.909947467493815e-05, "loss": 0.7466144561767578, "memory(GiB)": 91.52, "step": 81760, "token_acc": 0.7864648444099064, "train_speed(iter/s)": 0.135514 }, { "epoch": 1.0609597461956928, "grad_norm": 0.7380203604698181, "learning_rate": 4.90941117978065e-05, "loss": 0.7670942783355713, "memory(GiB)": 91.52, "step": 81765, "token_acc": 0.7833564493758669, "train_speed(iter/s)": 0.135513 }, { "epoch": 1.0610246245973485, "grad_norm": 0.6930800080299377, "learning_rate": 4.9088748931099755e-05, "loss": 0.7480539798736572, "memory(GiB)": 91.52, "step": 81770, "token_acc": 0.7860865536810409, "train_speed(iter/s)": 0.135511 }, { "epoch": 1.0610895029990042, "grad_norm": 0.6159957647323608, "learning_rate": 4.908338607487962e-05, "loss": 0.8120068550109864, "memory(GiB)": 91.52, "step": 81775, "token_acc": 0.7699351024866671, "train_speed(iter/s)": 0.13551 }, { "epoch": 1.0611543814006599, "grad_norm": 0.7164489030838013, "learning_rate": 4.907802322920782e-05, "loss": 0.7513457298278808, "memory(GiB)": 91.52, "step": 81780, "token_acc": 0.7826865043160923, "train_speed(iter/s)": 0.135509 }, { "epoch": 1.0612192598023156, "grad_norm": 0.6586650013923645, "learning_rate": 4.907266039414607e-05, "loss": 0.7189037322998046, "memory(GiB)": 91.52, "step": 81785, "token_acc": 0.7847305150863785, "train_speed(iter/s)": 0.135509 }, { "epoch": 1.0612841382039713, "grad_norm": 0.6657170653343201, "learning_rate": 4.906729756975609e-05, "loss": 0.7251841068267822, "memory(GiB)": 91.52, "step": 81790, "token_acc": 0.8031100006575054, "train_speed(iter/s)": 0.135508 }, { "epoch": 1.061349016605627, "grad_norm": 0.750282883644104, "learning_rate": 4.906193475609959e-05, "loss": 0.7155013084411621, "memory(GiB)": 91.52, "step": 81795, "token_acc": 0.7907734998265695, "train_speed(iter/s)": 0.135507 }, { "epoch": 1.0614138950072827, "grad_norm": 0.7130810022354126, "learning_rate": 4.905657195323829e-05, "loss": 0.7737560272216797, "memory(GiB)": 91.52, "step": 81800, "token_acc": 0.7608464444199948, "train_speed(iter/s)": 0.135506 }, { "epoch": 1.0614787734089384, "grad_norm": 0.705406665802002, "learning_rate": 4.9051209161233894e-05, "loss": 0.753267478942871, "memory(GiB)": 91.52, "step": 81805, "token_acc": 0.7849879829913108, "train_speed(iter/s)": 0.135505 }, { "epoch": 1.061543651810594, "grad_norm": 0.7056085467338562, "learning_rate": 4.9045846380148105e-05, "loss": 0.777228593826294, "memory(GiB)": 91.52, "step": 81810, "token_acc": 0.7757757009345795, "train_speed(iter/s)": 0.135505 }, { "epoch": 1.0616085302122498, "grad_norm": 0.7588369846343994, "learning_rate": 4.9040483610042676e-05, "loss": 0.7982119083404541, "memory(GiB)": 91.52, "step": 81815, "token_acc": 0.7858144866048516, "train_speed(iter/s)": 0.135504 }, { "epoch": 1.0616734086139055, "grad_norm": 0.732617974281311, "learning_rate": 4.9035120850979296e-05, "loss": 0.8157739639282227, "memory(GiB)": 91.52, "step": 81820, "token_acc": 0.7663544966258091, "train_speed(iter/s)": 0.135503 }, { "epoch": 1.0617382870155612, "grad_norm": 0.6949700713157654, "learning_rate": 4.9029758103019665e-05, "loss": 0.7598185539245605, "memory(GiB)": 91.52, "step": 81825, "token_acc": 0.7865980028889028, "train_speed(iter/s)": 0.135503 }, { "epoch": 1.0618031654172169, "grad_norm": 0.659429132938385, "learning_rate": 4.9024395366225534e-05, "loss": 0.7493988990783691, "memory(GiB)": 91.52, "step": 81830, "token_acc": 0.800763382858585, "train_speed(iter/s)": 0.135501 }, { "epoch": 1.0618680438188726, "grad_norm": 0.6711229085922241, "learning_rate": 4.901903264065861e-05, "loss": 0.7930018424987793, "memory(GiB)": 91.52, "step": 81835, "token_acc": 0.7817711837399526, "train_speed(iter/s)": 0.1355 }, { "epoch": 1.0619329222205283, "grad_norm": 0.737237811088562, "learning_rate": 4.901366992638058e-05, "loss": 0.7876171112060547, "memory(GiB)": 91.52, "step": 81840, "token_acc": 0.7853277209861695, "train_speed(iter/s)": 0.135499 }, { "epoch": 1.061997800622184, "grad_norm": 0.7226777672767639, "learning_rate": 4.900830722345316e-05, "loss": 0.7598518371582031, "memory(GiB)": 91.52, "step": 81845, "token_acc": 0.7818766263584876, "train_speed(iter/s)": 0.135498 }, { "epoch": 1.0620626790238397, "grad_norm": 0.7228595018386841, "learning_rate": 4.900294453193809e-05, "loss": 0.7162016868591309, "memory(GiB)": 91.52, "step": 81850, "token_acc": 0.7957985789311091, "train_speed(iter/s)": 0.135498 }, { "epoch": 1.0621275574254954, "grad_norm": 0.7949235439300537, "learning_rate": 4.899758185189706e-05, "loss": 0.7944580078125, "memory(GiB)": 91.52, "step": 81855, "token_acc": 0.7742678275886882, "train_speed(iter/s)": 0.135497 }, { "epoch": 1.062192435827151, "grad_norm": 0.7460607290267944, "learning_rate": 4.899221918339181e-05, "loss": 0.7667782783508301, "memory(GiB)": 91.52, "step": 81860, "token_acc": 0.783859176606387, "train_speed(iter/s)": 0.135496 }, { "epoch": 1.0622573142288068, "grad_norm": 0.6737319231033325, "learning_rate": 4.8986856526484025e-05, "loss": 0.7882328033447266, "memory(GiB)": 91.52, "step": 81865, "token_acc": 0.7643136198302574, "train_speed(iter/s)": 0.135495 }, { "epoch": 1.0623221926304625, "grad_norm": 0.673688530921936, "learning_rate": 4.898149388123545e-05, "loss": 0.7151175498962402, "memory(GiB)": 91.52, "step": 81870, "token_acc": 0.8024753823998755, "train_speed(iter/s)": 0.135494 }, { "epoch": 1.0623870710321182, "grad_norm": 0.6723122000694275, "learning_rate": 4.8976131247707764e-05, "loss": 0.7824352264404297, "memory(GiB)": 91.52, "step": 81875, "token_acc": 0.7591335273197543, "train_speed(iter/s)": 0.135493 }, { "epoch": 1.0624519494337739, "grad_norm": 0.690835177898407, "learning_rate": 4.897076862596268e-05, "loss": 0.775139331817627, "memory(GiB)": 91.52, "step": 81880, "token_acc": 0.7769628862930652, "train_speed(iter/s)": 0.135492 }, { "epoch": 1.0625168278354296, "grad_norm": 0.7580346465110779, "learning_rate": 4.896540601606194e-05, "loss": 0.7874648094177246, "memory(GiB)": 91.52, "step": 81885, "token_acc": 0.78343837535014, "train_speed(iter/s)": 0.135492 }, { "epoch": 1.062581706237085, "grad_norm": 0.7024588584899902, "learning_rate": 4.896004341806723e-05, "loss": 0.7456154823303223, "memory(GiB)": 91.52, "step": 81890, "token_acc": 0.7751731836238879, "train_speed(iter/s)": 0.135491 }, { "epoch": 1.062646584638741, "grad_norm": 0.7670663595199585, "learning_rate": 4.895468083204029e-05, "loss": 0.7761030197143555, "memory(GiB)": 91.52, "step": 81895, "token_acc": 0.7687495317299767, "train_speed(iter/s)": 0.13549 }, { "epoch": 1.0627114630403964, "grad_norm": 0.7505084276199341, "learning_rate": 4.8949318258042806e-05, "loss": 0.7706680297851562, "memory(GiB)": 91.52, "step": 81900, "token_acc": 0.7685705719211453, "train_speed(iter/s)": 0.135489 }, { "epoch": 1.0627763414420521, "grad_norm": 0.6508678793907166, "learning_rate": 4.894395569613651e-05, "loss": 0.7805405139923096, "memory(GiB)": 91.52, "step": 81905, "token_acc": 0.7661245806972103, "train_speed(iter/s)": 0.135488 }, { "epoch": 1.0628412198437078, "grad_norm": 0.7324135899543762, "learning_rate": 4.8938593146383096e-05, "loss": 0.752021312713623, "memory(GiB)": 91.52, "step": 81910, "token_acc": 0.7736988369300443, "train_speed(iter/s)": 0.135487 }, { "epoch": 1.0629060982453635, "grad_norm": 0.6495526432991028, "learning_rate": 4.893323060884427e-05, "loss": 0.7344998836517334, "memory(GiB)": 91.52, "step": 81915, "token_acc": 0.7695574589578872, "train_speed(iter/s)": 0.135485 }, { "epoch": 1.0629709766470192, "grad_norm": 0.8162173628807068, "learning_rate": 4.892786808358177e-05, "loss": 0.7906741142272949, "memory(GiB)": 91.52, "step": 81920, "token_acc": 0.7792174980148704, "train_speed(iter/s)": 0.135484 }, { "epoch": 1.063035855048675, "grad_norm": 0.7419809699058533, "learning_rate": 4.892250557065729e-05, "loss": 0.7833548545837402, "memory(GiB)": 91.52, "step": 81925, "token_acc": 0.7930638584266994, "train_speed(iter/s)": 0.135483 }, { "epoch": 1.0631007334503306, "grad_norm": 0.6864563226699829, "learning_rate": 4.891714307013256e-05, "loss": 0.7420392513275147, "memory(GiB)": 91.52, "step": 81930, "token_acc": 0.7867560375896797, "train_speed(iter/s)": 0.135482 }, { "epoch": 1.0631656118519863, "grad_norm": 0.6505784392356873, "learning_rate": 4.891178058206926e-05, "loss": 0.7644993305206299, "memory(GiB)": 91.52, "step": 81935, "token_acc": 0.7626959574542042, "train_speed(iter/s)": 0.135481 }, { "epoch": 1.063230490253642, "grad_norm": 0.7361071705818176, "learning_rate": 4.8906418106529144e-05, "loss": 0.786791181564331, "memory(GiB)": 91.52, "step": 81940, "token_acc": 0.7723187081048142, "train_speed(iter/s)": 0.13548 }, { "epoch": 1.0632953686552977, "grad_norm": 0.7951660752296448, "learning_rate": 4.890105564357389e-05, "loss": 0.7520317077636719, "memory(GiB)": 91.52, "step": 81945, "token_acc": 0.796317930951134, "train_speed(iter/s)": 0.13548 }, { "epoch": 1.0633602470569534, "grad_norm": 0.7367089986801147, "learning_rate": 4.889569319326519e-05, "loss": 0.796725082397461, "memory(GiB)": 91.52, "step": 81950, "token_acc": 0.76733073091937, "train_speed(iter/s)": 0.135479 }, { "epoch": 1.0634251254586091, "grad_norm": 0.737725555896759, "learning_rate": 4.88903307556648e-05, "loss": 0.7486386299133301, "memory(GiB)": 91.52, "step": 81955, "token_acc": 0.7904401587155117, "train_speed(iter/s)": 0.135478 }, { "epoch": 1.0634900038602648, "grad_norm": 0.6597769856452942, "learning_rate": 4.888496833083441e-05, "loss": 0.7434737205505371, "memory(GiB)": 91.52, "step": 81960, "token_acc": 0.7752836791685814, "train_speed(iter/s)": 0.135477 }, { "epoch": 1.0635548822619205, "grad_norm": 0.751481831073761, "learning_rate": 4.887960591883573e-05, "loss": 0.7382354736328125, "memory(GiB)": 91.52, "step": 81965, "token_acc": 0.789752121111296, "train_speed(iter/s)": 0.135476 }, { "epoch": 1.0636197606635762, "grad_norm": 0.780863344669342, "learning_rate": 4.887424351973049e-05, "loss": 0.8014368057250977, "memory(GiB)": 91.52, "step": 81970, "token_acc": 0.7760586593118294, "train_speed(iter/s)": 0.135475 }, { "epoch": 1.063684639065232, "grad_norm": 0.736621081829071, "learning_rate": 4.886888113358036e-05, "loss": 0.7812507629394532, "memory(GiB)": 91.52, "step": 81975, "token_acc": 0.7740732535814503, "train_speed(iter/s)": 0.135474 }, { "epoch": 1.0637495174668876, "grad_norm": 0.6298269629478455, "learning_rate": 4.8863518760447105e-05, "loss": 0.7519814968109131, "memory(GiB)": 91.52, "step": 81980, "token_acc": 0.7862466556249204, "train_speed(iter/s)": 0.135474 }, { "epoch": 1.0638143958685433, "grad_norm": 0.6818090677261353, "learning_rate": 4.885815640039237e-05, "loss": 0.7552734851837158, "memory(GiB)": 91.52, "step": 81985, "token_acc": 0.7757810624507013, "train_speed(iter/s)": 0.135473 }, { "epoch": 1.063879274270199, "grad_norm": 0.732215940952301, "learning_rate": 4.885279405347792e-05, "loss": 0.7749707698822021, "memory(GiB)": 91.52, "step": 81990, "token_acc": 0.7818491208167896, "train_speed(iter/s)": 0.135472 }, { "epoch": 1.0639441526718547, "grad_norm": 0.707182765007019, "learning_rate": 4.884743171976542e-05, "loss": 0.7476361274719239, "memory(GiB)": 91.52, "step": 81995, "token_acc": 0.7889233954451346, "train_speed(iter/s)": 0.135471 }, { "epoch": 1.0640090310735104, "grad_norm": 0.7063621878623962, "learning_rate": 4.884206939931662e-05, "loss": 0.7488307476043701, "memory(GiB)": 91.52, "step": 82000, "token_acc": 0.7869407057111677, "train_speed(iter/s)": 0.13547 }, { "epoch": 1.0640739094751661, "grad_norm": 0.8292285799980164, "learning_rate": 4.8836707092193206e-05, "loss": 0.7845363616943359, "memory(GiB)": 91.52, "step": 82005, "token_acc": 0.7758736882641764, "train_speed(iter/s)": 0.13547 }, { "epoch": 1.0641387878768218, "grad_norm": 0.7377310991287231, "learning_rate": 4.8831344798456876e-05, "loss": 0.7523958683013916, "memory(GiB)": 91.52, "step": 82010, "token_acc": 0.7924048276503987, "train_speed(iter/s)": 0.135469 }, { "epoch": 1.0642036662784775, "grad_norm": 0.7279332876205444, "learning_rate": 4.882598251816939e-05, "loss": 0.7604662895202636, "memory(GiB)": 91.52, "step": 82015, "token_acc": 0.7623511798673986, "train_speed(iter/s)": 0.135468 }, { "epoch": 1.0642685446801332, "grad_norm": 0.7153233289718628, "learning_rate": 4.88206202513924e-05, "loss": 0.7375532627105713, "memory(GiB)": 91.52, "step": 82020, "token_acc": 0.789983549625297, "train_speed(iter/s)": 0.135466 }, { "epoch": 1.064333423081789, "grad_norm": 0.7283233404159546, "learning_rate": 4.881525799818764e-05, "loss": 0.7776591300964355, "memory(GiB)": 91.52, "step": 82025, "token_acc": 0.7633227784140673, "train_speed(iter/s)": 0.135466 }, { "epoch": 1.0643983014834446, "grad_norm": 0.6778507232666016, "learning_rate": 4.8809895758616804e-05, "loss": 0.7299898624420166, "memory(GiB)": 91.52, "step": 82030, "token_acc": 0.773193384663191, "train_speed(iter/s)": 0.135465 }, { "epoch": 1.0644631798851003, "grad_norm": 0.7426540851593018, "learning_rate": 4.8804533532741616e-05, "loss": 0.7715797424316406, "memory(GiB)": 91.52, "step": 82035, "token_acc": 0.7893845518380834, "train_speed(iter/s)": 0.135464 }, { "epoch": 1.064528058286756, "grad_norm": 0.7354211211204529, "learning_rate": 4.879917132062379e-05, "loss": 0.7748534679412842, "memory(GiB)": 91.52, "step": 82040, "token_acc": 0.7778736458632921, "train_speed(iter/s)": 0.135462 }, { "epoch": 1.0645929366884117, "grad_norm": 0.6466814875602722, "learning_rate": 4.879380912232501e-05, "loss": 0.7328545570373535, "memory(GiB)": 91.52, "step": 82045, "token_acc": 0.8013453678474114, "train_speed(iter/s)": 0.135461 }, { "epoch": 1.0646578150900674, "grad_norm": 0.6878348588943481, "learning_rate": 4.8788446937907014e-05, "loss": 0.7516501903533935, "memory(GiB)": 91.52, "step": 82050, "token_acc": 0.797254363600706, "train_speed(iter/s)": 0.13546 }, { "epoch": 1.064722693491723, "grad_norm": 0.6249597072601318, "learning_rate": 4.878308476743148e-05, "loss": 0.7197183132171631, "memory(GiB)": 91.52, "step": 82055, "token_acc": 0.7730798958005325, "train_speed(iter/s)": 0.135458 }, { "epoch": 1.0647875718933788, "grad_norm": 0.7523845434188843, "learning_rate": 4.877772261096013e-05, "loss": 0.778211259841919, "memory(GiB)": 91.52, "step": 82060, "token_acc": 0.7746088266511453, "train_speed(iter/s)": 0.135458 }, { "epoch": 1.0648524502950345, "grad_norm": 0.6923003792762756, "learning_rate": 4.8772360468554664e-05, "loss": 0.7405235767364502, "memory(GiB)": 91.52, "step": 82065, "token_acc": 0.7870134594397963, "train_speed(iter/s)": 0.135457 }, { "epoch": 1.0649173286966902, "grad_norm": 0.6756604313850403, "learning_rate": 4.87669983402768e-05, "loss": 0.726048755645752, "memory(GiB)": 91.52, "step": 82070, "token_acc": 0.7956726574768801, "train_speed(iter/s)": 0.135456 }, { "epoch": 1.064982207098346, "grad_norm": 0.7415913939476013, "learning_rate": 4.876163622618824e-05, "loss": 0.7387220859527588, "memory(GiB)": 91.52, "step": 82075, "token_acc": 0.808861140364074, "train_speed(iter/s)": 0.135455 }, { "epoch": 1.0650470855000016, "grad_norm": 0.6780940890312195, "learning_rate": 4.875627412635068e-05, "loss": 0.8007094383239746, "memory(GiB)": 91.52, "step": 82080, "token_acc": 0.7416676840434623, "train_speed(iter/s)": 0.135454 }, { "epoch": 1.0651119639016573, "grad_norm": 0.7453498244285583, "learning_rate": 4.8750912040825864e-05, "loss": 0.754359245300293, "memory(GiB)": 91.52, "step": 82085, "token_acc": 0.7921495611201884, "train_speed(iter/s)": 0.135454 }, { "epoch": 1.065176842303313, "grad_norm": 0.6347827315330505, "learning_rate": 4.874554996967546e-05, "loss": 0.7628084659576416, "memory(GiB)": 91.52, "step": 82090, "token_acc": 0.7805901924811095, "train_speed(iter/s)": 0.135452 }, { "epoch": 1.0652417207049687, "grad_norm": 0.6539183855056763, "learning_rate": 4.8740187912961176e-05, "loss": 0.7781964778900147, "memory(GiB)": 91.52, "step": 82095, "token_acc": 0.781062010210386, "train_speed(iter/s)": 0.135452 }, { "epoch": 1.0653065991066244, "grad_norm": 0.6391057372093201, "learning_rate": 4.873482587074473e-05, "loss": 0.7569745540618896, "memory(GiB)": 91.52, "step": 82100, "token_acc": 0.7970856499146045, "train_speed(iter/s)": 0.13545 }, { "epoch": 1.06537147750828, "grad_norm": 0.7308133244514465, "learning_rate": 4.872946384308783e-05, "loss": 0.8047616958618165, "memory(GiB)": 91.52, "step": 82105, "token_acc": 0.755525965379494, "train_speed(iter/s)": 0.13545 }, { "epoch": 1.0654363559099358, "grad_norm": 0.6667754054069519, "learning_rate": 4.8724101830052186e-05, "loss": 0.7451632499694825, "memory(GiB)": 91.52, "step": 82110, "token_acc": 0.7659706429605128, "train_speed(iter/s)": 0.135449 }, { "epoch": 1.0655012343115915, "grad_norm": 0.616245448589325, "learning_rate": 4.8718739831699474e-05, "loss": 0.7337367534637451, "memory(GiB)": 91.52, "step": 82115, "token_acc": 0.7866392924826279, "train_speed(iter/s)": 0.135448 }, { "epoch": 1.0655661127132472, "grad_norm": 0.7368277907371521, "learning_rate": 4.871337784809144e-05, "loss": 0.7375407695770264, "memory(GiB)": 91.52, "step": 82120, "token_acc": 0.784996794186792, "train_speed(iter/s)": 0.135447 }, { "epoch": 1.065630991114903, "grad_norm": 0.6783962845802307, "learning_rate": 4.870801587928978e-05, "loss": 0.7752482891082764, "memory(GiB)": 91.52, "step": 82125, "token_acc": 0.7951552536734128, "train_speed(iter/s)": 0.135446 }, { "epoch": 1.0656958695165586, "grad_norm": 0.7902318239212036, "learning_rate": 4.870265392535618e-05, "loss": 0.789669132232666, "memory(GiB)": 91.52, "step": 82130, "token_acc": 0.7685482744166551, "train_speed(iter/s)": 0.135445 }, { "epoch": 1.0657607479182143, "grad_norm": 0.7555215358734131, "learning_rate": 4.869729198635234e-05, "loss": 0.7681965351104736, "memory(GiB)": 91.52, "step": 82135, "token_acc": 0.7700142824926708, "train_speed(iter/s)": 0.135444 }, { "epoch": 1.06582562631987, "grad_norm": 0.6471312046051025, "learning_rate": 4.8691930062339994e-05, "loss": 0.7068415641784668, "memory(GiB)": 91.52, "step": 82140, "token_acc": 0.7789437135912967, "train_speed(iter/s)": 0.135444 }, { "epoch": 1.0658905047215257, "grad_norm": 0.6807990074157715, "learning_rate": 4.8686568153380836e-05, "loss": 0.761293601989746, "memory(GiB)": 91.52, "step": 82145, "token_acc": 0.7803489625437887, "train_speed(iter/s)": 0.135443 }, { "epoch": 1.0659553831231814, "grad_norm": 0.7165892124176025, "learning_rate": 4.868120625953655e-05, "loss": 0.7288862228393554, "memory(GiB)": 91.52, "step": 82150, "token_acc": 0.7837688563636991, "train_speed(iter/s)": 0.135442 }, { "epoch": 1.066020261524837, "grad_norm": 0.7208656668663025, "learning_rate": 4.867584438086887e-05, "loss": 0.7681341648101807, "memory(GiB)": 91.52, "step": 82155, "token_acc": 0.7897550111358574, "train_speed(iter/s)": 0.135441 }, { "epoch": 1.0660851399264928, "grad_norm": 0.7819809913635254, "learning_rate": 4.8670482517439504e-05, "loss": 0.7746835708618164, "memory(GiB)": 91.52, "step": 82160, "token_acc": 0.7748618304424901, "train_speed(iter/s)": 0.13544 }, { "epoch": 1.0661500183281485, "grad_norm": 0.7115274667739868, "learning_rate": 4.8665120669310125e-05, "loss": 0.7395572662353516, "memory(GiB)": 91.52, "step": 82165, "token_acc": 0.7911278140083269, "train_speed(iter/s)": 0.135439 }, { "epoch": 1.0662148967298042, "grad_norm": 0.6841863393783569, "learning_rate": 4.865975883654244e-05, "loss": 0.7815153121948242, "memory(GiB)": 91.52, "step": 82170, "token_acc": 0.7763718177150968, "train_speed(iter/s)": 0.135438 }, { "epoch": 1.0662797751314599, "grad_norm": 0.7001606225967407, "learning_rate": 4.865439701919817e-05, "loss": 0.751060962677002, "memory(GiB)": 91.52, "step": 82175, "token_acc": 0.7756750241080038, "train_speed(iter/s)": 0.135437 }, { "epoch": 1.0663446535331156, "grad_norm": 0.635824978351593, "learning_rate": 4.864903521733902e-05, "loss": 0.7575849533081055, "memory(GiB)": 91.52, "step": 82180, "token_acc": 0.7830292039744607, "train_speed(iter/s)": 0.135436 }, { "epoch": 1.0664095319347713, "grad_norm": 0.693232536315918, "learning_rate": 4.864367343102668e-05, "loss": 0.8123023033142089, "memory(GiB)": 91.52, "step": 82185, "token_acc": 0.7584507042253521, "train_speed(iter/s)": 0.135435 }, { "epoch": 1.066474410336427, "grad_norm": 0.7339088320732117, "learning_rate": 4.863831166032285e-05, "loss": 0.7316183567047119, "memory(GiB)": 91.52, "step": 82190, "token_acc": 0.798215343860016, "train_speed(iter/s)": 0.135434 }, { "epoch": 1.0665392887380827, "grad_norm": 0.6934416890144348, "learning_rate": 4.863294990528927e-05, "loss": 0.7652363777160645, "memory(GiB)": 91.52, "step": 82195, "token_acc": 0.7892636272112757, "train_speed(iter/s)": 0.135433 }, { "epoch": 1.0666041671397384, "grad_norm": 0.6320356726646423, "learning_rate": 4.86275881659876e-05, "loss": 0.7795991897583008, "memory(GiB)": 91.52, "step": 82200, "token_acc": 0.772513440860215, "train_speed(iter/s)": 0.135432 }, { "epoch": 1.066669045541394, "grad_norm": 0.7090413570404053, "learning_rate": 4.8622226442479544e-05, "loss": 0.7735435009002686, "memory(GiB)": 91.52, "step": 82205, "token_acc": 0.7718379718930835, "train_speed(iter/s)": 0.135432 }, { "epoch": 1.0667339239430498, "grad_norm": 0.7468532919883728, "learning_rate": 4.861686473482683e-05, "loss": 0.7725919723510742, "memory(GiB)": 91.52, "step": 82210, "token_acc": 0.7811950218283704, "train_speed(iter/s)": 0.135431 }, { "epoch": 1.0667988023447055, "grad_norm": 0.7681126594543457, "learning_rate": 4.861150304309115e-05, "loss": 0.7843163490295411, "memory(GiB)": 91.52, "step": 82215, "token_acc": 0.7778451226468911, "train_speed(iter/s)": 0.13543 }, { "epoch": 1.0668636807463612, "grad_norm": 0.7355741858482361, "learning_rate": 4.860614136733418e-05, "loss": 0.7415285110473633, "memory(GiB)": 91.52, "step": 82220, "token_acc": 0.7862720920674064, "train_speed(iter/s)": 0.135429 }, { "epoch": 1.0669285591480169, "grad_norm": 0.7087995409965515, "learning_rate": 4.860077970761767e-05, "loss": 0.7420852661132813, "memory(GiB)": 91.52, "step": 82225, "token_acc": 0.7827611789303298, "train_speed(iter/s)": 0.135429 }, { "epoch": 1.0669934375496726, "grad_norm": 0.6628847122192383, "learning_rate": 4.859541806400331e-05, "loss": 0.7692501544952393, "memory(GiB)": 91.52, "step": 82230, "token_acc": 0.7778940392731332, "train_speed(iter/s)": 0.135428 }, { "epoch": 1.0670583159513283, "grad_norm": 0.755686342716217, "learning_rate": 4.8590056436552765e-05, "loss": 0.7602423191070556, "memory(GiB)": 91.52, "step": 82235, "token_acc": 0.7738751723273594, "train_speed(iter/s)": 0.135427 }, { "epoch": 1.067123194352984, "grad_norm": 0.7787958383560181, "learning_rate": 4.858469482532776e-05, "loss": 0.7593390941619873, "memory(GiB)": 91.52, "step": 82240, "token_acc": 0.7713849707168192, "train_speed(iter/s)": 0.135426 }, { "epoch": 1.0671880727546397, "grad_norm": 0.7629860043525696, "learning_rate": 4.857933323039e-05, "loss": 0.7834041595458985, "memory(GiB)": 91.52, "step": 82245, "token_acc": 0.7934596958966882, "train_speed(iter/s)": 0.135425 }, { "epoch": 1.0672529511562954, "grad_norm": 0.693566083908081, "learning_rate": 4.8573971651801184e-05, "loss": 0.7888115406036377, "memory(GiB)": 91.52, "step": 82250, "token_acc": 0.767194332395512, "train_speed(iter/s)": 0.135425 }, { "epoch": 1.067317829557951, "grad_norm": 0.7721985578536987, "learning_rate": 4.8568610089623e-05, "loss": 0.7272426605224609, "memory(GiB)": 91.52, "step": 82255, "token_acc": 0.7859676027988115, "train_speed(iter/s)": 0.135424 }, { "epoch": 1.0673827079596068, "grad_norm": 0.7000373005867004, "learning_rate": 4.856324854391717e-05, "loss": 0.7377100944519043, "memory(GiB)": 91.52, "step": 82260, "token_acc": 0.7927676838948668, "train_speed(iter/s)": 0.135423 }, { "epoch": 1.0674475863612625, "grad_norm": 0.7009446024894714, "learning_rate": 4.8557887014745396e-05, "loss": 0.7266903877258301, "memory(GiB)": 91.52, "step": 82265, "token_acc": 0.7914022444516152, "train_speed(iter/s)": 0.135422 }, { "epoch": 1.0675124647629182, "grad_norm": 0.6609313488006592, "learning_rate": 4.855252550216935e-05, "loss": 0.7527182102203369, "memory(GiB)": 91.52, "step": 82270, "token_acc": 0.771978325776844, "train_speed(iter/s)": 0.135421 }, { "epoch": 1.0675773431645739, "grad_norm": 0.721794068813324, "learning_rate": 4.854716400625074e-05, "loss": 0.7729992866516113, "memory(GiB)": 91.52, "step": 82275, "token_acc": 0.7743697779047347, "train_speed(iter/s)": 0.135419 }, { "epoch": 1.0676422215662296, "grad_norm": 0.7279667258262634, "learning_rate": 4.854180252705128e-05, "loss": 0.7544348239898682, "memory(GiB)": 91.52, "step": 82280, "token_acc": 0.7926353682315884, "train_speed(iter/s)": 0.135419 }, { "epoch": 1.0677070999678853, "grad_norm": 0.7630489468574524, "learning_rate": 4.8536441064632656e-05, "loss": 0.7771929264068603, "memory(GiB)": 91.52, "step": 82285, "token_acc": 0.784173110501372, "train_speed(iter/s)": 0.135418 }, { "epoch": 1.067771978369541, "grad_norm": 0.6808295845985413, "learning_rate": 4.853107961905658e-05, "loss": 0.7491730690002442, "memory(GiB)": 91.52, "step": 82290, "token_acc": 0.7947085133803351, "train_speed(iter/s)": 0.135417 }, { "epoch": 1.0678368567711967, "grad_norm": 0.7014618515968323, "learning_rate": 4.852571819038475e-05, "loss": 0.7274304389953613, "memory(GiB)": 91.52, "step": 82295, "token_acc": 0.8065317319098458, "train_speed(iter/s)": 0.135416 }, { "epoch": 1.0679017351728524, "grad_norm": 0.6466827988624573, "learning_rate": 4.852035677867885e-05, "loss": 0.7330203056335449, "memory(GiB)": 91.52, "step": 82300, "token_acc": 0.7907733300285041, "train_speed(iter/s)": 0.135415 }, { "epoch": 1.067966613574508, "grad_norm": 0.7341547608375549, "learning_rate": 4.8514995384000617e-05, "loss": 0.7291416168212891, "memory(GiB)": 91.52, "step": 82305, "token_acc": 0.792582923318941, "train_speed(iter/s)": 0.135414 }, { "epoch": 1.0680314919761638, "grad_norm": 0.6875665783882141, "learning_rate": 4.8509634006411694e-05, "loss": 0.7809195995330811, "memory(GiB)": 91.52, "step": 82310, "token_acc": 0.7585761101590451, "train_speed(iter/s)": 0.135414 }, { "epoch": 1.0680963703778195, "grad_norm": 0.6079655885696411, "learning_rate": 4.850427264597381e-05, "loss": 0.7832114696502686, "memory(GiB)": 91.52, "step": 82315, "token_acc": 0.7609939969286612, "train_speed(iter/s)": 0.135413 }, { "epoch": 1.0681612487794752, "grad_norm": 0.7338085174560547, "learning_rate": 4.849891130274866e-05, "loss": 0.7530120372772217, "memory(GiB)": 91.52, "step": 82320, "token_acc": 0.7659605466359742, "train_speed(iter/s)": 0.135412 }, { "epoch": 1.0682261271811309, "grad_norm": 0.6821649670600891, "learning_rate": 4.849354997679794e-05, "loss": 0.771055030822754, "memory(GiB)": 91.52, "step": 82325, "token_acc": 0.7932366807273759, "train_speed(iter/s)": 0.135411 }, { "epoch": 1.0682910055827866, "grad_norm": 0.7309678196907043, "learning_rate": 4.848818866818336e-05, "loss": 0.7680126190185547, "memory(GiB)": 91.52, "step": 82330, "token_acc": 0.7704215753091228, "train_speed(iter/s)": 0.13541 }, { "epoch": 1.0683558839844423, "grad_norm": 0.8077152967453003, "learning_rate": 4.8482827376966595e-05, "loss": 0.741881275177002, "memory(GiB)": 91.52, "step": 82335, "token_acc": 0.7957237103568874, "train_speed(iter/s)": 0.135409 }, { "epoch": 1.068420762386098, "grad_norm": 0.7195257544517517, "learning_rate": 4.8477466103209384e-05, "loss": 0.7478716850280762, "memory(GiB)": 91.52, "step": 82340, "token_acc": 0.7977855477855478, "train_speed(iter/s)": 0.135408 }, { "epoch": 1.0684856407877537, "grad_norm": 0.7597070336341858, "learning_rate": 4.847210484697337e-05, "loss": 0.736662769317627, "memory(GiB)": 91.52, "step": 82345, "token_acc": 0.7836120745904736, "train_speed(iter/s)": 0.135407 }, { "epoch": 1.0685505191894094, "grad_norm": 0.7017663717269897, "learning_rate": 4.846674360832028e-05, "loss": 0.734294319152832, "memory(GiB)": 91.52, "step": 82350, "token_acc": 0.773857224629978, "train_speed(iter/s)": 0.135406 }, { "epoch": 1.0686153975910648, "grad_norm": 0.7408987283706665, "learning_rate": 4.846138238731179e-05, "loss": 0.7748795032501221, "memory(GiB)": 91.52, "step": 82355, "token_acc": 0.7959910223666898, "train_speed(iter/s)": 0.135406 }, { "epoch": 1.0686802759927208, "grad_norm": 0.7096872925758362, "learning_rate": 4.845602118400963e-05, "loss": 0.7236101150512695, "memory(GiB)": 91.52, "step": 82360, "token_acc": 0.7968401486988848, "train_speed(iter/s)": 0.135404 }, { "epoch": 1.0687451543943762, "grad_norm": 0.6726560592651367, "learning_rate": 4.845065999847548e-05, "loss": 0.7552049636840821, "memory(GiB)": 91.52, "step": 82365, "token_acc": 0.7911553839790432, "train_speed(iter/s)": 0.135403 }, { "epoch": 1.0688100327960322, "grad_norm": 0.7044245600700378, "learning_rate": 4.844529883077102e-05, "loss": 0.7349875450134278, "memory(GiB)": 91.52, "step": 82370, "token_acc": 0.7949130494317403, "train_speed(iter/s)": 0.135402 }, { "epoch": 1.0688749111976876, "grad_norm": 0.6617008447647095, "learning_rate": 4.8439937680958e-05, "loss": 0.7970777988433838, "memory(GiB)": 91.52, "step": 82375, "token_acc": 0.8000846960714053, "train_speed(iter/s)": 0.135402 }, { "epoch": 1.0689397895993435, "grad_norm": 0.7489480972290039, "learning_rate": 4.843457654909804e-05, "loss": 0.7565948009490967, "memory(GiB)": 91.52, "step": 82380, "token_acc": 0.7937586551157773, "train_speed(iter/s)": 0.135401 }, { "epoch": 1.069004668000999, "grad_norm": 0.7037931680679321, "learning_rate": 4.8429215435252876e-05, "loss": 0.7717677116394043, "memory(GiB)": 91.52, "step": 82385, "token_acc": 0.7651238861525737, "train_speed(iter/s)": 0.1354 }, { "epoch": 1.0690695464026547, "grad_norm": 0.6938372850418091, "learning_rate": 4.8423854339484195e-05, "loss": 0.7536767005920411, "memory(GiB)": 91.52, "step": 82390, "token_acc": 0.7880320704724545, "train_speed(iter/s)": 0.135399 }, { "epoch": 1.0691344248043104, "grad_norm": 0.7510644793510437, "learning_rate": 4.8418493261853715e-05, "loss": 0.7650139331817627, "memory(GiB)": 91.52, "step": 82395, "token_acc": 0.787435488062971, "train_speed(iter/s)": 0.135398 }, { "epoch": 1.0691993032059661, "grad_norm": 0.6804237961769104, "learning_rate": 4.841313220242311e-05, "loss": 0.7717679977416992, "memory(GiB)": 91.52, "step": 82400, "token_acc": 0.789530340483215, "train_speed(iter/s)": 0.135397 }, { "epoch": 1.0692641816076218, "grad_norm": 0.7613361477851868, "learning_rate": 4.8407771161254055e-05, "loss": 0.7820942878723145, "memory(GiB)": 91.52, "step": 82405, "token_acc": 0.7650629044575746, "train_speed(iter/s)": 0.135396 }, { "epoch": 1.0693290600092775, "grad_norm": 0.7518704533576965, "learning_rate": 4.840241013840831e-05, "loss": 0.7290494918823243, "memory(GiB)": 91.52, "step": 82410, "token_acc": 0.767698327997154, "train_speed(iter/s)": 0.135395 }, { "epoch": 1.0693939384109332, "grad_norm": 0.7309831380844116, "learning_rate": 4.839704913394749e-05, "loss": 0.7785884857177734, "memory(GiB)": 91.52, "step": 82415, "token_acc": 0.7564897458608707, "train_speed(iter/s)": 0.135395 }, { "epoch": 1.069458816812589, "grad_norm": 0.7486156821250916, "learning_rate": 4.839168814793334e-05, "loss": 0.75367431640625, "memory(GiB)": 91.52, "step": 82420, "token_acc": 0.773873417721519, "train_speed(iter/s)": 0.135394 }, { "epoch": 1.0695236952142446, "grad_norm": 0.6420561075210571, "learning_rate": 4.838632718042753e-05, "loss": 0.7256547451019287, "memory(GiB)": 91.52, "step": 82425, "token_acc": 0.7630107526881721, "train_speed(iter/s)": 0.135393 }, { "epoch": 1.0695885736159003, "grad_norm": 0.7370595932006836, "learning_rate": 4.838096623149176e-05, "loss": 0.7204349517822266, "memory(GiB)": 91.52, "step": 82430, "token_acc": 0.7861835930167074, "train_speed(iter/s)": 0.135392 }, { "epoch": 1.069653452017556, "grad_norm": 0.6817259192466736, "learning_rate": 4.8375605301187743e-05, "loss": 0.7326266288757324, "memory(GiB)": 91.52, "step": 82435, "token_acc": 0.7974943374858438, "train_speed(iter/s)": 0.135391 }, { "epoch": 1.0697183304192117, "grad_norm": 0.6843574047088623, "learning_rate": 4.837024438957714e-05, "loss": 0.8110537528991699, "memory(GiB)": 91.52, "step": 82440, "token_acc": 0.7657275463214764, "train_speed(iter/s)": 0.13539 }, { "epoch": 1.0697832088208674, "grad_norm": 0.7708745002746582, "learning_rate": 4.836488349672168e-05, "loss": 0.7681615829467774, "memory(GiB)": 91.52, "step": 82445, "token_acc": 0.7755676487145806, "train_speed(iter/s)": 0.135389 }, { "epoch": 1.0698480872225231, "grad_norm": 0.7649316787719727, "learning_rate": 4.835952262268302e-05, "loss": 0.7465885162353516, "memory(GiB)": 91.52, "step": 82450, "token_acc": 0.7882301378595904, "train_speed(iter/s)": 0.135388 }, { "epoch": 1.0699129656241788, "grad_norm": 0.6972328424453735, "learning_rate": 4.835416176752287e-05, "loss": 0.765083122253418, "memory(GiB)": 91.52, "step": 82455, "token_acc": 0.7809175854376151, "train_speed(iter/s)": 0.135387 }, { "epoch": 1.0699778440258345, "grad_norm": 0.7898921966552734, "learning_rate": 4.8348800931302914e-05, "loss": 0.7913748264312744, "memory(GiB)": 91.52, "step": 82460, "token_acc": 0.7774269642533116, "train_speed(iter/s)": 0.135386 }, { "epoch": 1.0700427224274902, "grad_norm": 0.7088072896003723, "learning_rate": 4.834344011408486e-05, "loss": 0.7536994457244873, "memory(GiB)": 91.52, "step": 82465, "token_acc": 0.7637039566025569, "train_speed(iter/s)": 0.135385 }, { "epoch": 1.070107600829146, "grad_norm": 0.7216475009918213, "learning_rate": 4.833807931593039e-05, "loss": 0.7612215995788574, "memory(GiB)": 91.52, "step": 82470, "token_acc": 0.7863977037155159, "train_speed(iter/s)": 0.135384 }, { "epoch": 1.0701724792308016, "grad_norm": 0.7582783699035645, "learning_rate": 4.8332718536901186e-05, "loss": 0.7793071746826172, "memory(GiB)": 91.52, "step": 82475, "token_acc": 0.785727920590481, "train_speed(iter/s)": 0.135383 }, { "epoch": 1.0702373576324573, "grad_norm": 0.6484370827674866, "learning_rate": 4.8327357777058956e-05, "loss": 0.7602977275848388, "memory(GiB)": 91.52, "step": 82480, "token_acc": 0.7762783641994817, "train_speed(iter/s)": 0.135382 }, { "epoch": 1.070302236034113, "grad_norm": 0.7325794100761414, "learning_rate": 4.83219970364654e-05, "loss": 0.708824634552002, "memory(GiB)": 91.52, "step": 82485, "token_acc": 0.7958036449568548, "train_speed(iter/s)": 0.135381 }, { "epoch": 1.0703671144357687, "grad_norm": 0.7224735617637634, "learning_rate": 4.831663631518218e-05, "loss": 0.7442049026489258, "memory(GiB)": 91.52, "step": 82490, "token_acc": 0.7613947878211923, "train_speed(iter/s)": 0.13538 }, { "epoch": 1.0704319928374244, "grad_norm": 0.7387465238571167, "learning_rate": 4.8311275613270994e-05, "loss": 0.764687442779541, "memory(GiB)": 91.52, "step": 82495, "token_acc": 0.7774211030975111, "train_speed(iter/s)": 0.135379 }, { "epoch": 1.07049687123908, "grad_norm": 0.7977840304374695, "learning_rate": 4.830591493079355e-05, "loss": 0.7142242431640625, "memory(GiB)": 91.52, "step": 82500, "token_acc": 0.7821950256612712, "train_speed(iter/s)": 0.135378 }, { "epoch": 1.0705617496407358, "grad_norm": 0.7470217347145081, "learning_rate": 4.830055426781153e-05, "loss": 0.7995133399963379, "memory(GiB)": 91.52, "step": 82505, "token_acc": 0.7774358109532442, "train_speed(iter/s)": 0.135378 }, { "epoch": 1.0706266280423915, "grad_norm": 0.6667551398277283, "learning_rate": 4.829519362438661e-05, "loss": 0.797426176071167, "memory(GiB)": 91.52, "step": 82510, "token_acc": 0.7602899301061351, "train_speed(iter/s)": 0.135377 }, { "epoch": 1.0706915064440472, "grad_norm": 0.6198402047157288, "learning_rate": 4.82898330005805e-05, "loss": 0.7764878749847413, "memory(GiB)": 91.52, "step": 82515, "token_acc": 0.7807631214302613, "train_speed(iter/s)": 0.135376 }, { "epoch": 1.070756384845703, "grad_norm": 0.700817883014679, "learning_rate": 4.8284472396454894e-05, "loss": 0.7408699989318848, "memory(GiB)": 91.52, "step": 82520, "token_acc": 0.7757417651496102, "train_speed(iter/s)": 0.135375 }, { "epoch": 1.0708212632473586, "grad_norm": 0.6175437569618225, "learning_rate": 4.827911181207145e-05, "loss": 0.7733596801757813, "memory(GiB)": 91.52, "step": 82525, "token_acc": 0.7801620894444992, "train_speed(iter/s)": 0.135374 }, { "epoch": 1.0708861416490143, "grad_norm": 0.7392457723617554, "learning_rate": 4.8273751247491885e-05, "loss": 0.7781433582305908, "memory(GiB)": 91.52, "step": 82530, "token_acc": 0.781104835745079, "train_speed(iter/s)": 0.135373 }, { "epoch": 1.07095102005067, "grad_norm": 0.6954437494277954, "learning_rate": 4.8268390702777876e-05, "loss": 0.7391480922698974, "memory(GiB)": 91.52, "step": 82535, "token_acc": 0.8013876011126386, "train_speed(iter/s)": 0.135372 }, { "epoch": 1.0710158984523257, "grad_norm": 0.7351350784301758, "learning_rate": 4.8263030177991117e-05, "loss": 0.765538215637207, "memory(GiB)": 91.52, "step": 82540, "token_acc": 0.792162228537347, "train_speed(iter/s)": 0.135371 }, { "epoch": 1.0710807768539814, "grad_norm": 0.7556995749473572, "learning_rate": 4.8257669673193284e-05, "loss": 0.7565276622772217, "memory(GiB)": 91.52, "step": 82545, "token_acc": 0.7875286204922725, "train_speed(iter/s)": 0.13537 }, { "epoch": 1.071145655255637, "grad_norm": 0.665931761264801, "learning_rate": 4.825230918844609e-05, "loss": 0.7420566558837891, "memory(GiB)": 91.52, "step": 82550, "token_acc": 0.7856700472245878, "train_speed(iter/s)": 0.135369 }, { "epoch": 1.0712105336572928, "grad_norm": 0.6165059804916382, "learning_rate": 4.824694872381122e-05, "loss": 0.7937978744506836, "memory(GiB)": 91.52, "step": 82555, "token_acc": 0.7594543111658917, "train_speed(iter/s)": 0.135368 }, { "epoch": 1.0712754120589485, "grad_norm": 0.7265217900276184, "learning_rate": 4.8241588279350344e-05, "loss": 0.7404017925262452, "memory(GiB)": 91.52, "step": 82560, "token_acc": 0.8049014712096185, "train_speed(iter/s)": 0.135368 }, { "epoch": 1.0713402904606042, "grad_norm": 0.8597193956375122, "learning_rate": 4.823622785512514e-05, "loss": 0.7994726657867431, "memory(GiB)": 91.52, "step": 82565, "token_acc": 0.7682582764901253, "train_speed(iter/s)": 0.135367 }, { "epoch": 1.07140516886226, "grad_norm": 0.7455717325210571, "learning_rate": 4.823086745119732e-05, "loss": 0.759360408782959, "memory(GiB)": 91.52, "step": 82570, "token_acc": 0.7870419709745166, "train_speed(iter/s)": 0.135366 }, { "epoch": 1.0714700472639156, "grad_norm": 0.7238399982452393, "learning_rate": 4.8225507067628574e-05, "loss": 0.7630640029907226, "memory(GiB)": 91.52, "step": 82575, "token_acc": 0.7840993984193764, "train_speed(iter/s)": 0.135365 }, { "epoch": 1.0715349256655713, "grad_norm": 0.724419891834259, "learning_rate": 4.8220146704480563e-05, "loss": 0.7187810420989991, "memory(GiB)": 91.52, "step": 82580, "token_acc": 0.7980943409897834, "train_speed(iter/s)": 0.135364 }, { "epoch": 1.071599804067227, "grad_norm": 0.625848650932312, "learning_rate": 4.8214786361814995e-05, "loss": 0.7669834613800048, "memory(GiB)": 91.52, "step": 82585, "token_acc": 0.7823543178095015, "train_speed(iter/s)": 0.135364 }, { "epoch": 1.0716646824688827, "grad_norm": 0.751556396484375, "learning_rate": 4.820942603969357e-05, "loss": 0.7628122329711914, "memory(GiB)": 91.52, "step": 82590, "token_acc": 0.7755457248531438, "train_speed(iter/s)": 0.135363 }, { "epoch": 1.0717295608705384, "grad_norm": 0.7440071105957031, "learning_rate": 4.820406573817794e-05, "loss": 0.7924665451049805, "memory(GiB)": 91.52, "step": 82595, "token_acc": 0.7550919026328863, "train_speed(iter/s)": 0.135362 }, { "epoch": 1.071794439272194, "grad_norm": 0.5985442996025085, "learning_rate": 4.819870545732979e-05, "loss": 0.7669013023376465, "memory(GiB)": 91.52, "step": 82600, "token_acc": 0.7873201700739705, "train_speed(iter/s)": 0.135361 }, { "epoch": 1.0718593176738498, "grad_norm": 0.7115954756736755, "learning_rate": 4.819334519721084e-05, "loss": 0.7839033126831054, "memory(GiB)": 91.52, "step": 82605, "token_acc": 0.7583820590629857, "train_speed(iter/s)": 0.135361 }, { "epoch": 1.0719241960755055, "grad_norm": 0.682817816734314, "learning_rate": 4.8187984957882756e-05, "loss": 0.7598296165466308, "memory(GiB)": 91.52, "step": 82610, "token_acc": 0.7658651677596198, "train_speed(iter/s)": 0.13536 }, { "epoch": 1.0719890744771612, "grad_norm": 0.7062451243400574, "learning_rate": 4.818262473940721e-05, "loss": 0.7621622562408448, "memory(GiB)": 91.52, "step": 82615, "token_acc": 0.767901146387629, "train_speed(iter/s)": 0.135359 }, { "epoch": 1.0720539528788169, "grad_norm": 0.8223036527633667, "learning_rate": 4.817726454184591e-05, "loss": 0.731431531906128, "memory(GiB)": 91.52, "step": 82620, "token_acc": 0.7779797111984932, "train_speed(iter/s)": 0.135358 }, { "epoch": 1.0721188312804726, "grad_norm": 0.7438725233078003, "learning_rate": 4.817190436526055e-05, "loss": 0.7444373607635498, "memory(GiB)": 91.52, "step": 82625, "token_acc": 0.7793781074895431, "train_speed(iter/s)": 0.135357 }, { "epoch": 1.0721837096821283, "grad_norm": 0.7349479794502258, "learning_rate": 4.816654420971278e-05, "loss": 0.7369973182678222, "memory(GiB)": 91.52, "step": 82630, "token_acc": 0.7950461712837951, "train_speed(iter/s)": 0.135356 }, { "epoch": 1.072248588083784, "grad_norm": 0.7291929721832275, "learning_rate": 4.81611840752643e-05, "loss": 0.7630111694335937, "memory(GiB)": 91.52, "step": 82635, "token_acc": 0.7812281437963352, "train_speed(iter/s)": 0.135356 }, { "epoch": 1.0723134664854397, "grad_norm": 0.7988840937614441, "learning_rate": 4.8155823961976796e-05, "loss": 0.7431179523468018, "memory(GiB)": 91.52, "step": 82640, "token_acc": 0.7928947882100332, "train_speed(iter/s)": 0.135355 }, { "epoch": 1.0723783448870954, "grad_norm": 0.6513800024986267, "learning_rate": 4.8150463869911956e-05, "loss": 0.7897987365722656, "memory(GiB)": 91.52, "step": 82645, "token_acc": 0.7756999582114501, "train_speed(iter/s)": 0.135354 }, { "epoch": 1.072443223288751, "grad_norm": 0.7031988501548767, "learning_rate": 4.814510379913145e-05, "loss": 0.7734462738037109, "memory(GiB)": 91.52, "step": 82650, "token_acc": 0.7790852803338693, "train_speed(iter/s)": 0.135353 }, { "epoch": 1.0725081016904068, "grad_norm": 0.7239853143692017, "learning_rate": 4.8139743749696976e-05, "loss": 0.7762602806091309, "memory(GiB)": 91.52, "step": 82655, "token_acc": 0.7710052572885137, "train_speed(iter/s)": 0.135352 }, { "epoch": 1.0725729800920625, "grad_norm": 0.6528624296188354, "learning_rate": 4.81343837216702e-05, "loss": 0.7495036125183105, "memory(GiB)": 91.52, "step": 82660, "token_acc": 0.7851695765603051, "train_speed(iter/s)": 0.135351 }, { "epoch": 1.0726378584937182, "grad_norm": 0.6968725919723511, "learning_rate": 4.8129023715112854e-05, "loss": 0.7621149539947509, "memory(GiB)": 91.52, "step": 82665, "token_acc": 0.7858578653871093, "train_speed(iter/s)": 0.13535 }, { "epoch": 1.0727027368953739, "grad_norm": 0.6599772572517395, "learning_rate": 4.812366373008655e-05, "loss": 0.7096057415008545, "memory(GiB)": 91.52, "step": 82670, "token_acc": 0.7889985219510246, "train_speed(iter/s)": 0.135349 }, { "epoch": 1.0727676152970296, "grad_norm": 0.6554843187332153, "learning_rate": 4.811830376665301e-05, "loss": 0.7422891139984131, "memory(GiB)": 91.52, "step": 82675, "token_acc": 0.7875406013158991, "train_speed(iter/s)": 0.135348 }, { "epoch": 1.0728324936986853, "grad_norm": 0.7219289541244507, "learning_rate": 4.811294382487391e-05, "loss": 0.7577472686767578, "memory(GiB)": 91.52, "step": 82680, "token_acc": 0.8076719823432497, "train_speed(iter/s)": 0.135347 }, { "epoch": 1.072897372100341, "grad_norm": 0.7289272546768188, "learning_rate": 4.810758390481092e-05, "loss": 0.7387759208679199, "memory(GiB)": 91.52, "step": 82685, "token_acc": 0.793841692255203, "train_speed(iter/s)": 0.135346 }, { "epoch": 1.0729622505019967, "grad_norm": 0.7720587849617004, "learning_rate": 4.810222400652574e-05, "loss": 0.7723869800567627, "memory(GiB)": 91.52, "step": 82690, "token_acc": 0.7908091908091908, "train_speed(iter/s)": 0.135345 }, { "epoch": 1.0730271289036524, "grad_norm": 0.7697727084159851, "learning_rate": 4.809686413008004e-05, "loss": 0.7502667427062988, "memory(GiB)": 91.52, "step": 82695, "token_acc": 0.788639129501038, "train_speed(iter/s)": 0.135344 }, { "epoch": 1.073092007305308, "grad_norm": 0.7147748470306396, "learning_rate": 4.809150427553554e-05, "loss": 0.7823028564453125, "memory(GiB)": 91.52, "step": 82700, "token_acc": 0.7595108695652174, "train_speed(iter/s)": 0.135344 }, { "epoch": 1.0731568857069638, "grad_norm": 0.6096335053443909, "learning_rate": 4.808614444295385e-05, "loss": 0.7478387832641602, "memory(GiB)": 91.52, "step": 82705, "token_acc": 0.7830706179066835, "train_speed(iter/s)": 0.135343 }, { "epoch": 1.0732217641086195, "grad_norm": 0.6988815665245056, "learning_rate": 4.8080784632396694e-05, "loss": 0.7656808853149414, "memory(GiB)": 91.52, "step": 82710, "token_acc": 0.7899372247727912, "train_speed(iter/s)": 0.135342 }, { "epoch": 1.0732866425102752, "grad_norm": 0.6307553648948669, "learning_rate": 4.807542484392573e-05, "loss": 0.7694917678833008, "memory(GiB)": 91.52, "step": 82715, "token_acc": 0.7906837064967133, "train_speed(iter/s)": 0.135341 }, { "epoch": 1.0733515209119309, "grad_norm": 0.7975422739982605, "learning_rate": 4.807006507760267e-05, "loss": 0.7446368217468262, "memory(GiB)": 91.52, "step": 82720, "token_acc": 0.8059231253938248, "train_speed(iter/s)": 0.13534 }, { "epoch": 1.0734163993135866, "grad_norm": 0.719519853591919, "learning_rate": 4.806470533348918e-05, "loss": 0.7688235759735107, "memory(GiB)": 91.52, "step": 82725, "token_acc": 0.7682275777913362, "train_speed(iter/s)": 0.135339 }, { "epoch": 1.0734812777152423, "grad_norm": 0.6767703294754028, "learning_rate": 4.8059345611646915e-05, "loss": 0.7717108726501465, "memory(GiB)": 91.52, "step": 82730, "token_acc": 0.7764971893351758, "train_speed(iter/s)": 0.135337 }, { "epoch": 1.073546156116898, "grad_norm": 0.7492085695266724, "learning_rate": 4.8053985912137624e-05, "loss": 0.7646235466003418, "memory(GiB)": 91.52, "step": 82735, "token_acc": 0.7851882160392799, "train_speed(iter/s)": 0.135336 }, { "epoch": 1.0736110345185537, "grad_norm": 0.675675094127655, "learning_rate": 4.8048626235022895e-05, "loss": 0.7599539756774902, "memory(GiB)": 91.52, "step": 82740, "token_acc": 0.7638216560509554, "train_speed(iter/s)": 0.135335 }, { "epoch": 1.0736759129202094, "grad_norm": 0.7909488081932068, "learning_rate": 4.804326658036446e-05, "loss": 0.7741461277008057, "memory(GiB)": 91.52, "step": 82745, "token_acc": 0.7717314724729091, "train_speed(iter/s)": 0.135335 }, { "epoch": 1.073740791321865, "grad_norm": 0.7222229838371277, "learning_rate": 4.8037906948223975e-05, "loss": 0.7197819709777832, "memory(GiB)": 91.52, "step": 82750, "token_acc": 0.7817255388005233, "train_speed(iter/s)": 0.135333 }, { "epoch": 1.0738056697235208, "grad_norm": 0.7197990417480469, "learning_rate": 4.8032547338663146e-05, "loss": 0.7944153785705567, "memory(GiB)": 91.52, "step": 82755, "token_acc": 0.7733819073235504, "train_speed(iter/s)": 0.135332 }, { "epoch": 1.0738705481251765, "grad_norm": 0.7561357021331787, "learning_rate": 4.8027187751743636e-05, "loss": 0.7710107326507568, "memory(GiB)": 91.52, "step": 82760, "token_acc": 0.7704587347407227, "train_speed(iter/s)": 0.135331 }, { "epoch": 1.0739354265268322, "grad_norm": 0.652172863483429, "learning_rate": 4.802182818752711e-05, "loss": 0.6985424995422364, "memory(GiB)": 91.52, "step": 82765, "token_acc": 0.7967717386950774, "train_speed(iter/s)": 0.13533 }, { "epoch": 1.0740003049284879, "grad_norm": 0.6212412118911743, "learning_rate": 4.801646864607529e-05, "loss": 0.7338382720947265, "memory(GiB)": 91.52, "step": 82770, "token_acc": 0.7920688272842209, "train_speed(iter/s)": 0.135329 }, { "epoch": 1.0740651833301436, "grad_norm": 0.7754257321357727, "learning_rate": 4.801110912744979e-05, "loss": 0.7453077793121338, "memory(GiB)": 91.52, "step": 82775, "token_acc": 0.7686063402546356, "train_speed(iter/s)": 0.135327 }, { "epoch": 1.0741300617317993, "grad_norm": 0.709513783454895, "learning_rate": 4.800574963171233e-05, "loss": 0.7933843612670899, "memory(GiB)": 91.52, "step": 82780, "token_acc": 0.7632096785948546, "train_speed(iter/s)": 0.135326 }, { "epoch": 1.074194940133455, "grad_norm": 0.6728067398071289, "learning_rate": 4.8000390158924565e-05, "loss": 0.7383389949798584, "memory(GiB)": 91.52, "step": 82785, "token_acc": 0.7754726511762159, "train_speed(iter/s)": 0.135325 }, { "epoch": 1.0742598185351107, "grad_norm": 0.7515227198600769, "learning_rate": 4.799503070914819e-05, "loss": 0.7837296485900879, "memory(GiB)": 91.52, "step": 82790, "token_acc": 0.7814049298233363, "train_speed(iter/s)": 0.135325 }, { "epoch": 1.0743246969367664, "grad_norm": 0.6474741697311401, "learning_rate": 4.798967128244487e-05, "loss": 0.7555439472198486, "memory(GiB)": 91.52, "step": 82795, "token_acc": 0.7672711400341257, "train_speed(iter/s)": 0.135325 }, { "epoch": 1.074389575338422, "grad_norm": 0.6883537769317627, "learning_rate": 4.798431187887627e-05, "loss": 0.7432021141052246, "memory(GiB)": 91.52, "step": 82800, "token_acc": 0.7748858116435873, "train_speed(iter/s)": 0.135324 }, { "epoch": 1.0744544537400778, "grad_norm": 0.7002223134040833, "learning_rate": 4.797895249850412e-05, "loss": 0.798069953918457, "memory(GiB)": 91.52, "step": 82805, "token_acc": 0.7687462144155057, "train_speed(iter/s)": 0.135323 }, { "epoch": 1.0745193321417335, "grad_norm": 0.7135321497917175, "learning_rate": 4.797359314139002e-05, "loss": 0.7610281467437744, "memory(GiB)": 91.52, "step": 82810, "token_acc": 0.7765485928435011, "train_speed(iter/s)": 0.135322 }, { "epoch": 1.0745842105433892, "grad_norm": 0.6868124604225159, "learning_rate": 4.7968233807595686e-05, "loss": 0.7702689170837402, "memory(GiB)": 91.52, "step": 82815, "token_acc": 0.7758662976069514, "train_speed(iter/s)": 0.135321 }, { "epoch": 1.0746490889450449, "grad_norm": 0.7007591724395752, "learning_rate": 4.7962874497182783e-05, "loss": 0.7451913356781006, "memory(GiB)": 91.52, "step": 82820, "token_acc": 0.7898915056893357, "train_speed(iter/s)": 0.135321 }, { "epoch": 1.0747139673467005, "grad_norm": 0.7256997227668762, "learning_rate": 4.7957515210212995e-05, "loss": 0.7702065467834472, "memory(GiB)": 91.52, "step": 82825, "token_acc": 0.7709363186972957, "train_speed(iter/s)": 0.13532 }, { "epoch": 1.074778845748356, "grad_norm": 0.7058289647102356, "learning_rate": 4.795215594674799e-05, "loss": 0.746701717376709, "memory(GiB)": 91.52, "step": 82830, "token_acc": 0.7852946309115235, "train_speed(iter/s)": 0.135319 }, { "epoch": 1.074843724150012, "grad_norm": 0.725405752658844, "learning_rate": 4.794679670684944e-05, "loss": 0.7884605407714844, "memory(GiB)": 91.52, "step": 82835, "token_acc": 0.7700797057020233, "train_speed(iter/s)": 0.135318 }, { "epoch": 1.0749086025516674, "grad_norm": 0.6344432830810547, "learning_rate": 4.7941437490579016e-05, "loss": 0.7867835998535156, "memory(GiB)": 91.52, "step": 82840, "token_acc": 0.7757936507936508, "train_speed(iter/s)": 0.135317 }, { "epoch": 1.0749734809533233, "grad_norm": 0.7279828190803528, "learning_rate": 4.793607829799842e-05, "loss": 0.7675647735595703, "memory(GiB)": 91.52, "step": 82845, "token_acc": 0.7850635008282717, "train_speed(iter/s)": 0.135316 }, { "epoch": 1.0750383593549788, "grad_norm": 0.6952180862426758, "learning_rate": 4.793071912916929e-05, "loss": 0.7763325214385987, "memory(GiB)": 91.52, "step": 82850, "token_acc": 0.7673489765351972, "train_speed(iter/s)": 0.135315 }, { "epoch": 1.0751032377566347, "grad_norm": 0.6525517702102661, "learning_rate": 4.79253599841533e-05, "loss": 0.7413742065429687, "memory(GiB)": 91.52, "step": 82855, "token_acc": 0.7802319345756804, "train_speed(iter/s)": 0.135314 }, { "epoch": 1.0751681161582902, "grad_norm": 0.7299375534057617, "learning_rate": 4.792000086301214e-05, "loss": 0.7636889457702637, "memory(GiB)": 91.52, "step": 82860, "token_acc": 0.7862457351913512, "train_speed(iter/s)": 0.135313 }, { "epoch": 1.075232994559946, "grad_norm": 0.7307923436164856, "learning_rate": 4.791464176580748e-05, "loss": 0.757465934753418, "memory(GiB)": 91.52, "step": 82865, "token_acc": 0.7873116803352722, "train_speed(iter/s)": 0.135312 }, { "epoch": 1.0752978729616016, "grad_norm": 0.7628744840621948, "learning_rate": 4.7909282692600975e-05, "loss": 0.7884376049041748, "memory(GiB)": 91.52, "step": 82870, "token_acc": 0.7724985839469564, "train_speed(iter/s)": 0.135311 }, { "epoch": 1.0753627513632573, "grad_norm": 0.6565303802490234, "learning_rate": 4.790392364345432e-05, "loss": 0.7704571723937989, "memory(GiB)": 91.52, "step": 82875, "token_acc": 0.771072337600059, "train_speed(iter/s)": 0.13531 }, { "epoch": 1.075427629764913, "grad_norm": 0.701492965221405, "learning_rate": 4.7898564618429194e-05, "loss": 0.7334935188293457, "memory(GiB)": 91.52, "step": 82880, "token_acc": 0.7863691150151576, "train_speed(iter/s)": 0.135309 }, { "epoch": 1.0754925081665687, "grad_norm": 0.7109856605529785, "learning_rate": 4.789320561758723e-05, "loss": 0.7777868270874023, "memory(GiB)": 91.52, "step": 82885, "token_acc": 0.7706815529693758, "train_speed(iter/s)": 0.135308 }, { "epoch": 1.0755573865682244, "grad_norm": 0.6032099723815918, "learning_rate": 4.788784664099012e-05, "loss": 0.7230215072631836, "memory(GiB)": 91.52, "step": 82890, "token_acc": 0.7898921067575241, "train_speed(iter/s)": 0.135307 }, { "epoch": 1.0756222649698801, "grad_norm": 0.7762495875358582, "learning_rate": 4.7882487688699544e-05, "loss": 0.8250049591064453, "memory(GiB)": 91.52, "step": 82895, "token_acc": 0.7585216090687239, "train_speed(iter/s)": 0.135306 }, { "epoch": 1.0756871433715358, "grad_norm": 0.6876356601715088, "learning_rate": 4.787712876077716e-05, "loss": 0.7725993156433105, "memory(GiB)": 91.52, "step": 82900, "token_acc": 0.7691235892434165, "train_speed(iter/s)": 0.135306 }, { "epoch": 1.0757520217731915, "grad_norm": 0.6848890781402588, "learning_rate": 4.7871769857284634e-05, "loss": 0.7520339012145996, "memory(GiB)": 91.52, "step": 82905, "token_acc": 0.7894094100885376, "train_speed(iter/s)": 0.135305 }, { "epoch": 1.0758169001748472, "grad_norm": 0.7376539707183838, "learning_rate": 4.786641097828365e-05, "loss": 0.7992062568664551, "memory(GiB)": 91.52, "step": 82910, "token_acc": 0.777155192210536, "train_speed(iter/s)": 0.135304 }, { "epoch": 1.075881778576503, "grad_norm": 0.7473198175430298, "learning_rate": 4.786105212383589e-05, "loss": 0.7378164291381836, "memory(GiB)": 91.52, "step": 82915, "token_acc": 0.7954504226680695, "train_speed(iter/s)": 0.135303 }, { "epoch": 1.0759466569781586, "grad_norm": 0.7074165940284729, "learning_rate": 4.7855693294002985e-05, "loss": 0.7766834259033203, "memory(GiB)": 91.52, "step": 82920, "token_acc": 0.783953228970345, "train_speed(iter/s)": 0.135303 }, { "epoch": 1.0760115353798143, "grad_norm": 0.6812049150466919, "learning_rate": 4.785033448884662e-05, "loss": 0.7512641429901123, "memory(GiB)": 91.52, "step": 82925, "token_acc": 0.7908148574719263, "train_speed(iter/s)": 0.135302 }, { "epoch": 1.07607641378147, "grad_norm": 0.6394482254981995, "learning_rate": 4.784497570842847e-05, "loss": 0.7678595542907715, "memory(GiB)": 91.52, "step": 82930, "token_acc": 0.7804427333974976, "train_speed(iter/s)": 0.135301 }, { "epoch": 1.0761412921831257, "grad_norm": 0.6443459391593933, "learning_rate": 4.7839616952810204e-05, "loss": 0.7476398944854736, "memory(GiB)": 91.52, "step": 82935, "token_acc": 0.8053220863549474, "train_speed(iter/s)": 0.135301 }, { "epoch": 1.0762061705847814, "grad_norm": 0.6699233651161194, "learning_rate": 4.783425822205348e-05, "loss": 0.7621350288391113, "memory(GiB)": 91.52, "step": 82940, "token_acc": 0.7754230104099555, "train_speed(iter/s)": 0.1353 }, { "epoch": 1.076271048986437, "grad_norm": 0.7716707587242126, "learning_rate": 4.782889951621998e-05, "loss": 0.7572917461395263, "memory(GiB)": 91.52, "step": 82945, "token_acc": 0.7760831889081455, "train_speed(iter/s)": 0.135299 }, { "epoch": 1.0763359273880928, "grad_norm": 0.7966146469116211, "learning_rate": 4.7823540835371374e-05, "loss": 0.7968791961669922, "memory(GiB)": 91.52, "step": 82950, "token_acc": 0.7755342233991688, "train_speed(iter/s)": 0.135298 }, { "epoch": 1.0764008057897485, "grad_norm": 0.723812460899353, "learning_rate": 4.781818217956931e-05, "loss": 0.7569674015045166, "memory(GiB)": 91.52, "step": 82955, "token_acc": 0.7714544084400904, "train_speed(iter/s)": 0.135297 }, { "epoch": 1.0764656841914042, "grad_norm": 0.6849873661994934, "learning_rate": 4.781282354887546e-05, "loss": 0.7495169162750244, "memory(GiB)": 91.52, "step": 82960, "token_acc": 0.793596340766152, "train_speed(iter/s)": 0.135296 }, { "epoch": 1.07653056259306, "grad_norm": 0.7182273864746094, "learning_rate": 4.78074649433515e-05, "loss": 0.756880521774292, "memory(GiB)": 91.52, "step": 82965, "token_acc": 0.7609484369846446, "train_speed(iter/s)": 0.135295 }, { "epoch": 1.0765954409947156, "grad_norm": 0.6263728141784668, "learning_rate": 4.7802106363059095e-05, "loss": 0.7231085300445557, "memory(GiB)": 91.52, "step": 82970, "token_acc": 0.7955450776359646, "train_speed(iter/s)": 0.135294 }, { "epoch": 1.0766603193963713, "grad_norm": 0.7364614605903625, "learning_rate": 4.7796747808059895e-05, "loss": 0.7802066802978516, "memory(GiB)": 91.52, "step": 82975, "token_acc": 0.7701325736824806, "train_speed(iter/s)": 0.135294 }, { "epoch": 1.076725197798027, "grad_norm": 0.6228137612342834, "learning_rate": 4.779138927841559e-05, "loss": 0.7398893356323242, "memory(GiB)": 91.52, "step": 82980, "token_acc": 0.7940225082552733, "train_speed(iter/s)": 0.135292 }, { "epoch": 1.0767900761996827, "grad_norm": 0.6452608108520508, "learning_rate": 4.778603077418785e-05, "loss": 0.7970165729522705, "memory(GiB)": 91.52, "step": 82985, "token_acc": 0.7704187502126068, "train_speed(iter/s)": 0.135292 }, { "epoch": 1.0768549546013384, "grad_norm": 0.6475602388381958, "learning_rate": 4.7780672295438314e-05, "loss": 0.7401224136352539, "memory(GiB)": 91.52, "step": 82990, "token_acc": 0.7858076949549178, "train_speed(iter/s)": 0.135291 }, { "epoch": 1.076919833002994, "grad_norm": 0.7089309692382812, "learning_rate": 4.777531384222865e-05, "loss": 0.738829517364502, "memory(GiB)": 91.52, "step": 82995, "token_acc": 0.8061866255936336, "train_speed(iter/s)": 0.13529 }, { "epoch": 1.0769847114046498, "grad_norm": 0.7303377389907837, "learning_rate": 4.7769955414620536e-05, "loss": 0.8126421928405761, "memory(GiB)": 91.52, "step": 83000, "token_acc": 0.7543597689815467, "train_speed(iter/s)": 0.13529 }, { "epoch": 1.0770495898063055, "grad_norm": 0.7190698385238647, "learning_rate": 4.776459701267563e-05, "loss": 0.7704753875732422, "memory(GiB)": 91.52, "step": 83005, "token_acc": 0.7803859297870137, "train_speed(iter/s)": 0.135289 }, { "epoch": 1.0771144682079612, "grad_norm": 0.6960969567298889, "learning_rate": 4.7759238636455595e-05, "loss": 0.7837771415710449, "memory(GiB)": 91.52, "step": 83010, "token_acc": 0.7719113912967341, "train_speed(iter/s)": 0.135289 }, { "epoch": 1.077179346609617, "grad_norm": 0.6340808868408203, "learning_rate": 4.7753880286022103e-05, "loss": 0.7322557449340821, "memory(GiB)": 91.52, "step": 83015, "token_acc": 0.7795531557343177, "train_speed(iter/s)": 0.135288 }, { "epoch": 1.0772442250112726, "grad_norm": 0.673504114151001, "learning_rate": 4.7748521961436805e-05, "loss": 0.7614874362945556, "memory(GiB)": 91.52, "step": 83020, "token_acc": 0.780099106815123, "train_speed(iter/s)": 0.135287 }, { "epoch": 1.0773091034129283, "grad_norm": 0.7455098628997803, "learning_rate": 4.77431636627614e-05, "loss": 0.7782117843627929, "memory(GiB)": 91.52, "step": 83025, "token_acc": 0.7816091954022989, "train_speed(iter/s)": 0.135285 }, { "epoch": 1.077373981814584, "grad_norm": 0.6632198691368103, "learning_rate": 4.7737805390057496e-05, "loss": 0.8212532997131348, "memory(GiB)": 91.52, "step": 83030, "token_acc": 0.7726032735775527, "train_speed(iter/s)": 0.135284 }, { "epoch": 1.0774388602162397, "grad_norm": 0.7387703657150269, "learning_rate": 4.773244714338678e-05, "loss": 0.7499274253845215, "memory(GiB)": 91.52, "step": 83035, "token_acc": 0.7666163141993958, "train_speed(iter/s)": 0.135283 }, { "epoch": 1.0775037386178954, "grad_norm": 0.7379312515258789, "learning_rate": 4.7727088922810933e-05, "loss": 0.8006000518798828, "memory(GiB)": 91.52, "step": 83040, "token_acc": 0.7898963823455595, "train_speed(iter/s)": 0.135282 }, { "epoch": 1.077568617019551, "grad_norm": 0.6804154515266418, "learning_rate": 4.772173072839158e-05, "loss": 0.7299797058105468, "memory(GiB)": 91.52, "step": 83045, "token_acc": 0.7748003906025619, "train_speed(iter/s)": 0.135281 }, { "epoch": 1.0776334954212068, "grad_norm": 0.6646080613136292, "learning_rate": 4.771637256019041e-05, "loss": 0.7773744583129882, "memory(GiB)": 91.52, "step": 83050, "token_acc": 0.776952608723016, "train_speed(iter/s)": 0.13528 }, { "epoch": 1.0776983738228625, "grad_norm": 0.6485567092895508, "learning_rate": 4.7711014418269073e-05, "loss": 0.7373104572296143, "memory(GiB)": 91.52, "step": 83055, "token_acc": 0.7915792023252058, "train_speed(iter/s)": 0.135279 }, { "epoch": 1.0777632522245182, "grad_norm": 0.8077675700187683, "learning_rate": 4.770565630268927e-05, "loss": 0.7672001838684082, "memory(GiB)": 91.52, "step": 83060, "token_acc": 0.7753466527508573, "train_speed(iter/s)": 0.135279 }, { "epoch": 1.0778281306261739, "grad_norm": 0.6547600030899048, "learning_rate": 4.770029821351259e-05, "loss": 0.7641244888305664, "memory(GiB)": 91.52, "step": 83065, "token_acc": 0.7831071765474203, "train_speed(iter/s)": 0.135278 }, { "epoch": 1.0778930090278296, "grad_norm": 0.654001772403717, "learning_rate": 4.769494015080074e-05, "loss": 0.7546995162963868, "memory(GiB)": 91.52, "step": 83070, "token_acc": 0.7823860329776916, "train_speed(iter/s)": 0.135277 }, { "epoch": 1.0779578874294853, "grad_norm": 0.7347380518913269, "learning_rate": 4.7689582114615364e-05, "loss": 0.7471878528594971, "memory(GiB)": 91.52, "step": 83075, "token_acc": 0.7776749054021416, "train_speed(iter/s)": 0.135277 }, { "epoch": 1.078022765831141, "grad_norm": 0.6359380483627319, "learning_rate": 4.7684224105018125e-05, "loss": 0.7678315162658691, "memory(GiB)": 91.52, "step": 83080, "token_acc": 0.7923475328220982, "train_speed(iter/s)": 0.135275 }, { "epoch": 1.0780876442327967, "grad_norm": 0.6953085660934448, "learning_rate": 4.7678866122070704e-05, "loss": 0.7206617832183838, "memory(GiB)": 91.52, "step": 83085, "token_acc": 0.7905537006121314, "train_speed(iter/s)": 0.135275 }, { "epoch": 1.0781525226344524, "grad_norm": 0.6891794800758362, "learning_rate": 4.7673508165834725e-05, "loss": 0.7836848258972168, "memory(GiB)": 91.52, "step": 83090, "token_acc": 0.7751351790868806, "train_speed(iter/s)": 0.135274 }, { "epoch": 1.078217401036108, "grad_norm": 0.6852463483810425, "learning_rate": 4.766815023637189e-05, "loss": 0.7277856826782226, "memory(GiB)": 91.52, "step": 83095, "token_acc": 0.7930671912627616, "train_speed(iter/s)": 0.135273 }, { "epoch": 1.0782822794377638, "grad_norm": 0.6576258540153503, "learning_rate": 4.7662792333743814e-05, "loss": 0.7956864356994628, "memory(GiB)": 91.52, "step": 83100, "token_acc": 0.7851789591650907, "train_speed(iter/s)": 0.135272 }, { "epoch": 1.0783471578394195, "grad_norm": 0.6855029463768005, "learning_rate": 4.765743445801217e-05, "loss": 0.7424986839294434, "memory(GiB)": 91.52, "step": 83105, "token_acc": 0.7919314811452092, "train_speed(iter/s)": 0.135271 }, { "epoch": 1.0784120362410752, "grad_norm": 0.7202500700950623, "learning_rate": 4.765207660923862e-05, "loss": 0.7359533309936523, "memory(GiB)": 91.52, "step": 83110, "token_acc": 0.7897187133741372, "train_speed(iter/s)": 0.13527 }, { "epoch": 1.0784769146427309, "grad_norm": 0.7456221580505371, "learning_rate": 4.7646718787484825e-05, "loss": 0.7652721405029297, "memory(GiB)": 91.52, "step": 83115, "token_acc": 0.7767266883377771, "train_speed(iter/s)": 0.135269 }, { "epoch": 1.0785417930443866, "grad_norm": 0.6818666458129883, "learning_rate": 4.764136099281243e-05, "loss": 0.760117769241333, "memory(GiB)": 91.52, "step": 83120, "token_acc": 0.7843337123142666, "train_speed(iter/s)": 0.135268 }, { "epoch": 1.0786066714460423, "grad_norm": 0.6740840673446655, "learning_rate": 4.763600322528311e-05, "loss": 0.7308990001678467, "memory(GiB)": 91.52, "step": 83125, "token_acc": 0.789827255278311, "train_speed(iter/s)": 0.135267 }, { "epoch": 1.078671549847698, "grad_norm": 0.7455942034721375, "learning_rate": 4.763064548495853e-05, "loss": 0.7602549076080323, "memory(GiB)": 91.52, "step": 83130, "token_acc": 0.7637413231335883, "train_speed(iter/s)": 0.135267 }, { "epoch": 1.0787364282493537, "grad_norm": 0.6712652444839478, "learning_rate": 4.762528777190031e-05, "loss": 0.8091066360473633, "memory(GiB)": 91.52, "step": 83135, "token_acc": 0.7782203814598887, "train_speed(iter/s)": 0.135266 }, { "epoch": 1.0788013066510094, "grad_norm": 0.7131730318069458, "learning_rate": 4.761993008617012e-05, "loss": 0.7403750896453858, "memory(GiB)": 91.52, "step": 83140, "token_acc": 0.7781402376210426, "train_speed(iter/s)": 0.135265 }, { "epoch": 1.078866185052665, "grad_norm": 0.7081881761550903, "learning_rate": 4.7614572427829626e-05, "loss": 0.7340776443481445, "memory(GiB)": 91.52, "step": 83145, "token_acc": 0.7770000752615338, "train_speed(iter/s)": 0.135264 }, { "epoch": 1.0789310634543208, "grad_norm": 0.6954560875892639, "learning_rate": 4.760921479694048e-05, "loss": 0.7725661277770997, "memory(GiB)": 91.52, "step": 83150, "token_acc": 0.7770988677161005, "train_speed(iter/s)": 0.135263 }, { "epoch": 1.0789959418559765, "grad_norm": 0.7346979379653931, "learning_rate": 4.760385719356434e-05, "loss": 0.767918872833252, "memory(GiB)": 91.52, "step": 83155, "token_acc": 0.7706460972330338, "train_speed(iter/s)": 0.135262 }, { "epoch": 1.0790608202576322, "grad_norm": 0.7718926668167114, "learning_rate": 4.7598499617762845e-05, "loss": 0.7753671646118164, "memory(GiB)": 91.52, "step": 83160, "token_acc": 0.7851300864999495, "train_speed(iter/s)": 0.135261 }, { "epoch": 1.0791256986592879, "grad_norm": 0.6902020573616028, "learning_rate": 4.7593142069597694e-05, "loss": 0.759905481338501, "memory(GiB)": 91.52, "step": 83165, "token_acc": 0.7792099035135627, "train_speed(iter/s)": 0.135261 }, { "epoch": 1.0791905770609436, "grad_norm": 0.6748086810112, "learning_rate": 4.758778454913048e-05, "loss": 0.7789921760559082, "memory(GiB)": 91.52, "step": 83170, "token_acc": 0.7775968965397804, "train_speed(iter/s)": 0.13526 }, { "epoch": 1.0792554554625993, "grad_norm": 0.7203012704849243, "learning_rate": 4.75824270564229e-05, "loss": 0.80296630859375, "memory(GiB)": 91.52, "step": 83175, "token_acc": 0.7810936446541996, "train_speed(iter/s)": 0.135259 }, { "epoch": 1.079320333864255, "grad_norm": 0.7397326231002808, "learning_rate": 4.757706959153657e-05, "loss": 0.7562620162963867, "memory(GiB)": 91.52, "step": 83180, "token_acc": 0.7878954272291876, "train_speed(iter/s)": 0.135258 }, { "epoch": 1.0793852122659107, "grad_norm": 0.7271507978439331, "learning_rate": 4.757171215453319e-05, "loss": 0.7861420631408691, "memory(GiB)": 91.52, "step": 83185, "token_acc": 0.7622636994496291, "train_speed(iter/s)": 0.135258 }, { "epoch": 1.0794500906675664, "grad_norm": 0.6968560814857483, "learning_rate": 4.756635474547438e-05, "loss": 0.7490630149841309, "memory(GiB)": 91.52, "step": 83190, "token_acc": 0.7896485082739462, "train_speed(iter/s)": 0.135257 }, { "epoch": 1.079514969069222, "grad_norm": 0.7142585515975952, "learning_rate": 4.75609973644218e-05, "loss": 0.754229736328125, "memory(GiB)": 91.52, "step": 83195, "token_acc": 0.768037717444318, "train_speed(iter/s)": 0.135256 }, { "epoch": 1.0795798474708778, "grad_norm": 0.7098037004470825, "learning_rate": 4.755564001143711e-05, "loss": 0.7715321063995362, "memory(GiB)": 91.52, "step": 83200, "token_acc": 0.776877649909146, "train_speed(iter/s)": 0.135254 }, { "epoch": 1.0796447258725335, "grad_norm": 0.7171606421470642, "learning_rate": 4.755028268658198e-05, "loss": 0.7910477161407471, "memory(GiB)": 91.52, "step": 83205, "token_acc": 0.794078628053713, "train_speed(iter/s)": 0.135254 }, { "epoch": 1.0797096042741892, "grad_norm": 0.7251754403114319, "learning_rate": 4.754492538991802e-05, "loss": 0.783055591583252, "memory(GiB)": 91.52, "step": 83210, "token_acc": 0.774784694639978, "train_speed(iter/s)": 0.135253 }, { "epoch": 1.0797744826758449, "grad_norm": 0.7688173055648804, "learning_rate": 4.753956812150689e-05, "loss": 0.7410563945770263, "memory(GiB)": 91.52, "step": 83215, "token_acc": 0.7921842288904396, "train_speed(iter/s)": 0.135252 }, { "epoch": 1.0798393610775006, "grad_norm": 0.7120867371559143, "learning_rate": 4.7534210881410264e-05, "loss": 0.7613716125488281, "memory(GiB)": 91.52, "step": 83220, "token_acc": 0.7730419057121477, "train_speed(iter/s)": 0.135251 }, { "epoch": 1.0799042394791563, "grad_norm": 0.7753331661224365, "learning_rate": 4.752885366968978e-05, "loss": 0.7571702003479004, "memory(GiB)": 91.52, "step": 83225, "token_acc": 0.7967522262964903, "train_speed(iter/s)": 0.13525 }, { "epoch": 1.079969117880812, "grad_norm": 0.7575929164886475, "learning_rate": 4.752349648640707e-05, "loss": 0.7552204132080078, "memory(GiB)": 91.52, "step": 83230, "token_acc": 0.7988113032956082, "train_speed(iter/s)": 0.135249 }, { "epoch": 1.0800339962824677, "grad_norm": 0.6597808003425598, "learning_rate": 4.751813933162383e-05, "loss": 0.776644515991211, "memory(GiB)": 91.52, "step": 83235, "token_acc": 0.7972123444246888, "train_speed(iter/s)": 0.135248 }, { "epoch": 1.0800988746841234, "grad_norm": 0.699000358581543, "learning_rate": 4.751278220540168e-05, "loss": 0.7749477863311768, "memory(GiB)": 91.52, "step": 83240, "token_acc": 0.7793578700078309, "train_speed(iter/s)": 0.135247 }, { "epoch": 1.080163753085779, "grad_norm": 0.6519379019737244, "learning_rate": 4.750742510780227e-05, "loss": 0.7317989826202392, "memory(GiB)": 91.52, "step": 83245, "token_acc": 0.7964878974845753, "train_speed(iter/s)": 0.135246 }, { "epoch": 1.0802286314874348, "grad_norm": 0.6398550868034363, "learning_rate": 4.7502068038887235e-05, "loss": 0.7389514446258545, "memory(GiB)": 91.52, "step": 83250, "token_acc": 0.802097830883601, "train_speed(iter/s)": 0.135245 }, { "epoch": 1.0802935098890905, "grad_norm": 0.6968494057655334, "learning_rate": 4.7496710998718255e-05, "loss": 0.7750981330871582, "memory(GiB)": 91.52, "step": 83255, "token_acc": 0.7780078301460623, "train_speed(iter/s)": 0.135244 }, { "epoch": 1.0803583882907462, "grad_norm": 0.7190625071525574, "learning_rate": 4.749135398735696e-05, "loss": 0.7736427307128906, "memory(GiB)": 91.52, "step": 83260, "token_acc": 0.7864293659621802, "train_speed(iter/s)": 0.135243 }, { "epoch": 1.0804232666924019, "grad_norm": 0.8494031429290771, "learning_rate": 4.7485997004864996e-05, "loss": 0.7905458927154541, "memory(GiB)": 91.52, "step": 83265, "token_acc": 0.7712118818138727, "train_speed(iter/s)": 0.135242 }, { "epoch": 1.0804881450940576, "grad_norm": 0.6788533329963684, "learning_rate": 4.7480640051304023e-05, "loss": 0.7509798049926758, "memory(GiB)": 91.52, "step": 83270, "token_acc": 0.7764664777359369, "train_speed(iter/s)": 0.135241 }, { "epoch": 1.0805530234957132, "grad_norm": 0.7196401357650757, "learning_rate": 4.7475283126735695e-05, "loss": 0.7984983444213867, "memory(GiB)": 91.52, "step": 83275, "token_acc": 0.756443595905066, "train_speed(iter/s)": 0.135241 }, { "epoch": 1.080617901897369, "grad_norm": 0.6976978778839111, "learning_rate": 4.746992623122163e-05, "loss": 0.8089662551879883, "memory(GiB)": 91.52, "step": 83280, "token_acc": 0.7611724949690549, "train_speed(iter/s)": 0.13524 }, { "epoch": 1.0806827802990246, "grad_norm": 0.7698386311531067, "learning_rate": 4.746456936482349e-05, "loss": 0.7956310749053955, "memory(GiB)": 91.52, "step": 83285, "token_acc": 0.7762965482060874, "train_speed(iter/s)": 0.13524 }, { "epoch": 1.0807476587006803, "grad_norm": 0.7265867590904236, "learning_rate": 4.745921252760293e-05, "loss": 0.7166205406188965, "memory(GiB)": 91.52, "step": 83290, "token_acc": 0.7901821324115926, "train_speed(iter/s)": 0.135239 }, { "epoch": 1.080812537102336, "grad_norm": 0.6904247403144836, "learning_rate": 4.745385571962159e-05, "loss": 0.7390258312225342, "memory(GiB)": 91.52, "step": 83295, "token_acc": 0.7890941385435168, "train_speed(iter/s)": 0.135238 }, { "epoch": 1.0808774155039917, "grad_norm": 0.6649503111839294, "learning_rate": 4.74484989409411e-05, "loss": 0.7963851928710938, "memory(GiB)": 91.52, "step": 83300, "token_acc": 0.7775432359367478, "train_speed(iter/s)": 0.135238 }, { "epoch": 1.0809422939056474, "grad_norm": 0.6917295455932617, "learning_rate": 4.744314219162313e-05, "loss": 0.7300123691558837, "memory(GiB)": 91.52, "step": 83305, "token_acc": 0.770314437056238, "train_speed(iter/s)": 0.135237 }, { "epoch": 1.0810071723073031, "grad_norm": 0.7321677803993225, "learning_rate": 4.743778547172933e-05, "loss": 0.719694995880127, "memory(GiB)": 91.52, "step": 83310, "token_acc": 0.8075082083943194, "train_speed(iter/s)": 0.135236 }, { "epoch": 1.0810720507089586, "grad_norm": 0.7569814920425415, "learning_rate": 4.743242878132132e-05, "loss": 0.7389307022094727, "memory(GiB)": 91.52, "step": 83315, "token_acc": 0.7684391080617495, "train_speed(iter/s)": 0.135235 }, { "epoch": 1.0811369291106145, "grad_norm": 0.7113156318664551, "learning_rate": 4.742707212046074e-05, "loss": 0.7417683601379395, "memory(GiB)": 91.52, "step": 83320, "token_acc": 0.7794518061354148, "train_speed(iter/s)": 0.135235 }, { "epoch": 1.08120180751227, "grad_norm": 0.635827898979187, "learning_rate": 4.7421715489209254e-05, "loss": 0.7715805053710938, "memory(GiB)": 91.52, "step": 83325, "token_acc": 0.7885190002460717, "train_speed(iter/s)": 0.135233 }, { "epoch": 1.081266685913926, "grad_norm": 0.7391765117645264, "learning_rate": 4.7416358887628506e-05, "loss": 0.7623776912689209, "memory(GiB)": 91.52, "step": 83330, "token_acc": 0.7733982765080555, "train_speed(iter/s)": 0.135233 }, { "epoch": 1.0813315643155814, "grad_norm": 0.7403684854507446, "learning_rate": 4.7411002315780126e-05, "loss": 0.7857481002807617, "memory(GiB)": 91.52, "step": 83335, "token_acc": 0.7949248724080088, "train_speed(iter/s)": 0.135232 }, { "epoch": 1.0813964427172371, "grad_norm": 0.7149226069450378, "learning_rate": 4.740564577372577e-05, "loss": 0.7275121688842774, "memory(GiB)": 91.52, "step": 83340, "token_acc": 0.7920129805622703, "train_speed(iter/s)": 0.135231 }, { "epoch": 1.0814613211188928, "grad_norm": 0.6681123971939087, "learning_rate": 4.7400289261527085e-05, "loss": 0.7240047454833984, "memory(GiB)": 91.52, "step": 83345, "token_acc": 0.773465829846583, "train_speed(iter/s)": 0.13523 }, { "epoch": 1.0815261995205485, "grad_norm": 0.6967262625694275, "learning_rate": 4.73949327792457e-05, "loss": 0.788053560256958, "memory(GiB)": 91.52, "step": 83350, "token_acc": 0.7744022397094431, "train_speed(iter/s)": 0.135229 }, { "epoch": 1.0815910779222042, "grad_norm": 0.7405956983566284, "learning_rate": 4.7389576326943244e-05, "loss": 0.7503618240356446, "memory(GiB)": 91.52, "step": 83355, "token_acc": 0.7764540236109607, "train_speed(iter/s)": 0.135229 }, { "epoch": 1.08165595632386, "grad_norm": 0.6832312345504761, "learning_rate": 4.738421990468139e-05, "loss": 0.7741158485412598, "memory(GiB)": 91.52, "step": 83360, "token_acc": 0.7715371149763492, "train_speed(iter/s)": 0.135228 }, { "epoch": 1.0817208347255156, "grad_norm": 0.7331925630569458, "learning_rate": 4.737886351252177e-05, "loss": 0.7469449043273926, "memory(GiB)": 91.52, "step": 83365, "token_acc": 0.7796904550542283, "train_speed(iter/s)": 0.135228 }, { "epoch": 1.0817857131271713, "grad_norm": 0.6996310353279114, "learning_rate": 4.7373507150526e-05, "loss": 0.7697345733642578, "memory(GiB)": 91.52, "step": 83370, "token_acc": 0.7746317842491088, "train_speed(iter/s)": 0.135226 }, { "epoch": 1.081850591528827, "grad_norm": 0.7123005986213684, "learning_rate": 4.736815081875575e-05, "loss": 0.7796341419219971, "memory(GiB)": 91.52, "step": 83375, "token_acc": 0.7716955941255007, "train_speed(iter/s)": 0.135226 }, { "epoch": 1.0819154699304827, "grad_norm": 0.6014803051948547, "learning_rate": 4.736279451727265e-05, "loss": 0.7168768405914306, "memory(GiB)": 91.52, "step": 83380, "token_acc": 0.7694180898165718, "train_speed(iter/s)": 0.135225 }, { "epoch": 1.0819803483321384, "grad_norm": 0.7066051363945007, "learning_rate": 4.735743824613837e-05, "loss": 0.7755998611450196, "memory(GiB)": 91.52, "step": 83385, "token_acc": 0.79563618682955, "train_speed(iter/s)": 0.135223 }, { "epoch": 1.082045226733794, "grad_norm": 0.6615769267082214, "learning_rate": 4.7352082005414486e-05, "loss": 0.7818305969238282, "memory(GiB)": 91.52, "step": 83390, "token_acc": 0.7911148111708166, "train_speed(iter/s)": 0.135223 }, { "epoch": 1.0821101051354498, "grad_norm": 0.6516226530075073, "learning_rate": 4.734672579516268e-05, "loss": 0.755506181716919, "memory(GiB)": 91.52, "step": 83395, "token_acc": 0.7792390150347615, "train_speed(iter/s)": 0.135221 }, { "epoch": 1.0821749835371055, "grad_norm": 0.7429364323616028, "learning_rate": 4.734136961544459e-05, "loss": 0.7953577041625977, "memory(GiB)": 91.52, "step": 83400, "token_acc": 0.7872958993948819, "train_speed(iter/s)": 0.135221 }, { "epoch": 1.0822398619387612, "grad_norm": 0.6428013443946838, "learning_rate": 4.733601346632183e-05, "loss": 0.7268950462341308, "memory(GiB)": 91.52, "step": 83405, "token_acc": 0.7818920065450579, "train_speed(iter/s)": 0.135219 }, { "epoch": 1.082304740340417, "grad_norm": 0.7201273441314697, "learning_rate": 4.7330657347856075e-05, "loss": 0.783472728729248, "memory(GiB)": 91.52, "step": 83410, "token_acc": 0.7842869554424724, "train_speed(iter/s)": 0.135218 }, { "epoch": 1.0823696187420726, "grad_norm": 0.7081345319747925, "learning_rate": 4.7325301260108926e-05, "loss": 0.7267887115478515, "memory(GiB)": 91.52, "step": 83415, "token_acc": 0.784839802031779, "train_speed(iter/s)": 0.135217 }, { "epoch": 1.0824344971437283, "grad_norm": 0.737558901309967, "learning_rate": 4.7319945203142066e-05, "loss": 0.742273998260498, "memory(GiB)": 91.52, "step": 83420, "token_acc": 0.7947824922139245, "train_speed(iter/s)": 0.135216 }, { "epoch": 1.082499375545384, "grad_norm": 0.6764589548110962, "learning_rate": 4.731458917701708e-05, "loss": 0.7954602241516113, "memory(GiB)": 91.52, "step": 83425, "token_acc": 0.7634908664320429, "train_speed(iter/s)": 0.135216 }, { "epoch": 1.0825642539470397, "grad_norm": 0.8266412615776062, "learning_rate": 4.7309233181795633e-05, "loss": 0.7599564552307129, "memory(GiB)": 91.52, "step": 83430, "token_acc": 0.8022729078915998, "train_speed(iter/s)": 0.135215 }, { "epoch": 1.0826291323486954, "grad_norm": 0.6546744704246521, "learning_rate": 4.730387721753936e-05, "loss": 0.7733141422271729, "memory(GiB)": 91.52, "step": 83435, "token_acc": 0.7687471196260451, "train_speed(iter/s)": 0.135214 }, { "epoch": 1.082694010750351, "grad_norm": 0.6716168522834778, "learning_rate": 4.729852128430989e-05, "loss": 0.7631056785583497, "memory(GiB)": 91.52, "step": 83440, "token_acc": 0.7841536338546458, "train_speed(iter/s)": 0.135213 }, { "epoch": 1.0827588891520068, "grad_norm": 0.6900060772895813, "learning_rate": 4.7293165382168866e-05, "loss": 0.7536832332611084, "memory(GiB)": 91.52, "step": 83445, "token_acc": 0.8024990437332653, "train_speed(iter/s)": 0.135212 }, { "epoch": 1.0828237675536625, "grad_norm": 0.8342097401618958, "learning_rate": 4.728780951117791e-05, "loss": 0.7407572746276856, "memory(GiB)": 91.52, "step": 83450, "token_acc": 0.7755731156651615, "train_speed(iter/s)": 0.135211 }, { "epoch": 1.0828886459553182, "grad_norm": 0.7173689603805542, "learning_rate": 4.7282453671398705e-05, "loss": 0.7506669044494629, "memory(GiB)": 91.52, "step": 83455, "token_acc": 0.7812225302589763, "train_speed(iter/s)": 0.13521 }, { "epoch": 1.082953524356974, "grad_norm": 0.7111877799034119, "learning_rate": 4.727709786289281e-05, "loss": 0.723011589050293, "memory(GiB)": 91.52, "step": 83460, "token_acc": 0.7905207501236259, "train_speed(iter/s)": 0.13521 }, { "epoch": 1.0830184027586296, "grad_norm": 0.6831153035163879, "learning_rate": 4.7271742085721914e-05, "loss": 0.7497485160827637, "memory(GiB)": 91.52, "step": 83465, "token_acc": 0.7993628128358667, "train_speed(iter/s)": 0.135209 }, { "epoch": 1.0830832811602853, "grad_norm": 0.7293565273284912, "learning_rate": 4.7266386339947634e-05, "loss": 0.7554259300231934, "memory(GiB)": 91.52, "step": 83470, "token_acc": 0.8005955623343138, "train_speed(iter/s)": 0.135208 }, { "epoch": 1.083148159561941, "grad_norm": 0.7481986284255981, "learning_rate": 4.726103062563159e-05, "loss": 0.7909241199493409, "memory(GiB)": 91.52, "step": 83475, "token_acc": 0.773542138883165, "train_speed(iter/s)": 0.135207 }, { "epoch": 1.0832130379635967, "grad_norm": 0.6863642930984497, "learning_rate": 4.7255674942835434e-05, "loss": 0.7868064880371094, "memory(GiB)": 91.52, "step": 83480, "token_acc": 0.7772421080438515, "train_speed(iter/s)": 0.135206 }, { "epoch": 1.0832779163652524, "grad_norm": 0.6255150437355042, "learning_rate": 4.7250319291620795e-05, "loss": 0.7225351333618164, "memory(GiB)": 91.52, "step": 83485, "token_acc": 0.8023959128545423, "train_speed(iter/s)": 0.135205 }, { "epoch": 1.083342794766908, "grad_norm": 0.6449629068374634, "learning_rate": 4.7244963672049335e-05, "loss": 0.7906980991363526, "memory(GiB)": 91.52, "step": 83490, "token_acc": 0.7847708740094177, "train_speed(iter/s)": 0.135204 }, { "epoch": 1.0834076731685638, "grad_norm": 0.7083454132080078, "learning_rate": 4.7239608084182616e-05, "loss": 0.7691605091094971, "memory(GiB)": 91.52, "step": 83495, "token_acc": 0.7888702556555666, "train_speed(iter/s)": 0.135203 }, { "epoch": 1.0834725515702195, "grad_norm": 0.6290249228477478, "learning_rate": 4.723425252808233e-05, "loss": 0.7780146598815918, "memory(GiB)": 91.52, "step": 83500, "token_acc": 0.7514537610204465, "train_speed(iter/s)": 0.135202 }, { "epoch": 1.0835374299718752, "grad_norm": 0.7875682711601257, "learning_rate": 4.722889700381008e-05, "loss": 0.731234359741211, "memory(GiB)": 91.52, "step": 83505, "token_acc": 0.7913109811471539, "train_speed(iter/s)": 0.135201 }, { "epoch": 1.0836023083735309, "grad_norm": 0.6306434869766235, "learning_rate": 4.72235415114275e-05, "loss": 0.7732605457305908, "memory(GiB)": 91.52, "step": 83510, "token_acc": 0.7634875563721396, "train_speed(iter/s)": 0.1352 }, { "epoch": 1.0836671867751866, "grad_norm": 0.6980441808700562, "learning_rate": 4.721818605099623e-05, "loss": 0.7148558139801026, "memory(GiB)": 91.52, "step": 83515, "token_acc": 0.8013182674199624, "train_speed(iter/s)": 0.135199 }, { "epoch": 1.0837320651768423, "grad_norm": 0.6480110883712769, "learning_rate": 4.721283062257789e-05, "loss": 0.7411828994750976, "memory(GiB)": 91.52, "step": 83520, "token_acc": 0.7733733358334896, "train_speed(iter/s)": 0.135198 }, { "epoch": 1.083796943578498, "grad_norm": 0.7291632294654846, "learning_rate": 4.720747522623415e-05, "loss": 0.7711062431335449, "memory(GiB)": 91.52, "step": 83525, "token_acc": 0.7973455416019153, "train_speed(iter/s)": 0.135198 }, { "epoch": 1.0838618219801537, "grad_norm": 0.7331230640411377, "learning_rate": 4.720211986202657e-05, "loss": 0.7499387741088868, "memory(GiB)": 91.52, "step": 83530, "token_acc": 0.763777833554448, "train_speed(iter/s)": 0.135196 }, { "epoch": 1.0839267003818094, "grad_norm": 0.6831837296485901, "learning_rate": 4.719676453001682e-05, "loss": 0.7300036430358887, "memory(GiB)": 91.52, "step": 83535, "token_acc": 0.8082451526087222, "train_speed(iter/s)": 0.135196 }, { "epoch": 1.083991578783465, "grad_norm": 0.759226381778717, "learning_rate": 4.719140923026652e-05, "loss": 0.8010117530822753, "memory(GiB)": 91.52, "step": 83540, "token_acc": 0.793350471293916, "train_speed(iter/s)": 0.135195 }, { "epoch": 1.0840564571851208, "grad_norm": 0.8273898959159851, "learning_rate": 4.718605396283731e-05, "loss": 0.782051944732666, "memory(GiB)": 91.52, "step": 83545, "token_acc": 0.8052038098018465, "train_speed(iter/s)": 0.135194 }, { "epoch": 1.0841213355867765, "grad_norm": 0.7223582863807678, "learning_rate": 4.7180698727790815e-05, "loss": 0.7275089263916016, "memory(GiB)": 91.52, "step": 83550, "token_acc": 0.7890661118816459, "train_speed(iter/s)": 0.135193 }, { "epoch": 1.0841862139884322, "grad_norm": 0.7298301458358765, "learning_rate": 4.717534352518864e-05, "loss": 0.7863505363464356, "memory(GiB)": 91.52, "step": 83555, "token_acc": 0.7856500224921278, "train_speed(iter/s)": 0.135192 }, { "epoch": 1.0842510923900879, "grad_norm": 0.6580654382705688, "learning_rate": 4.7169988355092445e-05, "loss": 0.7450907707214356, "memory(GiB)": 91.52, "step": 83560, "token_acc": 0.7791589258571188, "train_speed(iter/s)": 0.135191 }, { "epoch": 1.0843159707917436, "grad_norm": 0.722905158996582, "learning_rate": 4.7164633217563844e-05, "loss": 0.7533368110656739, "memory(GiB)": 91.52, "step": 83565, "token_acc": 0.7810820217332457, "train_speed(iter/s)": 0.13519 }, { "epoch": 1.0843808491933993, "grad_norm": 0.7290173768997192, "learning_rate": 4.715927811266446e-05, "loss": 0.7489521026611328, "memory(GiB)": 91.52, "step": 83570, "token_acc": 0.7813784246575343, "train_speed(iter/s)": 0.135189 }, { "epoch": 1.084445727595055, "grad_norm": 0.7235912680625916, "learning_rate": 4.7153923040455905e-05, "loss": 0.8029786109924316, "memory(GiB)": 91.52, "step": 83575, "token_acc": 0.7613906609792005, "train_speed(iter/s)": 0.135188 }, { "epoch": 1.0845106059967107, "grad_norm": 0.8222890496253967, "learning_rate": 4.7148568000999835e-05, "loss": 0.7541969299316407, "memory(GiB)": 91.52, "step": 83580, "token_acc": 0.7959134532695814, "train_speed(iter/s)": 0.135187 }, { "epoch": 1.0845754843983664, "grad_norm": 0.7217019200325012, "learning_rate": 4.714321299435786e-05, "loss": 0.8004205703735352, "memory(GiB)": 91.52, "step": 83585, "token_acc": 0.7756124361709004, "train_speed(iter/s)": 0.135186 }, { "epoch": 1.084640362800022, "grad_norm": 0.7833576798439026, "learning_rate": 4.7137858020591594e-05, "loss": 0.759060001373291, "memory(GiB)": 91.52, "step": 83590, "token_acc": 0.7777281923122694, "train_speed(iter/s)": 0.135185 }, { "epoch": 1.0847052412016778, "grad_norm": 0.7132999300956726, "learning_rate": 4.7132503079762685e-05, "loss": 0.7826159477233887, "memory(GiB)": 91.52, "step": 83595, "token_acc": 0.7925179949001001, "train_speed(iter/s)": 0.135185 }, { "epoch": 1.0847701196033335, "grad_norm": 0.7160765528678894, "learning_rate": 4.7127148171932756e-05, "loss": 0.7350916862487793, "memory(GiB)": 91.52, "step": 83600, "token_acc": 0.771836950767602, "train_speed(iter/s)": 0.135184 }, { "epoch": 1.0848349980049892, "grad_norm": 0.7485979199409485, "learning_rate": 4.712179329716341e-05, "loss": 0.7831048011779785, "memory(GiB)": 91.52, "step": 83605, "token_acc": 0.776040384141837, "train_speed(iter/s)": 0.135183 }, { "epoch": 1.0848998764066449, "grad_norm": 0.724871814250946, "learning_rate": 4.711643845551628e-05, "loss": 0.7410794258117676, "memory(GiB)": 91.52, "step": 83610, "token_acc": 0.7820463444380256, "train_speed(iter/s)": 0.135182 }, { "epoch": 1.0849647548083006, "grad_norm": 0.6884841322898865, "learning_rate": 4.711108364705299e-05, "loss": 0.7524913787841797, "memory(GiB)": 91.52, "step": 83615, "token_acc": 0.7982398719906902, "train_speed(iter/s)": 0.135181 }, { "epoch": 1.0850296332099563, "grad_norm": 0.734216034412384, "learning_rate": 4.7105728871835165e-05, "loss": 0.751708984375, "memory(GiB)": 91.52, "step": 83620, "token_acc": 0.7832067954596708, "train_speed(iter/s)": 0.13518 }, { "epoch": 1.085094511611612, "grad_norm": 0.736054003238678, "learning_rate": 4.710037412992442e-05, "loss": 0.744025993347168, "memory(GiB)": 91.52, "step": 83625, "token_acc": 0.7898412469851327, "train_speed(iter/s)": 0.13518 }, { "epoch": 1.0851593900132677, "grad_norm": 0.8289721012115479, "learning_rate": 4.7095019421382385e-05, "loss": 0.8063970565795898, "memory(GiB)": 91.52, "step": 83630, "token_acc": 0.7596766801383276, "train_speed(iter/s)": 0.135179 }, { "epoch": 1.0852242684149234, "grad_norm": 0.6667943000793457, "learning_rate": 4.708966474627069e-05, "loss": 0.7354522705078125, "memory(GiB)": 91.52, "step": 83635, "token_acc": 0.7623645707178162, "train_speed(iter/s)": 0.135177 }, { "epoch": 1.085289146816579, "grad_norm": 0.7124769687652588, "learning_rate": 4.708431010465095e-05, "loss": 0.7841091156005859, "memory(GiB)": 91.52, "step": 83640, "token_acc": 0.7627006890185607, "train_speed(iter/s)": 0.135176 }, { "epoch": 1.0853540252182348, "grad_norm": 0.7094170451164246, "learning_rate": 4.707895549658476e-05, "loss": 0.7619647979736328, "memory(GiB)": 91.52, "step": 83645, "token_acc": 0.7787181826762735, "train_speed(iter/s)": 0.135175 }, { "epoch": 1.0854189036198905, "grad_norm": 0.7353459000587463, "learning_rate": 4.707360092213378e-05, "loss": 0.7462317943572998, "memory(GiB)": 91.52, "step": 83650, "token_acc": 0.7774250770236206, "train_speed(iter/s)": 0.135174 }, { "epoch": 1.0854837820215462, "grad_norm": 0.7058506608009338, "learning_rate": 4.70682463813596e-05, "loss": 0.7536012172698975, "memory(GiB)": 91.52, "step": 83655, "token_acc": 0.787077250693491, "train_speed(iter/s)": 0.135173 }, { "epoch": 1.0855486604232019, "grad_norm": 0.7293525338172913, "learning_rate": 4.706289187432385e-05, "loss": 0.7905006408691406, "memory(GiB)": 91.52, "step": 83660, "token_acc": 0.7607102702283566, "train_speed(iter/s)": 0.135172 }, { "epoch": 1.0856135388248576, "grad_norm": 0.6745395064353943, "learning_rate": 4.705753740108816e-05, "loss": 0.7390915393829346, "memory(GiB)": 91.52, "step": 83665, "token_acc": 0.7910206336055549, "train_speed(iter/s)": 0.135171 }, { "epoch": 1.0856784172265133, "grad_norm": 0.6300926804542542, "learning_rate": 4.705218296171415e-05, "loss": 0.7320192813873291, "memory(GiB)": 91.52, "step": 83670, "token_acc": 0.7856638811513463, "train_speed(iter/s)": 0.135171 }, { "epoch": 1.085743295628169, "grad_norm": 0.7020618319511414, "learning_rate": 4.704682855626342e-05, "loss": 0.7698693752288819, "memory(GiB)": 91.52, "step": 83675, "token_acc": 0.779830548364321, "train_speed(iter/s)": 0.13517 }, { "epoch": 1.0858081740298247, "grad_norm": 0.7328271865844727, "learning_rate": 4.704147418479759e-05, "loss": 0.7888669967651367, "memory(GiB)": 91.52, "step": 83680, "token_acc": 0.7711040224072201, "train_speed(iter/s)": 0.135169 }, { "epoch": 1.0858730524314804, "grad_norm": 0.7174594402313232, "learning_rate": 4.7036119847378286e-05, "loss": 0.7295586109161377, "memory(GiB)": 91.52, "step": 83685, "token_acc": 0.7952402004126142, "train_speed(iter/s)": 0.135168 }, { "epoch": 1.085937930833136, "grad_norm": 0.7265975475311279, "learning_rate": 4.7030765544067126e-05, "loss": 0.7797798156738281, "memory(GiB)": 91.52, "step": 83690, "token_acc": 0.7689976513372225, "train_speed(iter/s)": 0.135168 }, { "epoch": 1.0860028092347918, "grad_norm": 0.6717119216918945, "learning_rate": 4.702541127492572e-05, "loss": 0.6765039443969727, "memory(GiB)": 91.52, "step": 83695, "token_acc": 0.806265570078149, "train_speed(iter/s)": 0.135167 }, { "epoch": 1.0860676876364475, "grad_norm": 0.6887841820716858, "learning_rate": 4.702005704001569e-05, "loss": 0.7519527435302734, "memory(GiB)": 91.52, "step": 83700, "token_acc": 0.785988419068139, "train_speed(iter/s)": 0.135165 }, { "epoch": 1.0861325660381032, "grad_norm": 0.7458674907684326, "learning_rate": 4.701470283939866e-05, "loss": 0.7486268997192382, "memory(GiB)": 91.52, "step": 83705, "token_acc": 0.7837936711176148, "train_speed(iter/s)": 0.135164 }, { "epoch": 1.0861974444397589, "grad_norm": 0.7211663126945496, "learning_rate": 4.700934867313625e-05, "loss": 0.7909507274627685, "memory(GiB)": 91.52, "step": 83710, "token_acc": 0.7729946234619132, "train_speed(iter/s)": 0.135163 }, { "epoch": 1.0862623228414146, "grad_norm": 0.7114121913909912, "learning_rate": 4.7003994541290045e-05, "loss": 0.7460545063018799, "memory(GiB)": 91.52, "step": 83715, "token_acc": 0.7946758039008962, "train_speed(iter/s)": 0.135162 }, { "epoch": 1.0863272012430703, "grad_norm": 0.7645632028579712, "learning_rate": 4.699864044392168e-05, "loss": 0.763819694519043, "memory(GiB)": 91.52, "step": 83720, "token_acc": 0.7849283936365932, "train_speed(iter/s)": 0.135162 }, { "epoch": 1.086392079644726, "grad_norm": 0.6517189145088196, "learning_rate": 4.6993286381092765e-05, "loss": 0.7342618942260742, "memory(GiB)": 91.52, "step": 83725, "token_acc": 0.7935837906289573, "train_speed(iter/s)": 0.13516 }, { "epoch": 1.0864569580463816, "grad_norm": 0.6768571734428406, "learning_rate": 4.698793235286491e-05, "loss": 0.7300808906555176, "memory(GiB)": 91.52, "step": 83730, "token_acc": 0.8142792879634985, "train_speed(iter/s)": 0.135159 }, { "epoch": 1.0865218364480373, "grad_norm": 0.7075420022010803, "learning_rate": 4.698257835929975e-05, "loss": 0.7163904190063477, "memory(GiB)": 91.52, "step": 83735, "token_acc": 0.799363488624052, "train_speed(iter/s)": 0.135159 }, { "epoch": 1.086586714849693, "grad_norm": 0.7672603726387024, "learning_rate": 4.697722440045888e-05, "loss": 0.8100704193115235, "memory(GiB)": 91.52, "step": 83740, "token_acc": 0.7656610061332293, "train_speed(iter/s)": 0.135158 }, { "epoch": 1.0866515932513487, "grad_norm": 0.7139580845832825, "learning_rate": 4.697187047640393e-05, "loss": 0.772024393081665, "memory(GiB)": 91.52, "step": 83745, "token_acc": 0.7853281607733401, "train_speed(iter/s)": 0.135157 }, { "epoch": 1.0867164716530044, "grad_norm": 0.688134491443634, "learning_rate": 4.696651658719648e-05, "loss": 0.7602095603942871, "memory(GiB)": 91.52, "step": 83750, "token_acc": 0.7833230642842473, "train_speed(iter/s)": 0.135156 }, { "epoch": 1.0867813500546601, "grad_norm": 0.7190819382667542, "learning_rate": 4.696116273289817e-05, "loss": 0.7541271209716797, "memory(GiB)": 91.52, "step": 83755, "token_acc": 0.7785677847213366, "train_speed(iter/s)": 0.135156 }, { "epoch": 1.0868462284563158, "grad_norm": 0.6998558044433594, "learning_rate": 4.69558089135706e-05, "loss": 0.7775677680969239, "memory(GiB)": 91.52, "step": 83760, "token_acc": 0.7738757533611498, "train_speed(iter/s)": 0.135155 }, { "epoch": 1.0869111068579715, "grad_norm": 0.7229076623916626, "learning_rate": 4.695045512927538e-05, "loss": 0.8038585662841797, "memory(GiB)": 91.52, "step": 83765, "token_acc": 0.7775997976647135, "train_speed(iter/s)": 0.135154 }, { "epoch": 1.0869759852596272, "grad_norm": 0.6884693503379822, "learning_rate": 4.6945101380074134e-05, "loss": 0.7819883346557617, "memory(GiB)": 91.52, "step": 83770, "token_acc": 0.7869607884968213, "train_speed(iter/s)": 0.135154 }, { "epoch": 1.087040863661283, "grad_norm": 0.7569713592529297, "learning_rate": 4.693974766602847e-05, "loss": 0.7653257369995117, "memory(GiB)": 91.52, "step": 83775, "token_acc": 0.7969122407598449, "train_speed(iter/s)": 0.135152 }, { "epoch": 1.0871057420629386, "grad_norm": 0.7255074381828308, "learning_rate": 4.6934393987199997e-05, "loss": 0.7393169403076172, "memory(GiB)": 91.52, "step": 83780, "token_acc": 0.7897370832452245, "train_speed(iter/s)": 0.135152 }, { "epoch": 1.0871706204645943, "grad_norm": 0.7741292119026184, "learning_rate": 4.6929040343650306e-05, "loss": 0.769505786895752, "memory(GiB)": 91.52, "step": 83785, "token_acc": 0.7876995381249493, "train_speed(iter/s)": 0.135151 }, { "epoch": 1.0872354988662498, "grad_norm": 0.6508811116218567, "learning_rate": 4.6923686735441026e-05, "loss": 0.7337292194366455, "memory(GiB)": 91.52, "step": 83790, "token_acc": 0.7879022485525784, "train_speed(iter/s)": 0.13515 }, { "epoch": 1.0873003772679057, "grad_norm": 0.7178980708122253, "learning_rate": 4.691833316263376e-05, "loss": 0.7814337730407714, "memory(GiB)": 91.52, "step": 83795, "token_acc": 0.7800699643639455, "train_speed(iter/s)": 0.13515 }, { "epoch": 1.0873652556695612, "grad_norm": 0.8196853399276733, "learning_rate": 4.691297962529011e-05, "loss": 0.7653525352478028, "memory(GiB)": 91.52, "step": 83800, "token_acc": 0.7876604940887326, "train_speed(iter/s)": 0.135149 }, { "epoch": 1.0874301340712171, "grad_norm": 0.7036645412445068, "learning_rate": 4.6907626123471704e-05, "loss": 0.7823380947113037, "memory(GiB)": 91.52, "step": 83805, "token_acc": 0.7700540074881548, "train_speed(iter/s)": 0.135148 }, { "epoch": 1.0874950124728726, "grad_norm": 0.674222469329834, "learning_rate": 4.690227265724012e-05, "loss": 0.7187233924865722, "memory(GiB)": 91.52, "step": 83810, "token_acc": 0.7906431918216669, "train_speed(iter/s)": 0.135147 }, { "epoch": 1.0875598908745283, "grad_norm": 0.6127725839614868, "learning_rate": 4.689691922665702e-05, "loss": 0.7175263404846192, "memory(GiB)": 91.52, "step": 83815, "token_acc": 0.7815772514902529, "train_speed(iter/s)": 0.135146 }, { "epoch": 1.087624769276184, "grad_norm": 0.7436112761497498, "learning_rate": 4.689156583178394e-05, "loss": 0.7723641395568848, "memory(GiB)": 91.52, "step": 83820, "token_acc": 0.7707497247576092, "train_speed(iter/s)": 0.135146 }, { "epoch": 1.0876896476778397, "grad_norm": 0.6778929829597473, "learning_rate": 4.688621247268254e-05, "loss": 0.7529726982116699, "memory(GiB)": 91.52, "step": 83825, "token_acc": 0.7811431544522075, "train_speed(iter/s)": 0.135145 }, { "epoch": 1.0877545260794954, "grad_norm": 0.644203245639801, "learning_rate": 4.68808591494144e-05, "loss": 0.7415919780731202, "memory(GiB)": 91.52, "step": 83830, "token_acc": 0.7743813106315581, "train_speed(iter/s)": 0.135144 }, { "epoch": 1.087819404481151, "grad_norm": 0.6635117530822754, "learning_rate": 4.687550586204111e-05, "loss": 0.746954870223999, "memory(GiB)": 91.52, "step": 83835, "token_acc": 0.7878140575729834, "train_speed(iter/s)": 0.135143 }, { "epoch": 1.0878842828828068, "grad_norm": 0.686952531337738, "learning_rate": 4.687015261062432e-05, "loss": 0.7637430667877197, "memory(GiB)": 91.52, "step": 83840, "token_acc": 0.7824240967716094, "train_speed(iter/s)": 0.135143 }, { "epoch": 1.0879491612844625, "grad_norm": 0.7204899787902832, "learning_rate": 4.68647993952256e-05, "loss": 0.7925768375396729, "memory(GiB)": 91.52, "step": 83845, "token_acc": 0.7717854851492622, "train_speed(iter/s)": 0.135142 }, { "epoch": 1.0880140396861182, "grad_norm": 0.758210301399231, "learning_rate": 4.6859446215906604e-05, "loss": 0.755066967010498, "memory(GiB)": 91.52, "step": 83850, "token_acc": 0.7890849110728628, "train_speed(iter/s)": 0.135141 }, { "epoch": 1.088078918087774, "grad_norm": 0.7250353693962097, "learning_rate": 4.685409307272886e-05, "loss": 0.7666645050048828, "memory(GiB)": 91.52, "step": 83855, "token_acc": 0.7749460733402572, "train_speed(iter/s)": 0.13514 }, { "epoch": 1.0881437964894296, "grad_norm": 0.7685105800628662, "learning_rate": 4.6848739965754025e-05, "loss": 0.7490846633911132, "memory(GiB)": 91.52, "step": 83860, "token_acc": 0.8019788374330081, "train_speed(iter/s)": 0.135139 }, { "epoch": 1.0882086748910853, "grad_norm": 0.7402207851409912, "learning_rate": 4.6843386895043684e-05, "loss": 0.7285849571228027, "memory(GiB)": 91.52, "step": 83865, "token_acc": 0.7930837285968985, "train_speed(iter/s)": 0.135138 }, { "epoch": 1.088273553292741, "grad_norm": 0.6523149013519287, "learning_rate": 4.6838033860659435e-05, "loss": 0.7916749000549317, "memory(GiB)": 91.52, "step": 83870, "token_acc": 0.7896935933147632, "train_speed(iter/s)": 0.135137 }, { "epoch": 1.0883384316943967, "grad_norm": 0.6795504689216614, "learning_rate": 4.68326808626629e-05, "loss": 0.786933422088623, "memory(GiB)": 91.52, "step": 83875, "token_acc": 0.787680983299191, "train_speed(iter/s)": 0.135136 }, { "epoch": 1.0884033100960524, "grad_norm": 0.6936811804771423, "learning_rate": 4.682732790111566e-05, "loss": 0.7842128276824951, "memory(GiB)": 91.52, "step": 83880, "token_acc": 0.7709794800615223, "train_speed(iter/s)": 0.135135 }, { "epoch": 1.088468188497708, "grad_norm": 0.6651667356491089, "learning_rate": 4.6821974976079334e-05, "loss": 0.7762865543365478, "memory(GiB)": 91.52, "step": 83885, "token_acc": 0.7838186124268598, "train_speed(iter/s)": 0.135134 }, { "epoch": 1.0885330668993638, "grad_norm": 0.6079820990562439, "learning_rate": 4.6816622087615525e-05, "loss": 0.7461236953735352, "memory(GiB)": 91.52, "step": 83890, "token_acc": 0.754486631668905, "train_speed(iter/s)": 0.135133 }, { "epoch": 1.0885979453010195, "grad_norm": 0.7016441226005554, "learning_rate": 4.6811269235785816e-05, "loss": 0.7741233348846436, "memory(GiB)": 91.52, "step": 83895, "token_acc": 0.7713566948942352, "train_speed(iter/s)": 0.135132 }, { "epoch": 1.0886628237026752, "grad_norm": 0.7102477550506592, "learning_rate": 4.680591642065182e-05, "loss": 0.7626290798187256, "memory(GiB)": 91.52, "step": 83900, "token_acc": 0.7749874286390392, "train_speed(iter/s)": 0.135131 }, { "epoch": 1.088727702104331, "grad_norm": 0.7429402470588684, "learning_rate": 4.680056364227511e-05, "loss": 0.749307107925415, "memory(GiB)": 91.52, "step": 83905, "token_acc": 0.7796418827808962, "train_speed(iter/s)": 0.13513 }, { "epoch": 1.0887925805059866, "grad_norm": 0.7637196183204651, "learning_rate": 4.679521090071733e-05, "loss": 0.7717360019683838, "memory(GiB)": 91.52, "step": 83910, "token_acc": 0.7740050097411634, "train_speed(iter/s)": 0.13513 }, { "epoch": 1.0888574589076423, "grad_norm": 0.7249695658683777, "learning_rate": 4.678985819604004e-05, "loss": 0.7658113002777099, "memory(GiB)": 91.52, "step": 83915, "token_acc": 0.7896346531032878, "train_speed(iter/s)": 0.135129 }, { "epoch": 1.088922337309298, "grad_norm": 0.7679235339164734, "learning_rate": 4.678450552830487e-05, "loss": 0.8000114440917969, "memory(GiB)": 91.52, "step": 83920, "token_acc": 0.7906960597118555, "train_speed(iter/s)": 0.135129 }, { "epoch": 1.0889872157109537, "grad_norm": 0.6583437323570251, "learning_rate": 4.677915289757341e-05, "loss": 0.7047823429107666, "memory(GiB)": 91.52, "step": 83925, "token_acc": 0.7898036686729722, "train_speed(iter/s)": 0.135128 }, { "epoch": 1.0890520941126094, "grad_norm": 0.6828504800796509, "learning_rate": 4.677380030390724e-05, "loss": 0.7499141216278076, "memory(GiB)": 91.52, "step": 83930, "token_acc": 0.7844724117828606, "train_speed(iter/s)": 0.135127 }, { "epoch": 1.089116972514265, "grad_norm": 0.7467051148414612, "learning_rate": 4.6768447747367974e-05, "loss": 0.7573147773742676, "memory(GiB)": 91.52, "step": 83935, "token_acc": 0.8021763285949653, "train_speed(iter/s)": 0.135126 }, { "epoch": 1.0891818509159208, "grad_norm": 0.7259759306907654, "learning_rate": 4.676309522801718e-05, "loss": 0.7792178153991699, "memory(GiB)": 91.52, "step": 83940, "token_acc": 0.7732302359685376, "train_speed(iter/s)": 0.135125 }, { "epoch": 1.0892467293175765, "grad_norm": 0.6852993369102478, "learning_rate": 4.67577427459165e-05, "loss": 0.762493371963501, "memory(GiB)": 91.52, "step": 83945, "token_acc": 0.7719379568550544, "train_speed(iter/s)": 0.135124 }, { "epoch": 1.0893116077192322, "grad_norm": 0.7078462839126587, "learning_rate": 4.67523903011275e-05, "loss": 0.7962488174438477, "memory(GiB)": 91.52, "step": 83950, "token_acc": 0.7963904336256014, "train_speed(iter/s)": 0.135123 }, { "epoch": 1.0893764861208879, "grad_norm": 0.681954026222229, "learning_rate": 4.674703789371179e-05, "loss": 0.7473438262939454, "memory(GiB)": 91.52, "step": 83955, "token_acc": 0.7801104972375691, "train_speed(iter/s)": 0.135122 }, { "epoch": 1.0894413645225436, "grad_norm": 0.7322277426719666, "learning_rate": 4.674168552373096e-05, "loss": 0.7994722366333008, "memory(GiB)": 91.52, "step": 83960, "token_acc": 0.7807299270072993, "train_speed(iter/s)": 0.135121 }, { "epoch": 1.0895062429241993, "grad_norm": 0.6879755258560181, "learning_rate": 4.67363331912466e-05, "loss": 0.7573498249053955, "memory(GiB)": 91.52, "step": 83965, "token_acc": 0.7914377130138416, "train_speed(iter/s)": 0.13512 }, { "epoch": 1.089571121325855, "grad_norm": 0.7503846883773804, "learning_rate": 4.67309808963203e-05, "loss": 0.7776905059814453, "memory(GiB)": 91.52, "step": 83970, "token_acc": 0.7745038913299446, "train_speed(iter/s)": 0.135119 }, { "epoch": 1.0896359997275107, "grad_norm": 0.7703549861907959, "learning_rate": 4.672562863901367e-05, "loss": 0.7182627201080323, "memory(GiB)": 91.52, "step": 83975, "token_acc": 0.8141217115917515, "train_speed(iter/s)": 0.135118 }, { "epoch": 1.0897008781291664, "grad_norm": 0.6530188918113708, "learning_rate": 4.6720276419388294e-05, "loss": 0.7930714607238769, "memory(GiB)": 91.52, "step": 83980, "token_acc": 0.8027664479517646, "train_speed(iter/s)": 0.135117 }, { "epoch": 1.089765756530822, "grad_norm": 0.6159954071044922, "learning_rate": 4.6714924237505756e-05, "loss": 0.7548874378204345, "memory(GiB)": 91.52, "step": 83985, "token_acc": 0.7747670300196005, "train_speed(iter/s)": 0.135117 }, { "epoch": 1.0898306349324778, "grad_norm": 0.7659125924110413, "learning_rate": 4.670957209342767e-05, "loss": 0.7847798347473145, "memory(GiB)": 91.52, "step": 83990, "token_acc": 0.7798498498498498, "train_speed(iter/s)": 0.135115 }, { "epoch": 1.0898955133341335, "grad_norm": 0.7412845492362976, "learning_rate": 4.6704219987215624e-05, "loss": 0.7724013805389405, "memory(GiB)": 91.52, "step": 83995, "token_acc": 0.7771988051775639, "train_speed(iter/s)": 0.135114 }, { "epoch": 1.0899603917357892, "grad_norm": 0.7644646763801575, "learning_rate": 4.6698867918931195e-05, "loss": 0.7448330879211426, "memory(GiB)": 91.52, "step": 84000, "token_acc": 0.8062532238225608, "train_speed(iter/s)": 0.135113 }, { "epoch": 1.0900252701374449, "grad_norm": 0.8631498217582703, "learning_rate": 4.6693515888635964e-05, "loss": 0.7430645942687988, "memory(GiB)": 91.52, "step": 84005, "token_acc": 0.7702149817749014, "train_speed(iter/s)": 0.135112 }, { "epoch": 1.0900901485391006, "grad_norm": 0.6160331964492798, "learning_rate": 4.668816389639156e-05, "loss": 0.7330048084259033, "memory(GiB)": 91.52, "step": 84010, "token_acc": 0.7778720567289917, "train_speed(iter/s)": 0.135111 }, { "epoch": 1.0901550269407563, "grad_norm": 0.7008458375930786, "learning_rate": 4.6682811942259545e-05, "loss": 0.7663430213928223, "memory(GiB)": 91.52, "step": 84015, "token_acc": 0.7881984036488028, "train_speed(iter/s)": 0.13511 }, { "epoch": 1.090219905342412, "grad_norm": 0.7504372000694275, "learning_rate": 4.667746002630151e-05, "loss": 0.7670349597930908, "memory(GiB)": 91.52, "step": 84020, "token_acc": 0.7893894266567386, "train_speed(iter/s)": 0.13511 }, { "epoch": 1.0902847837440677, "grad_norm": 0.6084667444229126, "learning_rate": 4.667210814857906e-05, "loss": 0.7393409729003906, "memory(GiB)": 91.52, "step": 84025, "token_acc": 0.7879533108207312, "train_speed(iter/s)": 0.135109 }, { "epoch": 1.0903496621457234, "grad_norm": 0.6423932313919067, "learning_rate": 4.666675630915378e-05, "loss": 0.7544540882110595, "memory(GiB)": 91.52, "step": 84030, "token_acc": 0.7933240223463687, "train_speed(iter/s)": 0.135108 }, { "epoch": 1.090414540547379, "grad_norm": 0.6262097358703613, "learning_rate": 4.6661404508087256e-05, "loss": 0.7403905868530274, "memory(GiB)": 91.52, "step": 84035, "token_acc": 0.7840035532474632, "train_speed(iter/s)": 0.135107 }, { "epoch": 1.0904794189490348, "grad_norm": 0.7401875257492065, "learning_rate": 4.665605274544106e-05, "loss": 0.763048505783081, "memory(GiB)": 91.52, "step": 84040, "token_acc": 0.7897993105146516, "train_speed(iter/s)": 0.135106 }, { "epoch": 1.0905442973506905, "grad_norm": 0.6878225207328796, "learning_rate": 4.6650701021276797e-05, "loss": 0.808993911743164, "memory(GiB)": 91.52, "step": 84045, "token_acc": 0.7782663572330116, "train_speed(iter/s)": 0.135106 }, { "epoch": 1.0906091757523462, "grad_norm": 0.7275418639183044, "learning_rate": 4.6645349335656056e-05, "loss": 0.7626148700714112, "memory(GiB)": 91.52, "step": 84050, "token_acc": 0.7813585032867015, "train_speed(iter/s)": 0.135105 }, { "epoch": 1.0906740541540019, "grad_norm": 0.7432764768600464, "learning_rate": 4.6639997688640414e-05, "loss": 0.7607407093048095, "memory(GiB)": 91.52, "step": 84055, "token_acc": 0.7800565986063354, "train_speed(iter/s)": 0.135104 }, { "epoch": 1.0907389325556576, "grad_norm": 0.6248734593391418, "learning_rate": 4.663464608029147e-05, "loss": 0.7746450424194335, "memory(GiB)": 91.52, "step": 84060, "token_acc": 0.7972940653934196, "train_speed(iter/s)": 0.135103 }, { "epoch": 1.0908038109573133, "grad_norm": 0.7805988788604736, "learning_rate": 4.66292945106708e-05, "loss": 0.7578133106231689, "memory(GiB)": 91.52, "step": 84065, "token_acc": 0.7784794668713583, "train_speed(iter/s)": 0.135103 }, { "epoch": 1.090868689358969, "grad_norm": 0.7150788903236389, "learning_rate": 4.6623942979839996e-05, "loss": 0.7486633777618408, "memory(GiB)": 91.52, "step": 84070, "token_acc": 0.7808126006801503, "train_speed(iter/s)": 0.135102 }, { "epoch": 1.0909335677606247, "grad_norm": 0.7011120319366455, "learning_rate": 4.661859148786062e-05, "loss": 0.7339225292205811, "memory(GiB)": 91.52, "step": 84075, "token_acc": 0.7743091425619835, "train_speed(iter/s)": 0.135101 }, { "epoch": 1.0909984461622804, "grad_norm": 0.7313262224197388, "learning_rate": 4.661324003479429e-05, "loss": 0.7919234752655029, "memory(GiB)": 91.52, "step": 84080, "token_acc": 0.7725975601055861, "train_speed(iter/s)": 0.1351 }, { "epoch": 1.091063324563936, "grad_norm": 0.7538678050041199, "learning_rate": 4.660788862070258e-05, "loss": 0.7594001770019532, "memory(GiB)": 91.52, "step": 84085, "token_acc": 0.7758845273719416, "train_speed(iter/s)": 0.135099 }, { "epoch": 1.0911282029655918, "grad_norm": 0.7158052921295166, "learning_rate": 4.660253724564706e-05, "loss": 0.7731870174407959, "memory(GiB)": 91.52, "step": 84090, "token_acc": 0.753636232718894, "train_speed(iter/s)": 0.135099 }, { "epoch": 1.0911930813672475, "grad_norm": 0.7507984638214111, "learning_rate": 4.6597185909689323e-05, "loss": 0.7439030170440674, "memory(GiB)": 91.52, "step": 84095, "token_acc": 0.7733274159547321, "train_speed(iter/s)": 0.135098 }, { "epoch": 1.0912579597689032, "grad_norm": 0.7354370951652527, "learning_rate": 4.659183461289095e-05, "loss": 0.7507080078125, "memory(GiB)": 91.52, "step": 84100, "token_acc": 0.7858933173297561, "train_speed(iter/s)": 0.135097 }, { "epoch": 1.0913228381705589, "grad_norm": 0.7866997718811035, "learning_rate": 4.658648335531355e-05, "loss": 0.7697052955627441, "memory(GiB)": 91.52, "step": 84105, "token_acc": 0.7790074693989285, "train_speed(iter/s)": 0.135096 }, { "epoch": 1.0913877165722146, "grad_norm": 0.6959559917449951, "learning_rate": 4.6581132137018646e-05, "loss": 0.7420958995819091, "memory(GiB)": 91.52, "step": 84110, "token_acc": 0.7817781043350478, "train_speed(iter/s)": 0.135096 }, { "epoch": 1.0914525949738703, "grad_norm": 0.7559289932250977, "learning_rate": 4.657578095806787e-05, "loss": 0.7554577827453614, "memory(GiB)": 91.52, "step": 84115, "token_acc": 0.7760673441325419, "train_speed(iter/s)": 0.135094 }, { "epoch": 1.091517473375526, "grad_norm": 0.6574306488037109, "learning_rate": 4.6570429818522784e-05, "loss": 0.7469432353973389, "memory(GiB)": 91.52, "step": 84120, "token_acc": 0.7848101265822784, "train_speed(iter/s)": 0.135093 }, { "epoch": 1.0915823517771817, "grad_norm": 0.7424772381782532, "learning_rate": 4.656507871844496e-05, "loss": 0.745945405960083, "memory(GiB)": 91.52, "step": 84125, "token_acc": 0.7918689320388349, "train_speed(iter/s)": 0.135093 }, { "epoch": 1.0916472301788374, "grad_norm": 0.7011612057685852, "learning_rate": 4.6559727657896e-05, "loss": 0.7675946235656739, "memory(GiB)": 91.52, "step": 84130, "token_acc": 0.79657440298245, "train_speed(iter/s)": 0.135092 }, { "epoch": 1.091712108580493, "grad_norm": 0.621044397354126, "learning_rate": 4.6554376636937474e-05, "loss": 0.7687313079833984, "memory(GiB)": 91.52, "step": 84135, "token_acc": 0.7866343230038607, "train_speed(iter/s)": 0.135091 }, { "epoch": 1.0917769869821488, "grad_norm": 0.6718212962150574, "learning_rate": 4.6549025655630965e-05, "loss": 0.7846868515014649, "memory(GiB)": 91.52, "step": 84140, "token_acc": 0.7891802392376833, "train_speed(iter/s)": 0.13509 }, { "epoch": 1.0918418653838045, "grad_norm": 0.6572864651679993, "learning_rate": 4.654367471403803e-05, "loss": 0.7577505111694336, "memory(GiB)": 91.52, "step": 84145, "token_acc": 0.7765650741350906, "train_speed(iter/s)": 0.135089 }, { "epoch": 1.0919067437854602, "grad_norm": 0.6612402200698853, "learning_rate": 4.6538323812220284e-05, "loss": 0.7672102451324463, "memory(GiB)": 91.52, "step": 84150, "token_acc": 0.782562265320886, "train_speed(iter/s)": 0.135088 }, { "epoch": 1.0919716221871159, "grad_norm": 0.7258751392364502, "learning_rate": 4.653297295023927e-05, "loss": 0.7514798164367675, "memory(GiB)": 91.52, "step": 84155, "token_acc": 0.7814717839178668, "train_speed(iter/s)": 0.135087 }, { "epoch": 1.0920365005887716, "grad_norm": 0.6925880908966064, "learning_rate": 4.652762212815658e-05, "loss": 0.7715526580810547, "memory(GiB)": 91.52, "step": 84160, "token_acc": 0.7709051724137931, "train_speed(iter/s)": 0.135087 }, { "epoch": 1.0921013789904273, "grad_norm": 0.7288771271705627, "learning_rate": 4.652227134603379e-05, "loss": 0.7479022979736328, "memory(GiB)": 91.52, "step": 84165, "token_acc": 0.7791706169458197, "train_speed(iter/s)": 0.135086 }, { "epoch": 1.092166257392083, "grad_norm": 0.7066553235054016, "learning_rate": 4.651692060393248e-05, "loss": 0.768787145614624, "memory(GiB)": 91.52, "step": 84170, "token_acc": 0.7790658413055712, "train_speed(iter/s)": 0.135085 }, { "epoch": 1.0922311357937387, "grad_norm": 0.6618536710739136, "learning_rate": 4.651156990191425e-05, "loss": 0.7277009010314941, "memory(GiB)": 91.52, "step": 84175, "token_acc": 0.7988547131639406, "train_speed(iter/s)": 0.135084 }, { "epoch": 1.0922960141953943, "grad_norm": 0.6647579073905945, "learning_rate": 4.650621924004061e-05, "loss": 0.7517653465270996, "memory(GiB)": 91.52, "step": 84180, "token_acc": 0.7820472440944882, "train_speed(iter/s)": 0.135083 }, { "epoch": 1.09236089259705, "grad_norm": 0.7398457527160645, "learning_rate": 4.65008686183732e-05, "loss": 0.7406118869781494, "memory(GiB)": 91.52, "step": 84185, "token_acc": 0.7650933862334843, "train_speed(iter/s)": 0.135082 }, { "epoch": 1.0924257709987057, "grad_norm": 0.6976025700569153, "learning_rate": 4.649551803697355e-05, "loss": 0.7438020706176758, "memory(GiB)": 91.52, "step": 84190, "token_acc": 0.7817650584357207, "train_speed(iter/s)": 0.135081 }, { "epoch": 1.0924906494003614, "grad_norm": 0.6453362703323364, "learning_rate": 4.649016749590325e-05, "loss": 0.775240421295166, "memory(GiB)": 91.52, "step": 84195, "token_acc": 0.7854962941432931, "train_speed(iter/s)": 0.13508 }, { "epoch": 1.0925555278020171, "grad_norm": 0.7100245356559753, "learning_rate": 4.64848169952239e-05, "loss": 0.7666431427001953, "memory(GiB)": 91.52, "step": 84200, "token_acc": 0.7785570365778053, "train_speed(iter/s)": 0.13508 }, { "epoch": 1.0926204062036728, "grad_norm": 0.7103404998779297, "learning_rate": 4.647946653499702e-05, "loss": 0.7499970436096192, "memory(GiB)": 91.52, "step": 84205, "token_acc": 0.7705223880597015, "train_speed(iter/s)": 0.135079 }, { "epoch": 1.0926852846053285, "grad_norm": 0.6954526305198669, "learning_rate": 4.6474116115284254e-05, "loss": 0.7763428211212158, "memory(GiB)": 91.52, "step": 84210, "token_acc": 0.7782711066263034, "train_speed(iter/s)": 0.135078 }, { "epoch": 1.0927501630069842, "grad_norm": 0.7130894660949707, "learning_rate": 4.64687657361471e-05, "loss": 0.747838020324707, "memory(GiB)": 91.52, "step": 84215, "token_acc": 0.7708315516405827, "train_speed(iter/s)": 0.135077 }, { "epoch": 1.09281504140864, "grad_norm": 0.6716338396072388, "learning_rate": 4.646341539764717e-05, "loss": 0.7648233413696289, "memory(GiB)": 91.52, "step": 84220, "token_acc": 0.7862896703438905, "train_speed(iter/s)": 0.135076 }, { "epoch": 1.0928799198102956, "grad_norm": 0.6761428713798523, "learning_rate": 4.645806509984604e-05, "loss": 0.7400104522705078, "memory(GiB)": 91.52, "step": 84225, "token_acc": 0.7862945492662474, "train_speed(iter/s)": 0.135076 }, { "epoch": 1.0929447982119513, "grad_norm": 0.6993959546089172, "learning_rate": 4.6452714842805245e-05, "loss": 0.7832282066345215, "memory(GiB)": 91.52, "step": 84230, "token_acc": 0.7792662505708632, "train_speed(iter/s)": 0.135075 }, { "epoch": 1.093009676613607, "grad_norm": 0.7966716289520264, "learning_rate": 4.6447364626586395e-05, "loss": 0.7668935775756835, "memory(GiB)": 91.52, "step": 84235, "token_acc": 0.7732991284790554, "train_speed(iter/s)": 0.135074 }, { "epoch": 1.0930745550152627, "grad_norm": 0.6924362182617188, "learning_rate": 4.6442014451251035e-05, "loss": 0.8123237609863281, "memory(GiB)": 91.52, "step": 84240, "token_acc": 0.7603571307788562, "train_speed(iter/s)": 0.135074 }, { "epoch": 1.0931394334169184, "grad_norm": 0.6599434018135071, "learning_rate": 4.6436664316860745e-05, "loss": 0.7602365970611572, "memory(GiB)": 91.52, "step": 84245, "token_acc": 0.7828261482107636, "train_speed(iter/s)": 0.135073 }, { "epoch": 1.0932043118185741, "grad_norm": 0.7505150437355042, "learning_rate": 4.6431314223477116e-05, "loss": 0.7563220977783203, "memory(GiB)": 91.52, "step": 84250, "token_acc": 0.794168872788073, "train_speed(iter/s)": 0.135072 }, { "epoch": 1.0932691902202298, "grad_norm": 0.7125751972198486, "learning_rate": 4.6425964171161676e-05, "loss": 0.7446544647216797, "memory(GiB)": 91.52, "step": 84255, "token_acc": 0.7889708925369017, "train_speed(iter/s)": 0.135071 }, { "epoch": 1.0933340686218855, "grad_norm": 0.7540298104286194, "learning_rate": 4.642061415997601e-05, "loss": 0.7923094272613526, "memory(GiB)": 91.52, "step": 84260, "token_acc": 0.7794072191503245, "train_speed(iter/s)": 0.13507 }, { "epoch": 1.093398947023541, "grad_norm": 0.6545841097831726, "learning_rate": 4.6415264189981674e-05, "loss": 0.7519543647766114, "memory(GiB)": 91.52, "step": 84265, "token_acc": 0.7662115432604898, "train_speed(iter/s)": 0.135069 }, { "epoch": 1.093463825425197, "grad_norm": 0.7813401222229004, "learning_rate": 4.6409914261240264e-05, "loss": 0.7880143642425537, "memory(GiB)": 91.52, "step": 84270, "token_acc": 0.7698823011769882, "train_speed(iter/s)": 0.135068 }, { "epoch": 1.0935287038268524, "grad_norm": 0.7624314427375793, "learning_rate": 4.640456437381331e-05, "loss": 0.7728432178497314, "memory(GiB)": 91.52, "step": 84275, "token_acc": 0.7855212539344418, "train_speed(iter/s)": 0.135067 }, { "epoch": 1.0935935822285083, "grad_norm": 0.6449415683746338, "learning_rate": 4.639921452776241e-05, "loss": 0.7622962951660156, "memory(GiB)": 91.52, "step": 84280, "token_acc": 0.7621552850560485, "train_speed(iter/s)": 0.135066 }, { "epoch": 1.0936584606301638, "grad_norm": 0.6997420787811279, "learning_rate": 4.639386472314914e-05, "loss": 0.7539342880249024, "memory(GiB)": 91.52, "step": 84285, "token_acc": 0.7736157395539606, "train_speed(iter/s)": 0.135066 }, { "epoch": 1.0937233390318195, "grad_norm": 0.6577394008636475, "learning_rate": 4.6388514960035026e-05, "loss": 0.7817493438720703, "memory(GiB)": 91.52, "step": 84290, "token_acc": 0.7831436372017161, "train_speed(iter/s)": 0.135065 }, { "epoch": 1.0937882174334752, "grad_norm": 0.7450742125511169, "learning_rate": 4.638316523848164e-05, "loss": 0.7327295303344726, "memory(GiB)": 91.52, "step": 84295, "token_acc": 0.7904433116086984, "train_speed(iter/s)": 0.135064 }, { "epoch": 1.093853095835131, "grad_norm": 0.7092449069023132, "learning_rate": 4.637781555855055e-05, "loss": 0.7614747524261475, "memory(GiB)": 91.52, "step": 84300, "token_acc": 0.7849251487219716, "train_speed(iter/s)": 0.135063 }, { "epoch": 1.0939179742367866, "grad_norm": 0.7029449939727783, "learning_rate": 4.6372465920303345e-05, "loss": 0.7572274208068848, "memory(GiB)": 91.52, "step": 84305, "token_acc": 0.7892429724464236, "train_speed(iter/s)": 0.135063 }, { "epoch": 1.0939828526384423, "grad_norm": 0.7320541143417358, "learning_rate": 4.636711632380155e-05, "loss": 0.781137228012085, "memory(GiB)": 91.52, "step": 84310, "token_acc": 0.7743412709595604, "train_speed(iter/s)": 0.135062 }, { "epoch": 1.094047731040098, "grad_norm": 0.7429816722869873, "learning_rate": 4.636176676910676e-05, "loss": 0.7189790725708007, "memory(GiB)": 91.52, "step": 84315, "token_acc": 0.7926090862924333, "train_speed(iter/s)": 0.135062 }, { "epoch": 1.0941126094417537, "grad_norm": 0.7181299924850464, "learning_rate": 4.6356417256280534e-05, "loss": 0.7761359214782715, "memory(GiB)": 91.52, "step": 84320, "token_acc": 0.7888994654755803, "train_speed(iter/s)": 0.135061 }, { "epoch": 1.0941774878434094, "grad_norm": 0.6437972784042358, "learning_rate": 4.635106778538441e-05, "loss": 0.7400152206420898, "memory(GiB)": 91.52, "step": 84325, "token_acc": 0.7893419429732279, "train_speed(iter/s)": 0.13506 }, { "epoch": 1.094242366245065, "grad_norm": 0.7104406356811523, "learning_rate": 4.6345718356479964e-05, "loss": 0.7827917098999023, "memory(GiB)": 91.52, "step": 84330, "token_acc": 0.7812963130992348, "train_speed(iter/s)": 0.135058 }, { "epoch": 1.0943072446467208, "grad_norm": 0.6394675970077515, "learning_rate": 4.634036896962874e-05, "loss": 0.7550418853759766, "memory(GiB)": 91.52, "step": 84335, "token_acc": 0.7971511192031702, "train_speed(iter/s)": 0.135058 }, { "epoch": 1.0943721230483765, "grad_norm": 0.719878077507019, "learning_rate": 4.633501962489234e-05, "loss": 0.7814511299133301, "memory(GiB)": 91.52, "step": 84340, "token_acc": 0.7968880244512364, "train_speed(iter/s)": 0.135057 }, { "epoch": 1.0944370014500322, "grad_norm": 0.6810106635093689, "learning_rate": 4.632967032233227e-05, "loss": 0.7665788173675537, "memory(GiB)": 91.52, "step": 84345, "token_acc": 0.7841017245974036, "train_speed(iter/s)": 0.135056 }, { "epoch": 1.094501879851688, "grad_norm": 0.6821208596229553, "learning_rate": 4.632432106201015e-05, "loss": 0.7484292030334473, "memory(GiB)": 91.52, "step": 84350, "token_acc": 0.7795247854489561, "train_speed(iter/s)": 0.135055 }, { "epoch": 1.0945667582533436, "grad_norm": 0.6049498319625854, "learning_rate": 4.63189718439875e-05, "loss": 0.7571041107177734, "memory(GiB)": 91.52, "step": 84355, "token_acc": 0.7821654852267738, "train_speed(iter/s)": 0.135054 }, { "epoch": 1.0946316366549993, "grad_norm": 0.6366910338401794, "learning_rate": 4.631362266832588e-05, "loss": 0.7213627338409424, "memory(GiB)": 91.52, "step": 84360, "token_acc": 0.7914941663530297, "train_speed(iter/s)": 0.135053 }, { "epoch": 1.094696515056655, "grad_norm": 0.7274661660194397, "learning_rate": 4.6308273535086845e-05, "loss": 0.7480487823486328, "memory(GiB)": 91.52, "step": 84365, "token_acc": 0.789068247717142, "train_speed(iter/s)": 0.135053 }, { "epoch": 1.0947613934583107, "grad_norm": 0.6565707921981812, "learning_rate": 4.630292444433196e-05, "loss": 0.7271919250488281, "memory(GiB)": 91.52, "step": 84370, "token_acc": 0.7823880811861235, "train_speed(iter/s)": 0.135052 }, { "epoch": 1.0948262718599664, "grad_norm": 0.7538491487503052, "learning_rate": 4.629757539612279e-05, "loss": 0.7969986438751221, "memory(GiB)": 91.52, "step": 84375, "token_acc": 0.7772437647915529, "train_speed(iter/s)": 0.135051 }, { "epoch": 1.094891150261622, "grad_norm": 0.65606689453125, "learning_rate": 4.6292226390520876e-05, "loss": 0.7728622436523438, "memory(GiB)": 91.52, "step": 84380, "token_acc": 0.7735652691049658, "train_speed(iter/s)": 0.135051 }, { "epoch": 1.0949560286632778, "grad_norm": 0.6845371127128601, "learning_rate": 4.628687742758779e-05, "loss": 0.7529057025909424, "memory(GiB)": 91.52, "step": 84385, "token_acc": 0.7772146153264241, "train_speed(iter/s)": 0.13505 }, { "epoch": 1.0950209070649335, "grad_norm": 0.7396934628486633, "learning_rate": 4.62815285073851e-05, "loss": 0.757783031463623, "memory(GiB)": 91.52, "step": 84390, "token_acc": 0.7996702758098293, "train_speed(iter/s)": 0.135049 }, { "epoch": 1.0950857854665892, "grad_norm": 0.7510380148887634, "learning_rate": 4.6276179629974315e-05, "loss": 0.7769976139068604, "memory(GiB)": 91.52, "step": 84395, "token_acc": 0.7880369809516219, "train_speed(iter/s)": 0.135048 }, { "epoch": 1.095150663868245, "grad_norm": 0.7115552425384521, "learning_rate": 4.627083079541701e-05, "loss": 0.797988748550415, "memory(GiB)": 91.52, "step": 84400, "token_acc": 0.7751279578666271, "train_speed(iter/s)": 0.135048 }, { "epoch": 1.0952155422699006, "grad_norm": 0.6719304919242859, "learning_rate": 4.6265482003774755e-05, "loss": 0.7521274566650391, "memory(GiB)": 91.52, "step": 84405, "token_acc": 0.7911179729361547, "train_speed(iter/s)": 0.135047 }, { "epoch": 1.0952804206715563, "grad_norm": 0.750251829624176, "learning_rate": 4.6260133255109096e-05, "loss": 0.7395687103271484, "memory(GiB)": 91.52, "step": 84410, "token_acc": 0.7807909985177199, "train_speed(iter/s)": 0.135046 }, { "epoch": 1.095345299073212, "grad_norm": 0.6800493597984314, "learning_rate": 4.625478454948157e-05, "loss": 0.7639243125915527, "memory(GiB)": 91.52, "step": 84415, "token_acc": 0.7847258829731154, "train_speed(iter/s)": 0.135044 }, { "epoch": 1.0954101774748677, "grad_norm": 0.7540640234947205, "learning_rate": 4.6249435886953746e-05, "loss": 0.8023227691650391, "memory(GiB)": 91.52, "step": 84420, "token_acc": 0.7842770911181374, "train_speed(iter/s)": 0.135043 }, { "epoch": 1.0954750558765234, "grad_norm": 0.7298712730407715, "learning_rate": 4.624408726758718e-05, "loss": 0.7279042720794677, "memory(GiB)": 91.52, "step": 84425, "token_acc": 0.7757916241062308, "train_speed(iter/s)": 0.135042 }, { "epoch": 1.095539934278179, "grad_norm": 0.6819044947624207, "learning_rate": 4.623873869144343e-05, "loss": 0.764582633972168, "memory(GiB)": 91.52, "step": 84430, "token_acc": 0.7599336527828972, "train_speed(iter/s)": 0.135041 }, { "epoch": 1.0956048126798348, "grad_norm": 0.7929708361625671, "learning_rate": 4.6233390158584e-05, "loss": 0.769041919708252, "memory(GiB)": 91.52, "step": 84435, "token_acc": 0.7790348912259387, "train_speed(iter/s)": 0.13504 }, { "epoch": 1.0956696910814905, "grad_norm": 0.6929662227630615, "learning_rate": 4.6228041669070496e-05, "loss": 0.7326408386230469, "memory(GiB)": 91.52, "step": 84440, "token_acc": 0.7777002789956158, "train_speed(iter/s)": 0.135039 }, { "epoch": 1.0957345694831462, "grad_norm": 0.672804057598114, "learning_rate": 4.6222693222964437e-05, "loss": 0.7612014293670655, "memory(GiB)": 91.52, "step": 84445, "token_acc": 0.7737040896066684, "train_speed(iter/s)": 0.135038 }, { "epoch": 1.0957994478848019, "grad_norm": 0.6591919660568237, "learning_rate": 4.6217344820327377e-05, "loss": 0.7330706596374512, "memory(GiB)": 91.52, "step": 84450, "token_acc": 0.7799232324416185, "train_speed(iter/s)": 0.135037 }, { "epoch": 1.0958643262864576, "grad_norm": 0.7470768094062805, "learning_rate": 4.621199646122087e-05, "loss": 0.8024433135986329, "memory(GiB)": 91.52, "step": 84455, "token_acc": 0.7789418024389437, "train_speed(iter/s)": 0.135036 }, { "epoch": 1.0959292046881133, "grad_norm": 0.6745967268943787, "learning_rate": 4.6206648145706476e-05, "loss": 0.765513801574707, "memory(GiB)": 91.52, "step": 84460, "token_acc": 0.7775774938037145, "train_speed(iter/s)": 0.135035 }, { "epoch": 1.095994083089769, "grad_norm": 0.7567977905273438, "learning_rate": 4.620129987384574e-05, "loss": 0.7789024353027344, "memory(GiB)": 91.52, "step": 84465, "token_acc": 0.7686972132685952, "train_speed(iter/s)": 0.135034 }, { "epoch": 1.0960589614914247, "grad_norm": 0.6914271712303162, "learning_rate": 4.619595164570018e-05, "loss": 0.7609422206878662, "memory(GiB)": 91.52, "step": 84470, "token_acc": 0.7886529409651108, "train_speed(iter/s)": 0.135033 }, { "epoch": 1.0961238398930804, "grad_norm": 0.79710853099823, "learning_rate": 4.6190603461331364e-05, "loss": 0.7860711097717286, "memory(GiB)": 91.52, "step": 84475, "token_acc": 0.7781149589432346, "train_speed(iter/s)": 0.135033 }, { "epoch": 1.096188718294736, "grad_norm": 0.6984984874725342, "learning_rate": 4.6185255320800844e-05, "loss": 0.7293886661529541, "memory(GiB)": 91.52, "step": 84480, "token_acc": 0.7932163473965363, "train_speed(iter/s)": 0.135031 }, { "epoch": 1.0962535966963918, "grad_norm": 0.7079982757568359, "learning_rate": 4.617990722417016e-05, "loss": 0.7866009712219239, "memory(GiB)": 91.52, "step": 84485, "token_acc": 0.7708104028428567, "train_speed(iter/s)": 0.135031 }, { "epoch": 1.0963184750980475, "grad_norm": 0.7693729996681213, "learning_rate": 4.617455917150085e-05, "loss": 0.7618385314941406, "memory(GiB)": 91.52, "step": 84490, "token_acc": 0.791829521122931, "train_speed(iter/s)": 0.13503 }, { "epoch": 1.0963833534997032, "grad_norm": 0.7574706077575684, "learning_rate": 4.616921116285447e-05, "loss": 0.7460168361663818, "memory(GiB)": 91.52, "step": 84495, "token_acc": 0.7930971120570851, "train_speed(iter/s)": 0.135029 }, { "epoch": 1.0964482319013589, "grad_norm": 0.6964578628540039, "learning_rate": 4.616386319829259e-05, "loss": 0.8013301849365234, "memory(GiB)": 91.52, "step": 84500, "token_acc": 0.7834944751381215, "train_speed(iter/s)": 0.135028 }, { "epoch": 1.0965131103030146, "grad_norm": 0.6546041965484619, "learning_rate": 4.615851527787669e-05, "loss": 0.7837608814239502, "memory(GiB)": 91.52, "step": 84505, "token_acc": 0.773590446358653, "train_speed(iter/s)": 0.135027 }, { "epoch": 1.0965779887046703, "grad_norm": 0.7606578469276428, "learning_rate": 4.6153167401668364e-05, "loss": 0.7348914623260498, "memory(GiB)": 91.52, "step": 84510, "token_acc": 0.7952076997627243, "train_speed(iter/s)": 0.135026 }, { "epoch": 1.096642867106326, "grad_norm": 0.6985248327255249, "learning_rate": 4.614781956972914e-05, "loss": 0.7451489925384521, "memory(GiB)": 91.52, "step": 84515, "token_acc": 0.7936051865767857, "train_speed(iter/s)": 0.135025 }, { "epoch": 1.0967077455079817, "grad_norm": 0.7139398455619812, "learning_rate": 4.614247178212054e-05, "loss": 0.7598280429840087, "memory(GiB)": 91.52, "step": 84520, "token_acc": 0.7549675052084953, "train_speed(iter/s)": 0.135025 }, { "epoch": 1.0967726239096374, "grad_norm": 0.7874484658241272, "learning_rate": 4.6137124038904143e-05, "loss": 0.7870138168334961, "memory(GiB)": 91.52, "step": 84525, "token_acc": 0.7696818415943579, "train_speed(iter/s)": 0.135024 }, { "epoch": 1.096837502311293, "grad_norm": 0.6708924174308777, "learning_rate": 4.6131776340141474e-05, "loss": 0.7675177097320557, "memory(GiB)": 91.52, "step": 84530, "token_acc": 0.7919781163523013, "train_speed(iter/s)": 0.135023 }, { "epoch": 1.0969023807129488, "grad_norm": 0.7299258708953857, "learning_rate": 4.612642868589409e-05, "loss": 0.7693673133850097, "memory(GiB)": 91.52, "step": 84535, "token_acc": 0.7976135842129417, "train_speed(iter/s)": 0.135022 }, { "epoch": 1.0969672591146045, "grad_norm": 0.6766162514686584, "learning_rate": 4.612108107622349e-05, "loss": 0.7308254718780518, "memory(GiB)": 91.52, "step": 84540, "token_acc": 0.789642496756523, "train_speed(iter/s)": 0.135021 }, { "epoch": 1.0970321375162602, "grad_norm": 0.729887843132019, "learning_rate": 4.611573351119125e-05, "loss": 0.7558480262756347, "memory(GiB)": 91.52, "step": 84545, "token_acc": 0.7790677525558829, "train_speed(iter/s)": 0.13502 }, { "epoch": 1.0970970159179159, "grad_norm": 0.691628098487854, "learning_rate": 4.6110385990858895e-05, "loss": 0.7952881336212159, "memory(GiB)": 91.52, "step": 84550, "token_acc": 0.7615, "train_speed(iter/s)": 0.135019 }, { "epoch": 1.0971618943195716, "grad_norm": 0.6617155075073242, "learning_rate": 4.6105038515287964e-05, "loss": 0.777569580078125, "memory(GiB)": 91.52, "step": 84555, "token_acc": 0.7753526659152236, "train_speed(iter/s)": 0.135019 }, { "epoch": 1.0972267727212273, "grad_norm": 0.7077001929283142, "learning_rate": 4.609969108454001e-05, "loss": 0.7694566249847412, "memory(GiB)": 91.52, "step": 84560, "token_acc": 0.7770023846261748, "train_speed(iter/s)": 0.135018 }, { "epoch": 1.097291651122883, "grad_norm": 0.6916837096214294, "learning_rate": 4.609434369867655e-05, "loss": 0.7772787094116211, "memory(GiB)": 91.52, "step": 84565, "token_acc": 0.7799690926484657, "train_speed(iter/s)": 0.135017 }, { "epoch": 1.0973565295245387, "grad_norm": 0.7728767395019531, "learning_rate": 4.608899635775915e-05, "loss": 0.741086483001709, "memory(GiB)": 91.52, "step": 84570, "token_acc": 0.76958725078349, "train_speed(iter/s)": 0.135016 }, { "epoch": 1.0974214079261944, "grad_norm": 0.7238029837608337, "learning_rate": 4.60836490618493e-05, "loss": 0.7339975357055664, "memory(GiB)": 91.52, "step": 84575, "token_acc": 0.7932896087092313, "train_speed(iter/s)": 0.135016 }, { "epoch": 1.09748628632785, "grad_norm": 0.790522575378418, "learning_rate": 4.607830181100859e-05, "loss": 0.760168981552124, "memory(GiB)": 91.52, "step": 84580, "token_acc": 0.7705498812799155, "train_speed(iter/s)": 0.135015 }, { "epoch": 1.0975511647295058, "grad_norm": 0.6727147102355957, "learning_rate": 4.607295460529852e-05, "loss": 0.7983697891235352, "memory(GiB)": 91.52, "step": 84585, "token_acc": 0.7712997048386562, "train_speed(iter/s)": 0.135014 }, { "epoch": 1.0976160431311615, "grad_norm": 0.7696892023086548, "learning_rate": 4.6067607444780624e-05, "loss": 0.8146653175354004, "memory(GiB)": 91.52, "step": 84590, "token_acc": 0.7817975675118533, "train_speed(iter/s)": 0.135013 }, { "epoch": 1.0976809215328172, "grad_norm": 0.6829110980033875, "learning_rate": 4.606226032951646e-05, "loss": 0.7550577640533447, "memory(GiB)": 91.52, "step": 84595, "token_acc": 0.7900426191398683, "train_speed(iter/s)": 0.135013 }, { "epoch": 1.0977457999344729, "grad_norm": 0.6685484647750854, "learning_rate": 4.605691325956755e-05, "loss": 0.7912031173706054, "memory(GiB)": 91.52, "step": 84600, "token_acc": 0.7771420879046914, "train_speed(iter/s)": 0.135012 }, { "epoch": 1.0978106783361286, "grad_norm": 0.7241277098655701, "learning_rate": 4.6051566234995425e-05, "loss": 0.7529905319213868, "memory(GiB)": 91.52, "step": 84605, "token_acc": 0.7832053523910895, "train_speed(iter/s)": 0.135011 }, { "epoch": 1.0978755567377843, "grad_norm": 0.7711803913116455, "learning_rate": 4.6046219255861635e-05, "loss": 0.7761192321777344, "memory(GiB)": 91.52, "step": 84610, "token_acc": 0.7662258214868329, "train_speed(iter/s)": 0.13501 }, { "epoch": 1.09794043513944, "grad_norm": 0.6841062903404236, "learning_rate": 4.604087232222769e-05, "loss": 0.7821334838867188, "memory(GiB)": 91.52, "step": 84615, "token_acc": 0.7960715426381347, "train_speed(iter/s)": 0.135009 }, { "epoch": 1.0980053135410957, "grad_norm": 0.6572237610816956, "learning_rate": 4.603552543415514e-05, "loss": 0.7942693710327149, "memory(GiB)": 91.52, "step": 84620, "token_acc": 0.7813320601575555, "train_speed(iter/s)": 0.135008 }, { "epoch": 1.0980701919427514, "grad_norm": 0.8155021667480469, "learning_rate": 4.603017859170549e-05, "loss": 0.7565943717956543, "memory(GiB)": 91.52, "step": 84625, "token_acc": 0.7824032838942029, "train_speed(iter/s)": 0.135007 }, { "epoch": 1.098135070344407, "grad_norm": 0.6995106339454651, "learning_rate": 4.602483179494029e-05, "loss": 0.807716178894043, "memory(GiB)": 91.52, "step": 84630, "token_acc": 0.7785697553793126, "train_speed(iter/s)": 0.135006 }, { "epoch": 1.0981999487460627, "grad_norm": 0.7417916059494019, "learning_rate": 4.601948504392107e-05, "loss": 0.7241856575012207, "memory(GiB)": 91.52, "step": 84635, "token_acc": 0.78389220325036, "train_speed(iter/s)": 0.135005 }, { "epoch": 1.0982648271477184, "grad_norm": 0.6756774187088013, "learning_rate": 4.6014138338709374e-05, "loss": 0.7252121925354004, "memory(GiB)": 91.52, "step": 84640, "token_acc": 0.7816199489043215, "train_speed(iter/s)": 0.135004 }, { "epoch": 1.0983297055493741, "grad_norm": 0.7309080958366394, "learning_rate": 4.600879167936673e-05, "loss": 0.7561769485473633, "memory(GiB)": 91.52, "step": 84645, "token_acc": 0.784945815891397, "train_speed(iter/s)": 0.135003 }, { "epoch": 1.0983945839510298, "grad_norm": 0.712455153465271, "learning_rate": 4.600344506595463e-05, "loss": 0.7948296546936036, "memory(GiB)": 91.52, "step": 84650, "token_acc": 0.7886025954485613, "train_speed(iter/s)": 0.135002 }, { "epoch": 1.0984594623526855, "grad_norm": 0.6657128930091858, "learning_rate": 4.5998098498534635e-05, "loss": 0.7317332267761231, "memory(GiB)": 91.52, "step": 84655, "token_acc": 0.7900937641604796, "train_speed(iter/s)": 0.135002 }, { "epoch": 1.0985243407543412, "grad_norm": 0.7465822100639343, "learning_rate": 4.5992751977168254e-05, "loss": 0.7475512981414795, "memory(GiB)": 91.52, "step": 84660, "token_acc": 0.7733286293878991, "train_speed(iter/s)": 0.135001 }, { "epoch": 1.098589219155997, "grad_norm": 0.7049117684364319, "learning_rate": 4.598740550191704e-05, "loss": 0.7266259670257569, "memory(GiB)": 91.52, "step": 84665, "token_acc": 0.8108388771702093, "train_speed(iter/s)": 0.134999 }, { "epoch": 1.0986540975576526, "grad_norm": 0.677119255065918, "learning_rate": 4.5982059072842485e-05, "loss": 0.7823940277099609, "memory(GiB)": 91.52, "step": 84670, "token_acc": 0.7772768828221053, "train_speed(iter/s)": 0.134998 }, { "epoch": 1.0987189759593083, "grad_norm": 0.6902795433998108, "learning_rate": 4.5976712690006154e-05, "loss": 0.7672336101531982, "memory(GiB)": 91.52, "step": 84675, "token_acc": 0.7788133749184014, "train_speed(iter/s)": 0.134997 }, { "epoch": 1.098783854360964, "grad_norm": 0.6779073476791382, "learning_rate": 4.5971366353469565e-05, "loss": 0.7559645652770997, "memory(GiB)": 91.52, "step": 84680, "token_acc": 0.7783952125061065, "train_speed(iter/s)": 0.134996 }, { "epoch": 1.0988487327626197, "grad_norm": 0.6167421936988831, "learning_rate": 4.596602006329422e-05, "loss": 0.7508533477783204, "memory(GiB)": 91.52, "step": 84685, "token_acc": 0.7854191059990357, "train_speed(iter/s)": 0.134996 }, { "epoch": 1.0989136111642754, "grad_norm": 0.7177979946136475, "learning_rate": 4.5960673819541654e-05, "loss": 0.8003675460815429, "memory(GiB)": 91.52, "step": 84690, "token_acc": 0.748094607650195, "train_speed(iter/s)": 0.134995 }, { "epoch": 1.0989784895659311, "grad_norm": 0.6941503286361694, "learning_rate": 4.595532762227339e-05, "loss": 0.7400112152099609, "memory(GiB)": 91.52, "step": 84695, "token_acc": 0.7875249299072173, "train_speed(iter/s)": 0.134994 }, { "epoch": 1.0990433679675868, "grad_norm": 0.6358364224433899, "learning_rate": 4.594998147155096e-05, "loss": 0.764831829071045, "memory(GiB)": 91.52, "step": 84700, "token_acc": 0.781623610066048, "train_speed(iter/s)": 0.134993 }, { "epoch": 1.0991082463692425, "grad_norm": 0.7327666878700256, "learning_rate": 4.5944635367435875e-05, "loss": 0.8036989212036133, "memory(GiB)": 91.52, "step": 84705, "token_acc": 0.7639714228483675, "train_speed(iter/s)": 0.134993 }, { "epoch": 1.0991731247708982, "grad_norm": 0.707204282283783, "learning_rate": 4.593928930998967e-05, "loss": 0.7782377243041992, "memory(GiB)": 91.52, "step": 84710, "token_acc": 0.7746941549614862, "train_speed(iter/s)": 0.134992 }, { "epoch": 1.099238003172554, "grad_norm": 0.6826962828636169, "learning_rate": 4.593394329927388e-05, "loss": 0.7208170413970947, "memory(GiB)": 91.52, "step": 84715, "token_acc": 0.7893530576101677, "train_speed(iter/s)": 0.134991 }, { "epoch": 1.0993028815742096, "grad_norm": 0.6192836165428162, "learning_rate": 4.592859733534999e-05, "loss": 0.7725516319274902, "memory(GiB)": 91.52, "step": 84720, "token_acc": 0.8009624282805848, "train_speed(iter/s)": 0.13499 }, { "epoch": 1.0993677599758653, "grad_norm": 0.7756803631782532, "learning_rate": 4.592325141827955e-05, "loss": 0.7825428962707519, "memory(GiB)": 91.52, "step": 84725, "token_acc": 0.762867309879306, "train_speed(iter/s)": 0.134989 }, { "epoch": 1.099432638377521, "grad_norm": 0.7016569972038269, "learning_rate": 4.591790554812405e-05, "loss": 0.7537127017974854, "memory(GiB)": 91.52, "step": 84730, "token_acc": 0.8081463224802179, "train_speed(iter/s)": 0.134988 }, { "epoch": 1.0994975167791767, "grad_norm": 0.7077255845069885, "learning_rate": 4.5912559724945046e-05, "loss": 0.7488509178161621, "memory(GiB)": 91.52, "step": 84735, "token_acc": 0.7850738936422811, "train_speed(iter/s)": 0.134988 }, { "epoch": 1.0995623951808322, "grad_norm": 0.6189393401145935, "learning_rate": 4.590721394880403e-05, "loss": 0.7178318977355957, "memory(GiB)": 91.52, "step": 84740, "token_acc": 0.7946518668012109, "train_speed(iter/s)": 0.134987 }, { "epoch": 1.0996272735824881, "grad_norm": 0.6485428214073181, "learning_rate": 4.590186821976255e-05, "loss": 0.7458558082580566, "memory(GiB)": 91.52, "step": 84745, "token_acc": 0.7814771784232365, "train_speed(iter/s)": 0.134986 }, { "epoch": 1.0996921519841436, "grad_norm": 0.74627685546875, "learning_rate": 4.5896522537882114e-05, "loss": 0.7546528816223145, "memory(GiB)": 91.52, "step": 84750, "token_acc": 0.7783493625587117, "train_speed(iter/s)": 0.134985 }, { "epoch": 1.0997570303857995, "grad_norm": 0.7671088576316833, "learning_rate": 4.589117690322421e-05, "loss": 0.771027660369873, "memory(GiB)": 91.52, "step": 84755, "token_acc": 0.7855256207078711, "train_speed(iter/s)": 0.134984 }, { "epoch": 1.099821908787455, "grad_norm": 0.7062210440635681, "learning_rate": 4.588583131585039e-05, "loss": 0.7743979454040527, "memory(GiB)": 91.52, "step": 84760, "token_acc": 0.7804005722460658, "train_speed(iter/s)": 0.134983 }, { "epoch": 1.099886787189111, "grad_norm": 0.6949678063392639, "learning_rate": 4.588048577582214e-05, "loss": 0.7561665534973144, "memory(GiB)": 91.52, "step": 84765, "token_acc": 0.772996503246985, "train_speed(iter/s)": 0.134982 }, { "epoch": 1.0999516655907664, "grad_norm": 0.7065957188606262, "learning_rate": 4.587514028320102e-05, "loss": 0.8045732498168945, "memory(GiB)": 91.52, "step": 84770, "token_acc": 0.7518713908889998, "train_speed(iter/s)": 0.134981 }, { "epoch": 1.100016543992422, "grad_norm": 0.6951283812522888, "learning_rate": 4.58697948380485e-05, "loss": 0.7719943523406982, "memory(GiB)": 91.52, "step": 84775, "token_acc": 0.7876544076361595, "train_speed(iter/s)": 0.13498 }, { "epoch": 1.1000814223940778, "grad_norm": 0.79408198595047, "learning_rate": 4.586444944042612e-05, "loss": 0.768748664855957, "memory(GiB)": 91.52, "step": 84780, "token_acc": 0.8046132008516679, "train_speed(iter/s)": 0.134979 }, { "epoch": 1.1001463007957335, "grad_norm": 0.7373859882354736, "learning_rate": 4.58591040903954e-05, "loss": 0.7553201675415039, "memory(GiB)": 91.52, "step": 84785, "token_acc": 0.7865379353476918, "train_speed(iter/s)": 0.134978 }, { "epoch": 1.1002111791973892, "grad_norm": 0.6578139066696167, "learning_rate": 4.585375878801785e-05, "loss": 0.7425517559051513, "memory(GiB)": 91.52, "step": 84790, "token_acc": 0.7880392156862746, "train_speed(iter/s)": 0.134977 }, { "epoch": 1.100276057599045, "grad_norm": 0.6397427916526794, "learning_rate": 4.584841353335495e-05, "loss": 0.7689479351043701, "memory(GiB)": 91.52, "step": 84795, "token_acc": 0.7933806965863802, "train_speed(iter/s)": 0.134977 }, { "epoch": 1.1003409360007006, "grad_norm": 0.6458004713058472, "learning_rate": 4.584306832646826e-05, "loss": 0.752635145187378, "memory(GiB)": 91.52, "step": 84800, "token_acc": 0.7973553817682526, "train_speed(iter/s)": 0.134975 }, { "epoch": 1.1004058144023563, "grad_norm": 0.5944003462791443, "learning_rate": 4.583772316741927e-05, "loss": 0.695956802368164, "memory(GiB)": 91.52, "step": 84805, "token_acc": 0.8002160956397476, "train_speed(iter/s)": 0.134974 }, { "epoch": 1.100470692804012, "grad_norm": 0.6973534226417542, "learning_rate": 4.5832378056269474e-05, "loss": 0.8217985153198242, "memory(GiB)": 91.52, "step": 84810, "token_acc": 0.7685804601926046, "train_speed(iter/s)": 0.134974 }, { "epoch": 1.1005355712056677, "grad_norm": 0.7327893376350403, "learning_rate": 4.582703299308042e-05, "loss": 0.7556448936462402, "memory(GiB)": 91.52, "step": 84815, "token_acc": 0.7784262048192772, "train_speed(iter/s)": 0.134973 }, { "epoch": 1.1006004496073234, "grad_norm": 0.6495431065559387, "learning_rate": 4.5821687977913605e-05, "loss": 0.7793204307556152, "memory(GiB)": 91.52, "step": 84820, "token_acc": 0.7642760871107328, "train_speed(iter/s)": 0.134973 }, { "epoch": 1.100665328008979, "grad_norm": 0.7754120230674744, "learning_rate": 4.5816343010830544e-05, "loss": 0.7452188491821289, "memory(GiB)": 91.52, "step": 84825, "token_acc": 0.7965823552495451, "train_speed(iter/s)": 0.134972 }, { "epoch": 1.1007302064106348, "grad_norm": 0.7504702806472778, "learning_rate": 4.581099809189272e-05, "loss": 0.7436945915222168, "memory(GiB)": 91.52, "step": 84830, "token_acc": 0.7751990677801515, "train_speed(iter/s)": 0.134971 }, { "epoch": 1.1007950848122905, "grad_norm": 0.7130650281906128, "learning_rate": 4.5805653221161664e-05, "loss": 0.7474178791046142, "memory(GiB)": 91.52, "step": 84835, "token_acc": 0.8053983156150766, "train_speed(iter/s)": 0.13497 }, { "epoch": 1.1008599632139462, "grad_norm": 0.653052806854248, "learning_rate": 4.580030839869889e-05, "loss": 0.7219805717468262, "memory(GiB)": 91.52, "step": 84840, "token_acc": 0.7862904471984424, "train_speed(iter/s)": 0.134969 }, { "epoch": 1.100924841615602, "grad_norm": 0.6460460424423218, "learning_rate": 4.579496362456588e-05, "loss": 0.7488284587860108, "memory(GiB)": 91.52, "step": 84845, "token_acc": 0.7768535262206149, "train_speed(iter/s)": 0.134968 }, { "epoch": 1.1009897200172576, "grad_norm": 0.7889198660850525, "learning_rate": 4.578961889882417e-05, "loss": 0.7373312473297119, "memory(GiB)": 91.52, "step": 84850, "token_acc": 0.8011858363048345, "train_speed(iter/s)": 0.134968 }, { "epoch": 1.1010545984189133, "grad_norm": 0.7806344628334045, "learning_rate": 4.578427422153525e-05, "loss": 0.7142874717712402, "memory(GiB)": 91.52, "step": 84855, "token_acc": 0.7845018199135966, "train_speed(iter/s)": 0.134967 }, { "epoch": 1.101119476820569, "grad_norm": 0.65939861536026, "learning_rate": 4.5778929592760645e-05, "loss": 0.7381385803222656, "memory(GiB)": 91.52, "step": 84860, "token_acc": 0.7907877268033633, "train_speed(iter/s)": 0.134966 }, { "epoch": 1.1011843552222247, "grad_norm": 0.7611923217773438, "learning_rate": 4.577358501256183e-05, "loss": 0.7337020874023438, "memory(GiB)": 91.52, "step": 84865, "token_acc": 0.792161058159891, "train_speed(iter/s)": 0.134965 }, { "epoch": 1.1012492336238804, "grad_norm": 0.7677692770957947, "learning_rate": 4.576824048100033e-05, "loss": 0.7652372360229492, "memory(GiB)": 91.52, "step": 84870, "token_acc": 0.7796386520503451, "train_speed(iter/s)": 0.134964 }, { "epoch": 1.101314112025536, "grad_norm": 0.6983362436294556, "learning_rate": 4.576289599813765e-05, "loss": 0.7961916446685791, "memory(GiB)": 91.52, "step": 84875, "token_acc": 0.7658058404382388, "train_speed(iter/s)": 0.134964 }, { "epoch": 1.1013789904271918, "grad_norm": 0.6804841756820679, "learning_rate": 4.575755156403528e-05, "loss": 0.7508144378662109, "memory(GiB)": 91.52, "step": 84880, "token_acc": 0.7839780402354896, "train_speed(iter/s)": 0.134963 }, { "epoch": 1.1014438688288475, "grad_norm": 0.6611834764480591, "learning_rate": 4.575220717875474e-05, "loss": 0.7559849739074707, "memory(GiB)": 91.52, "step": 84885, "token_acc": 0.7739823288103502, "train_speed(iter/s)": 0.134962 }, { "epoch": 1.1015087472305032, "grad_norm": 0.7532942295074463, "learning_rate": 4.574686284235753e-05, "loss": 0.7735312461853028, "memory(GiB)": 91.52, "step": 84890, "token_acc": 0.762184845045231, "train_speed(iter/s)": 0.134962 }, { "epoch": 1.1015736256321589, "grad_norm": 0.6668655872344971, "learning_rate": 4.574151855490516e-05, "loss": 0.7250293731689453, "memory(GiB)": 91.52, "step": 84895, "token_acc": 0.797755926142844, "train_speed(iter/s)": 0.134961 }, { "epoch": 1.1016385040338146, "grad_norm": 0.6991117000579834, "learning_rate": 4.57361743164591e-05, "loss": 0.7188000679016113, "memory(GiB)": 91.52, "step": 84900, "token_acc": 0.7986050895381716, "train_speed(iter/s)": 0.13496 }, { "epoch": 1.1017033824354703, "grad_norm": 0.746327817440033, "learning_rate": 4.573083012708088e-05, "loss": 0.7878899574279785, "memory(GiB)": 91.52, "step": 84905, "token_acc": 0.7775989309685061, "train_speed(iter/s)": 0.134959 }, { "epoch": 1.101768260837126, "grad_norm": 0.7010692358016968, "learning_rate": 4.5725485986831995e-05, "loss": 0.8141333580017089, "memory(GiB)": 91.52, "step": 84910, "token_acc": 0.7734136202469739, "train_speed(iter/s)": 0.134958 }, { "epoch": 1.1018331392387817, "grad_norm": 0.8063840866088867, "learning_rate": 4.572014189577393e-05, "loss": 0.7821379184722901, "memory(GiB)": 91.52, "step": 84915, "token_acc": 0.7694813226684258, "train_speed(iter/s)": 0.134957 }, { "epoch": 1.1018980176404374, "grad_norm": 0.7972639203071594, "learning_rate": 4.5714797853968206e-05, "loss": 0.7771319389343262, "memory(GiB)": 91.52, "step": 84920, "token_acc": 0.780737633280895, "train_speed(iter/s)": 0.134956 }, { "epoch": 1.101962896042093, "grad_norm": 0.7851574420928955, "learning_rate": 4.570945386147631e-05, "loss": 0.7592296123504638, "memory(GiB)": 91.52, "step": 84925, "token_acc": 0.7925438783789555, "train_speed(iter/s)": 0.134955 }, { "epoch": 1.1020277744437488, "grad_norm": 0.7667064666748047, "learning_rate": 4.5704109918359755e-05, "loss": 0.732326889038086, "memory(GiB)": 91.52, "step": 84930, "token_acc": 0.7929753820033956, "train_speed(iter/s)": 0.134955 }, { "epoch": 1.1020926528454045, "grad_norm": 0.720500111579895, "learning_rate": 4.569876602468001e-05, "loss": 0.7613451957702637, "memory(GiB)": 91.52, "step": 84935, "token_acc": 0.801330136880365, "train_speed(iter/s)": 0.134954 }, { "epoch": 1.1021575312470602, "grad_norm": 0.6440877914428711, "learning_rate": 4.569342218049859e-05, "loss": 0.769853162765503, "memory(GiB)": 91.52, "step": 84940, "token_acc": 0.7847694492190419, "train_speed(iter/s)": 0.134953 }, { "epoch": 1.1022224096487159, "grad_norm": 0.7240380644798279, "learning_rate": 4.5688078385877e-05, "loss": 0.7397631168365478, "memory(GiB)": 91.52, "step": 84945, "token_acc": 0.8009810791871058, "train_speed(iter/s)": 0.134952 }, { "epoch": 1.1022872880503716, "grad_norm": 0.755531907081604, "learning_rate": 4.568273464087671e-05, "loss": 0.7558005809783935, "memory(GiB)": 91.52, "step": 84950, "token_acc": 0.7556170298900209, "train_speed(iter/s)": 0.134952 }, { "epoch": 1.1023521664520273, "grad_norm": 0.776971697807312, "learning_rate": 4.567739094555925e-05, "loss": 0.7748757362365722, "memory(GiB)": 91.52, "step": 84955, "token_acc": 0.7725652988501525, "train_speed(iter/s)": 0.134951 }, { "epoch": 1.102417044853683, "grad_norm": 0.7760571241378784, "learning_rate": 4.5672047299986084e-05, "loss": 0.7199895858764649, "memory(GiB)": 91.52, "step": 84960, "token_acc": 0.7873862417040893, "train_speed(iter/s)": 0.13495 }, { "epoch": 1.1024819232553387, "grad_norm": 0.7606105208396912, "learning_rate": 4.5666703704218716e-05, "loss": 0.7902886390686035, "memory(GiB)": 91.52, "step": 84965, "token_acc": 0.7738523302496073, "train_speed(iter/s)": 0.13495 }, { "epoch": 1.1025468016569944, "grad_norm": 0.7168834209442139, "learning_rate": 4.5661360158318664e-05, "loss": 0.7697805404663086, "memory(GiB)": 91.52, "step": 84970, "token_acc": 0.7659432184327315, "train_speed(iter/s)": 0.134949 }, { "epoch": 1.10261168005865, "grad_norm": 0.6957480311393738, "learning_rate": 4.5656016662347396e-05, "loss": 0.7730655193328857, "memory(GiB)": 91.52, "step": 84975, "token_acc": 0.7726163099239342, "train_speed(iter/s)": 0.134948 }, { "epoch": 1.1026765584603058, "grad_norm": 0.6583550572395325, "learning_rate": 4.56506732163664e-05, "loss": 0.7225426197052002, "memory(GiB)": 91.52, "step": 84980, "token_acc": 0.7908426927582167, "train_speed(iter/s)": 0.134947 }, { "epoch": 1.1027414368619615, "grad_norm": 0.7817355990409851, "learning_rate": 4.5645329820437164e-05, "loss": 0.7620371818542481, "memory(GiB)": 91.52, "step": 84985, "token_acc": 0.7832824606715192, "train_speed(iter/s)": 0.134947 }, { "epoch": 1.1028063152636172, "grad_norm": 0.7264835834503174, "learning_rate": 4.563998647462121e-05, "loss": 0.8017936706542969, "memory(GiB)": 91.52, "step": 84990, "token_acc": 0.7728138222849084, "train_speed(iter/s)": 0.134946 }, { "epoch": 1.1028711936652729, "grad_norm": 0.7336232662200928, "learning_rate": 4.563464317898001e-05, "loss": 0.7968256950378418, "memory(GiB)": 91.52, "step": 84995, "token_acc": 0.7802112676056338, "train_speed(iter/s)": 0.134945 }, { "epoch": 1.1029360720669286, "grad_norm": 0.6996043920516968, "learning_rate": 4.562929993357504e-05, "loss": 0.7631890296936035, "memory(GiB)": 91.52, "step": 85000, "token_acc": 0.7764855221627556, "train_speed(iter/s)": 0.134945 }, { "epoch": 1.1030009504685843, "grad_norm": 0.6973549723625183, "learning_rate": 4.5623956738467835e-05, "loss": 0.7704249858856201, "memory(GiB)": 91.52, "step": 85005, "token_acc": 0.7828181962276114, "train_speed(iter/s)": 0.134944 }, { "epoch": 1.10306582887024, "grad_norm": 0.7087757587432861, "learning_rate": 4.5618613593719836e-05, "loss": 0.7889305114746094, "memory(GiB)": 91.52, "step": 85010, "token_acc": 0.7738687220288414, "train_speed(iter/s)": 0.134943 }, { "epoch": 1.1031307072718957, "grad_norm": 0.7581685185432434, "learning_rate": 4.5613270499392544e-05, "loss": 0.7402614116668701, "memory(GiB)": 91.52, "step": 85015, "token_acc": 0.7831402831402832, "train_speed(iter/s)": 0.134942 }, { "epoch": 1.1031955856735514, "grad_norm": 0.7622677087783813, "learning_rate": 4.560792745554745e-05, "loss": 0.7925296783447265, "memory(GiB)": 91.52, "step": 85020, "token_acc": 0.7711524356042628, "train_speed(iter/s)": 0.134942 }, { "epoch": 1.103260464075207, "grad_norm": 0.7163086533546448, "learning_rate": 4.560258446224606e-05, "loss": 0.7681934356689453, "memory(GiB)": 91.52, "step": 85025, "token_acc": 0.7836277646404228, "train_speed(iter/s)": 0.134941 }, { "epoch": 1.1033253424768628, "grad_norm": 0.7251564264297485, "learning_rate": 4.559724151954984e-05, "loss": 0.8016119003295898, "memory(GiB)": 91.52, "step": 85030, "token_acc": 0.7793455313159746, "train_speed(iter/s)": 0.134941 }, { "epoch": 1.1033902208785185, "grad_norm": 0.691117525100708, "learning_rate": 4.559189862752027e-05, "loss": 0.7448118686676025, "memory(GiB)": 91.52, "step": 85035, "token_acc": 0.7850775950850527, "train_speed(iter/s)": 0.13494 }, { "epoch": 1.1034550992801742, "grad_norm": 0.5653213262557983, "learning_rate": 4.558655578621887e-05, "loss": 0.7240038871765136, "memory(GiB)": 91.52, "step": 85040, "token_acc": 0.8076132840225634, "train_speed(iter/s)": 0.134939 }, { "epoch": 1.1035199776818299, "grad_norm": 0.7789538502693176, "learning_rate": 4.558121299570709e-05, "loss": 0.7741905689239502, "memory(GiB)": 91.52, "step": 85045, "token_acc": 0.775564803804994, "train_speed(iter/s)": 0.134938 }, { "epoch": 1.1035848560834856, "grad_norm": 0.7600678205490112, "learning_rate": 4.557587025604642e-05, "loss": 0.7713819026947022, "memory(GiB)": 91.52, "step": 85050, "token_acc": 0.7912213016254507, "train_speed(iter/s)": 0.134938 }, { "epoch": 1.1036497344851413, "grad_norm": 0.7387691140174866, "learning_rate": 4.557052756729835e-05, "loss": 0.7999258518218995, "memory(GiB)": 91.52, "step": 85055, "token_acc": 0.7838330353458947, "train_speed(iter/s)": 0.134937 }, { "epoch": 1.103714612886797, "grad_norm": 0.7254182696342468, "learning_rate": 4.5565184929524366e-05, "loss": 0.8016470909118653, "memory(GiB)": 91.52, "step": 85060, "token_acc": 0.7661912308505019, "train_speed(iter/s)": 0.134936 }, { "epoch": 1.1037794912884527, "grad_norm": 0.7583939433097839, "learning_rate": 4.555984234278594e-05, "loss": 0.7397291660308838, "memory(GiB)": 91.52, "step": 85065, "token_acc": 0.7982139232798864, "train_speed(iter/s)": 0.134936 }, { "epoch": 1.1038443696901084, "grad_norm": 0.7291883230209351, "learning_rate": 4.5554499807144576e-05, "loss": 0.7467888832092285, "memory(GiB)": 91.52, "step": 85070, "token_acc": 0.7786461417012038, "train_speed(iter/s)": 0.134935 }, { "epoch": 1.103909248091764, "grad_norm": 0.7014917731285095, "learning_rate": 4.554915732266175e-05, "loss": 0.7579137325286865, "memory(GiB)": 91.52, "step": 85075, "token_acc": 0.7904005154085687, "train_speed(iter/s)": 0.134934 }, { "epoch": 1.1039741264934197, "grad_norm": 0.6387937664985657, "learning_rate": 4.554381488939893e-05, "loss": 0.7456232070922851, "memory(GiB)": 91.52, "step": 85080, "token_acc": 0.7827803301003914, "train_speed(iter/s)": 0.134933 }, { "epoch": 1.1040390048950754, "grad_norm": 0.6790006756782532, "learning_rate": 4.5538472507417595e-05, "loss": 0.7494349956512452, "memory(GiB)": 91.52, "step": 85085, "token_acc": 0.7821355766746102, "train_speed(iter/s)": 0.134932 }, { "epoch": 1.1041038832967311, "grad_norm": 0.6900462508201599, "learning_rate": 4.553313017677923e-05, "loss": 0.7778213500976563, "memory(GiB)": 91.52, "step": 85090, "token_acc": 0.7799821486530347, "train_speed(iter/s)": 0.134932 }, { "epoch": 1.1041687616983868, "grad_norm": 0.650976836681366, "learning_rate": 4.5527787897545316e-05, "loss": 0.749626636505127, "memory(GiB)": 91.52, "step": 85095, "token_acc": 0.7723060427356974, "train_speed(iter/s)": 0.134931 }, { "epoch": 1.1042336401000425, "grad_norm": 0.735633909702301, "learning_rate": 4.5522445669777316e-05, "loss": 0.7979812622070312, "memory(GiB)": 91.52, "step": 85100, "token_acc": 0.7690138976417128, "train_speed(iter/s)": 0.134931 }, { "epoch": 1.1042985185016982, "grad_norm": 0.761432945728302, "learning_rate": 4.5517103493536743e-05, "loss": 0.7778913497924804, "memory(GiB)": 91.52, "step": 85105, "token_acc": 0.7795571797076526, "train_speed(iter/s)": 0.13493 }, { "epoch": 1.104363396903354, "grad_norm": 0.6578496098518372, "learning_rate": 4.5511761368885065e-05, "loss": 0.7617604255676269, "memory(GiB)": 91.52, "step": 85110, "token_acc": 0.7810238276215341, "train_speed(iter/s)": 0.134929 }, { "epoch": 1.1044282753050096, "grad_norm": 0.6820142269134521, "learning_rate": 4.550641929588374e-05, "loss": 0.7749237060546875, "memory(GiB)": 91.52, "step": 85115, "token_acc": 0.781272283730531, "train_speed(iter/s)": 0.134929 }, { "epoch": 1.1044931537066653, "grad_norm": 0.7665696740150452, "learning_rate": 4.5501077274594245e-05, "loss": 0.7908251762390137, "memory(GiB)": 91.52, "step": 85120, "token_acc": 0.7844249613202682, "train_speed(iter/s)": 0.134928 }, { "epoch": 1.104558032108321, "grad_norm": 0.6645449995994568, "learning_rate": 4.549573530507806e-05, "loss": 0.7174547195434571, "memory(GiB)": 91.52, "step": 85125, "token_acc": 0.7898111863199145, "train_speed(iter/s)": 0.134927 }, { "epoch": 1.1046229105099767, "grad_norm": 0.7843366861343384, "learning_rate": 4.5490393387396666e-05, "loss": 0.8046524047851562, "memory(GiB)": 91.52, "step": 85130, "token_acc": 0.7695060164661178, "train_speed(iter/s)": 0.134926 }, { "epoch": 1.1046877889116324, "grad_norm": 0.6923621892929077, "learning_rate": 4.548505152161153e-05, "loss": 0.6984736442565918, "memory(GiB)": 91.52, "step": 85135, "token_acc": 0.7979823455233291, "train_speed(iter/s)": 0.134925 }, { "epoch": 1.1047526673132881, "grad_norm": 0.7046691179275513, "learning_rate": 4.547970970778414e-05, "loss": 0.7555892944335938, "memory(GiB)": 91.52, "step": 85140, "token_acc": 0.7833683212568565, "train_speed(iter/s)": 0.134925 }, { "epoch": 1.1048175457149438, "grad_norm": 0.6272200345993042, "learning_rate": 4.547436794597596e-05, "loss": 0.756251049041748, "memory(GiB)": 91.52, "step": 85145, "token_acc": 0.79139201240791, "train_speed(iter/s)": 0.134924 }, { "epoch": 1.1048824241165995, "grad_norm": 0.635875940322876, "learning_rate": 4.5469026236248466e-05, "loss": 0.7486613273620606, "memory(GiB)": 91.52, "step": 85150, "token_acc": 0.79513171057019, "train_speed(iter/s)": 0.134923 }, { "epoch": 1.1049473025182552, "grad_norm": 0.6470164656639099, "learning_rate": 4.546368457866312e-05, "loss": 0.723042106628418, "memory(GiB)": 91.52, "step": 85155, "token_acc": 0.772408238407548, "train_speed(iter/s)": 0.134922 }, { "epoch": 1.105012180919911, "grad_norm": 0.7110853791236877, "learning_rate": 4.54583429732814e-05, "loss": 0.7326934337615967, "memory(GiB)": 91.52, "step": 85160, "token_acc": 0.7919295370749693, "train_speed(iter/s)": 0.134921 }, { "epoch": 1.1050770593215666, "grad_norm": 0.7453333735466003, "learning_rate": 4.5453001420164774e-05, "loss": 0.7894024848937988, "memory(GiB)": 91.52, "step": 85165, "token_acc": 0.7539169497096527, "train_speed(iter/s)": 0.13492 }, { "epoch": 1.1051419377232223, "grad_norm": 0.6883120536804199, "learning_rate": 4.544765991937471e-05, "loss": 0.7560907363891601, "memory(GiB)": 91.52, "step": 85170, "token_acc": 0.8150921893442956, "train_speed(iter/s)": 0.134919 }, { "epoch": 1.105206816124878, "grad_norm": 0.6438111662864685, "learning_rate": 4.5442318470972693e-05, "loss": 0.7830218315124512, "memory(GiB)": 91.52, "step": 85175, "token_acc": 0.7640345781938702, "train_speed(iter/s)": 0.134918 }, { "epoch": 1.1052716945265337, "grad_norm": 0.6856728196144104, "learning_rate": 4.543697707502018e-05, "loss": 0.7524438858032226, "memory(GiB)": 91.52, "step": 85180, "token_acc": 0.7909211629857422, "train_speed(iter/s)": 0.134918 }, { "epoch": 1.1053365729281894, "grad_norm": 0.6858382225036621, "learning_rate": 4.543163573157865e-05, "loss": 0.7825991153717041, "memory(GiB)": 91.52, "step": 85185, "token_acc": 0.7769694611943202, "train_speed(iter/s)": 0.134917 }, { "epoch": 1.1054014513298451, "grad_norm": 0.7371180653572083, "learning_rate": 4.542629444070956e-05, "loss": 0.7706794738769531, "memory(GiB)": 91.52, "step": 85190, "token_acc": 0.7730342416940985, "train_speed(iter/s)": 0.134916 }, { "epoch": 1.1054663297315008, "grad_norm": 0.7238978743553162, "learning_rate": 4.542095320247436e-05, "loss": 0.7515382289886474, "memory(GiB)": 91.52, "step": 85195, "token_acc": 0.8141159524666907, "train_speed(iter/s)": 0.134915 }, { "epoch": 1.1055312081331565, "grad_norm": 0.7681032419204712, "learning_rate": 4.541561201693455e-05, "loss": 0.7594487190246582, "memory(GiB)": 91.52, "step": 85200, "token_acc": 0.8014169874682611, "train_speed(iter/s)": 0.134914 }, { "epoch": 1.1055960865348122, "grad_norm": 0.6274248361587524, "learning_rate": 4.541027088415157e-05, "loss": 0.7530145168304443, "memory(GiB)": 91.52, "step": 85205, "token_acc": 0.784758877159309, "train_speed(iter/s)": 0.134913 }, { "epoch": 1.105660964936468, "grad_norm": 0.7733912467956543, "learning_rate": 4.5404929804186906e-05, "loss": 0.7943397998809815, "memory(GiB)": 91.52, "step": 85210, "token_acc": 0.7673409150099835, "train_speed(iter/s)": 0.134913 }, { "epoch": 1.1057258433381234, "grad_norm": 0.7153539657592773, "learning_rate": 4.539958877710202e-05, "loss": 0.7302619457244873, "memory(GiB)": 91.52, "step": 85215, "token_acc": 0.7761182714177407, "train_speed(iter/s)": 0.134912 }, { "epoch": 1.1057907217397793, "grad_norm": 0.6790603995323181, "learning_rate": 4.5394247802958376e-05, "loss": 0.727137565612793, "memory(GiB)": 91.52, "step": 85220, "token_acc": 0.8014506244666821, "train_speed(iter/s)": 0.134911 }, { "epoch": 1.1058556001414348, "grad_norm": 0.7162036299705505, "learning_rate": 4.538890688181741e-05, "loss": 0.7492822647094727, "memory(GiB)": 91.52, "step": 85225, "token_acc": 0.7696233779230743, "train_speed(iter/s)": 0.13491 }, { "epoch": 1.1059204785430907, "grad_norm": 0.7242763638496399, "learning_rate": 4.538356601374062e-05, "loss": 0.741586971282959, "memory(GiB)": 91.52, "step": 85230, "token_acc": 0.8027549611107073, "train_speed(iter/s)": 0.13491 }, { "epoch": 1.1059853569447462, "grad_norm": 0.7090831995010376, "learning_rate": 4.537822519878945e-05, "loss": 0.7414949417114258, "memory(GiB)": 91.52, "step": 85235, "token_acc": 0.7852809285149037, "train_speed(iter/s)": 0.134909 }, { "epoch": 1.1060502353464021, "grad_norm": 0.7025143504142761, "learning_rate": 4.5372884437025365e-05, "loss": 0.7272956371307373, "memory(GiB)": 91.52, "step": 85240, "token_acc": 0.7988167572753437, "train_speed(iter/s)": 0.134908 }, { "epoch": 1.1061151137480576, "grad_norm": 0.7795483469963074, "learning_rate": 4.5367543728509836e-05, "loss": 0.7244316577911377, "memory(GiB)": 91.52, "step": 85245, "token_acc": 0.7868543832364737, "train_speed(iter/s)": 0.134907 }, { "epoch": 1.1061799921497133, "grad_norm": 0.6865291595458984, "learning_rate": 4.536220307330431e-05, "loss": 0.7501118183135986, "memory(GiB)": 91.52, "step": 85250, "token_acc": 0.781453473094105, "train_speed(iter/s)": 0.134906 }, { "epoch": 1.106244870551369, "grad_norm": 0.7688020467758179, "learning_rate": 4.535686247147028e-05, "loss": 0.7984389781951904, "memory(GiB)": 91.52, "step": 85255, "token_acc": 0.7709169496758259, "train_speed(iter/s)": 0.134906 }, { "epoch": 1.1063097489530247, "grad_norm": 0.7106149196624756, "learning_rate": 4.5351521923069135e-05, "loss": 0.7906574726104736, "memory(GiB)": 91.52, "step": 85260, "token_acc": 0.7863264448109699, "train_speed(iter/s)": 0.134906 }, { "epoch": 1.1063746273546804, "grad_norm": 0.7176656723022461, "learning_rate": 4.534618142816239e-05, "loss": 0.7757458686828613, "memory(GiB)": 91.52, "step": 85265, "token_acc": 0.7817910447761194, "train_speed(iter/s)": 0.134905 }, { "epoch": 1.106439505756336, "grad_norm": 0.6920357942581177, "learning_rate": 4.53408409868115e-05, "loss": 0.74620680809021, "memory(GiB)": 91.52, "step": 85270, "token_acc": 0.7985428051001822, "train_speed(iter/s)": 0.134904 }, { "epoch": 1.1065043841579918, "grad_norm": 0.729633629322052, "learning_rate": 4.5335500599077896e-05, "loss": 0.7591167449951172, "memory(GiB)": 91.52, "step": 85275, "token_acc": 0.771909517825253, "train_speed(iter/s)": 0.134903 }, { "epoch": 1.1065692625596475, "grad_norm": 0.7161145806312561, "learning_rate": 4.533016026502306e-05, "loss": 0.7341750144958497, "memory(GiB)": 91.52, "step": 85280, "token_acc": 0.787251627122146, "train_speed(iter/s)": 0.134902 }, { "epoch": 1.1066341409613032, "grad_norm": 0.7170582413673401, "learning_rate": 4.5324819984708445e-05, "loss": 0.7856839179992676, "memory(GiB)": 91.52, "step": 85285, "token_acc": 0.7776497695852534, "train_speed(iter/s)": 0.134901 }, { "epoch": 1.106699019362959, "grad_norm": 0.6960098147392273, "learning_rate": 4.531947975819549e-05, "loss": 0.7192561149597168, "memory(GiB)": 91.52, "step": 85290, "token_acc": 0.790283292735827, "train_speed(iter/s)": 0.1349 }, { "epoch": 1.1067638977646146, "grad_norm": 0.7214906811714172, "learning_rate": 4.531413958554568e-05, "loss": 0.7831057071685791, "memory(GiB)": 91.52, "step": 85295, "token_acc": 0.7896636840481333, "train_speed(iter/s)": 0.1349 }, { "epoch": 1.1068287761662703, "grad_norm": 0.7080379128456116, "learning_rate": 4.5308799466820436e-05, "loss": 0.73776273727417, "memory(GiB)": 91.52, "step": 85300, "token_acc": 0.7877816793372652, "train_speed(iter/s)": 0.134898 }, { "epoch": 1.106893654567926, "grad_norm": 0.7576904296875, "learning_rate": 4.5303459402081226e-05, "loss": 0.742093563079834, "memory(GiB)": 91.52, "step": 85305, "token_acc": 0.7736231773125646, "train_speed(iter/s)": 0.134897 }, { "epoch": 1.1069585329695817, "grad_norm": 0.6930847764015198, "learning_rate": 4.52981193913895e-05, "loss": 0.7438881874084473, "memory(GiB)": 91.52, "step": 85310, "token_acc": 0.7836911299717091, "train_speed(iter/s)": 0.134896 }, { "epoch": 1.1070234113712374, "grad_norm": 0.7327734231948853, "learning_rate": 4.5292779434806715e-05, "loss": 0.7754459381103516, "memory(GiB)": 91.52, "step": 85315, "token_acc": 0.7787514103046258, "train_speed(iter/s)": 0.134895 }, { "epoch": 1.107088289772893, "grad_norm": 0.7449295520782471, "learning_rate": 4.528743953239433e-05, "loss": 0.7703907012939453, "memory(GiB)": 91.52, "step": 85320, "token_acc": 0.7949021207177814, "train_speed(iter/s)": 0.134895 }, { "epoch": 1.1071531681745488, "grad_norm": 0.8056806921958923, "learning_rate": 4.528209968421376e-05, "loss": 0.7565752029418945, "memory(GiB)": 91.52, "step": 85325, "token_acc": 0.7871691999074145, "train_speed(iter/s)": 0.134894 }, { "epoch": 1.1072180465762045, "grad_norm": 0.7476207613945007, "learning_rate": 4.5276759890326523e-05, "loss": 0.7210311412811279, "memory(GiB)": 91.52, "step": 85330, "token_acc": 0.7971911178591763, "train_speed(iter/s)": 0.134893 }, { "epoch": 1.1072829249778602, "grad_norm": 0.7346546649932861, "learning_rate": 4.5271420150794006e-05, "loss": 0.7502808570861816, "memory(GiB)": 91.52, "step": 85335, "token_acc": 0.7695589371493836, "train_speed(iter/s)": 0.134892 }, { "epoch": 1.1073478033795159, "grad_norm": 0.5989171862602234, "learning_rate": 4.526608046567769e-05, "loss": 0.7510833740234375, "memory(GiB)": 91.52, "step": 85340, "token_acc": 0.7758985879332477, "train_speed(iter/s)": 0.134892 }, { "epoch": 1.1074126817811716, "grad_norm": 0.7623167634010315, "learning_rate": 4.526074083503901e-05, "loss": 0.78363037109375, "memory(GiB)": 91.52, "step": 85345, "token_acc": 0.7780028640285006, "train_speed(iter/s)": 0.134891 }, { "epoch": 1.1074775601828273, "grad_norm": 0.6984736919403076, "learning_rate": 4.525540125893942e-05, "loss": 0.744495964050293, "memory(GiB)": 91.52, "step": 85350, "token_acc": 0.7850461382884591, "train_speed(iter/s)": 0.13489 }, { "epoch": 1.107542438584483, "grad_norm": 0.7239658832550049, "learning_rate": 4.5250061737440364e-05, "loss": 0.7913105010986328, "memory(GiB)": 91.52, "step": 85355, "token_acc": 0.7843732433951658, "train_speed(iter/s)": 0.134889 }, { "epoch": 1.1076073169861387, "grad_norm": 0.7061218023300171, "learning_rate": 4.5244722270603294e-05, "loss": 0.7875012397766114, "memory(GiB)": 91.52, "step": 85360, "token_acc": 0.7849739177562496, "train_speed(iter/s)": 0.134889 }, { "epoch": 1.1076721953877944, "grad_norm": 0.6749738454818726, "learning_rate": 4.523938285848966e-05, "loss": 0.7870279788970947, "memory(GiB)": 91.52, "step": 85365, "token_acc": 0.7563375426941998, "train_speed(iter/s)": 0.134888 }, { "epoch": 1.10773707378945, "grad_norm": 0.6811597347259521, "learning_rate": 4.52340435011609e-05, "loss": 0.7651830196380616, "memory(GiB)": 91.52, "step": 85370, "token_acc": 0.7803668615198549, "train_speed(iter/s)": 0.134887 }, { "epoch": 1.1078019521911058, "grad_norm": 0.6264373660087585, "learning_rate": 4.522870419867846e-05, "loss": 0.7654778480529785, "memory(GiB)": 91.52, "step": 85375, "token_acc": 0.777083206894459, "train_speed(iter/s)": 0.134886 }, { "epoch": 1.1078668305927615, "grad_norm": 0.6969841718673706, "learning_rate": 4.522336495110377e-05, "loss": 0.7351591110229492, "memory(GiB)": 91.52, "step": 85380, "token_acc": 0.7918043417842695, "train_speed(iter/s)": 0.134885 }, { "epoch": 1.1079317089944172, "grad_norm": 0.6719347238540649, "learning_rate": 4.521802575849831e-05, "loss": 0.7493186473846436, "memory(GiB)": 91.52, "step": 85385, "token_acc": 0.7928380545163014, "train_speed(iter/s)": 0.134884 }, { "epoch": 1.1079965873960729, "grad_norm": 0.7792527675628662, "learning_rate": 4.521268662092349e-05, "loss": 0.7473913192749023, "memory(GiB)": 91.52, "step": 85390, "token_acc": 0.789273987595768, "train_speed(iter/s)": 0.134883 }, { "epoch": 1.1080614657977286, "grad_norm": 0.7293208241462708, "learning_rate": 4.520734753844076e-05, "loss": 0.7372378349304199, "memory(GiB)": 91.52, "step": 85395, "token_acc": 0.7894185803029246, "train_speed(iter/s)": 0.134882 }, { "epoch": 1.1081263441993843, "grad_norm": 0.7006362676620483, "learning_rate": 4.520200851111158e-05, "loss": 0.7476601600646973, "memory(GiB)": 91.52, "step": 85400, "token_acc": 0.7841631265930331, "train_speed(iter/s)": 0.134882 }, { "epoch": 1.10819122260104, "grad_norm": 0.7600893378257751, "learning_rate": 4.519666953899738e-05, "loss": 0.7944860935211182, "memory(GiB)": 91.52, "step": 85405, "token_acc": 0.7896724251864439, "train_speed(iter/s)": 0.134881 }, { "epoch": 1.1082561010026957, "grad_norm": 0.68388831615448, "learning_rate": 4.519133062215959e-05, "loss": 0.7720951080322266, "memory(GiB)": 91.52, "step": 85410, "token_acc": 0.7814226258040106, "train_speed(iter/s)": 0.13488 }, { "epoch": 1.1083209794043514, "grad_norm": 0.7436371445655823, "learning_rate": 4.518599176065965e-05, "loss": 0.7189826965332031, "memory(GiB)": 91.52, "step": 85415, "token_acc": 0.801895122945252, "train_speed(iter/s)": 0.13488 }, { "epoch": 1.108385857806007, "grad_norm": 0.7220588326454163, "learning_rate": 4.518065295455901e-05, "loss": 0.721437931060791, "memory(GiB)": 91.52, "step": 85420, "token_acc": 0.7782428085079801, "train_speed(iter/s)": 0.134879 }, { "epoch": 1.1084507362076628, "grad_norm": 0.7150870561599731, "learning_rate": 4.517531420391911e-05, "loss": 0.7381404876708985, "memory(GiB)": 91.52, "step": 85425, "token_acc": 0.7730341049655416, "train_speed(iter/s)": 0.134878 }, { "epoch": 1.1085156146093185, "grad_norm": 0.7424299716949463, "learning_rate": 4.516997550880137e-05, "loss": 0.7663496494293213, "memory(GiB)": 91.52, "step": 85430, "token_acc": 0.7570288094411662, "train_speed(iter/s)": 0.134878 }, { "epoch": 1.1085804930109742, "grad_norm": 0.6813856959342957, "learning_rate": 4.516463686926727e-05, "loss": 0.7387910842895508, "memory(GiB)": 91.52, "step": 85435, "token_acc": 0.7857035319623017, "train_speed(iter/s)": 0.134877 }, { "epoch": 1.1086453714126299, "grad_norm": 0.6658034324645996, "learning_rate": 4.51592982853782e-05, "loss": 0.7299381732940674, "memory(GiB)": 91.52, "step": 85440, "token_acc": 0.8004196537856269, "train_speed(iter/s)": 0.134876 }, { "epoch": 1.1087102498142856, "grad_norm": 0.689300000667572, "learning_rate": 4.515395975719562e-05, "loss": 0.7606245994567871, "memory(GiB)": 91.52, "step": 85445, "token_acc": 0.7782379939549984, "train_speed(iter/s)": 0.134875 }, { "epoch": 1.1087751282159413, "grad_norm": 0.7510712742805481, "learning_rate": 4.514862128478095e-05, "loss": 0.7244627952575684, "memory(GiB)": 91.52, "step": 85450, "token_acc": 0.7942028985507247, "train_speed(iter/s)": 0.134874 }, { "epoch": 1.108840006617597, "grad_norm": 0.7662381529808044, "learning_rate": 4.5143282868195634e-05, "loss": 0.7903197288513184, "memory(GiB)": 91.52, "step": 85455, "token_acc": 0.773083435051384, "train_speed(iter/s)": 0.134873 }, { "epoch": 1.1089048850192527, "grad_norm": 0.7153543829917908, "learning_rate": 4.51379445075011e-05, "loss": 0.7133261680603027, "memory(GiB)": 91.52, "step": 85460, "token_acc": 0.7985113380647395, "train_speed(iter/s)": 0.134872 }, { "epoch": 1.1089697634209084, "grad_norm": 0.7547348141670227, "learning_rate": 4.5132606202758795e-05, "loss": 0.7762164115905762, "memory(GiB)": 91.52, "step": 85465, "token_acc": 0.7917123838782825, "train_speed(iter/s)": 0.134871 }, { "epoch": 1.109034641822564, "grad_norm": 0.6277894973754883, "learning_rate": 4.5127267954030156e-05, "loss": 0.7598354816436768, "memory(GiB)": 91.52, "step": 85470, "token_acc": 0.8078077211610595, "train_speed(iter/s)": 0.13487 }, { "epoch": 1.1090995202242198, "grad_norm": 0.7070624828338623, "learning_rate": 4.512192976137661e-05, "loss": 0.7593062400817872, "memory(GiB)": 91.52, "step": 85475, "token_acc": 0.7817200674536257, "train_speed(iter/s)": 0.134869 }, { "epoch": 1.1091643986258755, "grad_norm": 0.7091169953346252, "learning_rate": 4.511659162485957e-05, "loss": 0.7367457389831543, "memory(GiB)": 91.52, "step": 85480, "token_acc": 0.7757185159379899, "train_speed(iter/s)": 0.134868 }, { "epoch": 1.1092292770275312, "grad_norm": 0.7654958963394165, "learning_rate": 4.511125354454047e-05, "loss": 0.8005990982055664, "memory(GiB)": 91.52, "step": 85485, "token_acc": 0.7744263241448982, "train_speed(iter/s)": 0.134867 }, { "epoch": 1.1092941554291869, "grad_norm": 0.6831095814704895, "learning_rate": 4.510591552048075e-05, "loss": 0.7611078262329102, "memory(GiB)": 91.52, "step": 85490, "token_acc": 0.798756825440434, "train_speed(iter/s)": 0.134866 }, { "epoch": 1.1093590338308426, "grad_norm": 0.7395604252815247, "learning_rate": 4.510057755274184e-05, "loss": 0.739203929901123, "memory(GiB)": 91.52, "step": 85495, "token_acc": 0.7913030506451854, "train_speed(iter/s)": 0.134866 }, { "epoch": 1.1094239122324983, "grad_norm": 0.7261387705802917, "learning_rate": 4.5095239641385174e-05, "loss": 0.7996846675872803, "memory(GiB)": 91.52, "step": 85500, "token_acc": 0.7699054301403294, "train_speed(iter/s)": 0.134865 }, { "epoch": 1.109488790634154, "grad_norm": 0.6817463636398315, "learning_rate": 4.508990178647218e-05, "loss": 0.7485987663269043, "memory(GiB)": 91.52, "step": 85505, "token_acc": 0.7873614972005795, "train_speed(iter/s)": 0.134864 }, { "epoch": 1.1095536690358097, "grad_norm": 0.661457896232605, "learning_rate": 4.508456398806429e-05, "loss": 0.7416172981262207, "memory(GiB)": 91.52, "step": 85510, "token_acc": 0.7851811790156842, "train_speed(iter/s)": 0.134863 }, { "epoch": 1.1096185474374654, "grad_norm": 0.7261120676994324, "learning_rate": 4.507922624622291e-05, "loss": 0.7754199981689454, "memory(GiB)": 91.52, "step": 85515, "token_acc": 0.7786576658444762, "train_speed(iter/s)": 0.134863 }, { "epoch": 1.109683425839121, "grad_norm": 0.6631700992584229, "learning_rate": 4.507388856100946e-05, "loss": 0.7768776893615723, "memory(GiB)": 91.52, "step": 85520, "token_acc": 0.786992722743067, "train_speed(iter/s)": 0.134862 }, { "epoch": 1.1097483042407768, "grad_norm": 0.7194666266441345, "learning_rate": 4.50685509324854e-05, "loss": 0.7525660991668701, "memory(GiB)": 91.52, "step": 85525, "token_acc": 0.7829545076375254, "train_speed(iter/s)": 0.134861 }, { "epoch": 1.1098131826424324, "grad_norm": 0.7057946920394897, "learning_rate": 4.506321336071212e-05, "loss": 0.7958577156066895, "memory(GiB)": 91.52, "step": 85530, "token_acc": 0.7699743622073719, "train_speed(iter/s)": 0.13486 }, { "epoch": 1.1098780610440881, "grad_norm": 0.7700397968292236, "learning_rate": 4.505787584575109e-05, "loss": 0.7783327579498291, "memory(GiB)": 91.52, "step": 85535, "token_acc": 0.7610971457663792, "train_speed(iter/s)": 0.134859 }, { "epoch": 1.1099429394457438, "grad_norm": 0.637910783290863, "learning_rate": 4.505253838766369e-05, "loss": 0.7425171852111816, "memory(GiB)": 91.52, "step": 85540, "token_acc": 0.7632396503142836, "train_speed(iter/s)": 0.134859 }, { "epoch": 1.1100078178473995, "grad_norm": 0.7737793922424316, "learning_rate": 4.504720098651138e-05, "loss": 0.7432124614715576, "memory(GiB)": 91.52, "step": 85545, "token_acc": 0.772452068617558, "train_speed(iter/s)": 0.134858 }, { "epoch": 1.1100726962490552, "grad_norm": 0.6947700381278992, "learning_rate": 4.5041863642355545e-05, "loss": 0.7762850761413574, "memory(GiB)": 91.52, "step": 85550, "token_acc": 0.7790359939703666, "train_speed(iter/s)": 0.134857 }, { "epoch": 1.110137574650711, "grad_norm": 0.7248266339302063, "learning_rate": 4.503652635525762e-05, "loss": 0.7032530307769775, "memory(GiB)": 91.52, "step": 85555, "token_acc": 0.7865351097456501, "train_speed(iter/s)": 0.134856 }, { "epoch": 1.1102024530523666, "grad_norm": 0.6849887371063232, "learning_rate": 4.503118912527904e-05, "loss": 0.7413613796234131, "memory(GiB)": 91.52, "step": 85560, "token_acc": 0.7886917960088692, "train_speed(iter/s)": 0.134855 }, { "epoch": 1.1102673314540223, "grad_norm": 0.725007176399231, "learning_rate": 4.50258519524812e-05, "loss": 0.7532386779785156, "memory(GiB)": 91.52, "step": 85565, "token_acc": 0.7808456380064855, "train_speed(iter/s)": 0.134854 }, { "epoch": 1.110332209855678, "grad_norm": 0.7500874996185303, "learning_rate": 4.5020514836925546e-05, "loss": 0.7381915092468262, "memory(GiB)": 91.52, "step": 85570, "token_acc": 0.7815005616454275, "train_speed(iter/s)": 0.134854 }, { "epoch": 1.1103970882573337, "grad_norm": 0.7027859091758728, "learning_rate": 4.50151777786735e-05, "loss": 0.7866871356964111, "memory(GiB)": 91.52, "step": 85575, "token_acc": 0.7988326848249028, "train_speed(iter/s)": 0.134853 }, { "epoch": 1.1104619666589894, "grad_norm": 0.7164962887763977, "learning_rate": 4.500984077778646e-05, "loss": 0.7345434665679932, "memory(GiB)": 91.52, "step": 85580, "token_acc": 0.802934857808688, "train_speed(iter/s)": 0.134852 }, { "epoch": 1.1105268450606451, "grad_norm": 0.7602453827857971, "learning_rate": 4.500450383432585e-05, "loss": 0.7699056148529053, "memory(GiB)": 91.52, "step": 85585, "token_acc": 0.7919068203650336, "train_speed(iter/s)": 0.134852 }, { "epoch": 1.1105917234623008, "grad_norm": 0.7491016387939453, "learning_rate": 4.499916694835308e-05, "loss": 0.7367319583892822, "memory(GiB)": 91.52, "step": 85590, "token_acc": 0.7851969002055986, "train_speed(iter/s)": 0.134851 }, { "epoch": 1.1106566018639565, "grad_norm": 0.8086138367652893, "learning_rate": 4.4993830119929584e-05, "loss": 0.7797993659973145, "memory(GiB)": 91.52, "step": 85595, "token_acc": 0.7789491583064104, "train_speed(iter/s)": 0.13485 }, { "epoch": 1.1107214802656122, "grad_norm": 0.8498863577842712, "learning_rate": 4.498849334911675e-05, "loss": 0.7795622825622559, "memory(GiB)": 91.52, "step": 85600, "token_acc": 0.7695610868326049, "train_speed(iter/s)": 0.134849 }, { "epoch": 1.110786358667268, "grad_norm": 0.7327868938446045, "learning_rate": 4.498315663597603e-05, "loss": 0.7448728561401368, "memory(GiB)": 91.52, "step": 85605, "token_acc": 0.7996121525533291, "train_speed(iter/s)": 0.134849 }, { "epoch": 1.1108512370689236, "grad_norm": 0.727729320526123, "learning_rate": 4.4977819980568816e-05, "loss": 0.7414459228515625, "memory(GiB)": 91.52, "step": 85610, "token_acc": 0.7907598603580183, "train_speed(iter/s)": 0.134847 }, { "epoch": 1.1109161154705793, "grad_norm": 0.6538671851158142, "learning_rate": 4.497248338295655e-05, "loss": 0.7152561187744141, "memory(GiB)": 91.52, "step": 85615, "token_acc": 0.7835448926696351, "train_speed(iter/s)": 0.134846 }, { "epoch": 1.110980993872235, "grad_norm": 0.6705058217048645, "learning_rate": 4.496714684320059e-05, "loss": 0.7522136688232421, "memory(GiB)": 91.52, "step": 85620, "token_acc": 0.7728864753703287, "train_speed(iter/s)": 0.134845 }, { "epoch": 1.1110458722738907, "grad_norm": 0.7595874071121216, "learning_rate": 4.496181036136239e-05, "loss": 0.7365601062774658, "memory(GiB)": 91.52, "step": 85625, "token_acc": 0.7800699300699301, "train_speed(iter/s)": 0.134844 }, { "epoch": 1.1111107506755464, "grad_norm": 0.7809958457946777, "learning_rate": 4.495647393750335e-05, "loss": 0.7693756103515625, "memory(GiB)": 91.52, "step": 85630, "token_acc": 0.7732293937266782, "train_speed(iter/s)": 0.134844 }, { "epoch": 1.1111756290772021, "grad_norm": 0.6948636174201965, "learning_rate": 4.495113757168488e-05, "loss": 0.7210978984832763, "memory(GiB)": 91.52, "step": 85635, "token_acc": 0.7723547366041148, "train_speed(iter/s)": 0.134842 }, { "epoch": 1.1112405074788578, "grad_norm": 0.7210325002670288, "learning_rate": 4.49458012639684e-05, "loss": 0.7819366931915284, "memory(GiB)": 91.52, "step": 85640, "token_acc": 0.7811400868215089, "train_speed(iter/s)": 0.134842 }, { "epoch": 1.1113053858805135, "grad_norm": 0.7246759533882141, "learning_rate": 4.494046501441532e-05, "loss": 0.7578346729278564, "memory(GiB)": 91.52, "step": 85645, "token_acc": 0.7791258046035117, "train_speed(iter/s)": 0.13484 }, { "epoch": 1.1113702642821692, "grad_norm": 0.6124911904335022, "learning_rate": 4.493512882308703e-05, "loss": 0.7287559986114502, "memory(GiB)": 91.52, "step": 85650, "token_acc": 0.7798020074247216, "train_speed(iter/s)": 0.134839 }, { "epoch": 1.111435142683825, "grad_norm": 0.6431220173835754, "learning_rate": 4.492979269004496e-05, "loss": 0.767219066619873, "memory(GiB)": 91.52, "step": 85655, "token_acc": 0.793992003007416, "train_speed(iter/s)": 0.134838 }, { "epoch": 1.1115000210854806, "grad_norm": 0.7949993014335632, "learning_rate": 4.4924456615350516e-05, "loss": 0.7925103664398193, "memory(GiB)": 91.52, "step": 85660, "token_acc": 0.7585237147280943, "train_speed(iter/s)": 0.134838 }, { "epoch": 1.1115648994871363, "grad_norm": 0.7707425951957703, "learning_rate": 4.491912059906509e-05, "loss": 0.7983873367309571, "memory(GiB)": 91.52, "step": 85665, "token_acc": 0.7760402449011427, "train_speed(iter/s)": 0.134837 }, { "epoch": 1.111629777888792, "grad_norm": 0.6690454483032227, "learning_rate": 4.491378464125009e-05, "loss": 0.7689037799835206, "memory(GiB)": 91.52, "step": 85670, "token_acc": 0.7774432529346098, "train_speed(iter/s)": 0.134836 }, { "epoch": 1.1116946562904477, "grad_norm": 0.6993139982223511, "learning_rate": 4.4908448741966936e-05, "loss": 0.7513651371002197, "memory(GiB)": 91.52, "step": 85675, "token_acc": 0.8011152416356877, "train_speed(iter/s)": 0.134835 }, { "epoch": 1.1117595346921034, "grad_norm": 0.6708598732948303, "learning_rate": 4.4903112901277025e-05, "loss": 0.7548148155212402, "memory(GiB)": 91.52, "step": 85680, "token_acc": 0.7808458072954513, "train_speed(iter/s)": 0.134834 }, { "epoch": 1.1118244130937591, "grad_norm": 0.703588604927063, "learning_rate": 4.489777711924176e-05, "loss": 0.7363887786865234, "memory(GiB)": 91.52, "step": 85685, "token_acc": 0.8066555043951444, "train_speed(iter/s)": 0.134834 }, { "epoch": 1.1118892914954148, "grad_norm": 0.6607214212417603, "learning_rate": 4.4892441395922555e-05, "loss": 0.7183669090270997, "memory(GiB)": 91.52, "step": 85690, "token_acc": 0.7705385286030447, "train_speed(iter/s)": 0.134833 }, { "epoch": 1.1119541698970705, "grad_norm": 0.688077986240387, "learning_rate": 4.488710573138081e-05, "loss": 0.7754478931427002, "memory(GiB)": 91.52, "step": 85695, "token_acc": 0.7691677691677692, "train_speed(iter/s)": 0.134832 }, { "epoch": 1.112019048298726, "grad_norm": 0.7492594122886658, "learning_rate": 4.4881770125677916e-05, "loss": 0.7800909996032714, "memory(GiB)": 91.52, "step": 85700, "token_acc": 0.783101786220584, "train_speed(iter/s)": 0.134831 }, { "epoch": 1.112083926700382, "grad_norm": 0.7174457907676697, "learning_rate": 4.487643457887527e-05, "loss": 0.7455036163330078, "memory(GiB)": 91.52, "step": 85705, "token_acc": 0.7891299276883602, "train_speed(iter/s)": 0.13483 }, { "epoch": 1.1121488051020374, "grad_norm": 0.7579342722892761, "learning_rate": 4.48710990910343e-05, "loss": 0.7679129600524902, "memory(GiB)": 91.52, "step": 85710, "token_acc": 0.7797614847223202, "train_speed(iter/s)": 0.134829 }, { "epoch": 1.1122136835036933, "grad_norm": 0.7266806960105896, "learning_rate": 4.486576366221639e-05, "loss": 0.7251986980438232, "memory(GiB)": 91.52, "step": 85715, "token_acc": 0.7930806376926639, "train_speed(iter/s)": 0.134828 }, { "epoch": 1.1122785619053488, "grad_norm": 0.6899767518043518, "learning_rate": 4.486042829248293e-05, "loss": 0.7487297534942627, "memory(GiB)": 91.52, "step": 85720, "token_acc": 0.7944762216048374, "train_speed(iter/s)": 0.134828 }, { "epoch": 1.1123434403070045, "grad_norm": 0.7138172388076782, "learning_rate": 4.485509298189535e-05, "loss": 0.7682054519653321, "memory(GiB)": 91.52, "step": 85725, "token_acc": 0.7851143932812048, "train_speed(iter/s)": 0.134827 }, { "epoch": 1.1124083187086602, "grad_norm": 0.7172846794128418, "learning_rate": 4.484975773051502e-05, "loss": 0.7352116584777832, "memory(GiB)": 91.52, "step": 85730, "token_acc": 0.7738278073900081, "train_speed(iter/s)": 0.134826 }, { "epoch": 1.112473197110316, "grad_norm": 0.7131621241569519, "learning_rate": 4.484442253840335e-05, "loss": 0.7522383689880371, "memory(GiB)": 91.52, "step": 85735, "token_acc": 0.7894445709823317, "train_speed(iter/s)": 0.134825 }, { "epoch": 1.1125380755119716, "grad_norm": 0.7561207413673401, "learning_rate": 4.483908740562172e-05, "loss": 0.7594902992248536, "memory(GiB)": 91.52, "step": 85740, "token_acc": 0.792391773335251, "train_speed(iter/s)": 0.134825 }, { "epoch": 1.1126029539136273, "grad_norm": 0.6935327053070068, "learning_rate": 4.483375233223155e-05, "loss": 0.7506021976470947, "memory(GiB)": 91.52, "step": 85745, "token_acc": 0.7701918347270045, "train_speed(iter/s)": 0.134824 }, { "epoch": 1.112667832315283, "grad_norm": 0.7338516116142273, "learning_rate": 4.4828417318294225e-05, "loss": 0.8015181541442871, "memory(GiB)": 91.52, "step": 85750, "token_acc": 0.7674100878812495, "train_speed(iter/s)": 0.134823 }, { "epoch": 1.1127327107169387, "grad_norm": 0.7036575078964233, "learning_rate": 4.482308236387113e-05, "loss": 0.7599599838256836, "memory(GiB)": 91.52, "step": 85755, "token_acc": 0.7835147383506992, "train_speed(iter/s)": 0.134822 }, { "epoch": 1.1127975891185944, "grad_norm": 0.6672145128250122, "learning_rate": 4.481774746902369e-05, "loss": 0.7179549217224122, "memory(GiB)": 91.52, "step": 85760, "token_acc": 0.7802153432032302, "train_speed(iter/s)": 0.134821 }, { "epoch": 1.11286246752025, "grad_norm": 0.753290593624115, "learning_rate": 4.481241263381327e-05, "loss": 0.753135871887207, "memory(GiB)": 91.52, "step": 85765, "token_acc": 0.7733155299917831, "train_speed(iter/s)": 0.13482 }, { "epoch": 1.1129273459219058, "grad_norm": 0.7740817666053772, "learning_rate": 4.4807077858301274e-05, "loss": 0.7710945129394531, "memory(GiB)": 91.52, "step": 85770, "token_acc": 0.7805472771606611, "train_speed(iter/s)": 0.13482 }, { "epoch": 1.1129922243235615, "grad_norm": 0.6828952431678772, "learning_rate": 4.480174314254908e-05, "loss": 0.7861064434051513, "memory(GiB)": 91.52, "step": 85775, "token_acc": 0.7791616038882139, "train_speed(iter/s)": 0.134819 }, { "epoch": 1.1130571027252172, "grad_norm": 0.7262572050094604, "learning_rate": 4.47964084866181e-05, "loss": 0.7641239166259766, "memory(GiB)": 91.52, "step": 85780, "token_acc": 0.7775649350649351, "train_speed(iter/s)": 0.134819 }, { "epoch": 1.1131219811268729, "grad_norm": 0.7485693097114563, "learning_rate": 4.479107389056972e-05, "loss": 0.7541865348815918, "memory(GiB)": 91.52, "step": 85785, "token_acc": 0.7833994708994709, "train_speed(iter/s)": 0.134818 }, { "epoch": 1.1131868595285286, "grad_norm": 0.6384598016738892, "learning_rate": 4.478573935446531e-05, "loss": 0.7672755241394043, "memory(GiB)": 91.52, "step": 85790, "token_acc": 0.7811042944785276, "train_speed(iter/s)": 0.134818 }, { "epoch": 1.1132517379301843, "grad_norm": 0.8177391886711121, "learning_rate": 4.47804048783663e-05, "loss": 0.7449419975280762, "memory(GiB)": 91.52, "step": 85795, "token_acc": 0.798191278493558, "train_speed(iter/s)": 0.134817 }, { "epoch": 1.11331661633184, "grad_norm": 0.7590845823287964, "learning_rate": 4.4775070462334044e-05, "loss": 0.7684147834777832, "memory(GiB)": 91.52, "step": 85800, "token_acc": 0.7787167988961711, "train_speed(iter/s)": 0.134817 }, { "epoch": 1.1133814947334957, "grad_norm": 0.7747043371200562, "learning_rate": 4.4769736106429945e-05, "loss": 0.7770855903625489, "memory(GiB)": 91.52, "step": 85805, "token_acc": 0.7658573535860942, "train_speed(iter/s)": 0.134816 }, { "epoch": 1.1134463731351514, "grad_norm": 0.7128260135650635, "learning_rate": 4.476440181071537e-05, "loss": 0.7798954963684082, "memory(GiB)": 91.52, "step": 85810, "token_acc": 0.7819750241350316, "train_speed(iter/s)": 0.134815 }, { "epoch": 1.113511251536807, "grad_norm": 0.7108712792396545, "learning_rate": 4.475906757525173e-05, "loss": 0.7836215019226074, "memory(GiB)": 91.52, "step": 85815, "token_acc": 0.7943504971815821, "train_speed(iter/s)": 0.134814 }, { "epoch": 1.1135761299384628, "grad_norm": 0.7331897020339966, "learning_rate": 4.475373340010041e-05, "loss": 0.781037425994873, "memory(GiB)": 91.52, "step": 85820, "token_acc": 0.7939339554917444, "train_speed(iter/s)": 0.134814 }, { "epoch": 1.1136410083401185, "grad_norm": 0.8154362440109253, "learning_rate": 4.4748399285322774e-05, "loss": 0.7263857364654541, "memory(GiB)": 91.52, "step": 85825, "token_acc": 0.7925677546374627, "train_speed(iter/s)": 0.134813 }, { "epoch": 1.1137058867417742, "grad_norm": 0.7465680837631226, "learning_rate": 4.474306523098023e-05, "loss": 0.7728532314300537, "memory(GiB)": 91.52, "step": 85830, "token_acc": 0.7838578527330221, "train_speed(iter/s)": 0.134812 }, { "epoch": 1.1137707651434299, "grad_norm": 0.6785392761230469, "learning_rate": 4.4737731237134173e-05, "loss": 0.7471033573150635, "memory(GiB)": 91.52, "step": 85835, "token_acc": 0.7663795922816963, "train_speed(iter/s)": 0.134811 }, { "epoch": 1.1138356435450856, "grad_norm": 0.7207012176513672, "learning_rate": 4.473239730384595e-05, "loss": 0.765409278869629, "memory(GiB)": 91.52, "step": 85840, "token_acc": 0.7671712149670387, "train_speed(iter/s)": 0.134811 }, { "epoch": 1.1139005219467413, "grad_norm": 0.8403677940368652, "learning_rate": 4.4727063431176944e-05, "loss": 0.7755160331726074, "memory(GiB)": 91.52, "step": 85845, "token_acc": 0.7877815848946562, "train_speed(iter/s)": 0.13481 }, { "epoch": 1.113965400348397, "grad_norm": 0.7763471603393555, "learning_rate": 4.472172961918857e-05, "loss": 0.7429604053497314, "memory(GiB)": 91.52, "step": 85850, "token_acc": 0.7989246635643316, "train_speed(iter/s)": 0.13481 }, { "epoch": 1.1140302787500527, "grad_norm": 0.6996423602104187, "learning_rate": 4.471639586794219e-05, "loss": 0.7725069046020507, "memory(GiB)": 91.52, "step": 85855, "token_acc": 0.7755557882246187, "train_speed(iter/s)": 0.134809 }, { "epoch": 1.1140951571517084, "grad_norm": 0.7051880955696106, "learning_rate": 4.471106217749917e-05, "loss": 0.7469954490661621, "memory(GiB)": 91.52, "step": 85860, "token_acc": 0.7903382247113669, "train_speed(iter/s)": 0.134808 }, { "epoch": 1.114160035553364, "grad_norm": 0.7162537574768066, "learning_rate": 4.470572854792092e-05, "loss": 0.7541937828063965, "memory(GiB)": 91.52, "step": 85865, "token_acc": 0.7752057217565177, "train_speed(iter/s)": 0.134807 }, { "epoch": 1.1142249139550198, "grad_norm": 0.7468294501304626, "learning_rate": 4.4700394979268824e-05, "loss": 0.7502964973449707, "memory(GiB)": 91.52, "step": 85870, "token_acc": 0.7964543206436557, "train_speed(iter/s)": 0.134806 }, { "epoch": 1.1142897923566755, "grad_norm": 0.7612380385398865, "learning_rate": 4.469506147160422e-05, "loss": 0.7354426383972168, "memory(GiB)": 91.52, "step": 85875, "token_acc": 0.7828966208476518, "train_speed(iter/s)": 0.134805 }, { "epoch": 1.1143546707583312, "grad_norm": 0.6939606070518494, "learning_rate": 4.4689728024988505e-05, "loss": 0.70748929977417, "memory(GiB)": 91.52, "step": 85880, "token_acc": 0.7920395914499315, "train_speed(iter/s)": 0.134804 }, { "epoch": 1.1144195491599869, "grad_norm": 0.7397820949554443, "learning_rate": 4.468439463948306e-05, "loss": 0.7487261772155762, "memory(GiB)": 91.52, "step": 85885, "token_acc": 0.7770079047526105, "train_speed(iter/s)": 0.134803 }, { "epoch": 1.1144844275616426, "grad_norm": 0.7060378789901733, "learning_rate": 4.467906131514926e-05, "loss": 0.7741489887237549, "memory(GiB)": 91.52, "step": 85890, "token_acc": 0.7807929386876755, "train_speed(iter/s)": 0.134803 }, { "epoch": 1.1145493059632983, "grad_norm": 0.7347224950790405, "learning_rate": 4.467372805204848e-05, "loss": 0.7749200820922851, "memory(GiB)": 91.52, "step": 85895, "token_acc": 0.7927327647476902, "train_speed(iter/s)": 0.134802 }, { "epoch": 1.114614184364954, "grad_norm": 0.7055299878120422, "learning_rate": 4.46683948502421e-05, "loss": 0.7619043350219726, "memory(GiB)": 91.52, "step": 85900, "token_acc": 0.7746197108670215, "train_speed(iter/s)": 0.134801 }, { "epoch": 1.1146790627666097, "grad_norm": 0.7288388609886169, "learning_rate": 4.46630617097915e-05, "loss": 0.756615161895752, "memory(GiB)": 91.52, "step": 85905, "token_acc": 0.7957838807617996, "train_speed(iter/s)": 0.1348 }, { "epoch": 1.1147439411682654, "grad_norm": 0.7060744166374207, "learning_rate": 4.465772863075803e-05, "loss": 0.779121208190918, "memory(GiB)": 91.52, "step": 85910, "token_acc": 0.7704090636360428, "train_speed(iter/s)": 0.134799 }, { "epoch": 1.114808819569921, "grad_norm": 0.7262293100357056, "learning_rate": 4.4652395613203064e-05, "loss": 0.7653062820434571, "memory(GiB)": 91.52, "step": 85915, "token_acc": 0.78362553420601, "train_speed(iter/s)": 0.134798 }, { "epoch": 1.1148736979715768, "grad_norm": 0.7673357129096985, "learning_rate": 4.464706265718799e-05, "loss": 0.7787083625793457, "memory(GiB)": 91.52, "step": 85920, "token_acc": 0.7685094084368467, "train_speed(iter/s)": 0.134797 }, { "epoch": 1.1149385763732325, "grad_norm": 0.6871738433837891, "learning_rate": 4.464172976277418e-05, "loss": 0.7205763816833496, "memory(GiB)": 91.52, "step": 85925, "token_acc": 0.8047021633902993, "train_speed(iter/s)": 0.134796 }, { "epoch": 1.1150034547748882, "grad_norm": 0.7308151125907898, "learning_rate": 4.463639693002299e-05, "loss": 0.7583408355712891, "memory(GiB)": 91.52, "step": 85930, "token_acc": 0.7713308221308359, "train_speed(iter/s)": 0.134796 }, { "epoch": 1.1150683331765439, "grad_norm": 0.7034666538238525, "learning_rate": 4.463106415899581e-05, "loss": 0.7022178173065186, "memory(GiB)": 91.52, "step": 85935, "token_acc": 0.7767022003147549, "train_speed(iter/s)": 0.134794 }, { "epoch": 1.1151332115781996, "grad_norm": 0.8664982318878174, "learning_rate": 4.462573144975401e-05, "loss": 0.7907665252685547, "memory(GiB)": 91.52, "step": 85940, "token_acc": 0.7916416020262783, "train_speed(iter/s)": 0.134794 }, { "epoch": 1.1151980899798553, "grad_norm": 0.7392001748085022, "learning_rate": 4.462039880235894e-05, "loss": 0.7500790596008301, "memory(GiB)": 91.52, "step": 85945, "token_acc": 0.7817657101048695, "train_speed(iter/s)": 0.134793 }, { "epoch": 1.115262968381511, "grad_norm": 0.6571299433708191, "learning_rate": 4.461506621687195e-05, "loss": 0.7431375026702881, "memory(GiB)": 91.52, "step": 85950, "token_acc": 0.8132536224541493, "train_speed(iter/s)": 0.134793 }, { "epoch": 1.1153278467831667, "grad_norm": 0.7144637703895569, "learning_rate": 4.460973369335445e-05, "loss": 0.7826746463775635, "memory(GiB)": 91.52, "step": 85955, "token_acc": 0.7850927965943463, "train_speed(iter/s)": 0.134792 }, { "epoch": 1.1153927251848224, "grad_norm": 0.6885789036750793, "learning_rate": 4.4604401231867774e-05, "loss": 0.7195310592651367, "memory(GiB)": 91.52, "step": 85960, "token_acc": 0.7831903675139686, "train_speed(iter/s)": 0.134791 }, { "epoch": 1.115457603586478, "grad_norm": 0.6690971255302429, "learning_rate": 4.4599068832473315e-05, "loss": 0.7395313262939454, "memory(GiB)": 91.52, "step": 85965, "token_acc": 0.7849123064504293, "train_speed(iter/s)": 0.13479 }, { "epoch": 1.1155224819881338, "grad_norm": 0.679957926273346, "learning_rate": 4.459373649523243e-05, "loss": 0.7300662040710449, "memory(GiB)": 91.52, "step": 85970, "token_acc": 0.8013783597518952, "train_speed(iter/s)": 0.134789 }, { "epoch": 1.1155873603897895, "grad_norm": 0.6588703393936157, "learning_rate": 4.4588404220206475e-05, "loss": 0.7604629993438721, "memory(GiB)": 91.52, "step": 85975, "token_acc": 0.7833667137275503, "train_speed(iter/s)": 0.134789 }, { "epoch": 1.1156522387914451, "grad_norm": 0.7640119194984436, "learning_rate": 4.458307200745682e-05, "loss": 0.7681457996368408, "memory(GiB)": 91.52, "step": 85980, "token_acc": 0.7787703625853915, "train_speed(iter/s)": 0.134788 }, { "epoch": 1.1157171171931008, "grad_norm": 0.6954837441444397, "learning_rate": 4.45777398570448e-05, "loss": 0.7865930557250976, "memory(GiB)": 91.52, "step": 85985, "token_acc": 0.776889661164205, "train_speed(iter/s)": 0.134787 }, { "epoch": 1.1157819955947565, "grad_norm": 0.7528683543205261, "learning_rate": 4.457240776903182e-05, "loss": 0.7760355949401856, "memory(GiB)": 91.52, "step": 85990, "token_acc": 0.7813692168967191, "train_speed(iter/s)": 0.134786 }, { "epoch": 1.1158468739964122, "grad_norm": 0.7839611172676086, "learning_rate": 4.4567075743479206e-05, "loss": 0.7802170753479004, "memory(GiB)": 91.52, "step": 85995, "token_acc": 0.7886853278817424, "train_speed(iter/s)": 0.134785 }, { "epoch": 1.115911752398068, "grad_norm": 0.7125051021575928, "learning_rate": 4.4561743780448346e-05, "loss": 0.8247233390808105, "memory(GiB)": 91.52, "step": 86000, "token_acc": 0.7640358014646054, "train_speed(iter/s)": 0.134784 }, { "epoch": 1.1159766307997236, "grad_norm": 0.7593160271644592, "learning_rate": 4.455641188000059e-05, "loss": 0.7329576492309571, "memory(GiB)": 91.52, "step": 86005, "token_acc": 0.7884024424312568, "train_speed(iter/s)": 0.134783 }, { "epoch": 1.1160415092013793, "grad_norm": 0.6657531261444092, "learning_rate": 4.455108004219728e-05, "loss": 0.7371776103973389, "memory(GiB)": 91.52, "step": 86010, "token_acc": 0.7915357142857142, "train_speed(iter/s)": 0.134782 }, { "epoch": 1.116106387603035, "grad_norm": 0.7675507664680481, "learning_rate": 4.4545748267099826e-05, "loss": 0.7448445320129394, "memory(GiB)": 91.52, "step": 86015, "token_acc": 0.776548080781062, "train_speed(iter/s)": 0.134782 }, { "epoch": 1.1161712660046907, "grad_norm": 0.6492167115211487, "learning_rate": 4.454041655476952e-05, "loss": 0.7547037124633789, "memory(GiB)": 91.52, "step": 86020, "token_acc": 0.7950745006181273, "train_speed(iter/s)": 0.134781 }, { "epoch": 1.1162361444063464, "grad_norm": 0.7482413649559021, "learning_rate": 4.453508490526776e-05, "loss": 0.7739119529724121, "memory(GiB)": 91.52, "step": 86025, "token_acc": 0.7727837683496255, "train_speed(iter/s)": 0.13478 }, { "epoch": 1.1163010228080021, "grad_norm": 0.735619068145752, "learning_rate": 4.452975331865588e-05, "loss": 0.7741106986999512, "memory(GiB)": 91.52, "step": 86030, "token_acc": 0.785088938299055, "train_speed(iter/s)": 0.134779 }, { "epoch": 1.1163659012096578, "grad_norm": 0.7404927611351013, "learning_rate": 4.4524421794995255e-05, "loss": 0.744115161895752, "memory(GiB)": 91.52, "step": 86035, "token_acc": 0.7819266173991961, "train_speed(iter/s)": 0.134778 }, { "epoch": 1.1164307796113135, "grad_norm": 0.7351924180984497, "learning_rate": 4.451909033434723e-05, "loss": 0.7542874336242675, "memory(GiB)": 91.52, "step": 86040, "token_acc": 0.785730508419584, "train_speed(iter/s)": 0.134777 }, { "epoch": 1.1164956580129692, "grad_norm": 0.8559114336967468, "learning_rate": 4.451375893677316e-05, "loss": 0.7378703117370605, "memory(GiB)": 91.52, "step": 86045, "token_acc": 0.7877369262388674, "train_speed(iter/s)": 0.134777 }, { "epoch": 1.116560536414625, "grad_norm": 0.7300543189048767, "learning_rate": 4.4508427602334415e-05, "loss": 0.7708658695220947, "memory(GiB)": 91.52, "step": 86050, "token_acc": 0.7909531502423264, "train_speed(iter/s)": 0.134776 }, { "epoch": 1.1166254148162806, "grad_norm": 0.7111931443214417, "learning_rate": 4.450309633109232e-05, "loss": 0.7593966007232666, "memory(GiB)": 91.52, "step": 86055, "token_acc": 0.7974397328416878, "train_speed(iter/s)": 0.134775 }, { "epoch": 1.1166902932179363, "grad_norm": 0.7424680590629578, "learning_rate": 4.449776512310823e-05, "loss": 0.7563712120056152, "memory(GiB)": 91.52, "step": 86060, "token_acc": 0.7905359224862301, "train_speed(iter/s)": 0.134774 }, { "epoch": 1.116755171619592, "grad_norm": 0.6824519634246826, "learning_rate": 4.449243397844351e-05, "loss": 0.753023624420166, "memory(GiB)": 91.52, "step": 86065, "token_acc": 0.7781474580782539, "train_speed(iter/s)": 0.134774 }, { "epoch": 1.1168200500212477, "grad_norm": 0.6507415771484375, "learning_rate": 4.4487102897159505e-05, "loss": 0.7174115180969238, "memory(GiB)": 91.52, "step": 86070, "token_acc": 0.7919531448943213, "train_speed(iter/s)": 0.134773 }, { "epoch": 1.1168849284229034, "grad_norm": 0.6262795329093933, "learning_rate": 4.4481771879317566e-05, "loss": 0.7678522109985352, "memory(GiB)": 91.52, "step": 86075, "token_acc": 0.7812718636079096, "train_speed(iter/s)": 0.134772 }, { "epoch": 1.1169498068245591, "grad_norm": 0.7224017977714539, "learning_rate": 4.4476440924979036e-05, "loss": 0.7303783893585205, "memory(GiB)": 91.52, "step": 86080, "token_acc": 0.8028797856903672, "train_speed(iter/s)": 0.134771 }, { "epoch": 1.1170146852262148, "grad_norm": 0.7932985424995422, "learning_rate": 4.447111003420528e-05, "loss": 0.766851806640625, "memory(GiB)": 91.52, "step": 86085, "token_acc": 0.7869370054777846, "train_speed(iter/s)": 0.13477 }, { "epoch": 1.1170795636278705, "grad_norm": 0.7314268350601196, "learning_rate": 4.446577920705764e-05, "loss": 0.7372992515563965, "memory(GiB)": 91.52, "step": 86090, "token_acc": 0.800484796634941, "train_speed(iter/s)": 0.134769 }, { "epoch": 1.1171444420295262, "grad_norm": 0.7093640565872192, "learning_rate": 4.446044844359744e-05, "loss": 0.7632041931152344, "memory(GiB)": 91.52, "step": 86095, "token_acc": 0.7794063217114516, "train_speed(iter/s)": 0.134769 }, { "epoch": 1.117209320431182, "grad_norm": 0.6399531364440918, "learning_rate": 4.445511774388604e-05, "loss": 0.763233757019043, "memory(GiB)": 91.52, "step": 86100, "token_acc": 0.7698498453742688, "train_speed(iter/s)": 0.134768 }, { "epoch": 1.1172741988328376, "grad_norm": 0.7836777567863464, "learning_rate": 4.4449787107984795e-05, "loss": 0.740338659286499, "memory(GiB)": 91.52, "step": 86105, "token_acc": 0.7977010820162449, "train_speed(iter/s)": 0.134767 }, { "epoch": 1.1173390772344933, "grad_norm": 0.7381860017776489, "learning_rate": 4.4444456535955046e-05, "loss": 0.7504495143890381, "memory(GiB)": 91.52, "step": 86110, "token_acc": 0.7849989495062679, "train_speed(iter/s)": 0.134766 }, { "epoch": 1.117403955636149, "grad_norm": 0.8049814701080322, "learning_rate": 4.4439126027858125e-05, "loss": 0.7861898422241211, "memory(GiB)": 91.52, "step": 86115, "token_acc": 0.7786583926343125, "train_speed(iter/s)": 0.134765 }, { "epoch": 1.1174688340378047, "grad_norm": 0.7204961180686951, "learning_rate": 4.44337955837554e-05, "loss": 0.7826742172241211, "memory(GiB)": 91.52, "step": 86120, "token_acc": 0.7775718581719001, "train_speed(iter/s)": 0.134764 }, { "epoch": 1.1175337124394604, "grad_norm": 0.6872599720954895, "learning_rate": 4.4428465203708186e-05, "loss": 0.763810396194458, "memory(GiB)": 91.52, "step": 86125, "token_acc": 0.7784050151264698, "train_speed(iter/s)": 0.134764 }, { "epoch": 1.1175985908411161, "grad_norm": 0.6069431900978088, "learning_rate": 4.442313488777784e-05, "loss": 0.7422836303710938, "memory(GiB)": 91.52, "step": 86130, "token_acc": 0.7853299167200513, "train_speed(iter/s)": 0.134763 }, { "epoch": 1.1176634692427718, "grad_norm": 0.694410502910614, "learning_rate": 4.441780463602568e-05, "loss": 0.8038738250732422, "memory(GiB)": 91.52, "step": 86135, "token_acc": 0.7759302211623272, "train_speed(iter/s)": 0.134762 }, { "epoch": 1.1177283476444275, "grad_norm": 0.736430287361145, "learning_rate": 4.4412474448513084e-05, "loss": 0.7879659175872803, "memory(GiB)": 91.52, "step": 86140, "token_acc": 0.7758038415535073, "train_speed(iter/s)": 0.134761 }, { "epoch": 1.1177932260460832, "grad_norm": 0.6292182803153992, "learning_rate": 4.4407144325301374e-05, "loss": 0.7682316780090332, "memory(GiB)": 91.52, "step": 86145, "token_acc": 0.789548267684655, "train_speed(iter/s)": 0.13476 }, { "epoch": 1.117858104447739, "grad_norm": 0.6204240322113037, "learning_rate": 4.4401814266451877e-05, "loss": 0.7152616024017334, "memory(GiB)": 91.52, "step": 86150, "token_acc": 0.7984360253471754, "train_speed(iter/s)": 0.134758 }, { "epoch": 1.1179229828493946, "grad_norm": 0.6635211706161499, "learning_rate": 4.4396484272025956e-05, "loss": 0.7772709846496582, "memory(GiB)": 91.52, "step": 86155, "token_acc": 0.7618151740244575, "train_speed(iter/s)": 0.134757 }, { "epoch": 1.1179878612510503, "grad_norm": 0.729027509689331, "learning_rate": 4.4391154342084925e-05, "loss": 0.7672149658203125, "memory(GiB)": 91.52, "step": 86160, "token_acc": 0.7817350663528128, "train_speed(iter/s)": 0.134756 }, { "epoch": 1.118052739652706, "grad_norm": 0.7519996762275696, "learning_rate": 4.438582447669013e-05, "loss": 0.7524784088134766, "memory(GiB)": 91.52, "step": 86165, "token_acc": 0.7887883827526062, "train_speed(iter/s)": 0.134756 }, { "epoch": 1.1181176180543617, "grad_norm": 0.6877403855323792, "learning_rate": 4.438049467590289e-05, "loss": 0.7394620895385742, "memory(GiB)": 91.52, "step": 86170, "token_acc": 0.7899638435411417, "train_speed(iter/s)": 0.134754 }, { "epoch": 1.1181824964560172, "grad_norm": 0.6387489438056946, "learning_rate": 4.4375164939784576e-05, "loss": 0.7456652164459229, "memory(GiB)": 91.52, "step": 86175, "token_acc": 0.7778288218248639, "train_speed(iter/s)": 0.134754 }, { "epoch": 1.1182473748576731, "grad_norm": 0.7477761507034302, "learning_rate": 4.436983526839649e-05, "loss": 0.7747312545776367, "memory(GiB)": 91.52, "step": 86180, "token_acc": 0.7761675276496897, "train_speed(iter/s)": 0.134753 }, { "epoch": 1.1183122532593286, "grad_norm": 0.7260550260543823, "learning_rate": 4.436450566179998e-05, "loss": 0.7860612392425537, "memory(GiB)": 91.52, "step": 86185, "token_acc": 0.7682097167788212, "train_speed(iter/s)": 0.134752 }, { "epoch": 1.1183771316609845, "grad_norm": 0.7622547149658203, "learning_rate": 4.435917612005638e-05, "loss": 0.7646842002868652, "memory(GiB)": 91.52, "step": 86190, "token_acc": 0.7618756786102063, "train_speed(iter/s)": 0.134751 }, { "epoch": 1.11844201006264, "grad_norm": 0.7699909806251526, "learning_rate": 4.435384664322702e-05, "loss": 0.7338574409484864, "memory(GiB)": 91.52, "step": 86195, "token_acc": 0.784184462038293, "train_speed(iter/s)": 0.13475 }, { "epoch": 1.1185068884642957, "grad_norm": 0.6368474364280701, "learning_rate": 4.4348517231373233e-05, "loss": 0.7652612686157226, "memory(GiB)": 91.52, "step": 86200, "token_acc": 0.7812325866488354, "train_speed(iter/s)": 0.134749 }, { "epoch": 1.1185717668659514, "grad_norm": 0.6198367476463318, "learning_rate": 4.434318788455633e-05, "loss": 0.7506436347961426, "memory(GiB)": 91.52, "step": 86205, "token_acc": 0.7811277054548359, "train_speed(iter/s)": 0.134749 }, { "epoch": 1.118636645267607, "grad_norm": 0.6941920518875122, "learning_rate": 4.4337858602837665e-05, "loss": 0.7663496017456055, "memory(GiB)": 91.52, "step": 86210, "token_acc": 0.7811043585250363, "train_speed(iter/s)": 0.134748 }, { "epoch": 1.1187015236692628, "grad_norm": 0.6689561009407043, "learning_rate": 4.433252938627856e-05, "loss": 0.7577583312988281, "memory(GiB)": 91.52, "step": 86215, "token_acc": 0.7827653089498476, "train_speed(iter/s)": 0.134747 }, { "epoch": 1.1187664020709185, "grad_norm": 0.6919525265693665, "learning_rate": 4.4327200234940334e-05, "loss": 0.7683710098266602, "memory(GiB)": 91.52, "step": 86220, "token_acc": 0.7864706812075233, "train_speed(iter/s)": 0.134746 }, { "epoch": 1.1188312804725742, "grad_norm": 0.6907522082328796, "learning_rate": 4.432187114888433e-05, "loss": 0.7183358192443847, "memory(GiB)": 91.52, "step": 86225, "token_acc": 0.7923329376440192, "train_speed(iter/s)": 0.134745 }, { "epoch": 1.1188961588742299, "grad_norm": 0.6651031374931335, "learning_rate": 4.431654212817188e-05, "loss": 0.7772544384002685, "memory(GiB)": 91.52, "step": 86230, "token_acc": 0.7926408237431859, "train_speed(iter/s)": 0.134745 }, { "epoch": 1.1189610372758856, "grad_norm": 0.7239784598350525, "learning_rate": 4.4311213172864285e-05, "loss": 0.7404253482818604, "memory(GiB)": 91.52, "step": 86235, "token_acc": 0.77822523668221, "train_speed(iter/s)": 0.134744 }, { "epoch": 1.1190259156775413, "grad_norm": 0.7145221829414368, "learning_rate": 4.4305884283022874e-05, "loss": 0.7395007133483886, "memory(GiB)": 91.52, "step": 86240, "token_acc": 0.8178805334175018, "train_speed(iter/s)": 0.134743 }, { "epoch": 1.119090794079197, "grad_norm": 0.7483913898468018, "learning_rate": 4.430055545870899e-05, "loss": 0.776726245880127, "memory(GiB)": 91.52, "step": 86245, "token_acc": 0.7859298186133243, "train_speed(iter/s)": 0.134742 }, { "epoch": 1.1191556724808527, "grad_norm": 0.6780906915664673, "learning_rate": 4.429522669998395e-05, "loss": 0.7299086570739746, "memory(GiB)": 91.52, "step": 86250, "token_acc": 0.7953086046687315, "train_speed(iter/s)": 0.134741 }, { "epoch": 1.1192205508825084, "grad_norm": 0.669677734375, "learning_rate": 4.428989800690906e-05, "loss": 0.7580836296081543, "memory(GiB)": 91.52, "step": 86255, "token_acc": 0.7911720386636775, "train_speed(iter/s)": 0.13474 }, { "epoch": 1.119285429284164, "grad_norm": 0.6626295447349548, "learning_rate": 4.4284569379545667e-05, "loss": 0.7567344188690186, "memory(GiB)": 91.52, "step": 86260, "token_acc": 0.7910578930023069, "train_speed(iter/s)": 0.134739 }, { "epoch": 1.1193503076858198, "grad_norm": 0.7142822742462158, "learning_rate": 4.427924081795509e-05, "loss": 0.7864863395690918, "memory(GiB)": 91.52, "step": 86265, "token_acc": 0.7691165553080921, "train_speed(iter/s)": 0.134738 }, { "epoch": 1.1194151860874755, "grad_norm": 0.643746554851532, "learning_rate": 4.427391232219864e-05, "loss": 0.753312063217163, "memory(GiB)": 91.52, "step": 86270, "token_acc": 0.7701290719114936, "train_speed(iter/s)": 0.134737 }, { "epoch": 1.1194800644891312, "grad_norm": 0.7201321125030518, "learning_rate": 4.4268583892337614e-05, "loss": 0.7451827049255371, "memory(GiB)": 91.52, "step": 86275, "token_acc": 0.7798739547720064, "train_speed(iter/s)": 0.134736 }, { "epoch": 1.1195449428907869, "grad_norm": 0.647666871547699, "learning_rate": 4.426325552843338e-05, "loss": 0.7622115612030029, "memory(GiB)": 91.52, "step": 86280, "token_acc": 0.7911259667059903, "train_speed(iter/s)": 0.134735 }, { "epoch": 1.1196098212924426, "grad_norm": 0.6835795640945435, "learning_rate": 4.4257927230547226e-05, "loss": 0.7490001678466797, "memory(GiB)": 91.52, "step": 86285, "token_acc": 0.7954691002241434, "train_speed(iter/s)": 0.134734 }, { "epoch": 1.1196746996940983, "grad_norm": 0.6859974265098572, "learning_rate": 4.425259899874046e-05, "loss": 0.7421558380126954, "memory(GiB)": 91.52, "step": 86290, "token_acc": 0.7907787527420871, "train_speed(iter/s)": 0.134734 }, { "epoch": 1.119739578095754, "grad_norm": 0.6978462934494019, "learning_rate": 4.424727083307443e-05, "loss": 0.7508539199829102, "memory(GiB)": 91.52, "step": 86295, "token_acc": 0.7982942430703625, "train_speed(iter/s)": 0.134732 }, { "epoch": 1.1198044564974097, "grad_norm": 0.700547456741333, "learning_rate": 4.4241942733610454e-05, "loss": 0.7696827411651611, "memory(GiB)": 91.52, "step": 86300, "token_acc": 0.777119944867337, "train_speed(iter/s)": 0.134732 }, { "epoch": 1.1198693348990654, "grad_norm": 0.6674253940582275, "learning_rate": 4.423661470040981e-05, "loss": 0.7152104377746582, "memory(GiB)": 91.52, "step": 86305, "token_acc": 0.8061100178052666, "train_speed(iter/s)": 0.134731 }, { "epoch": 1.119934213300721, "grad_norm": 0.6861858367919922, "learning_rate": 4.4231286733533825e-05, "loss": 0.7791184425354004, "memory(GiB)": 91.52, "step": 86310, "token_acc": 0.7739978818318944, "train_speed(iter/s)": 0.13473 }, { "epoch": 1.1199990917023768, "grad_norm": 0.7535227537155151, "learning_rate": 4.422595883304384e-05, "loss": 0.7722050666809082, "memory(GiB)": 91.52, "step": 86315, "token_acc": 0.7920298879202988, "train_speed(iter/s)": 0.134729 }, { "epoch": 1.1200639701040325, "grad_norm": 0.713243842124939, "learning_rate": 4.422063099900113e-05, "loss": 0.7423604965209961, "memory(GiB)": 91.52, "step": 86320, "token_acc": 0.7842068349910737, "train_speed(iter/s)": 0.134727 }, { "epoch": 1.1201288485056882, "grad_norm": 0.6843502521514893, "learning_rate": 4.421530323146704e-05, "loss": 0.7898880958557128, "memory(GiB)": 91.52, "step": 86325, "token_acc": 0.788102261553589, "train_speed(iter/s)": 0.134727 }, { "epoch": 1.1201937269073439, "grad_norm": 0.6575638055801392, "learning_rate": 4.4209975530502864e-05, "loss": 0.7123766899108886, "memory(GiB)": 91.52, "step": 86330, "token_acc": 0.7929874662858956, "train_speed(iter/s)": 0.134726 }, { "epoch": 1.1202586053089996, "grad_norm": 0.7680379152297974, "learning_rate": 4.420464789616993e-05, "loss": 0.8126195907592774, "memory(GiB)": 91.52, "step": 86335, "token_acc": 0.7666385228148747, "train_speed(iter/s)": 0.134725 }, { "epoch": 1.1203234837106553, "grad_norm": 0.6834102272987366, "learning_rate": 4.4199320328529525e-05, "loss": 0.7511762142181396, "memory(GiB)": 91.52, "step": 86340, "token_acc": 0.7966693839524863, "train_speed(iter/s)": 0.134724 }, { "epoch": 1.120388362112311, "grad_norm": 0.7778897881507874, "learning_rate": 4.4193992827642956e-05, "loss": 0.7436537742614746, "memory(GiB)": 91.52, "step": 86345, "token_acc": 0.7943113452707273, "train_speed(iter/s)": 0.134723 }, { "epoch": 1.1204532405139667, "grad_norm": 0.7663781046867371, "learning_rate": 4.418866539357155e-05, "loss": 0.7599437236785889, "memory(GiB)": 91.52, "step": 86350, "token_acc": 0.7781876443848275, "train_speed(iter/s)": 0.134722 }, { "epoch": 1.1205181189156224, "grad_norm": 0.6123982667922974, "learning_rate": 4.418333802637661e-05, "loss": 0.7504850864410401, "memory(GiB)": 91.52, "step": 86355, "token_acc": 0.7989235440242892, "train_speed(iter/s)": 0.134721 }, { "epoch": 1.120582997317278, "grad_norm": 0.7478379011154175, "learning_rate": 4.4178010726119445e-05, "loss": 0.7550500869750977, "memory(GiB)": 91.52, "step": 86360, "token_acc": 0.783352822366323, "train_speed(iter/s)": 0.134721 }, { "epoch": 1.1206478757189338, "grad_norm": 0.7035622596740723, "learning_rate": 4.417268349286136e-05, "loss": 0.7132733345031739, "memory(GiB)": 91.52, "step": 86365, "token_acc": 0.7974822306305032, "train_speed(iter/s)": 0.13472 }, { "epoch": 1.1207127541205895, "grad_norm": 0.7624941468238831, "learning_rate": 4.416735632666364e-05, "loss": 0.7347945690155029, "memory(GiB)": 91.52, "step": 86370, "token_acc": 0.7823122151480361, "train_speed(iter/s)": 0.134719 }, { "epoch": 1.1207776325222452, "grad_norm": 0.7199580073356628, "learning_rate": 4.416202922758765e-05, "loss": 0.7048617362976074, "memory(GiB)": 91.52, "step": 86375, "token_acc": 0.7918962615897317, "train_speed(iter/s)": 0.134718 }, { "epoch": 1.1208425109239009, "grad_norm": 0.7556313872337341, "learning_rate": 4.415670219569462e-05, "loss": 0.7684359550476074, "memory(GiB)": 91.52, "step": 86380, "token_acc": 0.7759134973900075, "train_speed(iter/s)": 0.134718 }, { "epoch": 1.1209073893255566, "grad_norm": 0.6849318742752075, "learning_rate": 4.415137523104589e-05, "loss": 0.7577109336853027, "memory(GiB)": 91.52, "step": 86385, "token_acc": 0.764142703574308, "train_speed(iter/s)": 0.134717 }, { "epoch": 1.1209722677272123, "grad_norm": 0.693343460559845, "learning_rate": 4.414604833370275e-05, "loss": 0.786834716796875, "memory(GiB)": 91.52, "step": 86390, "token_acc": 0.765482424508482, "train_speed(iter/s)": 0.134716 }, { "epoch": 1.121037146128868, "grad_norm": 0.7688422203063965, "learning_rate": 4.414072150372652e-05, "loss": 0.8116531372070312, "memory(GiB)": 91.52, "step": 86395, "token_acc": 0.752409906569776, "train_speed(iter/s)": 0.134715 }, { "epoch": 1.1211020245305237, "grad_norm": 0.7747924327850342, "learning_rate": 4.4135394741178496e-05, "loss": 0.7686384201049805, "memory(GiB)": 91.52, "step": 86400, "token_acc": 0.77530127889818, "train_speed(iter/s)": 0.134714 }, { "epoch": 1.1211669029321794, "grad_norm": 0.6849399209022522, "learning_rate": 4.4130068046119947e-05, "loss": 0.7676888465881347, "memory(GiB)": 91.52, "step": 86405, "token_acc": 0.7857690164391196, "train_speed(iter/s)": 0.134714 }, { "epoch": 1.121231781333835, "grad_norm": 0.7131370306015015, "learning_rate": 4.4124741418612234e-05, "loss": 0.74357008934021, "memory(GiB)": 91.52, "step": 86410, "token_acc": 0.7839895988112927, "train_speed(iter/s)": 0.134713 }, { "epoch": 1.1212966597354908, "grad_norm": 0.7939115166664124, "learning_rate": 4.411941485871659e-05, "loss": 0.7672548294067383, "memory(GiB)": 91.52, "step": 86415, "token_acc": 0.8041533914245135, "train_speed(iter/s)": 0.134712 }, { "epoch": 1.1213615381371465, "grad_norm": 0.7093925476074219, "learning_rate": 4.411408836649436e-05, "loss": 0.7551544189453125, "memory(GiB)": 91.52, "step": 86420, "token_acc": 0.786973988832902, "train_speed(iter/s)": 0.134711 }, { "epoch": 1.1214264165388022, "grad_norm": 0.8092456459999084, "learning_rate": 4.4108761942006795e-05, "loss": 0.7879795074462891, "memory(GiB)": 91.52, "step": 86425, "token_acc": 0.7602018150663198, "train_speed(iter/s)": 0.134711 }, { "epoch": 1.1214912949404579, "grad_norm": 0.6766154766082764, "learning_rate": 4.410343558531524e-05, "loss": 0.7631741523742676, "memory(GiB)": 91.52, "step": 86430, "token_acc": 0.7819039096916299, "train_speed(iter/s)": 0.13471 }, { "epoch": 1.1215561733421135, "grad_norm": 0.7248331308364868, "learning_rate": 4.409810929648096e-05, "loss": 0.7781148433685303, "memory(GiB)": 91.52, "step": 86435, "token_acc": 0.7820412346274416, "train_speed(iter/s)": 0.134709 }, { "epoch": 1.1216210517437692, "grad_norm": 0.8149036169052124, "learning_rate": 4.409278307556525e-05, "loss": 0.7785784244537354, "memory(GiB)": 91.52, "step": 86440, "token_acc": 0.7817300521998508, "train_speed(iter/s)": 0.134708 }, { "epoch": 1.121685930145425, "grad_norm": 0.6471711993217468, "learning_rate": 4.408745692262945e-05, "loss": 0.7373920440673828, "memory(GiB)": 91.52, "step": 86445, "token_acc": 0.8072567598798244, "train_speed(iter/s)": 0.134708 }, { "epoch": 1.1217508085470806, "grad_norm": 0.695289134979248, "learning_rate": 4.408213083773478e-05, "loss": 0.7777667999267578, "memory(GiB)": 91.52, "step": 86450, "token_acc": 0.7736932620846385, "train_speed(iter/s)": 0.134707 }, { "epoch": 1.1218156869487363, "grad_norm": 0.7111836671829224, "learning_rate": 4.407680482094258e-05, "loss": 0.7661525726318359, "memory(GiB)": 91.52, "step": 86455, "token_acc": 0.7806790559084137, "train_speed(iter/s)": 0.134705 }, { "epoch": 1.121880565350392, "grad_norm": 0.6768500208854675, "learning_rate": 4.407147887231412e-05, "loss": 0.75533447265625, "memory(GiB)": 91.52, "step": 86460, "token_acc": 0.7908758250401833, "train_speed(iter/s)": 0.134705 }, { "epoch": 1.1219454437520477, "grad_norm": 0.7315950393676758, "learning_rate": 4.4066152991910706e-05, "loss": 0.729685115814209, "memory(GiB)": 91.52, "step": 86465, "token_acc": 0.7887004535594557, "train_speed(iter/s)": 0.134704 }, { "epoch": 1.1220103221537034, "grad_norm": 0.730158269405365, "learning_rate": 4.4060827179793616e-05, "loss": 0.7513458251953125, "memory(GiB)": 91.52, "step": 86470, "token_acc": 0.7949933868834479, "train_speed(iter/s)": 0.134703 }, { "epoch": 1.1220752005553591, "grad_norm": 0.6548022031784058, "learning_rate": 4.405550143602414e-05, "loss": 0.7205376625061035, "memory(GiB)": 91.52, "step": 86475, "token_acc": 0.8141095790472022, "train_speed(iter/s)": 0.134702 }, { "epoch": 1.1221400789570148, "grad_norm": 0.734250009059906, "learning_rate": 4.40501757606636e-05, "loss": 0.7980134963989258, "memory(GiB)": 91.52, "step": 86480, "token_acc": 0.7626465392644933, "train_speed(iter/s)": 0.134701 }, { "epoch": 1.1222049573586705, "grad_norm": 0.7262457609176636, "learning_rate": 4.404485015377323e-05, "loss": 0.7849285125732421, "memory(GiB)": 91.52, "step": 86485, "token_acc": 0.7730389697273792, "train_speed(iter/s)": 0.1347 }, { "epoch": 1.1222698357603262, "grad_norm": 0.6699021458625793, "learning_rate": 4.403952461541435e-05, "loss": 0.7609830856323242, "memory(GiB)": 91.52, "step": 86490, "token_acc": 0.7721651114977068, "train_speed(iter/s)": 0.134699 }, { "epoch": 1.122334714161982, "grad_norm": 0.6880224347114563, "learning_rate": 4.4034199145648227e-05, "loss": 0.7695869445800781, "memory(GiB)": 91.52, "step": 86495, "token_acc": 0.7679817689787779, "train_speed(iter/s)": 0.134698 }, { "epoch": 1.1223995925636376, "grad_norm": 0.6856430768966675, "learning_rate": 4.402887374453617e-05, "loss": 0.7366163253784179, "memory(GiB)": 91.52, "step": 86500, "token_acc": 0.7941577273839058, "train_speed(iter/s)": 0.134696 }, { "epoch": 1.1224644709652933, "grad_norm": 0.6613052487373352, "learning_rate": 4.402354841213944e-05, "loss": 0.7435949325561524, "memory(GiB)": 91.52, "step": 86505, "token_acc": 0.791990146312332, "train_speed(iter/s)": 0.134696 }, { "epoch": 1.122529349366949, "grad_norm": 0.6634288430213928, "learning_rate": 4.4018223148519314e-05, "loss": 0.7663632392883301, "memory(GiB)": 91.52, "step": 86510, "token_acc": 0.8012404232032105, "train_speed(iter/s)": 0.134695 }, { "epoch": 1.1225942277686047, "grad_norm": 0.7954193353652954, "learning_rate": 4.401289795373713e-05, "loss": 0.7771810054779053, "memory(GiB)": 91.52, "step": 86515, "token_acc": 0.7701461862299864, "train_speed(iter/s)": 0.134694 }, { "epoch": 1.1226591061702604, "grad_norm": 0.7361058592796326, "learning_rate": 4.400757282785411e-05, "loss": 0.7589411735534668, "memory(GiB)": 91.52, "step": 86520, "token_acc": 0.791818000399122, "train_speed(iter/s)": 0.134694 }, { "epoch": 1.1227239845719161, "grad_norm": 0.7288852334022522, "learning_rate": 4.4002247770931556e-05, "loss": 0.7920198917388916, "memory(GiB)": 91.52, "step": 86525, "token_acc": 0.7738239225921446, "train_speed(iter/s)": 0.134693 }, { "epoch": 1.1227888629735718, "grad_norm": 0.6883911490440369, "learning_rate": 4.3996922783030744e-05, "loss": 0.7787826538085938, "memory(GiB)": 91.52, "step": 86530, "token_acc": 0.7723301276007292, "train_speed(iter/s)": 0.134692 }, { "epoch": 1.1228537413752275, "grad_norm": 0.7113566398620605, "learning_rate": 4.399159786421296e-05, "loss": 0.7854493141174317, "memory(GiB)": 91.52, "step": 86535, "token_acc": 0.7848642556435841, "train_speed(iter/s)": 0.134692 }, { "epoch": 1.1229186197768832, "grad_norm": 0.7285596132278442, "learning_rate": 4.3986273014539485e-05, "loss": 0.7187961578369141, "memory(GiB)": 91.52, "step": 86540, "token_acc": 0.7893801879784245, "train_speed(iter/s)": 0.134691 }, { "epoch": 1.122983498178539, "grad_norm": 0.7433406710624695, "learning_rate": 4.398094823407158e-05, "loss": 0.7623661041259766, "memory(GiB)": 91.52, "step": 86545, "token_acc": 0.7759776738791635, "train_speed(iter/s)": 0.134691 }, { "epoch": 1.1230483765801946, "grad_norm": 0.6979642510414124, "learning_rate": 4.3975623522870554e-05, "loss": 0.749747371673584, "memory(GiB)": 91.52, "step": 86550, "token_acc": 0.7809092148309705, "train_speed(iter/s)": 0.13469 }, { "epoch": 1.1231132549818503, "grad_norm": 0.6896217465400696, "learning_rate": 4.397029888099767e-05, "loss": 0.770419979095459, "memory(GiB)": 91.52, "step": 86555, "token_acc": 0.7708617574394336, "train_speed(iter/s)": 0.134689 }, { "epoch": 1.123178133383506, "grad_norm": 0.7631088495254517, "learning_rate": 4.396497430851419e-05, "loss": 0.802029037475586, "memory(GiB)": 91.52, "step": 86560, "token_acc": 0.7705401880705608, "train_speed(iter/s)": 0.134688 }, { "epoch": 1.1232430117851617, "grad_norm": 0.8530959486961365, "learning_rate": 4.3959649805481386e-05, "loss": 0.7765658378601075, "memory(GiB)": 91.52, "step": 86565, "token_acc": 0.7747924173586088, "train_speed(iter/s)": 0.134688 }, { "epoch": 1.1233078901868174, "grad_norm": 0.611056387424469, "learning_rate": 4.3954325371960555e-05, "loss": 0.7435377597808838, "memory(GiB)": 91.52, "step": 86570, "token_acc": 0.7818124207858048, "train_speed(iter/s)": 0.134687 }, { "epoch": 1.1233727685884731, "grad_norm": 0.7534090876579285, "learning_rate": 4.394900100801295e-05, "loss": 0.7555005550384521, "memory(GiB)": 91.52, "step": 86575, "token_acc": 0.7823250661579174, "train_speed(iter/s)": 0.134686 }, { "epoch": 1.1234376469901288, "grad_norm": 0.6990293264389038, "learning_rate": 4.394367671369985e-05, "loss": 0.7459431648254394, "memory(GiB)": 91.52, "step": 86580, "token_acc": 0.7752738550877656, "train_speed(iter/s)": 0.134685 }, { "epoch": 1.1235025253917845, "grad_norm": 0.7132588624954224, "learning_rate": 4.393835248908254e-05, "loss": 0.7506203174591064, "memory(GiB)": 91.52, "step": 86585, "token_acc": 0.8072696205972584, "train_speed(iter/s)": 0.134684 }, { "epoch": 1.1235674037934402, "grad_norm": 0.7511664032936096, "learning_rate": 4.3933028334222285e-05, "loss": 0.7830975532531739, "memory(GiB)": 91.52, "step": 86590, "token_acc": 0.7662013958125623, "train_speed(iter/s)": 0.134684 }, { "epoch": 1.123632282195096, "grad_norm": 0.7329415082931519, "learning_rate": 4.392770424918034e-05, "loss": 0.7614447116851807, "memory(GiB)": 91.52, "step": 86595, "token_acc": 0.7657481071524656, "train_speed(iter/s)": 0.134683 }, { "epoch": 1.1236971605967516, "grad_norm": 0.7345479130744934, "learning_rate": 4.3922380234017975e-05, "loss": 0.7763398170471192, "memory(GiB)": 91.52, "step": 86600, "token_acc": 0.7802101576182137, "train_speed(iter/s)": 0.134682 }, { "epoch": 1.1237620389984073, "grad_norm": 0.6821203827857971, "learning_rate": 4.391705628879648e-05, "loss": 0.7980690956115722, "memory(GiB)": 91.52, "step": 86605, "token_acc": 0.7677707224987129, "train_speed(iter/s)": 0.134682 }, { "epoch": 1.123826917400063, "grad_norm": 0.6871881484985352, "learning_rate": 4.391173241357711e-05, "loss": 0.7788853168487548, "memory(GiB)": 91.52, "step": 86610, "token_acc": 0.7835712106315896, "train_speed(iter/s)": 0.134681 }, { "epoch": 1.1238917958017187, "grad_norm": 0.7299737930297852, "learning_rate": 4.3906408608421115e-05, "loss": 0.7699658393859863, "memory(GiB)": 91.52, "step": 86615, "token_acc": 0.7762545266425246, "train_speed(iter/s)": 0.13468 }, { "epoch": 1.1239566742033744, "grad_norm": 0.7072117924690247, "learning_rate": 4.390108487338979e-05, "loss": 0.741300106048584, "memory(GiB)": 91.52, "step": 86620, "token_acc": 0.79470393527032, "train_speed(iter/s)": 0.134679 }, { "epoch": 1.1240215526050301, "grad_norm": 0.7075163125991821, "learning_rate": 4.38957612085444e-05, "loss": 0.7639311790466309, "memory(GiB)": 91.52, "step": 86625, "token_acc": 0.7718805253851982, "train_speed(iter/s)": 0.134678 }, { "epoch": 1.1240864310066858, "grad_norm": 0.7444789409637451, "learning_rate": 4.389043761394618e-05, "loss": 0.7374309539794922, "memory(GiB)": 91.52, "step": 86630, "token_acc": 0.7875737046794631, "train_speed(iter/s)": 0.134677 }, { "epoch": 1.1241513094083415, "grad_norm": 0.6836791038513184, "learning_rate": 4.388511408965641e-05, "loss": 0.7344345569610595, "memory(GiB)": 91.52, "step": 86635, "token_acc": 0.7937141753075289, "train_speed(iter/s)": 0.134676 }, { "epoch": 1.1242161878099972, "grad_norm": 0.7591850161552429, "learning_rate": 4.3879790635736356e-05, "loss": 0.78006010055542, "memory(GiB)": 91.52, "step": 86640, "token_acc": 0.7601459854014598, "train_speed(iter/s)": 0.134675 }, { "epoch": 1.124281066211653, "grad_norm": 0.7362320423126221, "learning_rate": 4.3874467252247284e-05, "loss": 0.7179287910461426, "memory(GiB)": 91.52, "step": 86645, "token_acc": 0.803779366700715, "train_speed(iter/s)": 0.134675 }, { "epoch": 1.1243459446133084, "grad_norm": 0.589537501335144, "learning_rate": 4.3869143939250435e-05, "loss": 0.7501872539520263, "memory(GiB)": 91.52, "step": 86650, "token_acc": 0.777799769091209, "train_speed(iter/s)": 0.134673 }, { "epoch": 1.1244108230149643, "grad_norm": 0.6503085494041443, "learning_rate": 4.3863820696807093e-05, "loss": 0.7383856773376465, "memory(GiB)": 91.52, "step": 86655, "token_acc": 0.785431177127025, "train_speed(iter/s)": 0.134673 }, { "epoch": 1.1244757014166198, "grad_norm": 0.7593466639518738, "learning_rate": 4.385849752497852e-05, "loss": 0.7393617630004883, "memory(GiB)": 91.52, "step": 86660, "token_acc": 0.7810600012282749, "train_speed(iter/s)": 0.134672 }, { "epoch": 1.1245405798182757, "grad_norm": 0.6670956611633301, "learning_rate": 4.385317442382595e-05, "loss": 0.7431893348693848, "memory(GiB)": 91.52, "step": 86665, "token_acc": 0.7715791167243182, "train_speed(iter/s)": 0.13467 }, { "epoch": 1.1246054582199312, "grad_norm": 0.7646792531013489, "learning_rate": 4.384785139341064e-05, "loss": 0.752793550491333, "memory(GiB)": 91.52, "step": 86670, "token_acc": 0.7869813701477029, "train_speed(iter/s)": 0.13467 }, { "epoch": 1.124670336621587, "grad_norm": 0.7001494765281677, "learning_rate": 4.384252843379388e-05, "loss": 0.7475533008575439, "memory(GiB)": 91.52, "step": 86675, "token_acc": 0.7702555407533003, "train_speed(iter/s)": 0.134669 }, { "epoch": 1.1247352150232426, "grad_norm": 0.796863853931427, "learning_rate": 4.38372055450369e-05, "loss": 0.8007004737854004, "memory(GiB)": 91.52, "step": 86680, "token_acc": 0.7813895979578813, "train_speed(iter/s)": 0.134668 }, { "epoch": 1.1248000934248983, "grad_norm": 0.6881595849990845, "learning_rate": 4.3831882727200954e-05, "loss": 0.7682184696197509, "memory(GiB)": 91.52, "step": 86685, "token_acc": 0.7660275334845706, "train_speed(iter/s)": 0.134667 }, { "epoch": 1.124864971826554, "grad_norm": 0.6857233047485352, "learning_rate": 4.3826559980347306e-05, "loss": 0.7679032325744629, "memory(GiB)": 91.52, "step": 86690, "token_acc": 0.7812295525020625, "train_speed(iter/s)": 0.134666 }, { "epoch": 1.1249298502282097, "grad_norm": 0.7361639142036438, "learning_rate": 4.3821237304537206e-05, "loss": 0.7277215957641602, "memory(GiB)": 91.52, "step": 86695, "token_acc": 0.7882929679890917, "train_speed(iter/s)": 0.134665 }, { "epoch": 1.1249947286298654, "grad_norm": 0.7023575901985168, "learning_rate": 4.3815914699831936e-05, "loss": 0.7619400501251221, "memory(GiB)": 91.52, "step": 86700, "token_acc": 0.7837827948333272, "train_speed(iter/s)": 0.134664 }, { "epoch": 1.125059607031521, "grad_norm": 0.7977370023727417, "learning_rate": 4.3810592166292695e-05, "loss": 0.7845280647277832, "memory(GiB)": 91.52, "step": 86705, "token_acc": 0.7769722013523667, "train_speed(iter/s)": 0.134663 }, { "epoch": 1.1251244854331768, "grad_norm": 0.6736907362937927, "learning_rate": 4.380526970398077e-05, "loss": 0.7548062324523925, "memory(GiB)": 91.52, "step": 86710, "token_acc": 0.7863152498416814, "train_speed(iter/s)": 0.134662 }, { "epoch": 1.1251893638348325, "grad_norm": 0.6259605884552002, "learning_rate": 4.379994731295739e-05, "loss": 0.7174067497253418, "memory(GiB)": 91.52, "step": 86715, "token_acc": 0.7895910780669145, "train_speed(iter/s)": 0.134661 }, { "epoch": 1.1252542422364882, "grad_norm": 0.7852146029472351, "learning_rate": 4.379462499328384e-05, "loss": 0.7816527366638184, "memory(GiB)": 91.52, "step": 86720, "token_acc": 0.7787250601386727, "train_speed(iter/s)": 0.13466 }, { "epoch": 1.1253191206381439, "grad_norm": 0.7123223543167114, "learning_rate": 4.378930274502133e-05, "loss": 0.7321087837219238, "memory(GiB)": 91.52, "step": 86725, "token_acc": 0.8044370211684397, "train_speed(iter/s)": 0.134659 }, { "epoch": 1.1253839990397996, "grad_norm": 0.7630718350410461, "learning_rate": 4.378398056823112e-05, "loss": 0.7747389793395996, "memory(GiB)": 91.52, "step": 86730, "token_acc": 0.7831919367703575, "train_speed(iter/s)": 0.134658 }, { "epoch": 1.1254488774414553, "grad_norm": 0.7284091114997864, "learning_rate": 4.37786584629745e-05, "loss": 0.7188066482543946, "memory(GiB)": 91.52, "step": 86735, "token_acc": 0.7871535607768968, "train_speed(iter/s)": 0.134657 }, { "epoch": 1.125513755843111, "grad_norm": 0.7761090397834778, "learning_rate": 4.377333642931264e-05, "loss": 0.7445573329925537, "memory(GiB)": 91.52, "step": 86740, "token_acc": 0.8080131069407209, "train_speed(iter/s)": 0.134656 }, { "epoch": 1.1255786342447667, "grad_norm": 0.648007333278656, "learning_rate": 4.3768014467306834e-05, "loss": 0.7475944519042969, "memory(GiB)": 91.52, "step": 86745, "token_acc": 0.7884797938703553, "train_speed(iter/s)": 0.134655 }, { "epoch": 1.1256435126464224, "grad_norm": 0.7028777599334717, "learning_rate": 4.3762692577018303e-05, "loss": 0.7344903469085693, "memory(GiB)": 91.52, "step": 86750, "token_acc": 0.7937448709765661, "train_speed(iter/s)": 0.134655 }, { "epoch": 1.125708391048078, "grad_norm": 0.7332250475883484, "learning_rate": 4.375737075850832e-05, "loss": 0.7611721992492676, "memory(GiB)": 91.52, "step": 86755, "token_acc": 0.787090157366751, "train_speed(iter/s)": 0.134654 }, { "epoch": 1.1257732694497338, "grad_norm": 0.7351505756378174, "learning_rate": 4.375204901183811e-05, "loss": 0.7938915252685547, "memory(GiB)": 91.52, "step": 86760, "token_acc": 0.7695728072197885, "train_speed(iter/s)": 0.134653 }, { "epoch": 1.1258381478513895, "grad_norm": 0.7337234616279602, "learning_rate": 4.374672733706891e-05, "loss": 0.776795768737793, "memory(GiB)": 91.52, "step": 86765, "token_acc": 0.7800614564007421, "train_speed(iter/s)": 0.134652 }, { "epoch": 1.1259030262530452, "grad_norm": 0.7226522564888, "learning_rate": 4.3741405734262e-05, "loss": 0.7621268272399903, "memory(GiB)": 91.52, "step": 86770, "token_acc": 0.7681918008784773, "train_speed(iter/s)": 0.134652 }, { "epoch": 1.1259679046547009, "grad_norm": 0.6990911364555359, "learning_rate": 4.373608420347855e-05, "loss": 0.7422161102294922, "memory(GiB)": 91.52, "step": 86775, "token_acc": 0.7812786677016442, "train_speed(iter/s)": 0.134651 }, { "epoch": 1.1260327830563566, "grad_norm": 0.6885524392127991, "learning_rate": 4.373076274477986e-05, "loss": 0.7680548667907715, "memory(GiB)": 91.52, "step": 86780, "token_acc": 0.7676672104404567, "train_speed(iter/s)": 0.134651 }, { "epoch": 1.1260976614580123, "grad_norm": 0.6996941566467285, "learning_rate": 4.372544135822714e-05, "loss": 0.7042523384094238, "memory(GiB)": 91.52, "step": 86785, "token_acc": 0.8018463327760413, "train_speed(iter/s)": 0.13465 }, { "epoch": 1.126162539859668, "grad_norm": 0.7486891746520996, "learning_rate": 4.372012004388164e-05, "loss": 0.7704473495483398, "memory(GiB)": 91.52, "step": 86790, "token_acc": 0.7835633431539774, "train_speed(iter/s)": 0.134649 }, { "epoch": 1.1262274182613237, "grad_norm": 0.6789999008178711, "learning_rate": 4.37147988018046e-05, "loss": 0.7490562438964844, "memory(GiB)": 91.52, "step": 86795, "token_acc": 0.7880166975298954, "train_speed(iter/s)": 0.134648 }, { "epoch": 1.1262922966629794, "grad_norm": 0.7922138571739197, "learning_rate": 4.3709477632057235e-05, "loss": 0.7689557075500488, "memory(GiB)": 91.52, "step": 86800, "token_acc": 0.7926267281105991, "train_speed(iter/s)": 0.134647 }, { "epoch": 1.126357175064635, "grad_norm": 0.7301903367042542, "learning_rate": 4.370415653470083e-05, "loss": 0.7250192642211915, "memory(GiB)": 91.52, "step": 86805, "token_acc": 0.7782668605986374, "train_speed(iter/s)": 0.134646 }, { "epoch": 1.1264220534662908, "grad_norm": 0.7470558881759644, "learning_rate": 4.3698835509796556e-05, "loss": 0.7364098072052002, "memory(GiB)": 91.52, "step": 86810, "token_acc": 0.7895966497685696, "train_speed(iter/s)": 0.134646 }, { "epoch": 1.1264869318679465, "grad_norm": 0.6743496060371399, "learning_rate": 4.3693514557405694e-05, "loss": 0.7779495239257812, "memory(GiB)": 91.52, "step": 86815, "token_acc": 0.7756631040727195, "train_speed(iter/s)": 0.134645 }, { "epoch": 1.1265518102696022, "grad_norm": 0.6884680390357971, "learning_rate": 4.368819367758944e-05, "loss": 0.7395977973937988, "memory(GiB)": 91.52, "step": 86820, "token_acc": 0.7761991841327894, "train_speed(iter/s)": 0.134644 }, { "epoch": 1.1266166886712579, "grad_norm": 0.7208070158958435, "learning_rate": 4.3682872870409056e-05, "loss": 0.7605062484741211, "memory(GiB)": 91.52, "step": 86825, "token_acc": 0.7941628813619943, "train_speed(iter/s)": 0.134643 }, { "epoch": 1.1266815670729136, "grad_norm": 0.7195098996162415, "learning_rate": 4.367755213592577e-05, "loss": 0.7814175128936768, "memory(GiB)": 91.52, "step": 86830, "token_acc": 0.7709752806030085, "train_speed(iter/s)": 0.134642 }, { "epoch": 1.1267464454745693, "grad_norm": 0.8352765440940857, "learning_rate": 4.36722314742008e-05, "loss": 0.77461838722229, "memory(GiB)": 91.52, "step": 86835, "token_acc": 0.7813232913817594, "train_speed(iter/s)": 0.134642 }, { "epoch": 1.126811323876225, "grad_norm": 0.6840143203735352, "learning_rate": 4.366691088529541e-05, "loss": 0.773501205444336, "memory(GiB)": 91.52, "step": 86840, "token_acc": 0.7968267785075571, "train_speed(iter/s)": 0.134641 }, { "epoch": 1.1268762022778807, "grad_norm": 0.7138380408287048, "learning_rate": 4.366159036927077e-05, "loss": 0.7611994743347168, "memory(GiB)": 91.52, "step": 86845, "token_acc": 0.7790525170606271, "train_speed(iter/s)": 0.13464 }, { "epoch": 1.1269410806795364, "grad_norm": 0.6550845503807068, "learning_rate": 4.3656269926188155e-05, "loss": 0.7134544372558593, "memory(GiB)": 91.52, "step": 86850, "token_acc": 0.7930155577623303, "train_speed(iter/s)": 0.134639 }, { "epoch": 1.127005959081192, "grad_norm": 0.7946265339851379, "learning_rate": 4.365094955610876e-05, "loss": 0.7727495670318604, "memory(GiB)": 91.52, "step": 86855, "token_acc": 0.7792447640378797, "train_speed(iter/s)": 0.134638 }, { "epoch": 1.1270708374828478, "grad_norm": 0.7107258439064026, "learning_rate": 4.364562925909385e-05, "loss": 0.8050654411315918, "memory(GiB)": 91.52, "step": 86860, "token_acc": 0.7804109364767517, "train_speed(iter/s)": 0.134637 }, { "epoch": 1.1271357158845035, "grad_norm": 0.6811023354530334, "learning_rate": 4.364030903520462e-05, "loss": 0.7248244762420655, "memory(GiB)": 91.52, "step": 86865, "token_acc": 0.785298481908458, "train_speed(iter/s)": 0.134636 }, { "epoch": 1.1272005942861592, "grad_norm": 0.6617447137832642, "learning_rate": 4.3634988884502295e-05, "loss": 0.7588626384735108, "memory(GiB)": 91.52, "step": 86870, "token_acc": 0.7916407292129836, "train_speed(iter/s)": 0.134635 }, { "epoch": 1.1272654726878149, "grad_norm": 0.789177417755127, "learning_rate": 4.362966880704812e-05, "loss": 0.7836488723754883, "memory(GiB)": 91.52, "step": 86875, "token_acc": 0.7753970698226935, "train_speed(iter/s)": 0.134635 }, { "epoch": 1.1273303510894706, "grad_norm": 0.7153753638267517, "learning_rate": 4.3624348802903316e-05, "loss": 0.7057705879211426, "memory(GiB)": 91.52, "step": 86880, "token_acc": 0.7994160583941606, "train_speed(iter/s)": 0.134633 }, { "epoch": 1.1273952294911262, "grad_norm": 0.7772853374481201, "learning_rate": 4.3619028872129084e-05, "loss": 0.7790033340454101, "memory(GiB)": 91.52, "step": 86885, "token_acc": 0.7646942410449238, "train_speed(iter/s)": 0.134633 }, { "epoch": 1.127460107892782, "grad_norm": 0.7883388996124268, "learning_rate": 4.361370901478665e-05, "loss": 0.742979907989502, "memory(GiB)": 91.52, "step": 86890, "token_acc": 0.7781135472256157, "train_speed(iter/s)": 0.134632 }, { "epoch": 1.1275249862944376, "grad_norm": 0.7100921273231506, "learning_rate": 4.360838923093725e-05, "loss": 0.7438711166381836, "memory(GiB)": 91.52, "step": 86895, "token_acc": 0.7740753719218885, "train_speed(iter/s)": 0.134631 }, { "epoch": 1.1275898646960933, "grad_norm": 0.6926799416542053, "learning_rate": 4.36030695206421e-05, "loss": 0.6781670093536377, "memory(GiB)": 91.52, "step": 86900, "token_acc": 0.8119011248437717, "train_speed(iter/s)": 0.13463 }, { "epoch": 1.127654743097749, "grad_norm": 0.7408965826034546, "learning_rate": 4.35977498839624e-05, "loss": 0.7609977722167969, "memory(GiB)": 91.52, "step": 86905, "token_acc": 0.7823565421915778, "train_speed(iter/s)": 0.134629 }, { "epoch": 1.1277196214994047, "grad_norm": 0.7296462059020996, "learning_rate": 4.359243032095939e-05, "loss": 0.771239948272705, "memory(GiB)": 91.52, "step": 86910, "token_acc": 0.7715942199593867, "train_speed(iter/s)": 0.134628 }, { "epoch": 1.1277844999010604, "grad_norm": 0.8013674020767212, "learning_rate": 4.35871108316943e-05, "loss": 0.7453596115112304, "memory(GiB)": 91.52, "step": 86915, "token_acc": 0.801008100810081, "train_speed(iter/s)": 0.134627 }, { "epoch": 1.1278493783027161, "grad_norm": 0.7268789410591125, "learning_rate": 4.358179141622831e-05, "loss": 0.7679281234741211, "memory(GiB)": 91.52, "step": 86920, "token_acc": 0.7764230403477789, "train_speed(iter/s)": 0.134627 }, { "epoch": 1.1279142567043718, "grad_norm": 0.7906427383422852, "learning_rate": 4.357647207462263e-05, "loss": 0.7478073120117188, "memory(GiB)": 91.52, "step": 86925, "token_acc": 0.7959371507461706, "train_speed(iter/s)": 0.134626 }, { "epoch": 1.1279791351060275, "grad_norm": 0.7056055068969727, "learning_rate": 4.357115280693852e-05, "loss": 0.7070889949798584, "memory(GiB)": 91.52, "step": 86930, "token_acc": 0.7857191640486272, "train_speed(iter/s)": 0.134625 }, { "epoch": 1.1280440135076832, "grad_norm": 0.6579484939575195, "learning_rate": 4.356583361323717e-05, "loss": 0.7061470508575439, "memory(GiB)": 91.52, "step": 86935, "token_acc": 0.798399131732465, "train_speed(iter/s)": 0.134624 }, { "epoch": 1.128108891909339, "grad_norm": 0.7184232473373413, "learning_rate": 4.3560514493579775e-05, "loss": 0.7609172344207764, "memory(GiB)": 91.52, "step": 86940, "token_acc": 0.7875028197608843, "train_speed(iter/s)": 0.134623 }, { "epoch": 1.1281737703109946, "grad_norm": 0.6906607151031494, "learning_rate": 4.355519544802757e-05, "loss": 0.7335347652435302, "memory(GiB)": 91.52, "step": 86945, "token_acc": 0.7663016247942166, "train_speed(iter/s)": 0.134622 }, { "epoch": 1.1282386487126503, "grad_norm": 0.7181830406188965, "learning_rate": 4.354987647664178e-05, "loss": 0.7469398021697998, "memory(GiB)": 91.52, "step": 86950, "token_acc": 0.7957362604540024, "train_speed(iter/s)": 0.134622 }, { "epoch": 1.128303527114306, "grad_norm": 0.6883105039596558, "learning_rate": 4.354455757948359e-05, "loss": 0.7211768150329589, "memory(GiB)": 91.52, "step": 86955, "token_acc": 0.7935733849956577, "train_speed(iter/s)": 0.134621 }, { "epoch": 1.1283684055159617, "grad_norm": 0.7287868857383728, "learning_rate": 4.3539238756614195e-05, "loss": 0.7141324996948242, "memory(GiB)": 91.52, "step": 86960, "token_acc": 0.783825692815395, "train_speed(iter/s)": 0.134621 }, { "epoch": 1.1284332839176174, "grad_norm": 0.6711397767066956, "learning_rate": 4.3533920008094845e-05, "loss": 0.8014020919799805, "memory(GiB)": 91.52, "step": 86965, "token_acc": 0.7684218606079214, "train_speed(iter/s)": 0.13462 }, { "epoch": 1.1284981623192731, "grad_norm": 0.7958894371986389, "learning_rate": 4.352860133398672e-05, "loss": 0.8121623992919922, "memory(GiB)": 91.52, "step": 86970, "token_acc": 0.7648105386682817, "train_speed(iter/s)": 0.134619 }, { "epoch": 1.1285630407209288, "grad_norm": 0.7121984362602234, "learning_rate": 4.352328273435103e-05, "loss": 0.7667052745819092, "memory(GiB)": 91.52, "step": 86975, "token_acc": 0.7678128056579641, "train_speed(iter/s)": 0.134619 }, { "epoch": 1.1286279191225845, "grad_norm": 0.6872950196266174, "learning_rate": 4.3517964209248994e-05, "loss": 0.746387243270874, "memory(GiB)": 91.52, "step": 86980, "token_acc": 0.7806546499298833, "train_speed(iter/s)": 0.134618 }, { "epoch": 1.1286927975242402, "grad_norm": 0.7467210292816162, "learning_rate": 4.351264575874182e-05, "loss": 0.7766156196594238, "memory(GiB)": 91.52, "step": 86985, "token_acc": 0.804869957537155, "train_speed(iter/s)": 0.134617 }, { "epoch": 1.128757675925896, "grad_norm": 0.653863787651062, "learning_rate": 4.350732738289068e-05, "loss": 0.7628990173339844, "memory(GiB)": 91.52, "step": 86990, "token_acc": 0.780674675493733, "train_speed(iter/s)": 0.134616 }, { "epoch": 1.1288225543275516, "grad_norm": 0.7743024230003357, "learning_rate": 4.35020090817568e-05, "loss": 0.7692721843719482, "memory(GiB)": 91.52, "step": 86995, "token_acc": 0.7837892254149049, "train_speed(iter/s)": 0.134615 }, { "epoch": 1.1288874327292073, "grad_norm": 0.7235759496688843, "learning_rate": 4.349669085540139e-05, "loss": 0.7858908176422119, "memory(GiB)": 91.52, "step": 87000, "token_acc": 0.7855668730444045, "train_speed(iter/s)": 0.134614 }, { "epoch": 1.128952311130863, "grad_norm": 0.6991355419158936, "learning_rate": 4.3491372703885645e-05, "loss": 0.7699632167816162, "memory(GiB)": 91.52, "step": 87005, "token_acc": 0.7570841598186455, "train_speed(iter/s)": 0.134614 }, { "epoch": 1.1290171895325187, "grad_norm": 0.7096983194351196, "learning_rate": 4.348605462727074e-05, "loss": 0.783712100982666, "memory(GiB)": 91.52, "step": 87010, "token_acc": 0.7878768233387358, "train_speed(iter/s)": 0.134613 }, { "epoch": 1.1290820679341744, "grad_norm": 0.6646832227706909, "learning_rate": 4.348073662561793e-05, "loss": 0.7385207176208496, "memory(GiB)": 91.52, "step": 87015, "token_acc": 0.7955468155290535, "train_speed(iter/s)": 0.134611 }, { "epoch": 1.1291469463358301, "grad_norm": 0.717356264591217, "learning_rate": 4.3475418698988376e-05, "loss": 0.7626935005187988, "memory(GiB)": 91.52, "step": 87020, "token_acc": 0.7990451255197906, "train_speed(iter/s)": 0.13461 }, { "epoch": 1.1292118247374858, "grad_norm": 0.6960331797599792, "learning_rate": 4.3470100847443286e-05, "loss": 0.7611184120178223, "memory(GiB)": 91.52, "step": 87025, "token_acc": 0.7895249160046472, "train_speed(iter/s)": 0.134609 }, { "epoch": 1.1292767031391415, "grad_norm": 0.8237718343734741, "learning_rate": 4.346478307104384e-05, "loss": 0.7827141761779786, "memory(GiB)": 91.52, "step": 87030, "token_acc": 0.7891814370817092, "train_speed(iter/s)": 0.134609 }, { "epoch": 1.1293415815407972, "grad_norm": 0.7667146325111389, "learning_rate": 4.345946536985126e-05, "loss": 0.762470293045044, "memory(GiB)": 91.52, "step": 87035, "token_acc": 0.8047208138871168, "train_speed(iter/s)": 0.134608 }, { "epoch": 1.129406459942453, "grad_norm": 0.7074242830276489, "learning_rate": 4.345414774392673e-05, "loss": 0.8003190994262696, "memory(GiB)": 91.52, "step": 87040, "token_acc": 0.7756431129925105, "train_speed(iter/s)": 0.134607 }, { "epoch": 1.1294713383441086, "grad_norm": 0.7686281204223633, "learning_rate": 4.344883019333143e-05, "loss": 0.7551663398742676, "memory(GiB)": 91.52, "step": 87045, "token_acc": 0.7916260602779281, "train_speed(iter/s)": 0.134607 }, { "epoch": 1.1295362167457643, "grad_norm": 0.6816063523292542, "learning_rate": 4.344351271812659e-05, "loss": 0.7652447700500489, "memory(GiB)": 91.52, "step": 87050, "token_acc": 0.7816174419068117, "train_speed(iter/s)": 0.134606 }, { "epoch": 1.12960109514742, "grad_norm": 0.6833862066268921, "learning_rate": 4.343819531837337e-05, "loss": 0.7550677299499512, "memory(GiB)": 91.52, "step": 87055, "token_acc": 0.7809883132419251, "train_speed(iter/s)": 0.134605 }, { "epoch": 1.1296659735490757, "grad_norm": 0.7975884079933167, "learning_rate": 4.343287799413301e-05, "loss": 0.7668362617492676, "memory(GiB)": 91.52, "step": 87060, "token_acc": 0.7629589240276263, "train_speed(iter/s)": 0.134605 }, { "epoch": 1.1297308519507314, "grad_norm": 0.7260253429412842, "learning_rate": 4.3427560745466644e-05, "loss": 0.7340628623962402, "memory(GiB)": 91.52, "step": 87065, "token_acc": 0.7787701599948597, "train_speed(iter/s)": 0.134604 }, { "epoch": 1.1297957303523871, "grad_norm": 0.7369164824485779, "learning_rate": 4.342224357243548e-05, "loss": 0.6982908248901367, "memory(GiB)": 91.52, "step": 87070, "token_acc": 0.8037322376305427, "train_speed(iter/s)": 0.134603 }, { "epoch": 1.1298606087540428, "grad_norm": 0.7319468855857849, "learning_rate": 4.3416926475100725e-05, "loss": 0.7523652076721191, "memory(GiB)": 91.52, "step": 87075, "token_acc": 0.7864849313097892, "train_speed(iter/s)": 0.134603 }, { "epoch": 1.1299254871556985, "grad_norm": 0.7535687685012817, "learning_rate": 4.3411609453523545e-05, "loss": 0.7572065353393554, "memory(GiB)": 91.52, "step": 87080, "token_acc": 0.7779465883385664, "train_speed(iter/s)": 0.134602 }, { "epoch": 1.1299903655573542, "grad_norm": 0.8196312785148621, "learning_rate": 4.3406292507765145e-05, "loss": 0.7749744415283203, "memory(GiB)": 91.52, "step": 87085, "token_acc": 0.7957677509618748, "train_speed(iter/s)": 0.134601 }, { "epoch": 1.13005524395901, "grad_norm": 0.7774093747138977, "learning_rate": 4.34009756378867e-05, "loss": 0.7273169040679932, "memory(GiB)": 91.52, "step": 87090, "token_acc": 0.790426957687153, "train_speed(iter/s)": 0.1346 }, { "epoch": 1.1301201223606656, "grad_norm": 0.746045708656311, "learning_rate": 4.339565884394943e-05, "loss": 0.765077543258667, "memory(GiB)": 91.52, "step": 87095, "token_acc": 0.7908324552160169, "train_speed(iter/s)": 0.134599 }, { "epoch": 1.1301850007623213, "grad_norm": 0.752531886100769, "learning_rate": 4.339034212601446e-05, "loss": 0.7813259601593018, "memory(GiB)": 91.52, "step": 87100, "token_acc": 0.7675636694879873, "train_speed(iter/s)": 0.134598 }, { "epoch": 1.130249879163977, "grad_norm": 0.6149684190750122, "learning_rate": 4.338502548414302e-05, "loss": 0.725556468963623, "memory(GiB)": 91.52, "step": 87105, "token_acc": 0.7869753402813372, "train_speed(iter/s)": 0.134597 }, { "epoch": 1.1303147575656327, "grad_norm": 0.666293740272522, "learning_rate": 4.337970891839628e-05, "loss": 0.7737002372741699, "memory(GiB)": 91.52, "step": 87110, "token_acc": 0.7791199309749784, "train_speed(iter/s)": 0.134596 }, { "epoch": 1.1303796359672882, "grad_norm": 0.6983011364936829, "learning_rate": 4.337439242883542e-05, "loss": 0.7828380584716796, "memory(GiB)": 91.52, "step": 87115, "token_acc": 0.7788551666181842, "train_speed(iter/s)": 0.134596 }, { "epoch": 1.130444514368944, "grad_norm": 0.7120607495307922, "learning_rate": 4.3369076015521634e-05, "loss": 0.7655254364013672, "memory(GiB)": 91.52, "step": 87120, "token_acc": 0.7889623734259986, "train_speed(iter/s)": 0.134595 }, { "epoch": 1.1305093927705996, "grad_norm": 0.7042768597602844, "learning_rate": 4.336375967851608e-05, "loss": 0.7361330986022949, "memory(GiB)": 91.52, "step": 87125, "token_acc": 0.7873608126587224, "train_speed(iter/s)": 0.134594 }, { "epoch": 1.1305742711722555, "grad_norm": 0.7257538437843323, "learning_rate": 4.3358443417879983e-05, "loss": 0.7128757476806641, "memory(GiB)": 91.52, "step": 87130, "token_acc": 0.7951219512195122, "train_speed(iter/s)": 0.134593 }, { "epoch": 1.130639149573911, "grad_norm": 0.7252004146575928, "learning_rate": 4.335312723367446e-05, "loss": 0.7745030403137207, "memory(GiB)": 91.52, "step": 87135, "token_acc": 0.7692898913951546, "train_speed(iter/s)": 0.134592 }, { "epoch": 1.130704027975567, "grad_norm": 0.6801677346229553, "learning_rate": 4.3347811125960733e-05, "loss": 0.7601472854614257, "memory(GiB)": 91.52, "step": 87140, "token_acc": 0.7850583543701752, "train_speed(iter/s)": 0.134591 }, { "epoch": 1.1307689063772224, "grad_norm": 0.7129473686218262, "learning_rate": 4.334249509479996e-05, "loss": 0.7376704216003418, "memory(GiB)": 91.52, "step": 87145, "token_acc": 0.7823100644799551, "train_speed(iter/s)": 0.134591 }, { "epoch": 1.1308337847788783, "grad_norm": 0.752585232257843, "learning_rate": 4.333717914025334e-05, "loss": 0.7677063941955566, "memory(GiB)": 91.52, "step": 87150, "token_acc": 0.7916038539686702, "train_speed(iter/s)": 0.134591 }, { "epoch": 1.1308986631805338, "grad_norm": 0.6705551147460938, "learning_rate": 4.3331863262382025e-05, "loss": 0.7397802352905274, "memory(GiB)": 91.52, "step": 87155, "token_acc": 0.7946970876810465, "train_speed(iter/s)": 0.134589 }, { "epoch": 1.1309635415821897, "grad_norm": 0.7473207712173462, "learning_rate": 4.332654746124719e-05, "loss": 0.7348739624023437, "memory(GiB)": 91.52, "step": 87160, "token_acc": 0.8105202809230329, "train_speed(iter/s)": 0.134588 }, { "epoch": 1.1310284199838452, "grad_norm": 0.602871298789978, "learning_rate": 4.332123173691005e-05, "loss": 0.7422433853149414, "memory(GiB)": 91.52, "step": 87165, "token_acc": 0.7761609562344556, "train_speed(iter/s)": 0.134587 }, { "epoch": 1.1310932983855009, "grad_norm": 0.7954002022743225, "learning_rate": 4.331591608943171e-05, "loss": 0.7509127140045166, "memory(GiB)": 91.52, "step": 87170, "token_acc": 0.7926751592356688, "train_speed(iter/s)": 0.134587 }, { "epoch": 1.1311581767871566, "grad_norm": 0.7169038653373718, "learning_rate": 4.331060051887339e-05, "loss": 0.7624519348144532, "memory(GiB)": 91.52, "step": 87175, "token_acc": 0.771698728399087, "train_speed(iter/s)": 0.134586 }, { "epoch": 1.1312230551888123, "grad_norm": 0.7468252778053284, "learning_rate": 4.330528502529624e-05, "loss": 0.7706159114837646, "memory(GiB)": 91.52, "step": 87180, "token_acc": 0.7820396219700265, "train_speed(iter/s)": 0.134585 }, { "epoch": 1.131287933590468, "grad_norm": 0.7135940194129944, "learning_rate": 4.3299969608761445e-05, "loss": 0.750706958770752, "memory(GiB)": 91.52, "step": 87185, "token_acc": 0.7794860919662108, "train_speed(iter/s)": 0.134585 }, { "epoch": 1.1313528119921237, "grad_norm": 0.6401788592338562, "learning_rate": 4.329465426933018e-05, "loss": 0.7319415092468262, "memory(GiB)": 91.52, "step": 87190, "token_acc": 0.7928171606747061, "train_speed(iter/s)": 0.134584 }, { "epoch": 1.1314176903937794, "grad_norm": 0.7956870794296265, "learning_rate": 4.3289339007063576e-05, "loss": 0.7859731197357178, "memory(GiB)": 91.52, "step": 87195, "token_acc": 0.7872539432847243, "train_speed(iter/s)": 0.134583 }, { "epoch": 1.131482568795435, "grad_norm": 0.7174650430679321, "learning_rate": 4.328402382202287e-05, "loss": 0.7476914405822754, "memory(GiB)": 91.52, "step": 87200, "token_acc": 0.7997280429552666, "train_speed(iter/s)": 0.134582 }, { "epoch": 1.1315474471970908, "grad_norm": 0.6461482644081116, "learning_rate": 4.327870871426915e-05, "loss": 0.7383297920227051, "memory(GiB)": 91.52, "step": 87205, "token_acc": 0.7974839480364342, "train_speed(iter/s)": 0.134581 }, { "epoch": 1.1316123255987465, "grad_norm": 0.7195336818695068, "learning_rate": 4.327339368386363e-05, "loss": 0.7564145565032959, "memory(GiB)": 91.52, "step": 87210, "token_acc": 0.7813104189044039, "train_speed(iter/s)": 0.13458 }, { "epoch": 1.1316772040004022, "grad_norm": 0.7418314218521118, "learning_rate": 4.326807873086745e-05, "loss": 0.7567291259765625, "memory(GiB)": 91.52, "step": 87215, "token_acc": 0.7844027373128261, "train_speed(iter/s)": 0.134579 }, { "epoch": 1.1317420824020579, "grad_norm": 0.6484593152999878, "learning_rate": 4.326276385534179e-05, "loss": 0.7247949600219726, "memory(GiB)": 91.52, "step": 87220, "token_acc": 0.7847772711329358, "train_speed(iter/s)": 0.134578 }, { "epoch": 1.1318069608037136, "grad_norm": 0.6146893501281738, "learning_rate": 4.325744905734782e-05, "loss": 0.7265047550201416, "memory(GiB)": 91.52, "step": 87225, "token_acc": 0.7831238779174147, "train_speed(iter/s)": 0.134577 }, { "epoch": 1.1318718392053693, "grad_norm": 0.7595562934875488, "learning_rate": 4.325213433694668e-05, "loss": 0.7347110271453857, "memory(GiB)": 91.52, "step": 87230, "token_acc": 0.7781909742397831, "train_speed(iter/s)": 0.134576 }, { "epoch": 1.131936717607025, "grad_norm": 0.8022733330726624, "learning_rate": 4.324681969419955e-05, "loss": 0.7316436290740966, "memory(GiB)": 91.52, "step": 87235, "token_acc": 0.7842320619611282, "train_speed(iter/s)": 0.134576 }, { "epoch": 1.1320015960086807, "grad_norm": 0.7243444323539734, "learning_rate": 4.32415051291676e-05, "loss": 0.7459570884704589, "memory(GiB)": 91.52, "step": 87240, "token_acc": 0.7803174603174603, "train_speed(iter/s)": 0.134575 }, { "epoch": 1.1320664744103364, "grad_norm": 0.6543285250663757, "learning_rate": 4.323619064191197e-05, "loss": 0.763487434387207, "memory(GiB)": 91.52, "step": 87245, "token_acc": 0.7757776883883079, "train_speed(iter/s)": 0.134575 }, { "epoch": 1.132131352811992, "grad_norm": 0.7692190408706665, "learning_rate": 4.32308762324938e-05, "loss": 0.7630887031555176, "memory(GiB)": 91.52, "step": 87250, "token_acc": 0.7677457795431977, "train_speed(iter/s)": 0.134574 }, { "epoch": 1.1321962312136478, "grad_norm": 0.7225836515426636, "learning_rate": 4.3225561900974284e-05, "loss": 0.7792351245880127, "memory(GiB)": 91.52, "step": 87255, "token_acc": 0.7733801611959116, "train_speed(iter/s)": 0.134573 }, { "epoch": 1.1322611096153035, "grad_norm": 0.747249186038971, "learning_rate": 4.322024764741456e-05, "loss": 0.7829721450805665, "memory(GiB)": 91.52, "step": 87260, "token_acc": 0.7815245429735691, "train_speed(iter/s)": 0.134573 }, { "epoch": 1.1323259880169592, "grad_norm": 0.6940569877624512, "learning_rate": 4.321493347187579e-05, "loss": 0.7628179550170898, "memory(GiB)": 91.52, "step": 87265, "token_acc": 0.7878002349224351, "train_speed(iter/s)": 0.134572 }, { "epoch": 1.1323908664186149, "grad_norm": 0.7786260843276978, "learning_rate": 4.3209619374419136e-05, "loss": 0.7913149356842041, "memory(GiB)": 91.52, "step": 87270, "token_acc": 0.762933727071384, "train_speed(iter/s)": 0.134572 }, { "epoch": 1.1324557448202706, "grad_norm": 0.6662027835845947, "learning_rate": 4.320430535510576e-05, "loss": 0.7455721855163574, "memory(GiB)": 91.52, "step": 87275, "token_acc": 0.781610337972167, "train_speed(iter/s)": 0.134571 }, { "epoch": 1.1325206232219263, "grad_norm": 0.7499805092811584, "learning_rate": 4.319899141399678e-05, "loss": 0.728356122970581, "memory(GiB)": 91.52, "step": 87280, "token_acc": 0.7697765700483091, "train_speed(iter/s)": 0.13457 }, { "epoch": 1.132585501623582, "grad_norm": 0.7887160181999207, "learning_rate": 4.3193677551153365e-05, "loss": 0.7499669075012207, "memory(GiB)": 91.52, "step": 87285, "token_acc": 0.7897946527012127, "train_speed(iter/s)": 0.134569 }, { "epoch": 1.1326503800252377, "grad_norm": 0.8114398717880249, "learning_rate": 4.318836376663668e-05, "loss": 0.791547155380249, "memory(GiB)": 91.52, "step": 87290, "token_acc": 0.7856400702232095, "train_speed(iter/s)": 0.134568 }, { "epoch": 1.1327152584268934, "grad_norm": 0.7022684812545776, "learning_rate": 4.3183050060507866e-05, "loss": 0.7458109855651855, "memory(GiB)": 91.52, "step": 87295, "token_acc": 0.7846364883401921, "train_speed(iter/s)": 0.134567 }, { "epoch": 1.132780136828549, "grad_norm": 0.7358949780464172, "learning_rate": 4.3177736432828066e-05, "loss": 0.7546965599060058, "memory(GiB)": 91.52, "step": 87300, "token_acc": 0.7921291624621595, "train_speed(iter/s)": 0.134566 }, { "epoch": 1.1328450152302048, "grad_norm": 0.7845853567123413, "learning_rate": 4.3172422883658435e-05, "loss": 0.7622783660888672, "memory(GiB)": 91.52, "step": 87305, "token_acc": 0.779187314172448, "train_speed(iter/s)": 0.134566 }, { "epoch": 1.1329098936318605, "grad_norm": 0.8062589764595032, "learning_rate": 4.316710941306015e-05, "loss": 0.7932828426361084, "memory(GiB)": 91.52, "step": 87310, "token_acc": 0.7717960056010023, "train_speed(iter/s)": 0.134565 }, { "epoch": 1.1329747720335162, "grad_norm": 0.6554123759269714, "learning_rate": 4.316179602109431e-05, "loss": 0.7413952350616455, "memory(GiB)": 91.52, "step": 87315, "token_acc": 0.7907578820441891, "train_speed(iter/s)": 0.134563 }, { "epoch": 1.1330396504351719, "grad_norm": 0.8000393509864807, "learning_rate": 4.315648270782206e-05, "loss": 0.7812262535095215, "memory(GiB)": 91.52, "step": 87320, "token_acc": 0.7747555686245681, "train_speed(iter/s)": 0.134563 }, { "epoch": 1.1331045288368276, "grad_norm": 0.6800060272216797, "learning_rate": 4.31511694733046e-05, "loss": 0.715147590637207, "memory(GiB)": 91.52, "step": 87325, "token_acc": 0.7892069503230118, "train_speed(iter/s)": 0.134562 }, { "epoch": 1.1331694072384833, "grad_norm": 0.7007843255996704, "learning_rate": 4.314585631760303e-05, "loss": 0.755375862121582, "memory(GiB)": 91.52, "step": 87330, "token_acc": 0.7723763048983878, "train_speed(iter/s)": 0.13456 }, { "epoch": 1.133234285640139, "grad_norm": 0.6898946166038513, "learning_rate": 4.314054324077849e-05, "loss": 0.7687418460845947, "memory(GiB)": 91.52, "step": 87335, "token_acc": 0.7767637540453074, "train_speed(iter/s)": 0.13456 }, { "epoch": 1.1332991640417946, "grad_norm": 0.7145248055458069, "learning_rate": 4.313523024289215e-05, "loss": 0.7568900108337402, "memory(GiB)": 91.52, "step": 87340, "token_acc": 0.7855555884705394, "train_speed(iter/s)": 0.134559 }, { "epoch": 1.1333640424434503, "grad_norm": 0.6598849892616272, "learning_rate": 4.3129917324005154e-05, "loss": 0.7826020240783691, "memory(GiB)": 91.52, "step": 87345, "token_acc": 0.7800112593966289, "train_speed(iter/s)": 0.134558 }, { "epoch": 1.133428920845106, "grad_norm": 0.7718244194984436, "learning_rate": 4.312460448417861e-05, "loss": 0.7754100799560547, "memory(GiB)": 91.52, "step": 87350, "token_acc": 0.7760597693681988, "train_speed(iter/s)": 0.134557 }, { "epoch": 1.1334937992467617, "grad_norm": 0.7271690964698792, "learning_rate": 4.3119291723473664e-05, "loss": 0.772395133972168, "memory(GiB)": 91.52, "step": 87355, "token_acc": 0.7865057170165316, "train_speed(iter/s)": 0.134556 }, { "epoch": 1.1335586776484174, "grad_norm": 0.6911306977272034, "learning_rate": 4.311397904195148e-05, "loss": 0.7528334617614746, "memory(GiB)": 91.52, "step": 87360, "token_acc": 0.8004096495220756, "train_speed(iter/s)": 0.134556 }, { "epoch": 1.1336235560500731, "grad_norm": 0.7147826552391052, "learning_rate": 4.310866643967318e-05, "loss": 0.7786452293395996, "memory(GiB)": 91.52, "step": 87365, "token_acc": 0.7678668239946423, "train_speed(iter/s)": 0.134555 }, { "epoch": 1.1336884344517288, "grad_norm": 0.7477113604545593, "learning_rate": 4.31033539166999e-05, "loss": 0.7622835159301757, "memory(GiB)": 91.52, "step": 87370, "token_acc": 0.7818387486835433, "train_speed(iter/s)": 0.134555 }, { "epoch": 1.1337533128533845, "grad_norm": 0.6809335350990295, "learning_rate": 4.309804147309278e-05, "loss": 0.7514758110046387, "memory(GiB)": 91.52, "step": 87375, "token_acc": 0.782154760470626, "train_speed(iter/s)": 0.134554 }, { "epoch": 1.1338181912550402, "grad_norm": 0.7099711298942566, "learning_rate": 4.3092729108912975e-05, "loss": 0.7455883979797363, "memory(GiB)": 91.52, "step": 87380, "token_acc": 0.7813499048097992, "train_speed(iter/s)": 0.134553 }, { "epoch": 1.133883069656696, "grad_norm": 0.6994419097900391, "learning_rate": 4.308741682422158e-05, "loss": 0.757593822479248, "memory(GiB)": 91.52, "step": 87385, "token_acc": 0.7799562804775517, "train_speed(iter/s)": 0.134553 }, { "epoch": 1.1339479480583516, "grad_norm": 0.7990574240684509, "learning_rate": 4.308210461907974e-05, "loss": 0.7446315288543701, "memory(GiB)": 91.52, "step": 87390, "token_acc": 0.7912881608339538, "train_speed(iter/s)": 0.134552 }, { "epoch": 1.1340128264600073, "grad_norm": 0.7784428000450134, "learning_rate": 4.30767924935486e-05, "loss": 0.7433617591857911, "memory(GiB)": 91.52, "step": 87395, "token_acc": 0.7845471713605386, "train_speed(iter/s)": 0.134551 }, { "epoch": 1.134077704861663, "grad_norm": 0.6459296941757202, "learning_rate": 4.307148044768929e-05, "loss": 0.7227924346923829, "memory(GiB)": 91.52, "step": 87400, "token_acc": 0.7721650221650221, "train_speed(iter/s)": 0.134551 }, { "epoch": 1.1341425832633187, "grad_norm": 0.6518885493278503, "learning_rate": 4.306616848156293e-05, "loss": 0.7506890296936035, "memory(GiB)": 91.52, "step": 87405, "token_acc": 0.7681636008632207, "train_speed(iter/s)": 0.13455 }, { "epoch": 1.1342074616649744, "grad_norm": 0.7520225644111633, "learning_rate": 4.3060856595230665e-05, "loss": 0.7346646785736084, "memory(GiB)": 91.52, "step": 87410, "token_acc": 0.7826198410663933, "train_speed(iter/s)": 0.134549 }, { "epoch": 1.1342723400666301, "grad_norm": 0.7446692585945129, "learning_rate": 4.3055544788753606e-05, "loss": 0.7472617626190186, "memory(GiB)": 91.52, "step": 87415, "token_acc": 0.76848766555596, "train_speed(iter/s)": 0.134548 }, { "epoch": 1.1343372184682858, "grad_norm": 0.780823826789856, "learning_rate": 4.305023306219293e-05, "loss": 0.802609920501709, "memory(GiB)": 91.52, "step": 87420, "token_acc": 0.761875925178307, "train_speed(iter/s)": 0.134547 }, { "epoch": 1.1344020968699415, "grad_norm": 0.7853103280067444, "learning_rate": 4.3044921415609674e-05, "loss": 0.7381617069244385, "memory(GiB)": 91.52, "step": 87425, "token_acc": 0.8092129865714771, "train_speed(iter/s)": 0.134547 }, { "epoch": 1.1344669752715972, "grad_norm": 0.7229925394058228, "learning_rate": 4.303960984906504e-05, "loss": 0.7681624412536621, "memory(GiB)": 91.52, "step": 87430, "token_acc": 0.786042944785276, "train_speed(iter/s)": 0.134546 }, { "epoch": 1.134531853673253, "grad_norm": 0.712342381477356, "learning_rate": 4.303429836262012e-05, "loss": 0.7636609077453613, "memory(GiB)": 91.52, "step": 87435, "token_acc": 0.7926529740504752, "train_speed(iter/s)": 0.134545 }, { "epoch": 1.1345967320749086, "grad_norm": 0.6573714017868042, "learning_rate": 4.3028986956336036e-05, "loss": 0.771408224105835, "memory(GiB)": 91.52, "step": 87440, "token_acc": 0.7773151048384125, "train_speed(iter/s)": 0.134544 }, { "epoch": 1.1346616104765643, "grad_norm": 0.7810189127922058, "learning_rate": 4.302367563027394e-05, "loss": 0.7691228866577149, "memory(GiB)": 91.52, "step": 87445, "token_acc": 0.780507435229189, "train_speed(iter/s)": 0.134543 }, { "epoch": 1.13472648887822, "grad_norm": 0.7421699166297913, "learning_rate": 4.301836438449492e-05, "loss": 0.7361619949340821, "memory(GiB)": 91.52, "step": 87450, "token_acc": 0.795618745894989, "train_speed(iter/s)": 0.134542 }, { "epoch": 1.1347913672798757, "grad_norm": 0.7576490044593811, "learning_rate": 4.301305321906014e-05, "loss": 0.7504133701324462, "memory(GiB)": 91.52, "step": 87455, "token_acc": 0.7637200813041856, "train_speed(iter/s)": 0.134541 }, { "epoch": 1.1348562456815314, "grad_norm": 0.6768220663070679, "learning_rate": 4.300774213403067e-05, "loss": 0.799981164932251, "memory(GiB)": 91.52, "step": 87460, "token_acc": 0.7628279883381924, "train_speed(iter/s)": 0.13454 }, { "epoch": 1.1349211240831871, "grad_norm": 0.6990116238594055, "learning_rate": 4.300243112946766e-05, "loss": 0.7594796180725097, "memory(GiB)": 91.52, "step": 87465, "token_acc": 0.795711790957894, "train_speed(iter/s)": 0.134539 }, { "epoch": 1.1349860024848428, "grad_norm": 0.7546815276145935, "learning_rate": 4.2997120205432226e-05, "loss": 0.7671504020690918, "memory(GiB)": 91.52, "step": 87470, "token_acc": 0.7938835794960903, "train_speed(iter/s)": 0.134539 }, { "epoch": 1.1350508808864985, "grad_norm": 0.6668093204498291, "learning_rate": 4.299180936198547e-05, "loss": 0.7572714328765869, "memory(GiB)": 91.52, "step": 87475, "token_acc": 0.795412620514427, "train_speed(iter/s)": 0.134537 }, { "epoch": 1.1351157592881542, "grad_norm": 0.6179412603378296, "learning_rate": 4.298649859918853e-05, "loss": 0.7261480331420899, "memory(GiB)": 91.52, "step": 87480, "token_acc": 0.7818080134222826, "train_speed(iter/s)": 0.134537 }, { "epoch": 1.13518063768981, "grad_norm": 0.7130052447319031, "learning_rate": 4.2981187917102507e-05, "loss": 0.7585537910461426, "memory(GiB)": 91.52, "step": 87485, "token_acc": 0.7911917532685216, "train_speed(iter/s)": 0.134536 }, { "epoch": 1.1352455160914656, "grad_norm": 0.7874459624290466, "learning_rate": 4.297587731578855e-05, "loss": 0.7865143299102784, "memory(GiB)": 91.52, "step": 87490, "token_acc": 0.790156182834217, "train_speed(iter/s)": 0.134536 }, { "epoch": 1.1353103944931213, "grad_norm": 0.6418361663818359, "learning_rate": 4.2970566795307707e-05, "loss": 0.7444436073303222, "memory(GiB)": 91.52, "step": 87495, "token_acc": 0.7811420982735724, "train_speed(iter/s)": 0.134535 }, { "epoch": 1.135375272894777, "grad_norm": 0.6951016187667847, "learning_rate": 4.296525635572114e-05, "loss": 0.7313294410705566, "memory(GiB)": 91.52, "step": 87500, "token_acc": 0.7844946025515211, "train_speed(iter/s)": 0.134534 }, { "epoch": 1.1354401512964327, "grad_norm": 0.6970971822738647, "learning_rate": 4.295994599708996e-05, "loss": 0.741586971282959, "memory(GiB)": 91.52, "step": 87505, "token_acc": 0.8030913012221423, "train_speed(iter/s)": 0.134533 }, { "epoch": 1.1355050296980884, "grad_norm": 0.7169091105461121, "learning_rate": 4.295463571947525e-05, "loss": 0.7790214061737061, "memory(GiB)": 91.52, "step": 87510, "token_acc": 0.7860184528034067, "train_speed(iter/s)": 0.134532 }, { "epoch": 1.1355699080997441, "grad_norm": 0.7121113538742065, "learning_rate": 4.294932552293815e-05, "loss": 0.7623234272003174, "memory(GiB)": 91.52, "step": 87515, "token_acc": 0.7925878497985009, "train_speed(iter/s)": 0.134531 }, { "epoch": 1.1356347865013998, "grad_norm": 0.6359761953353882, "learning_rate": 4.2944015407539754e-05, "loss": 0.7336514472961426, "memory(GiB)": 91.52, "step": 87520, "token_acc": 0.8039947251616223, "train_speed(iter/s)": 0.13453 }, { "epoch": 1.1356996649030555, "grad_norm": 0.8731226325035095, "learning_rate": 4.2938705373341206e-05, "loss": 0.7592445850372315, "memory(GiB)": 91.52, "step": 87525, "token_acc": 0.7822081978708485, "train_speed(iter/s)": 0.134529 }, { "epoch": 1.1357645433047112, "grad_norm": 0.7797975540161133, "learning_rate": 4.2933395420403544e-05, "loss": 0.7617183685302734, "memory(GiB)": 91.52, "step": 87530, "token_acc": 0.7802060449965912, "train_speed(iter/s)": 0.134529 }, { "epoch": 1.135829421706367, "grad_norm": 0.7199251651763916, "learning_rate": 4.292808554878792e-05, "loss": 0.7620120048522949, "memory(GiB)": 91.52, "step": 87535, "token_acc": 0.7950181446640595, "train_speed(iter/s)": 0.134528 }, { "epoch": 1.1358943001080226, "grad_norm": 0.7703250646591187, "learning_rate": 4.292277575855544e-05, "loss": 0.7734565734863281, "memory(GiB)": 91.52, "step": 87540, "token_acc": 0.7902692879563853, "train_speed(iter/s)": 0.134527 }, { "epoch": 1.1359591785096783, "grad_norm": 0.7490678429603577, "learning_rate": 4.291746604976719e-05, "loss": 0.7992898464202881, "memory(GiB)": 91.52, "step": 87545, "token_acc": 0.7628491330079951, "train_speed(iter/s)": 0.134527 }, { "epoch": 1.136024056911334, "grad_norm": 0.6977384090423584, "learning_rate": 4.29121564224843e-05, "loss": 0.7445605278015137, "memory(GiB)": 91.52, "step": 87550, "token_acc": 0.7783695720283226, "train_speed(iter/s)": 0.134526 }, { "epoch": 1.1360889353129897, "grad_norm": 0.6125417351722717, "learning_rate": 4.2906846876767835e-05, "loss": 0.7529636859893799, "memory(GiB)": 91.52, "step": 87555, "token_acc": 0.7726466489294311, "train_speed(iter/s)": 0.134525 }, { "epoch": 1.1361538137146454, "grad_norm": 0.6627274751663208, "learning_rate": 4.290153741267896e-05, "loss": 0.7663776397705078, "memory(GiB)": 91.52, "step": 87560, "token_acc": 0.7890413670251369, "train_speed(iter/s)": 0.134524 }, { "epoch": 1.136218692116301, "grad_norm": 0.633657693862915, "learning_rate": 4.28962280302787e-05, "loss": 0.76610107421875, "memory(GiB)": 91.52, "step": 87565, "token_acc": 0.7632432059594427, "train_speed(iter/s)": 0.134523 }, { "epoch": 1.1362835705179568, "grad_norm": 0.7232488989830017, "learning_rate": 4.28909187296282e-05, "loss": 0.7644779205322265, "memory(GiB)": 91.52, "step": 87570, "token_acc": 0.7745595700209018, "train_speed(iter/s)": 0.134522 }, { "epoch": 1.1363484489196125, "grad_norm": 0.7433861494064331, "learning_rate": 4.288560951078854e-05, "loss": 0.7507675647735595, "memory(GiB)": 91.52, "step": 87575, "token_acc": 0.7759972980072803, "train_speed(iter/s)": 0.134521 }, { "epoch": 1.1364133273212682, "grad_norm": 0.7377007007598877, "learning_rate": 4.2880300373820844e-05, "loss": 0.7883275985717774, "memory(GiB)": 91.52, "step": 87580, "token_acc": 0.7647993342663691, "train_speed(iter/s)": 0.134521 }, { "epoch": 1.136478205722924, "grad_norm": 0.7339348793029785, "learning_rate": 4.287499131878618e-05, "loss": 0.7649120330810547, "memory(GiB)": 91.52, "step": 87585, "token_acc": 0.7567873303167421, "train_speed(iter/s)": 0.13452 }, { "epoch": 1.1365430841245794, "grad_norm": 0.7692973613739014, "learning_rate": 4.286968234574567e-05, "loss": 0.7182573318481446, "memory(GiB)": 91.52, "step": 87590, "token_acc": 0.7799911465250111, "train_speed(iter/s)": 0.134519 }, { "epoch": 1.1366079625262353, "grad_norm": 0.6980583071708679, "learning_rate": 4.286437345476039e-05, "loss": 0.773252534866333, "memory(GiB)": 91.52, "step": 87595, "token_acc": 0.7784880191408063, "train_speed(iter/s)": 0.134519 }, { "epoch": 1.1366728409278908, "grad_norm": 0.7188717722892761, "learning_rate": 4.285906464589145e-05, "loss": 0.7676961898803711, "memory(GiB)": 91.52, "step": 87600, "token_acc": 0.7839556446560582, "train_speed(iter/s)": 0.134518 }, { "epoch": 1.1367377193295467, "grad_norm": 0.6495080590248108, "learning_rate": 4.285375591919993e-05, "loss": 0.7420918941497803, "memory(GiB)": 91.52, "step": 87605, "token_acc": 0.8005323948745714, "train_speed(iter/s)": 0.134517 }, { "epoch": 1.1368025977312022, "grad_norm": 0.6752895712852478, "learning_rate": 4.284844727474692e-05, "loss": 0.7709677696228028, "memory(GiB)": 91.52, "step": 87610, "token_acc": 0.7647361735727282, "train_speed(iter/s)": 0.134516 }, { "epoch": 1.136867476132858, "grad_norm": 0.7137033939361572, "learning_rate": 4.284313871259352e-05, "loss": 0.7319971084594726, "memory(GiB)": 91.52, "step": 87615, "token_acc": 0.7766074709124311, "train_speed(iter/s)": 0.134515 }, { "epoch": 1.1369323545345136, "grad_norm": 0.6635063290596008, "learning_rate": 4.283783023280082e-05, "loss": 0.7225951194763184, "memory(GiB)": 91.52, "step": 87620, "token_acc": 0.7927437482438887, "train_speed(iter/s)": 0.134514 }, { "epoch": 1.1369972329361695, "grad_norm": 0.7203707098960876, "learning_rate": 4.28325218354299e-05, "loss": 0.7367583751678467, "memory(GiB)": 91.52, "step": 87625, "token_acc": 0.7846460934034598, "train_speed(iter/s)": 0.134513 }, { "epoch": 1.137062111337825, "grad_norm": 0.6778368949890137, "learning_rate": 4.282721352054187e-05, "loss": 0.7206412792205811, "memory(GiB)": 91.52, "step": 87630, "token_acc": 0.7862236116477572, "train_speed(iter/s)": 0.134512 }, { "epoch": 1.137126989739481, "grad_norm": 0.719383180141449, "learning_rate": 4.2821905288197815e-05, "loss": 0.7563968658447265, "memory(GiB)": 91.52, "step": 87635, "token_acc": 0.795347866903098, "train_speed(iter/s)": 0.134511 }, { "epoch": 1.1371918681411364, "grad_norm": 0.6915913820266724, "learning_rate": 4.28165971384588e-05, "loss": 0.7665114402770996, "memory(GiB)": 91.52, "step": 87640, "token_acc": 0.7715534658193439, "train_speed(iter/s)": 0.13451 }, { "epoch": 1.1372567465427923, "grad_norm": 0.7486288547515869, "learning_rate": 4.2811289071385915e-05, "loss": 0.7528998851776123, "memory(GiB)": 91.52, "step": 87645, "token_acc": 0.7751331876892416, "train_speed(iter/s)": 0.134509 }, { "epoch": 1.1373216249444478, "grad_norm": 0.6771302819252014, "learning_rate": 4.280598108704026e-05, "loss": 0.7575925827026367, "memory(GiB)": 91.52, "step": 87650, "token_acc": 0.7741877451156766, "train_speed(iter/s)": 0.134509 }, { "epoch": 1.1373865033461035, "grad_norm": 0.6565261483192444, "learning_rate": 4.2800673185482913e-05, "loss": 0.7671179294586181, "memory(GiB)": 91.52, "step": 87655, "token_acc": 0.7796754593953764, "train_speed(iter/s)": 0.134508 }, { "epoch": 1.1374513817477592, "grad_norm": 0.7990767955780029, "learning_rate": 4.279536536677495e-05, "loss": 0.7670474529266358, "memory(GiB)": 91.52, "step": 87660, "token_acc": 0.7667448261800387, "train_speed(iter/s)": 0.134507 }, { "epoch": 1.1375162601494149, "grad_norm": 0.747788667678833, "learning_rate": 4.2790057630977466e-05, "loss": 0.7784204006195068, "memory(GiB)": 91.52, "step": 87665, "token_acc": 0.8074563179368985, "train_speed(iter/s)": 0.134506 }, { "epoch": 1.1375811385510706, "grad_norm": 0.6947304606437683, "learning_rate": 4.278474997815154e-05, "loss": 0.7316353797912598, "memory(GiB)": 91.52, "step": 87670, "token_acc": 0.7857013574660634, "train_speed(iter/s)": 0.134506 }, { "epoch": 1.1376460169527263, "grad_norm": 0.7107187509536743, "learning_rate": 4.277944240835824e-05, "loss": 0.7364093780517578, "memory(GiB)": 91.52, "step": 87675, "token_acc": 0.7874778657579099, "train_speed(iter/s)": 0.134505 }, { "epoch": 1.137710895354382, "grad_norm": 0.7299922704696655, "learning_rate": 4.2774134921658644e-05, "loss": 0.7829390525817871, "memory(GiB)": 91.52, "step": 87680, "token_acc": 0.7753162461090138, "train_speed(iter/s)": 0.134504 }, { "epoch": 1.1377757737560377, "grad_norm": 0.7203572392463684, "learning_rate": 4.276882751811385e-05, "loss": 0.74874906539917, "memory(GiB)": 91.52, "step": 87685, "token_acc": 0.7772369266132576, "train_speed(iter/s)": 0.134503 }, { "epoch": 1.1378406521576934, "grad_norm": 0.6183233857154846, "learning_rate": 4.276352019778492e-05, "loss": 0.7196192264556884, "memory(GiB)": 91.52, "step": 87690, "token_acc": 0.8082749935177983, "train_speed(iter/s)": 0.134502 }, { "epoch": 1.137905530559349, "grad_norm": 0.6005482077598572, "learning_rate": 4.275821296073292e-05, "loss": 0.729800796508789, "memory(GiB)": 91.52, "step": 87695, "token_acc": 0.7960680845587053, "train_speed(iter/s)": 0.134501 }, { "epoch": 1.1379704089610048, "grad_norm": 0.6386622190475464, "learning_rate": 4.275290580701895e-05, "loss": 0.7057385444641113, "memory(GiB)": 91.52, "step": 87700, "token_acc": 0.7930157488017217, "train_speed(iter/s)": 0.134499 }, { "epoch": 1.1380352873626605, "grad_norm": 0.7420447468757629, "learning_rate": 4.274759873670409e-05, "loss": 0.7232322692871094, "memory(GiB)": 91.52, "step": 87705, "token_acc": 0.7859581881533101, "train_speed(iter/s)": 0.134498 }, { "epoch": 1.1381001657643162, "grad_norm": 0.7170855402946472, "learning_rate": 4.274229174984938e-05, "loss": 0.7320639610290527, "memory(GiB)": 91.52, "step": 87710, "token_acc": 0.787132573485303, "train_speed(iter/s)": 0.134498 }, { "epoch": 1.1381650441659719, "grad_norm": 0.7304447889328003, "learning_rate": 4.273698484651591e-05, "loss": 0.754310941696167, "memory(GiB)": 91.52, "step": 87715, "token_acc": 0.7890295358649789, "train_speed(iter/s)": 0.134497 }, { "epoch": 1.1382299225676276, "grad_norm": 0.6986724138259888, "learning_rate": 4.273167802676475e-05, "loss": 0.7421313285827636, "memory(GiB)": 91.52, "step": 87720, "token_acc": 0.8145630334617419, "train_speed(iter/s)": 0.134497 }, { "epoch": 1.1382948009692833, "grad_norm": 0.7986506819725037, "learning_rate": 4.2726371290656976e-05, "loss": 0.7647587776184082, "memory(GiB)": 91.52, "step": 87725, "token_acc": 0.7853030238410031, "train_speed(iter/s)": 0.134496 }, { "epoch": 1.138359679370939, "grad_norm": 0.6526042819023132, "learning_rate": 4.2721064638253646e-05, "loss": 0.7531026840209961, "memory(GiB)": 91.52, "step": 87730, "token_acc": 0.789163541746384, "train_speed(iter/s)": 0.134495 }, { "epoch": 1.1384245577725947, "grad_norm": 0.7102811336517334, "learning_rate": 4.271575806961584e-05, "loss": 0.763698148727417, "memory(GiB)": 91.52, "step": 87735, "token_acc": 0.7908184039087948, "train_speed(iter/s)": 0.134494 }, { "epoch": 1.1384894361742504, "grad_norm": 0.7929748892784119, "learning_rate": 4.2710451584804635e-05, "loss": 0.7508083820343018, "memory(GiB)": 91.52, "step": 87740, "token_acc": 0.7973906911142454, "train_speed(iter/s)": 0.134493 }, { "epoch": 1.138554314575906, "grad_norm": 0.7245523929595947, "learning_rate": 4.270514518388108e-05, "loss": 0.7173223495483398, "memory(GiB)": 91.52, "step": 87745, "token_acc": 0.7848950954597584, "train_speed(iter/s)": 0.134492 }, { "epoch": 1.1386191929775618, "grad_norm": 0.8159865736961365, "learning_rate": 4.2699838866906215e-05, "loss": 0.7907967567443848, "memory(GiB)": 91.52, "step": 87750, "token_acc": 0.7519047108058805, "train_speed(iter/s)": 0.134491 }, { "epoch": 1.1386840713792175, "grad_norm": 0.6576656103134155, "learning_rate": 4.269453263394116e-05, "loss": 0.7071700096130371, "memory(GiB)": 91.52, "step": 87755, "token_acc": 0.7807660108832147, "train_speed(iter/s)": 0.13449 }, { "epoch": 1.1387489497808732, "grad_norm": 0.6862199902534485, "learning_rate": 4.268922648504695e-05, "loss": 0.736506175994873, "memory(GiB)": 91.52, "step": 87760, "token_acc": 0.7920318725099602, "train_speed(iter/s)": 0.134489 }, { "epoch": 1.1388138281825289, "grad_norm": 0.6998077630996704, "learning_rate": 4.268392042028464e-05, "loss": 0.7597121238708496, "memory(GiB)": 91.52, "step": 87765, "token_acc": 0.7828350872082438, "train_speed(iter/s)": 0.134489 }, { "epoch": 1.1388787065841846, "grad_norm": 0.6211798787117004, "learning_rate": 4.2678614439715315e-05, "loss": 0.7895263195037842, "memory(GiB)": 91.52, "step": 87770, "token_acc": 0.7880478378181288, "train_speed(iter/s)": 0.134488 }, { "epoch": 1.1389435849858403, "grad_norm": 0.7986134886741638, "learning_rate": 4.267330854340003e-05, "loss": 0.7618974208831787, "memory(GiB)": 91.52, "step": 87775, "token_acc": 0.77423900789177, "train_speed(iter/s)": 0.134487 }, { "epoch": 1.139008463387496, "grad_norm": 0.636825442314148, "learning_rate": 4.2668002731399835e-05, "loss": 0.7331056594848633, "memory(GiB)": 91.52, "step": 87780, "token_acc": 0.798708288482239, "train_speed(iter/s)": 0.134486 }, { "epoch": 1.1390733417891516, "grad_norm": 0.6652206778526306, "learning_rate": 4.266269700377578e-05, "loss": 0.721407127380371, "memory(GiB)": 91.52, "step": 87785, "token_acc": 0.7932751306055187, "train_speed(iter/s)": 0.134486 }, { "epoch": 1.1391382201908073, "grad_norm": 0.6910904049873352, "learning_rate": 4.2657391360588944e-05, "loss": 0.7727501392364502, "memory(GiB)": 91.52, "step": 87790, "token_acc": 0.7741022530966295, "train_speed(iter/s)": 0.134485 }, { "epoch": 1.139203098592463, "grad_norm": 0.7143104672431946, "learning_rate": 4.265208580190038e-05, "loss": 0.7511547088623047, "memory(GiB)": 91.52, "step": 87795, "token_acc": 0.7813173819294706, "train_speed(iter/s)": 0.134484 }, { "epoch": 1.1392679769941187, "grad_norm": 0.7230908870697021, "learning_rate": 4.264678032777113e-05, "loss": 0.7576482772827149, "memory(GiB)": 91.52, "step": 87800, "token_acc": 0.7970473345588235, "train_speed(iter/s)": 0.134483 }, { "epoch": 1.1393328553957744, "grad_norm": 0.7562171816825867, "learning_rate": 4.264147493826226e-05, "loss": 0.7649223327636718, "memory(GiB)": 91.52, "step": 87805, "token_acc": 0.7800361782601546, "train_speed(iter/s)": 0.134482 }, { "epoch": 1.1393977337974301, "grad_norm": 0.6801256537437439, "learning_rate": 4.263616963343482e-05, "loss": 0.7469090461730957, "memory(GiB)": 91.52, "step": 87810, "token_acc": 0.7804241227790868, "train_speed(iter/s)": 0.134481 }, { "epoch": 1.1394626121990858, "grad_norm": 0.672233521938324, "learning_rate": 4.263086441334989e-05, "loss": 0.7658252716064453, "memory(GiB)": 91.52, "step": 87815, "token_acc": 0.7799805126667666, "train_speed(iter/s)": 0.13448 }, { "epoch": 1.1395274906007415, "grad_norm": 0.7262153625488281, "learning_rate": 4.262555927806847e-05, "loss": 0.7693684577941895, "memory(GiB)": 91.52, "step": 87820, "token_acc": 0.7710184821077467, "train_speed(iter/s)": 0.13448 }, { "epoch": 1.1395923690023972, "grad_norm": 0.6811403632164001, "learning_rate": 4.2620254227651655e-05, "loss": 0.742266321182251, "memory(GiB)": 91.52, "step": 87825, "token_acc": 0.804399664147775, "train_speed(iter/s)": 0.134479 }, { "epoch": 1.139657247404053, "grad_norm": 0.6564825177192688, "learning_rate": 4.2614949262160477e-05, "loss": 0.7745763778686523, "memory(GiB)": 91.52, "step": 87830, "token_acc": 0.7966395574454032, "train_speed(iter/s)": 0.134477 }, { "epoch": 1.1397221258057086, "grad_norm": 0.6793181896209717, "learning_rate": 4.2609644381655976e-05, "loss": 0.7405974388122558, "memory(GiB)": 91.52, "step": 87835, "token_acc": 0.787685408761642, "train_speed(iter/s)": 0.134476 }, { "epoch": 1.1397870042073643, "grad_norm": 0.6896059513092041, "learning_rate": 4.260433958619922e-05, "loss": 0.7261409759521484, "memory(GiB)": 91.52, "step": 87840, "token_acc": 0.7861526735930041, "train_speed(iter/s)": 0.134475 }, { "epoch": 1.13985188260902, "grad_norm": 0.713503360748291, "learning_rate": 4.259903487585124e-05, "loss": 0.6941057205200195, "memory(GiB)": 91.52, "step": 87845, "token_acc": 0.7982521203521559, "train_speed(iter/s)": 0.134474 }, { "epoch": 1.1399167610106757, "grad_norm": 0.6915876269340515, "learning_rate": 4.2593730250673126e-05, "loss": 0.7643547058105469, "memory(GiB)": 91.52, "step": 87850, "token_acc": 0.7703311368754692, "train_speed(iter/s)": 0.134474 }, { "epoch": 1.1399816394123314, "grad_norm": 0.7530323266983032, "learning_rate": 4.2588425710725846e-05, "loss": 0.7791924953460694, "memory(GiB)": 91.52, "step": 87855, "token_acc": 0.7725431522096121, "train_speed(iter/s)": 0.134473 }, { "epoch": 1.1400465178139871, "grad_norm": 0.7163341641426086, "learning_rate": 4.25831212560705e-05, "loss": 0.7681430816650391, "memory(GiB)": 91.52, "step": 87860, "token_acc": 0.7620336039015265, "train_speed(iter/s)": 0.134472 }, { "epoch": 1.1401113962156428, "grad_norm": 0.7436225414276123, "learning_rate": 4.2577816886768125e-05, "loss": 0.7611394882202148, "memory(GiB)": 91.52, "step": 87865, "token_acc": 0.7806534639307928, "train_speed(iter/s)": 0.134472 }, { "epoch": 1.1401762746172985, "grad_norm": 0.6867504715919495, "learning_rate": 4.257251260287974e-05, "loss": 0.7730553627014161, "memory(GiB)": 91.52, "step": 87870, "token_acc": 0.776011859615252, "train_speed(iter/s)": 0.134471 }, { "epoch": 1.1402411530189542, "grad_norm": 0.7036277651786804, "learning_rate": 4.256720840446641e-05, "loss": 0.7395090103149414, "memory(GiB)": 91.52, "step": 87875, "token_acc": 0.7847428199377848, "train_speed(iter/s)": 0.134469 }, { "epoch": 1.14030603142061, "grad_norm": 0.7399093508720398, "learning_rate": 4.2561904291589155e-05, "loss": 0.8037237167358399, "memory(GiB)": 91.52, "step": 87880, "token_acc": 0.7591370090355891, "train_speed(iter/s)": 0.134469 }, { "epoch": 1.1403709098222656, "grad_norm": 0.7759787440299988, "learning_rate": 4.255660026430907e-05, "loss": 0.7680039405822754, "memory(GiB)": 91.52, "step": 87885, "token_acc": 0.7782503952420387, "train_speed(iter/s)": 0.134468 }, { "epoch": 1.1404357882239213, "grad_norm": 0.6612361073493958, "learning_rate": 4.2551296322687104e-05, "loss": 0.7518130779266358, "memory(GiB)": 91.52, "step": 87890, "token_acc": 0.7739626480633675, "train_speed(iter/s)": 0.134468 }, { "epoch": 1.140500666625577, "grad_norm": 0.6834450364112854, "learning_rate": 4.2545992466784353e-05, "loss": 0.7599016189575195, "memory(GiB)": 91.52, "step": 87895, "token_acc": 0.7905549706722815, "train_speed(iter/s)": 0.134467 }, { "epoch": 1.1405655450272327, "grad_norm": 0.7375254034996033, "learning_rate": 4.254068869666185e-05, "loss": 0.7571850776672363, "memory(GiB)": 91.52, "step": 87900, "token_acc": 0.802362964153673, "train_speed(iter/s)": 0.134466 }, { "epoch": 1.1406304234288884, "grad_norm": 0.7517027854919434, "learning_rate": 4.2535385012380605e-05, "loss": 0.7743950843811035, "memory(GiB)": 91.52, "step": 87905, "token_acc": 0.7776315789473685, "train_speed(iter/s)": 0.134465 }, { "epoch": 1.1406953018305441, "grad_norm": 0.7196430563926697, "learning_rate": 4.253008141400168e-05, "loss": 0.7060972213745117, "memory(GiB)": 91.52, "step": 87910, "token_acc": 0.7741602694420002, "train_speed(iter/s)": 0.134465 }, { "epoch": 1.1407601802321998, "grad_norm": 0.7163969874382019, "learning_rate": 4.252477790158609e-05, "loss": 0.769615650177002, "memory(GiB)": 91.52, "step": 87915, "token_acc": 0.7732458550143823, "train_speed(iter/s)": 0.134464 }, { "epoch": 1.1408250586338555, "grad_norm": 0.6903505325317383, "learning_rate": 4.25194744751949e-05, "loss": 0.6778131484985351, "memory(GiB)": 91.52, "step": 87920, "token_acc": 0.7917931422147274, "train_speed(iter/s)": 0.134464 }, { "epoch": 1.1408899370355112, "grad_norm": 0.751496434211731, "learning_rate": 4.251417113488909e-05, "loss": 0.7624136447906494, "memory(GiB)": 91.52, "step": 87925, "token_acc": 0.7683959451401312, "train_speed(iter/s)": 0.134463 }, { "epoch": 1.140954815437167, "grad_norm": 0.7372376918792725, "learning_rate": 4.250886788072973e-05, "loss": 0.7859132289886475, "memory(GiB)": 91.52, "step": 87930, "token_acc": 0.7770386118921885, "train_speed(iter/s)": 0.134462 }, { "epoch": 1.1410196938388226, "grad_norm": 0.6859009861946106, "learning_rate": 4.250356471277783e-05, "loss": 0.6998690605163574, "memory(GiB)": 91.52, "step": 87935, "token_acc": 0.7840372174165879, "train_speed(iter/s)": 0.134462 }, { "epoch": 1.1410845722404783, "grad_norm": 0.6793208718299866, "learning_rate": 4.249826163109442e-05, "loss": 0.760935640335083, "memory(GiB)": 91.52, "step": 87940, "token_acc": 0.7903962390866354, "train_speed(iter/s)": 0.134461 }, { "epoch": 1.141149450642134, "grad_norm": 0.7371911406517029, "learning_rate": 4.249295863574053e-05, "loss": 0.8010922431945801, "memory(GiB)": 91.52, "step": 87945, "token_acc": 0.7698178237321517, "train_speed(iter/s)": 0.13446 }, { "epoch": 1.1412143290437897, "grad_norm": 0.7108789086341858, "learning_rate": 4.248765572677719e-05, "loss": 0.7325225353240967, "memory(GiB)": 91.52, "step": 87950, "token_acc": 0.7713810058461419, "train_speed(iter/s)": 0.134459 }, { "epoch": 1.1412792074454454, "grad_norm": 0.7380265593528748, "learning_rate": 4.248235290426543e-05, "loss": 0.7519752502441406, "memory(GiB)": 91.52, "step": 87955, "token_acc": 0.7645552398863591, "train_speed(iter/s)": 0.134458 }, { "epoch": 1.1413440858471011, "grad_norm": 0.6624996066093445, "learning_rate": 4.247705016826628e-05, "loss": 0.7636659622192383, "memory(GiB)": 91.52, "step": 87960, "token_acc": 0.7866661440777613, "train_speed(iter/s)": 0.134458 }, { "epoch": 1.1414089642487568, "grad_norm": 0.6941739320755005, "learning_rate": 4.247174751884075e-05, "loss": 0.7258284568786622, "memory(GiB)": 91.52, "step": 87965, "token_acc": 0.7933844269067495, "train_speed(iter/s)": 0.134457 }, { "epoch": 1.1414738426504125, "grad_norm": 0.6676560044288635, "learning_rate": 4.246644495604985e-05, "loss": 0.7505743980407715, "memory(GiB)": 91.52, "step": 87970, "token_acc": 0.7944192922479903, "train_speed(iter/s)": 0.134456 }, { "epoch": 1.1415387210520682, "grad_norm": 0.7938168048858643, "learning_rate": 4.2461142479954625e-05, "loss": 0.7553403854370118, "memory(GiB)": 91.52, "step": 87975, "token_acc": 0.7788731851965957, "train_speed(iter/s)": 0.134455 }, { "epoch": 1.141603599453724, "grad_norm": 0.6789577603340149, "learning_rate": 4.245584009061609e-05, "loss": 0.7616086959838867, "memory(GiB)": 91.52, "step": 87980, "token_acc": 0.7695632732150842, "train_speed(iter/s)": 0.134454 }, { "epoch": 1.1416684778553796, "grad_norm": 0.6958352327346802, "learning_rate": 4.245053778809525e-05, "loss": 0.7237896919250488, "memory(GiB)": 91.52, "step": 87985, "token_acc": 0.7917217770170388, "train_speed(iter/s)": 0.134453 }, { "epoch": 1.1417333562570353, "grad_norm": 0.7609395384788513, "learning_rate": 4.2445235572453144e-05, "loss": 0.7021509170532226, "memory(GiB)": 91.52, "step": 87990, "token_acc": 0.8067797722574082, "train_speed(iter/s)": 0.134452 }, { "epoch": 1.141798234658691, "grad_norm": 0.7097877860069275, "learning_rate": 4.243993344375079e-05, "loss": 0.7712574005126953, "memory(GiB)": 91.52, "step": 87995, "token_acc": 0.7788591969614759, "train_speed(iter/s)": 0.134451 }, { "epoch": 1.1418631130603467, "grad_norm": 0.7164960503578186, "learning_rate": 4.243463140204918e-05, "loss": 0.6995001792907715, "memory(GiB)": 91.52, "step": 88000, "token_acc": 0.8177064420074079, "train_speed(iter/s)": 0.134451 }, { "epoch": 1.1419279914620024, "grad_norm": 0.7110539674758911, "learning_rate": 4.242932944740935e-05, "loss": 0.7609229564666748, "memory(GiB)": 91.52, "step": 88005, "token_acc": 0.7762342302924232, "train_speed(iter/s)": 0.13445 }, { "epoch": 1.1419928698636581, "grad_norm": 0.6606597304344177, "learning_rate": 4.2424027579892315e-05, "loss": 0.7726423740386963, "memory(GiB)": 91.52, "step": 88010, "token_acc": 0.752842544617156, "train_speed(iter/s)": 0.134449 }, { "epoch": 1.1420577482653138, "grad_norm": 0.6840968728065491, "learning_rate": 4.241872579955907e-05, "loss": 0.765796709060669, "memory(GiB)": 91.52, "step": 88015, "token_acc": 0.776885043263288, "train_speed(iter/s)": 0.134449 }, { "epoch": 1.1421226266669695, "grad_norm": 0.7020424008369446, "learning_rate": 4.241342410647065e-05, "loss": 0.7642602920532227, "memory(GiB)": 91.52, "step": 88020, "token_acc": 0.7719882680957382, "train_speed(iter/s)": 0.134448 }, { "epoch": 1.1421875050686252, "grad_norm": 0.799426257610321, "learning_rate": 4.240812250068805e-05, "loss": 0.7604034900665283, "memory(GiB)": 91.52, "step": 88025, "token_acc": 0.7778963246385343, "train_speed(iter/s)": 0.134446 }, { "epoch": 1.142252383470281, "grad_norm": 0.8281062245368958, "learning_rate": 4.240282098227231e-05, "loss": 0.8063173294067383, "memory(GiB)": 91.52, "step": 88030, "token_acc": 0.7660193817016102, "train_speed(iter/s)": 0.134446 }, { "epoch": 1.1423172618719366, "grad_norm": 0.7077211141586304, "learning_rate": 4.23975195512844e-05, "loss": 0.7429022789001465, "memory(GiB)": 91.52, "step": 88035, "token_acc": 0.7720323140606222, "train_speed(iter/s)": 0.134445 }, { "epoch": 1.1423821402735923, "grad_norm": 0.7242961525917053, "learning_rate": 4.2392218207785333e-05, "loss": 0.7482157707214355, "memory(GiB)": 91.52, "step": 88040, "token_acc": 0.7912042175360711, "train_speed(iter/s)": 0.134444 }, { "epoch": 1.142447018675248, "grad_norm": 0.6985286474227905, "learning_rate": 4.238691695183614e-05, "loss": 0.7389308452606201, "memory(GiB)": 91.52, "step": 88045, "token_acc": 0.7869792138051709, "train_speed(iter/s)": 0.134443 }, { "epoch": 1.1425118970769037, "grad_norm": 0.718183696269989, "learning_rate": 4.238161578349783e-05, "loss": 0.7607264041900634, "memory(GiB)": 91.52, "step": 88050, "token_acc": 0.8088724696974323, "train_speed(iter/s)": 0.134442 }, { "epoch": 1.1425767754785594, "grad_norm": 0.6377739310264587, "learning_rate": 4.237631470283137e-05, "loss": 0.7233833789825439, "memory(GiB)": 91.52, "step": 88055, "token_acc": 0.782283956875866, "train_speed(iter/s)": 0.134441 }, { "epoch": 1.142641653880215, "grad_norm": 0.686330258846283, "learning_rate": 4.2371013709897805e-05, "loss": 0.7446614265441894, "memory(GiB)": 91.52, "step": 88060, "token_acc": 0.7806120863756408, "train_speed(iter/s)": 0.13444 }, { "epoch": 1.1427065322818706, "grad_norm": 0.7370418906211853, "learning_rate": 4.2365712804758135e-05, "loss": 0.7341103553771973, "memory(GiB)": 91.52, "step": 88065, "token_acc": 0.7934139534883721, "train_speed(iter/s)": 0.134439 }, { "epoch": 1.1427714106835265, "grad_norm": 0.6860151290893555, "learning_rate": 4.236041198747334e-05, "loss": 0.761334753036499, "memory(GiB)": 91.52, "step": 88070, "token_acc": 0.7721748656328096, "train_speed(iter/s)": 0.134438 }, { "epoch": 1.142836289085182, "grad_norm": 0.702360212802887, "learning_rate": 4.235511125810442e-05, "loss": 0.7694465637207031, "memory(GiB)": 91.52, "step": 88075, "token_acc": 0.7636166650251157, "train_speed(iter/s)": 0.134438 }, { "epoch": 1.142901167486838, "grad_norm": 0.8091323375701904, "learning_rate": 4.2349810616712396e-05, "loss": 0.7297737121582031, "memory(GiB)": 91.52, "step": 88080, "token_acc": 0.8020990729774983, "train_speed(iter/s)": 0.134437 }, { "epoch": 1.1429660458884934, "grad_norm": 0.6564017534255981, "learning_rate": 4.234451006335827e-05, "loss": 0.7307185173034668, "memory(GiB)": 91.52, "step": 88085, "token_acc": 0.7981689421972694, "train_speed(iter/s)": 0.134435 }, { "epoch": 1.1430309242901493, "grad_norm": 0.7164912819862366, "learning_rate": 4.233920959810301e-05, "loss": 0.7544111251831055, "memory(GiB)": 91.52, "step": 88090, "token_acc": 0.7882037533512064, "train_speed(iter/s)": 0.134434 }, { "epoch": 1.1430958026918048, "grad_norm": 0.7440571784973145, "learning_rate": 4.2333909221007636e-05, "loss": 0.7337518215179444, "memory(GiB)": 91.52, "step": 88095, "token_acc": 0.7882800552053415, "train_speed(iter/s)": 0.134433 }, { "epoch": 1.1431606810934607, "grad_norm": 0.7117645740509033, "learning_rate": 4.232860893213317e-05, "loss": 0.7831466674804688, "memory(GiB)": 91.52, "step": 88100, "token_acc": 0.777473630424468, "train_speed(iter/s)": 0.134433 }, { "epoch": 1.1432255594951162, "grad_norm": 0.7061954140663147, "learning_rate": 4.232330873154056e-05, "loss": 0.7242473125457763, "memory(GiB)": 91.52, "step": 88105, "token_acc": 0.7971442969931127, "train_speed(iter/s)": 0.134432 }, { "epoch": 1.143290437896772, "grad_norm": 0.7896761894226074, "learning_rate": 4.23180086192908e-05, "loss": 0.7668614864349366, "memory(GiB)": 91.52, "step": 88110, "token_acc": 0.7844580224825305, "train_speed(iter/s)": 0.134431 }, { "epoch": 1.1433553162984276, "grad_norm": 0.6806729435920715, "learning_rate": 4.2312708595444925e-05, "loss": 0.7495286941528321, "memory(GiB)": 91.52, "step": 88115, "token_acc": 0.7950845948352627, "train_speed(iter/s)": 0.13443 }, { "epoch": 1.1434201947000835, "grad_norm": 0.6986842155456543, "learning_rate": 4.23074086600639e-05, "loss": 0.7474715232849121, "memory(GiB)": 91.52, "step": 88120, "token_acc": 0.8007091150834688, "train_speed(iter/s)": 0.13443 }, { "epoch": 1.143485073101739, "grad_norm": 0.6281819939613342, "learning_rate": 4.230210881320871e-05, "loss": 0.7215829372406006, "memory(GiB)": 91.52, "step": 88125, "token_acc": 0.7881188118811882, "train_speed(iter/s)": 0.134428 }, { "epoch": 1.1435499515033947, "grad_norm": 0.7241522669792175, "learning_rate": 4.229680905494037e-05, "loss": 0.7519460678100586, "memory(GiB)": 91.52, "step": 88130, "token_acc": 0.7902439024390244, "train_speed(iter/s)": 0.134428 }, { "epoch": 1.1436148299050504, "grad_norm": 0.7733384370803833, "learning_rate": 4.229150938531985e-05, "loss": 0.7795959949493408, "memory(GiB)": 91.52, "step": 88135, "token_acc": 0.7795287296513789, "train_speed(iter/s)": 0.134428 }, { "epoch": 1.143679708306706, "grad_norm": 0.7470332980155945, "learning_rate": 4.2286209804408155e-05, "loss": 0.7727568626403809, "memory(GiB)": 91.52, "step": 88140, "token_acc": 0.7896090709087832, "train_speed(iter/s)": 0.134427 }, { "epoch": 1.1437445867083618, "grad_norm": 0.6420565843582153, "learning_rate": 4.228091031226624e-05, "loss": 0.7779589653015136, "memory(GiB)": 91.52, "step": 88145, "token_acc": 0.7604460566975682, "train_speed(iter/s)": 0.134426 }, { "epoch": 1.1438094651100175, "grad_norm": 0.7877939939498901, "learning_rate": 4.227561090895513e-05, "loss": 0.806153678894043, "memory(GiB)": 91.52, "step": 88150, "token_acc": 0.7651439920556107, "train_speed(iter/s)": 0.134425 }, { "epoch": 1.1438743435116732, "grad_norm": 0.6868630647659302, "learning_rate": 4.2270311594535784e-05, "loss": 0.7550135612487793, "memory(GiB)": 91.52, "step": 88155, "token_acc": 0.7759132567445463, "train_speed(iter/s)": 0.134425 }, { "epoch": 1.1439392219133289, "grad_norm": 0.8236095309257507, "learning_rate": 4.226501236906919e-05, "loss": 0.7935413837432861, "memory(GiB)": 91.52, "step": 88160, "token_acc": 0.7871945178335535, "train_speed(iter/s)": 0.134424 }, { "epoch": 1.1440041003149846, "grad_norm": 0.7768616676330566, "learning_rate": 4.225971323261634e-05, "loss": 0.7916996002197265, "memory(GiB)": 91.52, "step": 88165, "token_acc": 0.7676990350908893, "train_speed(iter/s)": 0.134423 }, { "epoch": 1.1440689787166403, "grad_norm": 0.7015578150749207, "learning_rate": 4.2254414185238216e-05, "loss": 0.7608355522155762, "memory(GiB)": 91.52, "step": 88170, "token_acc": 0.7815106757480677, "train_speed(iter/s)": 0.134422 }, { "epoch": 1.144133857118296, "grad_norm": 0.6534326672554016, "learning_rate": 4.2249115226995813e-05, "loss": 0.7380498886108399, "memory(GiB)": 91.52, "step": 88175, "token_acc": 0.7814638400644036, "train_speed(iter/s)": 0.134421 }, { "epoch": 1.1441987355199517, "grad_norm": 0.7111989259719849, "learning_rate": 4.224381635795006e-05, "loss": 0.7748476505279541, "memory(GiB)": 91.52, "step": 88180, "token_acc": 0.7674958323799562, "train_speed(iter/s)": 0.13442 }, { "epoch": 1.1442636139216074, "grad_norm": 0.6998803615570068, "learning_rate": 4.223851757816199e-05, "loss": 0.7350497245788574, "memory(GiB)": 91.52, "step": 88185, "token_acc": 0.7859074681764091, "train_speed(iter/s)": 0.134419 }, { "epoch": 1.144328492323263, "grad_norm": 0.6799151301383972, "learning_rate": 4.223321888769255e-05, "loss": 0.740244197845459, "memory(GiB)": 91.52, "step": 88190, "token_acc": 0.7826074514965947, "train_speed(iter/s)": 0.134418 }, { "epoch": 1.1443933707249188, "grad_norm": 0.7113497853279114, "learning_rate": 4.2227920286602716e-05, "loss": 0.7756882667541504, "memory(GiB)": 91.52, "step": 88195, "token_acc": 0.7888031925098806, "train_speed(iter/s)": 0.134418 }, { "epoch": 1.1444582491265745, "grad_norm": 0.6535026431083679, "learning_rate": 4.2222621774953495e-05, "loss": 0.7753829956054688, "memory(GiB)": 91.52, "step": 88200, "token_acc": 0.763657801650727, "train_speed(iter/s)": 0.134417 }, { "epoch": 1.1445231275282302, "grad_norm": 0.7120672464370728, "learning_rate": 4.221732335280584e-05, "loss": 0.7627823352813721, "memory(GiB)": 91.52, "step": 88205, "token_acc": 0.7775393948361323, "train_speed(iter/s)": 0.134416 }, { "epoch": 1.1445880059298859, "grad_norm": 0.7099764943122864, "learning_rate": 4.221202502022073e-05, "loss": 0.7569624900817871, "memory(GiB)": 91.52, "step": 88210, "token_acc": 0.7859626249456758, "train_speed(iter/s)": 0.134414 }, { "epoch": 1.1446528843315416, "grad_norm": 0.6341323852539062, "learning_rate": 4.2206726777259125e-05, "loss": 0.7611470222473145, "memory(GiB)": 91.52, "step": 88215, "token_acc": 0.7764266961448114, "train_speed(iter/s)": 0.134414 }, { "epoch": 1.1447177627331973, "grad_norm": 0.6289256811141968, "learning_rate": 4.220142862398201e-05, "loss": 0.771498155593872, "memory(GiB)": 91.52, "step": 88220, "token_acc": 0.7752482269503546, "train_speed(iter/s)": 0.134413 }, { "epoch": 1.144782641134853, "grad_norm": 0.7794569134712219, "learning_rate": 4.219613056045035e-05, "loss": 0.7132392883300781, "memory(GiB)": 91.52, "step": 88225, "token_acc": 0.7800169472792249, "train_speed(iter/s)": 0.134412 }, { "epoch": 1.1448475195365087, "grad_norm": 0.6656138896942139, "learning_rate": 4.2190832586725117e-05, "loss": 0.7583171844482421, "memory(GiB)": 91.52, "step": 88230, "token_acc": 0.798611684984712, "train_speed(iter/s)": 0.134412 }, { "epoch": 1.1449123979381644, "grad_norm": 0.6403953433036804, "learning_rate": 4.218553470286728e-05, "loss": 0.7400599479675293, "memory(GiB)": 91.52, "step": 88235, "token_acc": 0.8014858002633064, "train_speed(iter/s)": 0.134411 }, { "epoch": 1.14497727633982, "grad_norm": 0.6867300868034363, "learning_rate": 4.2180236908937806e-05, "loss": 0.7400773525238037, "memory(GiB)": 91.52, "step": 88240, "token_acc": 0.7973875790167471, "train_speed(iter/s)": 0.13441 }, { "epoch": 1.1450421547414757, "grad_norm": 0.649225652217865, "learning_rate": 4.217493920499769e-05, "loss": 0.7669061660766602, "memory(GiB)": 91.52, "step": 88245, "token_acc": 0.7844609840703196, "train_speed(iter/s)": 0.134409 }, { "epoch": 1.1451070331431314, "grad_norm": 0.6593641042709351, "learning_rate": 4.216964159110783e-05, "loss": 0.7370021343231201, "memory(GiB)": 91.52, "step": 88250, "token_acc": 0.7886843601066853, "train_speed(iter/s)": 0.134409 }, { "epoch": 1.1451719115447871, "grad_norm": 0.7587539553642273, "learning_rate": 4.216434406732925e-05, "loss": 0.7994794368743896, "memory(GiB)": 91.52, "step": 88255, "token_acc": 0.7859134760325596, "train_speed(iter/s)": 0.134408 }, { "epoch": 1.1452367899464428, "grad_norm": 0.7383955717086792, "learning_rate": 4.215904663372289e-05, "loss": 0.7834866046905518, "memory(GiB)": 91.52, "step": 88260, "token_acc": 0.7683239719852992, "train_speed(iter/s)": 0.134408 }, { "epoch": 1.1453016683480985, "grad_norm": 0.6136283874511719, "learning_rate": 4.215374929034971e-05, "loss": 0.7442394256591797, "memory(GiB)": 91.52, "step": 88265, "token_acc": 0.7874596588289534, "train_speed(iter/s)": 0.134406 }, { "epoch": 1.1453665467497542, "grad_norm": 0.7087845802307129, "learning_rate": 4.214845203727069e-05, "loss": 0.7612743377685547, "memory(GiB)": 91.52, "step": 88270, "token_acc": 0.7769902951717472, "train_speed(iter/s)": 0.134406 }, { "epoch": 1.14543142515141, "grad_norm": 0.7239570021629333, "learning_rate": 4.214315487454677e-05, "loss": 0.7630542755126953, "memory(GiB)": 91.52, "step": 88275, "token_acc": 0.7699682840581823, "train_speed(iter/s)": 0.134405 }, { "epoch": 1.1454963035530656, "grad_norm": 0.7176023721694946, "learning_rate": 4.2137857802238915e-05, "loss": 0.7385663032531739, "memory(GiB)": 91.52, "step": 88280, "token_acc": 0.7931629955947137, "train_speed(iter/s)": 0.134404 }, { "epoch": 1.1455611819547213, "grad_norm": 0.6660927534103394, "learning_rate": 4.213256082040811e-05, "loss": 0.7850345611572266, "memory(GiB)": 91.52, "step": 88285, "token_acc": 0.7755989804587935, "train_speed(iter/s)": 0.134403 }, { "epoch": 1.145626060356377, "grad_norm": 0.8610800504684448, "learning_rate": 4.2127263929115276e-05, "loss": 0.7629030227661133, "memory(GiB)": 91.52, "step": 88290, "token_acc": 0.7893035160986124, "train_speed(iter/s)": 0.134402 }, { "epoch": 1.1456909387580327, "grad_norm": 0.7759537696838379, "learning_rate": 4.212196712842137e-05, "loss": 0.7461350440979004, "memory(GiB)": 91.52, "step": 88295, "token_acc": 0.7837343195943112, "train_speed(iter/s)": 0.134402 }, { "epoch": 1.1457558171596884, "grad_norm": 0.7249438166618347, "learning_rate": 4.2116670418387365e-05, "loss": 0.7281522750854492, "memory(GiB)": 91.52, "step": 88300, "token_acc": 0.7905853274912931, "train_speed(iter/s)": 0.134401 }, { "epoch": 1.1458206955613441, "grad_norm": 0.6496248841285706, "learning_rate": 4.211137379907422e-05, "loss": 0.7244687080383301, "memory(GiB)": 91.52, "step": 88305, "token_acc": 0.7879249291784702, "train_speed(iter/s)": 0.1344 }, { "epoch": 1.1458855739629998, "grad_norm": 0.7898886203765869, "learning_rate": 4.2106077270542856e-05, "loss": 0.7611661911010742, "memory(GiB)": 91.52, "step": 88310, "token_acc": 0.7870125466259749, "train_speed(iter/s)": 0.134399 }, { "epoch": 1.1459504523646555, "grad_norm": 0.7530964016914368, "learning_rate": 4.210078083285426e-05, "loss": 0.7573648452758789, "memory(GiB)": 91.52, "step": 88315, "token_acc": 0.7806527062091145, "train_speed(iter/s)": 0.134398 }, { "epoch": 1.1460153307663112, "grad_norm": 0.6645600199699402, "learning_rate": 4.209548448606938e-05, "loss": 0.7648794174194335, "memory(GiB)": 91.52, "step": 88320, "token_acc": 0.788574073461831, "train_speed(iter/s)": 0.134397 }, { "epoch": 1.146080209167967, "grad_norm": 0.6202709078788757, "learning_rate": 4.209018823024915e-05, "loss": 0.70445876121521, "memory(GiB)": 91.52, "step": 88325, "token_acc": 0.8130504267401135, "train_speed(iter/s)": 0.134396 }, { "epoch": 1.1461450875696226, "grad_norm": 0.6261945962905884, "learning_rate": 4.2084892065454524e-05, "loss": 0.7743989944458007, "memory(GiB)": 91.52, "step": 88330, "token_acc": 0.7665883634630443, "train_speed(iter/s)": 0.134396 }, { "epoch": 1.1462099659712783, "grad_norm": 0.6408817768096924, "learning_rate": 4.2079595991746435e-05, "loss": 0.7403875350952148, "memory(GiB)": 91.52, "step": 88335, "token_acc": 0.7915682863178825, "train_speed(iter/s)": 0.134395 }, { "epoch": 1.146274844372934, "grad_norm": 0.7046403288841248, "learning_rate": 4.207430000918586e-05, "loss": 0.7468387603759765, "memory(GiB)": 91.52, "step": 88340, "token_acc": 0.7989522156967432, "train_speed(iter/s)": 0.134393 }, { "epoch": 1.1463397227745897, "grad_norm": 0.6643826961517334, "learning_rate": 4.206900411783372e-05, "loss": 0.7784123420715332, "memory(GiB)": 91.52, "step": 88345, "token_acc": 0.778247539131838, "train_speed(iter/s)": 0.134393 }, { "epoch": 1.1464046011762454, "grad_norm": 0.7435877919197083, "learning_rate": 4.206370831775098e-05, "loss": 0.7586521148681641, "memory(GiB)": 91.52, "step": 88350, "token_acc": 0.7742968440512347, "train_speed(iter/s)": 0.134392 }, { "epoch": 1.1464694795779011, "grad_norm": 0.7229765057563782, "learning_rate": 4.2058412608998586e-05, "loss": 0.7744170188903808, "memory(GiB)": 91.52, "step": 88355, "token_acc": 0.7637156904242504, "train_speed(iter/s)": 0.134391 }, { "epoch": 1.1465343579795568, "grad_norm": 0.6926152110099792, "learning_rate": 4.205311699163745e-05, "loss": 0.7819928169250489, "memory(GiB)": 91.52, "step": 88360, "token_acc": 0.7815027393686408, "train_speed(iter/s)": 0.13439 }, { "epoch": 1.1465992363812125, "grad_norm": 0.6553632020950317, "learning_rate": 4.204782146572854e-05, "loss": 0.7448570251464843, "memory(GiB)": 91.52, "step": 88365, "token_acc": 0.7878749724574271, "train_speed(iter/s)": 0.134388 }, { "epoch": 1.1466641147828682, "grad_norm": 0.6927667260169983, "learning_rate": 4.2042526031332786e-05, "loss": 0.7695859909057617, "memory(GiB)": 91.52, "step": 88370, "token_acc": 0.7855646100116415, "train_speed(iter/s)": 0.134388 }, { "epoch": 1.146728993184524, "grad_norm": 0.659049391746521, "learning_rate": 4.203723068851113e-05, "loss": 0.7216962814331055, "memory(GiB)": 91.52, "step": 88375, "token_acc": 0.802000284738041, "train_speed(iter/s)": 0.134387 }, { "epoch": 1.1467938715861796, "grad_norm": 0.785844087600708, "learning_rate": 4.20319354373245e-05, "loss": 0.7477731704711914, "memory(GiB)": 91.52, "step": 88380, "token_acc": 0.793170223066324, "train_speed(iter/s)": 0.134386 }, { "epoch": 1.1468587499878353, "grad_norm": 0.7317821383476257, "learning_rate": 4.202664027783386e-05, "loss": 0.752492094039917, "memory(GiB)": 91.52, "step": 88385, "token_acc": 0.7815304175793119, "train_speed(iter/s)": 0.134385 }, { "epoch": 1.146923628389491, "grad_norm": 0.7430275082588196, "learning_rate": 4.202134521010014e-05, "loss": 0.7736562252044678, "memory(GiB)": 91.52, "step": 88390, "token_acc": 0.7785058977719528, "train_speed(iter/s)": 0.134384 }, { "epoch": 1.1469885067911467, "grad_norm": 0.8403403162956238, "learning_rate": 4.201605023418424e-05, "loss": 0.7866199493408204, "memory(GiB)": 91.52, "step": 88395, "token_acc": 0.7622349823321555, "train_speed(iter/s)": 0.134384 }, { "epoch": 1.1470533851928024, "grad_norm": 0.7749624252319336, "learning_rate": 4.201075535014713e-05, "loss": 0.731314754486084, "memory(GiB)": 91.52, "step": 88400, "token_acc": 0.7849278601242318, "train_speed(iter/s)": 0.134383 }, { "epoch": 1.1471182635944581, "grad_norm": 0.8219757676124573, "learning_rate": 4.2005460558049734e-05, "loss": 0.7958084106445312, "memory(GiB)": 91.52, "step": 88405, "token_acc": 0.7769086748491782, "train_speed(iter/s)": 0.134382 }, { "epoch": 1.1471831419961138, "grad_norm": 0.7600803971290588, "learning_rate": 4.2000165857952987e-05, "loss": 0.7882384777069091, "memory(GiB)": 91.52, "step": 88410, "token_acc": 0.7786516853932585, "train_speed(iter/s)": 0.134382 }, { "epoch": 1.1472480203977695, "grad_norm": 0.7797754406929016, "learning_rate": 4.1994871249917804e-05, "loss": 0.7669005870819092, "memory(GiB)": 91.52, "step": 88415, "token_acc": 0.7778248437996399, "train_speed(iter/s)": 0.134381 }, { "epoch": 1.1473128987994252, "grad_norm": 0.701585054397583, "learning_rate": 4.1989576734005145e-05, "loss": 0.7757769584655761, "memory(GiB)": 91.52, "step": 88420, "token_acc": 0.7668802365697388, "train_speed(iter/s)": 0.13438 }, { "epoch": 1.147377777201081, "grad_norm": 0.5763723850250244, "learning_rate": 4.198428231027592e-05, "loss": 0.7287098407745362, "memory(GiB)": 91.52, "step": 88425, "token_acc": 0.7764780271893772, "train_speed(iter/s)": 0.134379 }, { "epoch": 1.1474426556027366, "grad_norm": 0.6749125123023987, "learning_rate": 4.1978987978791054e-05, "loss": 0.7683182716369629, "memory(GiB)": 91.52, "step": 88430, "token_acc": 0.7857077219958954, "train_speed(iter/s)": 0.134378 }, { "epoch": 1.1475075340043923, "grad_norm": 0.8009558916091919, "learning_rate": 4.197369373961147e-05, "loss": 0.7621035575866699, "memory(GiB)": 91.52, "step": 88435, "token_acc": 0.78529183142384, "train_speed(iter/s)": 0.134378 }, { "epoch": 1.147572412406048, "grad_norm": 0.7413989901542664, "learning_rate": 4.196839959279811e-05, "loss": 0.7762440204620361, "memory(GiB)": 91.52, "step": 88440, "token_acc": 0.7696326255476913, "train_speed(iter/s)": 0.134377 }, { "epoch": 1.1476372908077037, "grad_norm": 0.8263944387435913, "learning_rate": 4.196310553841189e-05, "loss": 0.7698781967163086, "memory(GiB)": 91.52, "step": 88445, "token_acc": 0.7921875, "train_speed(iter/s)": 0.134376 }, { "epoch": 1.1477021692093594, "grad_norm": 0.7036772966384888, "learning_rate": 4.195781157651372e-05, "loss": 0.7967310905456543, "memory(GiB)": 91.52, "step": 88450, "token_acc": 0.7767881611400383, "train_speed(iter/s)": 0.134376 }, { "epoch": 1.1477670476110151, "grad_norm": 0.7083677649497986, "learning_rate": 4.195251770716455e-05, "loss": 0.7899791717529296, "memory(GiB)": 91.52, "step": 88455, "token_acc": 0.7745765165773852, "train_speed(iter/s)": 0.134375 }, { "epoch": 1.1478319260126708, "grad_norm": 0.6825978755950928, "learning_rate": 4.194722393042529e-05, "loss": 0.7405952453613281, "memory(GiB)": 91.52, "step": 88460, "token_acc": 0.7976888523128622, "train_speed(iter/s)": 0.134374 }, { "epoch": 1.1478968044143265, "grad_norm": 0.7737236022949219, "learning_rate": 4.194193024635687e-05, "loss": 0.7517275810241699, "memory(GiB)": 91.52, "step": 88465, "token_acc": 0.7762124601399267, "train_speed(iter/s)": 0.134373 }, { "epoch": 1.1479616828159822, "grad_norm": 0.7248621582984924, "learning_rate": 4.193663665502018e-05, "loss": 0.7623584747314454, "memory(GiB)": 91.52, "step": 88470, "token_acc": 0.7761541016273663, "train_speed(iter/s)": 0.134373 }, { "epoch": 1.148026561217638, "grad_norm": 0.7359911203384399, "learning_rate": 4.193134315647616e-05, "loss": 0.7754881381988525, "memory(GiB)": 91.52, "step": 88475, "token_acc": 0.7862148801702021, "train_speed(iter/s)": 0.134371 }, { "epoch": 1.1480914396192936, "grad_norm": 0.6495792865753174, "learning_rate": 4.1926049750785724e-05, "loss": 0.746858024597168, "memory(GiB)": 91.52, "step": 88480, "token_acc": 0.7834269943138363, "train_speed(iter/s)": 0.13437 }, { "epoch": 1.1481563180209493, "grad_norm": 0.6713228821754456, "learning_rate": 4.1920756438009776e-05, "loss": 0.7108184814453125, "memory(GiB)": 91.52, "step": 88485, "token_acc": 0.7854742722966681, "train_speed(iter/s)": 0.134369 }, { "epoch": 1.148221196422605, "grad_norm": 0.7240744233131409, "learning_rate": 4.191546321820926e-05, "loss": 0.7615365982055664, "memory(GiB)": 91.52, "step": 88490, "token_acc": 0.7606475883380357, "train_speed(iter/s)": 0.134369 }, { "epoch": 1.1482860748242607, "grad_norm": 0.7437121868133545, "learning_rate": 4.191017009144507e-05, "loss": 0.7893843173980712, "memory(GiB)": 91.52, "step": 88495, "token_acc": 0.7796249366447034, "train_speed(iter/s)": 0.134368 }, { "epoch": 1.1483509532259164, "grad_norm": 0.7391822338104248, "learning_rate": 4.190487705777813e-05, "loss": 0.7578961372375488, "memory(GiB)": 91.52, "step": 88500, "token_acc": 0.7793050079227622, "train_speed(iter/s)": 0.134368 }, { "epoch": 1.148415831627572, "grad_norm": 0.7267091274261475, "learning_rate": 4.189958411726933e-05, "loss": 0.7849105834960938, "memory(GiB)": 91.52, "step": 88505, "token_acc": 0.7968432129077516, "train_speed(iter/s)": 0.134367 }, { "epoch": 1.1484807100292278, "grad_norm": 0.8178050518035889, "learning_rate": 4.18942912699796e-05, "loss": 0.7812739372253418, "memory(GiB)": 91.52, "step": 88510, "token_acc": 0.757501172058134, "train_speed(iter/s)": 0.134367 }, { "epoch": 1.1485455884308835, "grad_norm": 0.7598398923873901, "learning_rate": 4.188899851596985e-05, "loss": 0.7587443828582764, "memory(GiB)": 91.52, "step": 88515, "token_acc": 0.785705667055464, "train_speed(iter/s)": 0.134366 }, { "epoch": 1.1486104668325392, "grad_norm": 0.7886874675750732, "learning_rate": 4.188370585530096e-05, "loss": 0.7923937320709229, "memory(GiB)": 91.52, "step": 88520, "token_acc": 0.7687194097450508, "train_speed(iter/s)": 0.134365 }, { "epoch": 1.148675345234195, "grad_norm": 0.6383728384971619, "learning_rate": 4.1878413288033877e-05, "loss": 0.7096503257751465, "memory(GiB)": 91.52, "step": 88525, "token_acc": 0.7897631285719421, "train_speed(iter/s)": 0.134365 }, { "epoch": 1.1487402236358506, "grad_norm": 0.6924296617507935, "learning_rate": 4.1873120814229494e-05, "loss": 0.7389664649963379, "memory(GiB)": 91.52, "step": 88530, "token_acc": 0.7875682519585344, "train_speed(iter/s)": 0.134364 }, { "epoch": 1.1488051020375063, "grad_norm": 0.7092418670654297, "learning_rate": 4.1867828433948724e-05, "loss": 0.7864705562591553, "memory(GiB)": 91.52, "step": 88535, "token_acc": 0.7930788048706743, "train_speed(iter/s)": 0.134363 }, { "epoch": 1.148869980439162, "grad_norm": 0.6613959074020386, "learning_rate": 4.186253614725244e-05, "loss": 0.7262049674987793, "memory(GiB)": 91.52, "step": 88540, "token_acc": 0.7865980644370942, "train_speed(iter/s)": 0.134363 }, { "epoch": 1.1489348588408177, "grad_norm": 0.7430468797683716, "learning_rate": 4.1857243954201576e-05, "loss": 0.7668214797973633, "memory(GiB)": 91.52, "step": 88545, "token_acc": 0.7781451612903226, "train_speed(iter/s)": 0.134362 }, { "epoch": 1.1489997372424732, "grad_norm": 0.8155292272567749, "learning_rate": 4.185195185485702e-05, "loss": 0.7674009799957275, "memory(GiB)": 91.52, "step": 88550, "token_acc": 0.7701905937200055, "train_speed(iter/s)": 0.134361 }, { "epoch": 1.149064615644129, "grad_norm": 0.6476951837539673, "learning_rate": 4.184665984927967e-05, "loss": 0.737978458404541, "memory(GiB)": 91.52, "step": 88555, "token_acc": 0.7922998986828774, "train_speed(iter/s)": 0.134361 }, { "epoch": 1.1491294940457846, "grad_norm": 0.8091416954994202, "learning_rate": 4.1841367937530454e-05, "loss": 0.7696374893188477, "memory(GiB)": 91.52, "step": 88560, "token_acc": 0.7928331038702768, "train_speed(iter/s)": 0.13436 }, { "epoch": 1.1491943724474405, "grad_norm": 0.6938063502311707, "learning_rate": 4.1836076119670236e-05, "loss": 0.7786121845245362, "memory(GiB)": 91.52, "step": 88565, "token_acc": 0.7784048437733593, "train_speed(iter/s)": 0.134359 }, { "epoch": 1.149259250849096, "grad_norm": 0.8154862523078918, "learning_rate": 4.1830784395759945e-05, "loss": 0.7758280277252197, "memory(GiB)": 91.52, "step": 88570, "token_acc": 0.7808294761364681, "train_speed(iter/s)": 0.134358 }, { "epoch": 1.149324129250752, "grad_norm": 0.7374590039253235, "learning_rate": 4.1825492765860444e-05, "loss": 0.7557351112365722, "memory(GiB)": 91.52, "step": 88575, "token_acc": 0.7570458989991947, "train_speed(iter/s)": 0.134357 }, { "epoch": 1.1493890076524074, "grad_norm": 0.7942487001419067, "learning_rate": 4.1820201230032653e-05, "loss": 0.7513912200927735, "memory(GiB)": 91.52, "step": 88580, "token_acc": 0.7666091299717214, "train_speed(iter/s)": 0.134357 }, { "epoch": 1.1494538860540633, "grad_norm": 0.6584351658821106, "learning_rate": 4.181490978833746e-05, "loss": 0.7815855979919434, "memory(GiB)": 91.52, "step": 88585, "token_acc": 0.7691047174865282, "train_speed(iter/s)": 0.134356 }, { "epoch": 1.1495187644557188, "grad_norm": 0.7639154195785522, "learning_rate": 4.1809618440835746e-05, "loss": 0.7653745651245117, "memory(GiB)": 91.52, "step": 88590, "token_acc": 0.782562038900067, "train_speed(iter/s)": 0.134356 }, { "epoch": 1.1495836428573747, "grad_norm": 0.7592513561248779, "learning_rate": 4.180432718758843e-05, "loss": 0.7596803665161133, "memory(GiB)": 91.52, "step": 88595, "token_acc": 0.7859564529495969, "train_speed(iter/s)": 0.134355 }, { "epoch": 1.1496485212590302, "grad_norm": 0.7835453152656555, "learning_rate": 4.179903602865639e-05, "loss": 0.7769640445709228, "memory(GiB)": 91.52, "step": 88600, "token_acc": 0.7676038454561944, "train_speed(iter/s)": 0.134355 }, { "epoch": 1.1497133996606859, "grad_norm": 0.7245311141014099, "learning_rate": 4.1793744964100524e-05, "loss": 0.7147633552551269, "memory(GiB)": 91.52, "step": 88605, "token_acc": 0.7892780277832127, "train_speed(iter/s)": 0.134354 }, { "epoch": 1.1497782780623416, "grad_norm": 0.7595716118812561, "learning_rate": 4.178845399398169e-05, "loss": 0.7430952072143555, "memory(GiB)": 91.52, "step": 88610, "token_acc": 0.80188432347553, "train_speed(iter/s)": 0.134353 }, { "epoch": 1.1498431564639973, "grad_norm": 0.6942127346992493, "learning_rate": 4.178316311836081e-05, "loss": 0.7113070964813233, "memory(GiB)": 91.52, "step": 88615, "token_acc": 0.7921023281751011, "train_speed(iter/s)": 0.134352 }, { "epoch": 1.149908034865653, "grad_norm": 0.678657591342926, "learning_rate": 4.177787233729876e-05, "loss": 0.7312829494476318, "memory(GiB)": 91.52, "step": 88620, "token_acc": 0.7880758057822659, "train_speed(iter/s)": 0.134351 }, { "epoch": 1.1499729132673087, "grad_norm": 0.6613963842391968, "learning_rate": 4.177258165085641e-05, "loss": 0.7327835083007812, "memory(GiB)": 91.52, "step": 88625, "token_acc": 0.7759063758855397, "train_speed(iter/s)": 0.13435 }, { "epoch": 1.1500377916689644, "grad_norm": 0.7647121548652649, "learning_rate": 4.176729105909468e-05, "loss": 0.7858344078063965, "memory(GiB)": 91.52, "step": 88630, "token_acc": 0.7739400407490056, "train_speed(iter/s)": 0.134349 }, { "epoch": 1.15010267007062, "grad_norm": 0.7976469397544861, "learning_rate": 4.176200056207443e-05, "loss": 0.7529121398925781, "memory(GiB)": 91.52, "step": 88635, "token_acc": 0.7917077986179665, "train_speed(iter/s)": 0.134349 }, { "epoch": 1.1501675484722758, "grad_norm": 0.8032996654510498, "learning_rate": 4.1756710159856544e-05, "loss": 0.8120233535766601, "memory(GiB)": 91.52, "step": 88640, "token_acc": 0.7780152778751139, "train_speed(iter/s)": 0.134348 }, { "epoch": 1.1502324268739315, "grad_norm": 0.7721975445747375, "learning_rate": 4.175141985250193e-05, "loss": 0.7892637252807617, "memory(GiB)": 91.52, "step": 88645, "token_acc": 0.7736102337334175, "train_speed(iter/s)": 0.134347 }, { "epoch": 1.1502973052755872, "grad_norm": 0.7092704176902771, "learning_rate": 4.174612964007143e-05, "loss": 0.7275418281555176, "memory(GiB)": 91.52, "step": 88650, "token_acc": 0.7822747518675692, "train_speed(iter/s)": 0.134346 }, { "epoch": 1.1503621836772429, "grad_norm": 0.7068308591842651, "learning_rate": 4.174083952262593e-05, "loss": 0.7591907501220703, "memory(GiB)": 91.52, "step": 88655, "token_acc": 0.7804265588312027, "train_speed(iter/s)": 0.134345 }, { "epoch": 1.1504270620788986, "grad_norm": 0.6419057250022888, "learning_rate": 4.173554950022631e-05, "loss": 0.770885705947876, "memory(GiB)": 91.52, "step": 88660, "token_acc": 0.7693443858327054, "train_speed(iter/s)": 0.134344 }, { "epoch": 1.1504919404805543, "grad_norm": 0.6976821422576904, "learning_rate": 4.1730259572933476e-05, "loss": 0.7591529369354248, "memory(GiB)": 91.52, "step": 88665, "token_acc": 0.7871482303892691, "train_speed(iter/s)": 0.134344 }, { "epoch": 1.15055681888221, "grad_norm": 0.7096119523048401, "learning_rate": 4.172496974080826e-05, "loss": 0.7328061103820801, "memory(GiB)": 91.52, "step": 88670, "token_acc": 0.800299091222823, "train_speed(iter/s)": 0.134343 }, { "epoch": 1.1506216972838657, "grad_norm": 0.7005694508552551, "learning_rate": 4.1719680003911576e-05, "loss": 0.7263062477111817, "memory(GiB)": 91.52, "step": 88675, "token_acc": 0.7816776965455112, "train_speed(iter/s)": 0.134342 }, { "epoch": 1.1506865756855214, "grad_norm": 0.7062340974807739, "learning_rate": 4.171439036230429e-05, "loss": 0.7115665435791015, "memory(GiB)": 91.52, "step": 88680, "token_acc": 0.7736285643088534, "train_speed(iter/s)": 0.134341 }, { "epoch": 1.150751454087177, "grad_norm": 0.7030295729637146, "learning_rate": 4.170910081604726e-05, "loss": 0.7673059463500976, "memory(GiB)": 91.52, "step": 88685, "token_acc": 0.7578819592533054, "train_speed(iter/s)": 0.13434 }, { "epoch": 1.1508163324888327, "grad_norm": 0.6244174838066101, "learning_rate": 4.170381136520136e-05, "loss": 0.7320613861083984, "memory(GiB)": 91.52, "step": 88690, "token_acc": 0.7977180227502, "train_speed(iter/s)": 0.134339 }, { "epoch": 1.1508812108904884, "grad_norm": 0.6728355884552002, "learning_rate": 4.169852200982745e-05, "loss": 0.7541440010070801, "memory(GiB)": 91.52, "step": 88695, "token_acc": 0.7878540660488187, "train_speed(iter/s)": 0.134338 }, { "epoch": 1.1509460892921441, "grad_norm": 0.7315468788146973, "learning_rate": 4.1693232749986434e-05, "loss": 0.7095829010009765, "memory(GiB)": 91.52, "step": 88700, "token_acc": 0.7857166561577008, "train_speed(iter/s)": 0.134337 }, { "epoch": 1.1510109676937998, "grad_norm": 0.6771226525306702, "learning_rate": 4.168794358573915e-05, "loss": 0.7542270660400391, "memory(GiB)": 91.52, "step": 88705, "token_acc": 0.7872734684801542, "train_speed(iter/s)": 0.134336 }, { "epoch": 1.1510758460954555, "grad_norm": 0.5919477939605713, "learning_rate": 4.168265451714648e-05, "loss": 0.7134439945220947, "memory(GiB)": 91.52, "step": 88710, "token_acc": 0.799877099549365, "train_speed(iter/s)": 0.134334 }, { "epoch": 1.1511407244971112, "grad_norm": 0.8343738913536072, "learning_rate": 4.16773655442693e-05, "loss": 0.7911423206329345, "memory(GiB)": 91.52, "step": 88715, "token_acc": 0.7785286901123499, "train_speed(iter/s)": 0.134334 }, { "epoch": 1.151205602898767, "grad_norm": 0.5937207341194153, "learning_rate": 4.167207666716845e-05, "loss": 0.7279552459716797, "memory(GiB)": 91.52, "step": 88720, "token_acc": 0.7875112620417215, "train_speed(iter/s)": 0.134333 }, { "epoch": 1.1512704813004226, "grad_norm": 0.7174897789955139, "learning_rate": 4.166678788590481e-05, "loss": 0.7301549434661865, "memory(GiB)": 91.52, "step": 88725, "token_acc": 0.7868711748022093, "train_speed(iter/s)": 0.134333 }, { "epoch": 1.1513353597020783, "grad_norm": 0.6824886202812195, "learning_rate": 4.166149920053923e-05, "loss": 0.7368780136108398, "memory(GiB)": 91.52, "step": 88730, "token_acc": 0.7773810773915782, "train_speed(iter/s)": 0.134332 }, { "epoch": 1.151400238103734, "grad_norm": 0.6750330924987793, "learning_rate": 4.165621061113259e-05, "loss": 0.7616455554962158, "memory(GiB)": 91.52, "step": 88735, "token_acc": 0.7919277488010642, "train_speed(iter/s)": 0.134331 }, { "epoch": 1.1514651165053897, "grad_norm": 0.6702504754066467, "learning_rate": 4.1650922117745716e-05, "loss": 0.7665340423583984, "memory(GiB)": 91.52, "step": 88740, "token_acc": 0.7800689862027594, "train_speed(iter/s)": 0.13433 }, { "epoch": 1.1515299949070454, "grad_norm": 0.6292784214019775, "learning_rate": 4.164563372043952e-05, "loss": 0.7090666770935059, "memory(GiB)": 91.52, "step": 88745, "token_acc": 0.8060576453346361, "train_speed(iter/s)": 0.134329 }, { "epoch": 1.1515948733087011, "grad_norm": 0.7128511071205139, "learning_rate": 4.164034541927484e-05, "loss": 0.7899508476257324, "memory(GiB)": 91.52, "step": 88750, "token_acc": 0.7856640039176078, "train_speed(iter/s)": 0.134328 }, { "epoch": 1.1516597517103568, "grad_norm": 0.6818573474884033, "learning_rate": 4.1635057214312515e-05, "loss": 0.738014554977417, "memory(GiB)": 91.52, "step": 88755, "token_acc": 0.797914597815293, "train_speed(iter/s)": 0.134327 }, { "epoch": 1.1517246301120125, "grad_norm": 0.7241778373718262, "learning_rate": 4.1629769105613415e-05, "loss": 0.7518102169036865, "memory(GiB)": 91.52, "step": 88760, "token_acc": 0.7885858130447851, "train_speed(iter/s)": 0.134327 }, { "epoch": 1.1517895085136682, "grad_norm": 0.703102707862854, "learning_rate": 4.1624481093238374e-05, "loss": 0.7843783855438232, "memory(GiB)": 91.52, "step": 88765, "token_acc": 0.7796843554708188, "train_speed(iter/s)": 0.134326 }, { "epoch": 1.151854386915324, "grad_norm": 0.774433434009552, "learning_rate": 4.161919317724829e-05, "loss": 0.7743505001068115, "memory(GiB)": 91.52, "step": 88770, "token_acc": 0.7687778112349025, "train_speed(iter/s)": 0.134326 }, { "epoch": 1.1519192653169796, "grad_norm": 0.7006446123123169, "learning_rate": 4.161390535770398e-05, "loss": 0.7538261413574219, "memory(GiB)": 91.52, "step": 88775, "token_acc": 0.788649577167019, "train_speed(iter/s)": 0.134325 }, { "epoch": 1.1519841437186353, "grad_norm": 0.7006314396858215, "learning_rate": 4.1608617634666303e-05, "loss": 0.7374663829803467, "memory(GiB)": 91.52, "step": 88780, "token_acc": 0.7844767238361918, "train_speed(iter/s)": 0.134324 }, { "epoch": 1.152049022120291, "grad_norm": 0.6900457739830017, "learning_rate": 4.160333000819613e-05, "loss": 0.7620491027832031, "memory(GiB)": 91.52, "step": 88785, "token_acc": 0.7761565307789411, "train_speed(iter/s)": 0.134323 }, { "epoch": 1.1521139005219467, "grad_norm": 0.7932555079460144, "learning_rate": 4.159804247835429e-05, "loss": 0.7453066349029541, "memory(GiB)": 91.52, "step": 88790, "token_acc": 0.7862636107515419, "train_speed(iter/s)": 0.134322 }, { "epoch": 1.1521787789236024, "grad_norm": 0.7007162570953369, "learning_rate": 4.159275504520164e-05, "loss": 0.7311413288116455, "memory(GiB)": 91.52, "step": 88795, "token_acc": 0.7890922138500179, "train_speed(iter/s)": 0.134322 }, { "epoch": 1.1522436573252581, "grad_norm": 0.8014063239097595, "learning_rate": 4.158746770879901e-05, "loss": 0.7709487438201904, "memory(GiB)": 91.52, "step": 88800, "token_acc": 0.7802686249879215, "train_speed(iter/s)": 0.134321 }, { "epoch": 1.1523085357269138, "grad_norm": 0.6634889245033264, "learning_rate": 4.158218046920727e-05, "loss": 0.7593876361846924, "memory(GiB)": 91.52, "step": 88805, "token_acc": 0.7756969650166036, "train_speed(iter/s)": 0.13432 }, { "epoch": 1.1523734141285695, "grad_norm": 0.6774401068687439, "learning_rate": 4.157689332648724e-05, "loss": 0.7457386970520019, "memory(GiB)": 91.52, "step": 88810, "token_acc": 0.8052826818712647, "train_speed(iter/s)": 0.134318 }, { "epoch": 1.1524382925302252, "grad_norm": 0.7093320488929749, "learning_rate": 4.157160628069979e-05, "loss": 0.7132461547851563, "memory(GiB)": 91.52, "step": 88815, "token_acc": 0.7926341306697041, "train_speed(iter/s)": 0.134317 }, { "epoch": 1.152503170931881, "grad_norm": 0.7248069047927856, "learning_rate": 4.156631933190576e-05, "loss": 0.7491748809814454, "memory(GiB)": 91.52, "step": 88820, "token_acc": 0.7971737601714496, "train_speed(iter/s)": 0.134316 }, { "epoch": 1.1525680493335366, "grad_norm": 0.6356683373451233, "learning_rate": 4.156103248016599e-05, "loss": 0.7183442115783691, "memory(GiB)": 91.52, "step": 88825, "token_acc": 0.7791331006816477, "train_speed(iter/s)": 0.134315 }, { "epoch": 1.1526329277351923, "grad_norm": 0.7052215337753296, "learning_rate": 4.155574572554129e-05, "loss": 0.7531803607940674, "memory(GiB)": 91.52, "step": 88830, "token_acc": 0.7891033623910336, "train_speed(iter/s)": 0.134314 }, { "epoch": 1.152697806136848, "grad_norm": 0.7840345501899719, "learning_rate": 4.155045906809255e-05, "loss": 0.7551225662231446, "memory(GiB)": 91.52, "step": 88835, "token_acc": 0.7682373714641831, "train_speed(iter/s)": 0.134313 }, { "epoch": 1.1527626845385037, "grad_norm": 0.6967939734458923, "learning_rate": 4.154517250788057e-05, "loss": 0.7476775646209717, "memory(GiB)": 91.52, "step": 88840, "token_acc": 0.7867093847870694, "train_speed(iter/s)": 0.134312 }, { "epoch": 1.1528275629401594, "grad_norm": 0.7204099893569946, "learning_rate": 4.1539886044966194e-05, "loss": 0.7904692649841308, "memory(GiB)": 91.52, "step": 88845, "token_acc": 0.7767154105736783, "train_speed(iter/s)": 0.134312 }, { "epoch": 1.1528924413418151, "grad_norm": 0.7729039788246155, "learning_rate": 4.1534599679410275e-05, "loss": 0.71005277633667, "memory(GiB)": 91.52, "step": 88850, "token_acc": 0.7826122250182644, "train_speed(iter/s)": 0.134311 }, { "epoch": 1.1529573197434708, "grad_norm": 0.7429335117340088, "learning_rate": 4.152931341127364e-05, "loss": 0.7402298927307129, "memory(GiB)": 91.52, "step": 88855, "token_acc": 0.7967732045681, "train_speed(iter/s)": 0.13431 }, { "epoch": 1.1530221981451265, "grad_norm": 0.777094841003418, "learning_rate": 4.152402724061713e-05, "loss": 0.7344244003295899, "memory(GiB)": 91.52, "step": 88860, "token_acc": 0.7941046425939573, "train_speed(iter/s)": 0.134309 }, { "epoch": 1.1530870765467822, "grad_norm": 0.7843533158302307, "learning_rate": 4.151874116750154e-05, "loss": 0.765858268737793, "memory(GiB)": 91.52, "step": 88865, "token_acc": 0.7946443185910237, "train_speed(iter/s)": 0.134308 }, { "epoch": 1.153151954948438, "grad_norm": 0.7408039569854736, "learning_rate": 4.1513455191987754e-05, "loss": 0.7387287139892578, "memory(GiB)": 91.52, "step": 88870, "token_acc": 0.7830763147489126, "train_speed(iter/s)": 0.134308 }, { "epoch": 1.1532168333500936, "grad_norm": 0.7530072927474976, "learning_rate": 4.150816931413658e-05, "loss": 0.7618844985961915, "memory(GiB)": 91.52, "step": 88875, "token_acc": 0.7790434610041675, "train_speed(iter/s)": 0.134307 }, { "epoch": 1.1532817117517493, "grad_norm": 0.6903296709060669, "learning_rate": 4.150288353400883e-05, "loss": 0.7652918815612793, "memory(GiB)": 91.52, "step": 88880, "token_acc": 0.773317362167762, "train_speed(iter/s)": 0.134307 }, { "epoch": 1.153346590153405, "grad_norm": 0.7468043565750122, "learning_rate": 4.149759785166536e-05, "loss": 0.742207384109497, "memory(GiB)": 91.52, "step": 88885, "token_acc": 0.8022987008758506, "train_speed(iter/s)": 0.134306 }, { "epoch": 1.1534114685550607, "grad_norm": 0.7234847545623779, "learning_rate": 4.149231226716699e-05, "loss": 0.7248624801635742, "memory(GiB)": 91.52, "step": 88890, "token_acc": 0.8088883293465945, "train_speed(iter/s)": 0.134305 }, { "epoch": 1.1534763469567164, "grad_norm": 0.7500243782997131, "learning_rate": 4.148702678057455e-05, "loss": 0.754033374786377, "memory(GiB)": 91.52, "step": 88895, "token_acc": 0.7986270845313399, "train_speed(iter/s)": 0.134304 }, { "epoch": 1.1535412253583721, "grad_norm": 0.7273382544517517, "learning_rate": 4.1481741391948835e-05, "loss": 0.7394155502319336, "memory(GiB)": 91.52, "step": 88900, "token_acc": 0.7901584070233475, "train_speed(iter/s)": 0.134303 }, { "epoch": 1.1536061037600278, "grad_norm": 0.7475613355636597, "learning_rate": 4.1476456101350705e-05, "loss": 0.7827951431274414, "memory(GiB)": 91.52, "step": 88905, "token_acc": 0.7713150175062297, "train_speed(iter/s)": 0.134302 }, { "epoch": 1.1536709821616835, "grad_norm": 0.5423612594604492, "learning_rate": 4.147117090884097e-05, "loss": 0.7278775215148926, "memory(GiB)": 91.52, "step": 88910, "token_acc": 0.7977020799601445, "train_speed(iter/s)": 0.134301 }, { "epoch": 1.1537358605633392, "grad_norm": 0.7065948247909546, "learning_rate": 4.1465885814480446e-05, "loss": 0.7482295513153077, "memory(GiB)": 91.52, "step": 88915, "token_acc": 0.776819317235637, "train_speed(iter/s)": 0.1343 }, { "epoch": 1.153800738964995, "grad_norm": 0.6326248049736023, "learning_rate": 4.1460600818329956e-05, "loss": 0.7259993553161621, "memory(GiB)": 91.52, "step": 88920, "token_acc": 0.8117695248498088, "train_speed(iter/s)": 0.134299 }, { "epoch": 1.1538656173666506, "grad_norm": 0.6749030351638794, "learning_rate": 4.145531592045033e-05, "loss": 0.7514790534973145, "memory(GiB)": 91.52, "step": 88925, "token_acc": 0.7976569725619155, "train_speed(iter/s)": 0.134299 }, { "epoch": 1.1539304957683063, "grad_norm": 0.6626773476600647, "learning_rate": 4.1450031120902396e-05, "loss": 0.7501926422119141, "memory(GiB)": 91.52, "step": 88930, "token_acc": 0.7754132159206066, "train_speed(iter/s)": 0.134298 }, { "epoch": 1.153995374169962, "grad_norm": 0.6819464564323425, "learning_rate": 4.144474641974692e-05, "loss": 0.7620053768157959, "memory(GiB)": 91.52, "step": 88935, "token_acc": 0.7961434661868815, "train_speed(iter/s)": 0.134297 }, { "epoch": 1.1540602525716177, "grad_norm": 0.8207980394363403, "learning_rate": 4.143946181704477e-05, "loss": 0.7694353103637696, "memory(GiB)": 91.52, "step": 88940, "token_acc": 0.7839705051981267, "train_speed(iter/s)": 0.134296 }, { "epoch": 1.1541251309732734, "grad_norm": 0.71928471326828, "learning_rate": 4.143417731285674e-05, "loss": 0.719597578048706, "memory(GiB)": 91.52, "step": 88945, "token_acc": 0.8011392533821585, "train_speed(iter/s)": 0.134295 }, { "epoch": 1.154190009374929, "grad_norm": 0.7038962244987488, "learning_rate": 4.142889290724364e-05, "loss": 0.7200666427612304, "memory(GiB)": 91.52, "step": 88950, "token_acc": 0.7980662779540926, "train_speed(iter/s)": 0.134294 }, { "epoch": 1.1542548877765848, "grad_norm": 0.6867343187332153, "learning_rate": 4.142360860026629e-05, "loss": 0.7466578483581543, "memory(GiB)": 91.52, "step": 88955, "token_acc": 0.7831368345837799, "train_speed(iter/s)": 0.134294 }, { "epoch": 1.1543197661782405, "grad_norm": 0.6637178659439087, "learning_rate": 4.1418324391985505e-05, "loss": 0.7110044956207275, "memory(GiB)": 91.52, "step": 88960, "token_acc": 0.7996935452978357, "train_speed(iter/s)": 0.134293 }, { "epoch": 1.1543846445798962, "grad_norm": 0.7064062356948853, "learning_rate": 4.1413040282462104e-05, "loss": 0.7256811141967774, "memory(GiB)": 91.52, "step": 88965, "token_acc": 0.7845313072744843, "train_speed(iter/s)": 0.134292 }, { "epoch": 1.154449522981552, "grad_norm": 0.6866298913955688, "learning_rate": 4.140775627175686e-05, "loss": 0.7379539489746094, "memory(GiB)": 91.52, "step": 88970, "token_acc": 0.7956284985339477, "train_speed(iter/s)": 0.134291 }, { "epoch": 1.1545144013832076, "grad_norm": 0.7297512292861938, "learning_rate": 4.140247235993062e-05, "loss": 0.7243529319763183, "memory(GiB)": 91.52, "step": 88975, "token_acc": 0.7996535978586049, "train_speed(iter/s)": 0.13429 }, { "epoch": 1.1545792797848633, "grad_norm": 0.6569032073020935, "learning_rate": 4.1397188547044165e-05, "loss": 0.7665425300598144, "memory(GiB)": 91.52, "step": 88980, "token_acc": 0.78351755526658, "train_speed(iter/s)": 0.134289 }, { "epoch": 1.154644158186519, "grad_norm": 0.739289402961731, "learning_rate": 4.13919048331583e-05, "loss": 0.743277645111084, "memory(GiB)": 91.52, "step": 88985, "token_acc": 0.7817087429091066, "train_speed(iter/s)": 0.134289 }, { "epoch": 1.1547090365881747, "grad_norm": 0.733821451663971, "learning_rate": 4.1386621218333854e-05, "loss": 0.727168607711792, "memory(GiB)": 91.52, "step": 88990, "token_acc": 0.8031234628627644, "train_speed(iter/s)": 0.134288 }, { "epoch": 1.1547739149898304, "grad_norm": 0.7434675097465515, "learning_rate": 4.1381337702631615e-05, "loss": 0.7681728363037109, "memory(GiB)": 91.52, "step": 88995, "token_acc": 0.7901504124211548, "train_speed(iter/s)": 0.134287 }, { "epoch": 1.154838793391486, "grad_norm": 0.7561978101730347, "learning_rate": 4.137605428611238e-05, "loss": 0.7928287982940674, "memory(GiB)": 91.52, "step": 89000, "token_acc": 0.7738723271421882, "train_speed(iter/s)": 0.134287 }, { "epoch": 1.1549036717931418, "grad_norm": 0.6837778091430664, "learning_rate": 4.137077096883698e-05, "loss": 0.7784607887268067, "memory(GiB)": 91.52, "step": 89005, "token_acc": 0.7895011169024572, "train_speed(iter/s)": 0.134286 }, { "epoch": 1.1549685501947975, "grad_norm": 0.7060525417327881, "learning_rate": 4.136548775086618e-05, "loss": 0.7549237728118896, "memory(GiB)": 91.52, "step": 89010, "token_acc": 0.78660469047116, "train_speed(iter/s)": 0.134285 }, { "epoch": 1.1550334285964532, "grad_norm": 0.7867281436920166, "learning_rate": 4.136020463226079e-05, "loss": 0.7745987892150878, "memory(GiB)": 91.52, "step": 89015, "token_acc": 0.7840429750874284, "train_speed(iter/s)": 0.134285 }, { "epoch": 1.155098306998109, "grad_norm": 0.7059725522994995, "learning_rate": 4.13549216130816e-05, "loss": 0.740293312072754, "memory(GiB)": 91.52, "step": 89020, "token_acc": 0.7964224150111623, "train_speed(iter/s)": 0.134284 }, { "epoch": 1.1551631853997644, "grad_norm": 0.7773672938346863, "learning_rate": 4.134963869338942e-05, "loss": 0.7882817268371582, "memory(GiB)": 91.52, "step": 89025, "token_acc": 0.7772936636374953, "train_speed(iter/s)": 0.134283 }, { "epoch": 1.1552280638014203, "grad_norm": 0.7016081213951111, "learning_rate": 4.1344355873245046e-05, "loss": 0.7372269630432129, "memory(GiB)": 91.52, "step": 89030, "token_acc": 0.7802150251306461, "train_speed(iter/s)": 0.134282 }, { "epoch": 1.1552929422030758, "grad_norm": 0.7266326546669006, "learning_rate": 4.133907315270926e-05, "loss": 0.7555527687072754, "memory(GiB)": 91.52, "step": 89035, "token_acc": 0.7961771016188804, "train_speed(iter/s)": 0.134281 }, { "epoch": 1.1553578206047317, "grad_norm": 0.7826082706451416, "learning_rate": 4.133379053184287e-05, "loss": 0.7514330863952636, "memory(GiB)": 91.52, "step": 89040, "token_acc": 0.7824594665271967, "train_speed(iter/s)": 0.13428 }, { "epoch": 1.1554226990063872, "grad_norm": 0.7647123336791992, "learning_rate": 4.132850801070667e-05, "loss": 0.768031120300293, "memory(GiB)": 91.52, "step": 89045, "token_acc": 0.7785775094806904, "train_speed(iter/s)": 0.134279 }, { "epoch": 1.155487577408043, "grad_norm": 0.6855292916297913, "learning_rate": 4.132322558936143e-05, "loss": 0.7063207626342773, "memory(GiB)": 91.52, "step": 89050, "token_acc": 0.7740726195151558, "train_speed(iter/s)": 0.134279 }, { "epoch": 1.1555524558096986, "grad_norm": 0.7011728286743164, "learning_rate": 4.131794326786794e-05, "loss": 0.7447373390197753, "memory(GiB)": 91.52, "step": 89055, "token_acc": 0.7857358490566038, "train_speed(iter/s)": 0.134278 }, { "epoch": 1.1556173342113545, "grad_norm": 0.6892698407173157, "learning_rate": 4.131266104628702e-05, "loss": 0.711127758026123, "memory(GiB)": 91.52, "step": 89060, "token_acc": 0.792676074957861, "train_speed(iter/s)": 0.134277 }, { "epoch": 1.15568221261301, "grad_norm": 0.6958277225494385, "learning_rate": 4.130737892467942e-05, "loss": 0.7785576820373535, "memory(GiB)": 91.52, "step": 89065, "token_acc": 0.7784457281031704, "train_speed(iter/s)": 0.134276 }, { "epoch": 1.1557470910146659, "grad_norm": 0.6309027671813965, "learning_rate": 4.1302096903105954e-05, "loss": 0.664164686203003, "memory(GiB)": 91.52, "step": 89070, "token_acc": 0.8106074973133574, "train_speed(iter/s)": 0.134275 }, { "epoch": 1.1558119694163214, "grad_norm": 0.6861658096313477, "learning_rate": 4.129681498162741e-05, "loss": 0.7379987716674805, "memory(GiB)": 91.52, "step": 89075, "token_acc": 0.7916390410719893, "train_speed(iter/s)": 0.134274 }, { "epoch": 1.155876847817977, "grad_norm": 0.757164478302002, "learning_rate": 4.129153316030455e-05, "loss": 0.7664042472839355, "memory(GiB)": 91.52, "step": 89080, "token_acc": 0.7878218380003759, "train_speed(iter/s)": 0.134273 }, { "epoch": 1.1559417262196328, "grad_norm": 0.6951079964637756, "learning_rate": 4.128625143919816e-05, "loss": 0.7713384628295898, "memory(GiB)": 91.52, "step": 89085, "token_acc": 0.7833645263460984, "train_speed(iter/s)": 0.134272 }, { "epoch": 1.1560066046212885, "grad_norm": 0.7127852439880371, "learning_rate": 4.128096981836902e-05, "loss": 0.8154644012451172, "memory(GiB)": 91.52, "step": 89090, "token_acc": 0.7635065910798443, "train_speed(iter/s)": 0.134271 }, { "epoch": 1.1560714830229442, "grad_norm": 0.6773269176483154, "learning_rate": 4.127568829787793e-05, "loss": 0.750458812713623, "memory(GiB)": 91.52, "step": 89095, "token_acc": 0.7842523048446232, "train_speed(iter/s)": 0.134271 }, { "epoch": 1.1561363614245999, "grad_norm": 0.6540678143501282, "learning_rate": 4.1270406877785644e-05, "loss": 0.7471665382385254, "memory(GiB)": 91.52, "step": 89100, "token_acc": 0.7773558586484811, "train_speed(iter/s)": 0.13427 }, { "epoch": 1.1562012398262556, "grad_norm": 0.6830919981002808, "learning_rate": 4.1265125558152964e-05, "loss": 0.7865782260894776, "memory(GiB)": 91.52, "step": 89105, "token_acc": 0.7701286931303245, "train_speed(iter/s)": 0.134269 }, { "epoch": 1.1562661182279113, "grad_norm": 0.7821295261383057, "learning_rate": 4.125984433904068e-05, "loss": 0.7689859390258789, "memory(GiB)": 91.52, "step": 89110, "token_acc": 0.7791431792559188, "train_speed(iter/s)": 0.134269 }, { "epoch": 1.156330996629567, "grad_norm": 0.7539458274841309, "learning_rate": 4.125456322050952e-05, "loss": 0.7414932250976562, "memory(GiB)": 91.52, "step": 89115, "token_acc": 0.7871055226824457, "train_speed(iter/s)": 0.134268 }, { "epoch": 1.1563958750312227, "grad_norm": 0.7912964224815369, "learning_rate": 4.1249282202620285e-05, "loss": 0.7682958126068116, "memory(GiB)": 91.52, "step": 89120, "token_acc": 0.7585665229780083, "train_speed(iter/s)": 0.134267 }, { "epoch": 1.1564607534328784, "grad_norm": 0.6701748371124268, "learning_rate": 4.124400128543374e-05, "loss": 0.7199945449829102, "memory(GiB)": 91.52, "step": 89125, "token_acc": 0.7997820014987397, "train_speed(iter/s)": 0.134266 }, { "epoch": 1.156525631834534, "grad_norm": 0.7567071914672852, "learning_rate": 4.123872046901068e-05, "loss": 0.7874890327453613, "memory(GiB)": 91.52, "step": 89130, "token_acc": 0.7830350576121599, "train_speed(iter/s)": 0.134265 }, { "epoch": 1.1565905102361898, "grad_norm": 0.7269290089607239, "learning_rate": 4.123343975341184e-05, "loss": 0.7185891151428223, "memory(GiB)": 91.52, "step": 89135, "token_acc": 0.7947335159047819, "train_speed(iter/s)": 0.134264 }, { "epoch": 1.1566553886378454, "grad_norm": 0.7767406702041626, "learning_rate": 4.122815913869803e-05, "loss": 0.7423874855041503, "memory(GiB)": 91.52, "step": 89140, "token_acc": 0.7782066324955228, "train_speed(iter/s)": 0.134264 }, { "epoch": 1.1567202670395011, "grad_norm": 0.7347124814987183, "learning_rate": 4.122287862493001e-05, "loss": 0.7636178970336914, "memory(GiB)": 91.52, "step": 89145, "token_acc": 0.7873702983138781, "train_speed(iter/s)": 0.134263 }, { "epoch": 1.1567851454411568, "grad_norm": 0.681040346622467, "learning_rate": 4.1217598212168524e-05, "loss": 0.7186817169189453, "memory(GiB)": 91.52, "step": 89150, "token_acc": 0.7917052740817613, "train_speed(iter/s)": 0.134262 }, { "epoch": 1.1568500238428125, "grad_norm": 0.6717743277549744, "learning_rate": 4.121231790047435e-05, "loss": 0.7848525047302246, "memory(GiB)": 91.52, "step": 89155, "token_acc": 0.792075297834529, "train_speed(iter/s)": 0.134261 }, { "epoch": 1.1569149022444682, "grad_norm": 0.728776216506958, "learning_rate": 4.120703768990825e-05, "loss": 0.7721218585968017, "memory(GiB)": 91.52, "step": 89160, "token_acc": 0.7701185715288009, "train_speed(iter/s)": 0.13426 }, { "epoch": 1.156979780646124, "grad_norm": 0.7039821147918701, "learning_rate": 4.1201757580531e-05, "loss": 0.725321626663208, "memory(GiB)": 91.52, "step": 89165, "token_acc": 0.7888634775427228, "train_speed(iter/s)": 0.134259 }, { "epoch": 1.1570446590477796, "grad_norm": 0.7294819951057434, "learning_rate": 4.119647757240336e-05, "loss": 0.7603461742401123, "memory(GiB)": 91.52, "step": 89170, "token_acc": 0.7815013404825737, "train_speed(iter/s)": 0.134259 }, { "epoch": 1.1571095374494353, "grad_norm": 0.7255065441131592, "learning_rate": 4.119119766558609e-05, "loss": 0.7561186790466309, "memory(GiB)": 91.52, "step": 89175, "token_acc": 0.803664840437057, "train_speed(iter/s)": 0.134258 }, { "epoch": 1.157174415851091, "grad_norm": 0.6998085975646973, "learning_rate": 4.118591786013995e-05, "loss": 0.7873578548431397, "memory(GiB)": 91.52, "step": 89180, "token_acc": 0.7769303737431228, "train_speed(iter/s)": 0.134257 }, { "epoch": 1.1572392942527467, "grad_norm": 0.645393431186676, "learning_rate": 4.118063815612571e-05, "loss": 0.7714863777160644, "memory(GiB)": 91.52, "step": 89185, "token_acc": 0.7866556186868687, "train_speed(iter/s)": 0.134256 }, { "epoch": 1.1573041726544024, "grad_norm": 0.7400208711624146, "learning_rate": 4.117535855360411e-05, "loss": 0.7716323375701905, "memory(GiB)": 91.52, "step": 89190, "token_acc": 0.7722679881580641, "train_speed(iter/s)": 0.134255 }, { "epoch": 1.1573690510560581, "grad_norm": 0.7084410190582275, "learning_rate": 4.1170079052635905e-05, "loss": 0.7412469863891602, "memory(GiB)": 91.52, "step": 89195, "token_acc": 0.7700897229949457, "train_speed(iter/s)": 0.134254 }, { "epoch": 1.1574339294577138, "grad_norm": 0.7032425403594971, "learning_rate": 4.116479965328187e-05, "loss": 0.7207651138305664, "memory(GiB)": 91.52, "step": 89200, "token_acc": 0.8024250834241914, "train_speed(iter/s)": 0.134253 }, { "epoch": 1.1574988078593695, "grad_norm": 0.6604031920433044, "learning_rate": 4.115952035560274e-05, "loss": 0.7569313049316406, "memory(GiB)": 91.52, "step": 89205, "token_acc": 0.7700676090964966, "train_speed(iter/s)": 0.134252 }, { "epoch": 1.1575636862610252, "grad_norm": 0.6931920051574707, "learning_rate": 4.11542411596593e-05, "loss": 0.7614672660827637, "memory(GiB)": 91.52, "step": 89210, "token_acc": 0.7781858007988661, "train_speed(iter/s)": 0.134252 }, { "epoch": 1.157628564662681, "grad_norm": 0.7835177779197693, "learning_rate": 4.1148962065512284e-05, "loss": 0.7898493766784668, "memory(GiB)": 91.52, "step": 89215, "token_acc": 0.7696454488907319, "train_speed(iter/s)": 0.134251 }, { "epoch": 1.1576934430643366, "grad_norm": 0.7783824801445007, "learning_rate": 4.1143683073222445e-05, "loss": 0.7578680992126465, "memory(GiB)": 91.52, "step": 89220, "token_acc": 0.7866854336367992, "train_speed(iter/s)": 0.13425 }, { "epoch": 1.1577583214659923, "grad_norm": 0.7095963358879089, "learning_rate": 4.113840418285051e-05, "loss": 0.7293922424316406, "memory(GiB)": 91.52, "step": 89225, "token_acc": 0.7875655487229788, "train_speed(iter/s)": 0.134249 }, { "epoch": 1.157823199867648, "grad_norm": 0.6468709111213684, "learning_rate": 4.113312539445727e-05, "loss": 0.7459277629852294, "memory(GiB)": 91.52, "step": 89230, "token_acc": 0.7960044315785711, "train_speed(iter/s)": 0.134248 }, { "epoch": 1.1578880782693037, "grad_norm": 0.6992401480674744, "learning_rate": 4.112784670810344e-05, "loss": 0.8071701049804687, "memory(GiB)": 91.52, "step": 89235, "token_acc": 0.7617082298625442, "train_speed(iter/s)": 0.134248 }, { "epoch": 1.1579529566709594, "grad_norm": 0.6836540102958679, "learning_rate": 4.1122568123849774e-05, "loss": 0.7659790515899658, "memory(GiB)": 91.52, "step": 89240, "token_acc": 0.779356029397225, "train_speed(iter/s)": 0.134247 }, { "epoch": 1.1580178350726151, "grad_norm": 0.6429727077484131, "learning_rate": 4.111728964175704e-05, "loss": 0.736335277557373, "memory(GiB)": 91.52, "step": 89245, "token_acc": 0.7898989898989899, "train_speed(iter/s)": 0.134246 }, { "epoch": 1.1580827134742708, "grad_norm": 0.7579625248908997, "learning_rate": 4.111201126188595e-05, "loss": 0.7426679134368896, "memory(GiB)": 91.52, "step": 89250, "token_acc": 0.7822665267576075, "train_speed(iter/s)": 0.134245 }, { "epoch": 1.1581475918759265, "grad_norm": 0.7544557452201843, "learning_rate": 4.1106732984297284e-05, "loss": 0.7167757987976074, "memory(GiB)": 91.52, "step": 89255, "token_acc": 0.7897302683083055, "train_speed(iter/s)": 0.134244 }, { "epoch": 1.1582124702775822, "grad_norm": 0.7477899193763733, "learning_rate": 4.110145480905174e-05, "loss": 0.7679222106933594, "memory(GiB)": 91.52, "step": 89260, "token_acc": 0.7951704755565416, "train_speed(iter/s)": 0.134244 }, { "epoch": 1.158277348679238, "grad_norm": 0.7963544130325317, "learning_rate": 4.109617673621009e-05, "loss": 0.7607120990753173, "memory(GiB)": 91.52, "step": 89265, "token_acc": 0.7788791075868557, "train_speed(iter/s)": 0.134243 }, { "epoch": 1.1583422270808936, "grad_norm": 0.7317503094673157, "learning_rate": 4.1090898765833067e-05, "loss": 0.7596276760101318, "memory(GiB)": 91.52, "step": 89270, "token_acc": 0.7776475185619383, "train_speed(iter/s)": 0.134242 }, { "epoch": 1.1584071054825493, "grad_norm": 0.726810097694397, "learning_rate": 4.108562089798139e-05, "loss": 0.7400562286376953, "memory(GiB)": 91.52, "step": 89275, "token_acc": 0.7948261978257952, "train_speed(iter/s)": 0.134241 }, { "epoch": 1.158471983884205, "grad_norm": 0.7461198568344116, "learning_rate": 4.108034313271583e-05, "loss": 0.7528394222259521, "memory(GiB)": 91.52, "step": 89280, "token_acc": 0.7793386011120866, "train_speed(iter/s)": 0.13424 }, { "epoch": 1.1585368622858607, "grad_norm": 0.7485880255699158, "learning_rate": 4.107506547009711e-05, "loss": 0.7365263938903809, "memory(GiB)": 91.52, "step": 89285, "token_acc": 0.7951629345800963, "train_speed(iter/s)": 0.13424 }, { "epoch": 1.1586017406875164, "grad_norm": 0.6537411212921143, "learning_rate": 4.106978791018597e-05, "loss": 0.7566225051879882, "memory(GiB)": 91.52, "step": 89290, "token_acc": 0.7705835331734612, "train_speed(iter/s)": 0.134239 }, { "epoch": 1.1586666190891721, "grad_norm": 0.7324155569076538, "learning_rate": 4.1064510453043114e-05, "loss": 0.7132441997528076, "memory(GiB)": 91.52, "step": 89295, "token_acc": 0.7894273127753304, "train_speed(iter/s)": 0.134238 }, { "epoch": 1.1587314974908278, "grad_norm": 0.7083739042282104, "learning_rate": 4.105923309872931e-05, "loss": 0.7375002861022949, "memory(GiB)": 91.52, "step": 89300, "token_acc": 0.7662391738840773, "train_speed(iter/s)": 0.134237 }, { "epoch": 1.1587963758924835, "grad_norm": 0.5878435373306274, "learning_rate": 4.105395584730527e-05, "loss": 0.7167055130004882, "memory(GiB)": 91.52, "step": 89305, "token_acc": 0.8056254676552616, "train_speed(iter/s)": 0.134236 }, { "epoch": 1.1588612542941392, "grad_norm": 0.6970428824424744, "learning_rate": 4.1048678698831725e-05, "loss": 0.7487861156463623, "memory(GiB)": 91.52, "step": 89310, "token_acc": 0.7808332349817066, "train_speed(iter/s)": 0.134235 }, { "epoch": 1.158926132695795, "grad_norm": 0.7454648613929749, "learning_rate": 4.1043401653369425e-05, "loss": 0.7556224822998047, "memory(GiB)": 91.52, "step": 89315, "token_acc": 0.7635622452179367, "train_speed(iter/s)": 0.134234 }, { "epoch": 1.1589910110974506, "grad_norm": 0.6677675247192383, "learning_rate": 4.1038124710979075e-05, "loss": 0.7539851188659668, "memory(GiB)": 91.52, "step": 89320, "token_acc": 0.7787824963668658, "train_speed(iter/s)": 0.134233 }, { "epoch": 1.1590558894991063, "grad_norm": 0.7370718121528625, "learning_rate": 4.1032847871721416e-05, "loss": 0.7544790267944336, "memory(GiB)": 91.52, "step": 89325, "token_acc": 0.7674444610108841, "train_speed(iter/s)": 0.134232 }, { "epoch": 1.159120767900762, "grad_norm": 0.781813383102417, "learning_rate": 4.102757113565716e-05, "loss": 0.7486445426940918, "memory(GiB)": 91.52, "step": 89330, "token_acc": 0.786940852236591, "train_speed(iter/s)": 0.134232 }, { "epoch": 1.1591856463024177, "grad_norm": 0.7138918042182922, "learning_rate": 4.102229450284704e-05, "loss": 0.7249096393585205, "memory(GiB)": 91.52, "step": 89335, "token_acc": 0.7980769230769231, "train_speed(iter/s)": 0.134231 }, { "epoch": 1.1592505247040734, "grad_norm": 0.6189805865287781, "learning_rate": 4.101701797335177e-05, "loss": 0.7205961227416993, "memory(GiB)": 91.52, "step": 89340, "token_acc": 0.8112760759255779, "train_speed(iter/s)": 0.13423 }, { "epoch": 1.1593154031057291, "grad_norm": 0.6922235488891602, "learning_rate": 4.101174154723208e-05, "loss": 0.7619979858398438, "memory(GiB)": 91.52, "step": 89345, "token_acc": 0.7970639366428434, "train_speed(iter/s)": 0.134229 }, { "epoch": 1.1593802815073848, "grad_norm": 0.809975802898407, "learning_rate": 4.1006465224548684e-05, "loss": 0.7746211051940918, "memory(GiB)": 91.52, "step": 89350, "token_acc": 0.7479013940061534, "train_speed(iter/s)": 0.134228 }, { "epoch": 1.1594451599090405, "grad_norm": 0.6306625604629517, "learning_rate": 4.100118900536232e-05, "loss": 0.7519424438476563, "memory(GiB)": 91.52, "step": 89355, "token_acc": 0.800304262783694, "train_speed(iter/s)": 0.134227 }, { "epoch": 1.1595100383106962, "grad_norm": 0.7758505344390869, "learning_rate": 4.0995912889733685e-05, "loss": 0.774560546875, "memory(GiB)": 91.52, "step": 89360, "token_acc": 0.7775572579684833, "train_speed(iter/s)": 0.134227 }, { "epoch": 1.159574916712352, "grad_norm": 0.7011573910713196, "learning_rate": 4.099063687772352e-05, "loss": 0.7362169742584228, "memory(GiB)": 91.52, "step": 89365, "token_acc": 0.7866505995767693, "train_speed(iter/s)": 0.134226 }, { "epoch": 1.1596397951140076, "grad_norm": 0.8944578170776367, "learning_rate": 4.0985360969392514e-05, "loss": 0.7954465866088867, "memory(GiB)": 91.52, "step": 89370, "token_acc": 0.787291169451074, "train_speed(iter/s)": 0.134225 }, { "epoch": 1.1597046735156633, "grad_norm": 0.6902380585670471, "learning_rate": 4.09800851648014e-05, "loss": 0.7472735404968261, "memory(GiB)": 91.52, "step": 89375, "token_acc": 0.7947540983606557, "train_speed(iter/s)": 0.134224 }, { "epoch": 1.159769551917319, "grad_norm": 0.8020069003105164, "learning_rate": 4.097480946401088e-05, "loss": 0.7614334106445313, "memory(GiB)": 91.52, "step": 89380, "token_acc": 0.7949437165528696, "train_speed(iter/s)": 0.134223 }, { "epoch": 1.1598344303189747, "grad_norm": 0.6154283285140991, "learning_rate": 4.0969533867081666e-05, "loss": 0.7137197017669678, "memory(GiB)": 91.52, "step": 89385, "token_acc": 0.8097049745172636, "train_speed(iter/s)": 0.134222 }, { "epoch": 1.1598993087206304, "grad_norm": 0.6862161755561829, "learning_rate": 4.0964258374074485e-05, "loss": 0.7373924255371094, "memory(GiB)": 91.52, "step": 89390, "token_acc": 0.7753030922217682, "train_speed(iter/s)": 0.134222 }, { "epoch": 1.159964187122286, "grad_norm": 0.7833378911018372, "learning_rate": 4.095898298505002e-05, "loss": 0.7323143005371093, "memory(GiB)": 91.52, "step": 89395, "token_acc": 0.7926237722990579, "train_speed(iter/s)": 0.134221 }, { "epoch": 1.1600290655239418, "grad_norm": 0.7480514049530029, "learning_rate": 4.095370770006903e-05, "loss": 0.7958756923675537, "memory(GiB)": 91.52, "step": 89400, "token_acc": 0.7917426273458446, "train_speed(iter/s)": 0.13422 }, { "epoch": 1.1600939439255975, "grad_norm": 0.698144257068634, "learning_rate": 4.094843251919217e-05, "loss": 0.8132342338562012, "memory(GiB)": 91.52, "step": 89405, "token_acc": 0.7687239531653106, "train_speed(iter/s)": 0.13422 }, { "epoch": 1.1601588223272532, "grad_norm": 0.6480888724327087, "learning_rate": 4.094315744248017e-05, "loss": 0.742896556854248, "memory(GiB)": 91.52, "step": 89410, "token_acc": 0.786013986013986, "train_speed(iter/s)": 0.134219 }, { "epoch": 1.160223700728909, "grad_norm": 0.7160648107528687, "learning_rate": 4.0937882469993724e-05, "loss": 0.7510253429412842, "memory(GiB)": 91.52, "step": 89415, "token_acc": 0.7819023063610355, "train_speed(iter/s)": 0.134218 }, { "epoch": 1.1602885791305646, "grad_norm": 0.7038810849189758, "learning_rate": 4.093260760179355e-05, "loss": 0.7791835308074951, "memory(GiB)": 91.52, "step": 89420, "token_acc": 0.7686425791340888, "train_speed(iter/s)": 0.134218 }, { "epoch": 1.1603534575322203, "grad_norm": 0.6651509404182434, "learning_rate": 4.092733283794034e-05, "loss": 0.7828254222869873, "memory(GiB)": 91.52, "step": 89425, "token_acc": 0.7740731252058705, "train_speed(iter/s)": 0.134217 }, { "epoch": 1.160418335933876, "grad_norm": 0.7207894325256348, "learning_rate": 4.0922058178494796e-05, "loss": 0.7744139671325684, "memory(GiB)": 91.52, "step": 89430, "token_acc": 0.7683252863014245, "train_speed(iter/s)": 0.134217 }, { "epoch": 1.1604832143355317, "grad_norm": 0.6865477561950684, "learning_rate": 4.091678362351764e-05, "loss": 0.7188291549682617, "memory(GiB)": 91.52, "step": 89435, "token_acc": 0.7821533442088091, "train_speed(iter/s)": 0.134216 }, { "epoch": 1.1605480927371874, "grad_norm": 0.7852330803871155, "learning_rate": 4.091150917306954e-05, "loss": 0.7657797813415528, "memory(GiB)": 91.52, "step": 89440, "token_acc": 0.769201163188505, "train_speed(iter/s)": 0.134215 }, { "epoch": 1.160612971138843, "grad_norm": 0.8004571199417114, "learning_rate": 4.090623482721121e-05, "loss": 0.8086316108703613, "memory(GiB)": 91.52, "step": 89445, "token_acc": 0.7812970858846536, "train_speed(iter/s)": 0.134215 }, { "epoch": 1.1606778495404988, "grad_norm": 0.6477665305137634, "learning_rate": 4.090096058600334e-05, "loss": 0.7635296821594239, "memory(GiB)": 91.52, "step": 89450, "token_acc": 0.7852228751612441, "train_speed(iter/s)": 0.134214 }, { "epoch": 1.1607427279421545, "grad_norm": 0.762333333492279, "learning_rate": 4.0895686449506626e-05, "loss": 0.7224162578582763, "memory(GiB)": 91.52, "step": 89455, "token_acc": 0.7993643986675345, "train_speed(iter/s)": 0.134213 }, { "epoch": 1.1608076063438102, "grad_norm": 0.6935476064682007, "learning_rate": 4.089041241778177e-05, "loss": 0.7510751247406006, "memory(GiB)": 91.52, "step": 89460, "token_acc": 0.779536582745407, "train_speed(iter/s)": 0.134212 }, { "epoch": 1.160872484745466, "grad_norm": 0.6567149758338928, "learning_rate": 4.0885138490889464e-05, "loss": 0.7318008422851563, "memory(GiB)": 91.52, "step": 89465, "token_acc": 0.8152198678514468, "train_speed(iter/s)": 0.134211 }, { "epoch": 1.1609373631471216, "grad_norm": 0.5916088819503784, "learning_rate": 4.087986466889041e-05, "loss": 0.7734490871429444, "memory(GiB)": 91.52, "step": 89470, "token_acc": 0.7906315427642886, "train_speed(iter/s)": 0.13421 }, { "epoch": 1.1610022415487773, "grad_norm": 0.663856029510498, "learning_rate": 4.0874590951845274e-05, "loss": 0.7860239982604981, "memory(GiB)": 91.52, "step": 89475, "token_acc": 0.781044704374299, "train_speed(iter/s)": 0.134209 }, { "epoch": 1.161067119950433, "grad_norm": 0.7267821431159973, "learning_rate": 4.086931733981476e-05, "loss": 0.759091854095459, "memory(GiB)": 91.52, "step": 89480, "token_acc": 0.7892010092514719, "train_speed(iter/s)": 0.134208 }, { "epoch": 1.1611319983520887, "grad_norm": 0.7245824337005615, "learning_rate": 4.086404383285954e-05, "loss": 0.6874757289886475, "memory(GiB)": 91.52, "step": 89485, "token_acc": 0.79103400780507, "train_speed(iter/s)": 0.134207 }, { "epoch": 1.1611968767537444, "grad_norm": 0.680305004119873, "learning_rate": 4.0858770431040336e-05, "loss": 0.7276242733001709, "memory(GiB)": 91.52, "step": 89490, "token_acc": 0.7915788620521965, "train_speed(iter/s)": 0.134207 }, { "epoch": 1.1612617551554, "grad_norm": 0.7220405340194702, "learning_rate": 4.0853497134417796e-05, "loss": 0.7869078159332276, "memory(GiB)": 91.52, "step": 89495, "token_acc": 0.7897866789441375, "train_speed(iter/s)": 0.134206 }, { "epoch": 1.1613266335570556, "grad_norm": 0.7203760147094727, "learning_rate": 4.084822394305264e-05, "loss": 0.7458435535430908, "memory(GiB)": 91.52, "step": 89500, "token_acc": 0.7743258447356005, "train_speed(iter/s)": 0.134205 }, { "epoch": 1.1613915119587115, "grad_norm": 0.7097975611686707, "learning_rate": 4.084295085700553e-05, "loss": 0.7856714248657226, "memory(GiB)": 91.52, "step": 89505, "token_acc": 0.7739282660912755, "train_speed(iter/s)": 0.134205 }, { "epoch": 1.161456390360367, "grad_norm": 0.7342192530632019, "learning_rate": 4.083767787633716e-05, "loss": 0.7601790428161621, "memory(GiB)": 91.52, "step": 89510, "token_acc": 0.7722801567931366, "train_speed(iter/s)": 0.134204 }, { "epoch": 1.1615212687620229, "grad_norm": 0.7441356778144836, "learning_rate": 4.083240500110819e-05, "loss": 0.7546091556549073, "memory(GiB)": 91.52, "step": 89515, "token_acc": 0.7969746303079637, "train_speed(iter/s)": 0.134203 }, { "epoch": 1.1615861471636784, "grad_norm": 0.7675141096115112, "learning_rate": 4.08271322313793e-05, "loss": 0.7467837810516358, "memory(GiB)": 91.52, "step": 89520, "token_acc": 0.7903920723825937, "train_speed(iter/s)": 0.134203 }, { "epoch": 1.1616510255653343, "grad_norm": 0.7655520439147949, "learning_rate": 4.0821859567211206e-05, "loss": 0.7551039218902588, "memory(GiB)": 91.52, "step": 89525, "token_acc": 0.7837160377780762, "train_speed(iter/s)": 0.134202 }, { "epoch": 1.1617159039669898, "grad_norm": 0.6947376728057861, "learning_rate": 4.081658700866454e-05, "loss": 0.777078914642334, "memory(GiB)": 91.52, "step": 89530, "token_acc": 0.7554930434121493, "train_speed(iter/s)": 0.134201 }, { "epoch": 1.1617807823686457, "grad_norm": 0.7058237791061401, "learning_rate": 4.081131455580002e-05, "loss": 0.739409065246582, "memory(GiB)": 91.52, "step": 89535, "token_acc": 0.7799454049135578, "train_speed(iter/s)": 0.1342 }, { "epoch": 1.1618456607703012, "grad_norm": 0.6261345148086548, "learning_rate": 4.080604220867829e-05, "loss": 0.7090469837188721, "memory(GiB)": 91.52, "step": 89540, "token_acc": 0.7939226519337017, "train_speed(iter/s)": 0.1342 }, { "epoch": 1.161910539171957, "grad_norm": 0.701393723487854, "learning_rate": 4.0800769967360055e-05, "loss": 0.7347745418548584, "memory(GiB)": 91.52, "step": 89545, "token_acc": 0.7974129483242961, "train_speed(iter/s)": 0.134199 }, { "epoch": 1.1619754175736126, "grad_norm": 0.737343430519104, "learning_rate": 4.0795497831905946e-05, "loss": 0.7731739521026612, "memory(GiB)": 91.52, "step": 89550, "token_acc": 0.7788907633909743, "train_speed(iter/s)": 0.134198 }, { "epoch": 1.1620402959752685, "grad_norm": 0.6786221265792847, "learning_rate": 4.0790225802376655e-05, "loss": 0.706669807434082, "memory(GiB)": 91.52, "step": 89555, "token_acc": 0.7922710113252533, "train_speed(iter/s)": 0.134197 }, { "epoch": 1.162105174376924, "grad_norm": 0.7351375222206116, "learning_rate": 4.0784953878832856e-05, "loss": 0.8656379699707031, "memory(GiB)": 91.52, "step": 89560, "token_acc": 0.77205775120315, "train_speed(iter/s)": 0.134196 }, { "epoch": 1.1621700527785797, "grad_norm": 0.7958292961120605, "learning_rate": 4.0779682061335203e-05, "loss": 0.7965211868286133, "memory(GiB)": 91.52, "step": 89565, "token_acc": 0.7850126506920673, "train_speed(iter/s)": 0.134195 }, { "epoch": 1.1622349311802354, "grad_norm": 0.6992281079292297, "learning_rate": 4.077441034994439e-05, "loss": 0.7711111545562744, "memory(GiB)": 91.52, "step": 89570, "token_acc": 0.7816499085923218, "train_speed(iter/s)": 0.134194 }, { "epoch": 1.162299809581891, "grad_norm": 0.7033147811889648, "learning_rate": 4.076913874472107e-05, "loss": 0.756501293182373, "memory(GiB)": 91.52, "step": 89575, "token_acc": 0.7888082493185774, "train_speed(iter/s)": 0.134193 }, { "epoch": 1.1623646879835468, "grad_norm": 0.6780709028244019, "learning_rate": 4.076386724572592e-05, "loss": 0.7147919654846191, "memory(GiB)": 91.52, "step": 89580, "token_acc": 0.7980804166162044, "train_speed(iter/s)": 0.134192 }, { "epoch": 1.1624295663852025, "grad_norm": 0.7670702338218689, "learning_rate": 4.075859585301958e-05, "loss": 0.757101583480835, "memory(GiB)": 91.52, "step": 89585, "token_acc": 0.7892338278024162, "train_speed(iter/s)": 0.134191 }, { "epoch": 1.1624944447868581, "grad_norm": 0.7376911044120789, "learning_rate": 4.075332456666271e-05, "loss": 0.7576854705810547, "memory(GiB)": 91.52, "step": 89590, "token_acc": 0.772642136426802, "train_speed(iter/s)": 0.13419 }, { "epoch": 1.1625593231885138, "grad_norm": 0.7796515226364136, "learning_rate": 4.0748053386716e-05, "loss": 0.7721554279327393, "memory(GiB)": 91.52, "step": 89595, "token_acc": 0.7893734123624048, "train_speed(iter/s)": 0.134189 }, { "epoch": 1.1626242015901695, "grad_norm": 0.7331069707870483, "learning_rate": 4.074278231324008e-05, "loss": 0.7563010215759277, "memory(GiB)": 91.52, "step": 89600, "token_acc": 0.7885687946531459, "train_speed(iter/s)": 0.134188 }, { "epoch": 1.1626890799918252, "grad_norm": 0.7818451523780823, "learning_rate": 4.0737511346295635e-05, "loss": 0.7618775367736816, "memory(GiB)": 91.52, "step": 89605, "token_acc": 0.790184464376375, "train_speed(iter/s)": 0.134188 }, { "epoch": 1.162753958393481, "grad_norm": 0.814983069896698, "learning_rate": 4.0732240485943316e-05, "loss": 0.7446871757507324, "memory(GiB)": 91.52, "step": 89610, "token_acc": 0.7961657347271907, "train_speed(iter/s)": 0.134187 }, { "epoch": 1.1628188367951366, "grad_norm": 0.7699145674705505, "learning_rate": 4.0726969732243794e-05, "loss": 0.7362582206726074, "memory(GiB)": 91.52, "step": 89615, "token_acc": 0.7821370789822033, "train_speed(iter/s)": 0.134186 }, { "epoch": 1.1628837151967923, "grad_norm": 0.741870641708374, "learning_rate": 4.0721699085257685e-05, "loss": 0.7477604389190674, "memory(GiB)": 91.52, "step": 89620, "token_acc": 0.7892011154352956, "train_speed(iter/s)": 0.134185 }, { "epoch": 1.162948593598448, "grad_norm": 0.8006219267845154, "learning_rate": 4.071642854504566e-05, "loss": 0.7262575149536132, "memory(GiB)": 91.52, "step": 89625, "token_acc": 0.7991803278688525, "train_speed(iter/s)": 0.134184 }, { "epoch": 1.1630134720001037, "grad_norm": 0.7355496883392334, "learning_rate": 4.0711158111668395e-05, "loss": 0.7310218811035156, "memory(GiB)": 91.52, "step": 89630, "token_acc": 0.7758452481076535, "train_speed(iter/s)": 0.134183 }, { "epoch": 1.1630783504017594, "grad_norm": 0.5722700357437134, "learning_rate": 4.070588778518651e-05, "loss": 0.7691028594970704, "memory(GiB)": 91.52, "step": 89635, "token_acc": 0.7864848537153707, "train_speed(iter/s)": 0.134182 }, { "epoch": 1.1631432288034151, "grad_norm": 0.770226240158081, "learning_rate": 4.070061756566068e-05, "loss": 0.8069231033325195, "memory(GiB)": 91.52, "step": 89640, "token_acc": 0.7637824023366192, "train_speed(iter/s)": 0.134182 }, { "epoch": 1.1632081072050708, "grad_norm": 0.6363857388496399, "learning_rate": 4.0695347453151546e-05, "loss": 0.7669947147369385, "memory(GiB)": 91.52, "step": 89645, "token_acc": 0.7831085700080791, "train_speed(iter/s)": 0.134181 }, { "epoch": 1.1632729856067265, "grad_norm": 0.7548373937606812, "learning_rate": 4.069007744771976e-05, "loss": 0.7924521446228028, "memory(GiB)": 91.52, "step": 89650, "token_acc": 0.7808255357681859, "train_speed(iter/s)": 0.13418 }, { "epoch": 1.1633378640083822, "grad_norm": 0.5958771705627441, "learning_rate": 4.068480754942595e-05, "loss": 0.7387741088867188, "memory(GiB)": 91.52, "step": 89655, "token_acc": 0.8011515128096139, "train_speed(iter/s)": 0.134179 }, { "epoch": 1.163402742410038, "grad_norm": 0.7198265194892883, "learning_rate": 4.067953775833078e-05, "loss": 0.7756207466125489, "memory(GiB)": 91.52, "step": 89660, "token_acc": 0.7884663275107617, "train_speed(iter/s)": 0.134178 }, { "epoch": 1.1634676208116936, "grad_norm": 0.6888742446899414, "learning_rate": 4.067426807449489e-05, "loss": 0.759923791885376, "memory(GiB)": 91.52, "step": 89665, "token_acc": 0.7845890410958904, "train_speed(iter/s)": 0.134178 }, { "epoch": 1.1635324992133493, "grad_norm": 0.6198583245277405, "learning_rate": 4.066899849797892e-05, "loss": 0.7703680038452149, "memory(GiB)": 91.52, "step": 89670, "token_acc": 0.7781429097218571, "train_speed(iter/s)": 0.134176 }, { "epoch": 1.163597377615005, "grad_norm": 0.7574059963226318, "learning_rate": 4.066372902884351e-05, "loss": 0.7641963958740234, "memory(GiB)": 91.52, "step": 89675, "token_acc": 0.7853779429987608, "train_speed(iter/s)": 0.134176 }, { "epoch": 1.1636622560166607, "grad_norm": 0.7312679886817932, "learning_rate": 4.065845966714932e-05, "loss": 0.7580066680908203, "memory(GiB)": 91.52, "step": 89680, "token_acc": 0.7832835820895523, "train_speed(iter/s)": 0.134175 }, { "epoch": 1.1637271344183164, "grad_norm": 0.6773310899734497, "learning_rate": 4.065319041295697e-05, "loss": 0.7577945232391358, "memory(GiB)": 91.52, "step": 89685, "token_acc": 0.7674274501704017, "train_speed(iter/s)": 0.134174 }, { "epoch": 1.1637920128199721, "grad_norm": 0.7318724989891052, "learning_rate": 4.06479212663271e-05, "loss": 0.7756507873535157, "memory(GiB)": 91.52, "step": 89690, "token_acc": 0.7722852512155591, "train_speed(iter/s)": 0.134173 }, { "epoch": 1.1638568912216278, "grad_norm": 0.7465975880622864, "learning_rate": 4.064265222732035e-05, "loss": 0.7751732349395752, "memory(GiB)": 91.52, "step": 89695, "token_acc": 0.7858514505177971, "train_speed(iter/s)": 0.134173 }, { "epoch": 1.1639217696232835, "grad_norm": 0.7086920142173767, "learning_rate": 4.063738329599736e-05, "loss": 0.8035455703735351, "memory(GiB)": 91.52, "step": 89700, "token_acc": 0.7799957148978718, "train_speed(iter/s)": 0.134173 }, { "epoch": 1.1639866480249392, "grad_norm": 0.6386823058128357, "learning_rate": 4.063211447241875e-05, "loss": 0.7317201614379882, "memory(GiB)": 91.52, "step": 89705, "token_acc": 0.7896756294088132, "train_speed(iter/s)": 0.134171 }, { "epoch": 1.164051526426595, "grad_norm": 0.6889777183532715, "learning_rate": 4.062684575664517e-05, "loss": 0.7646600246429444, "memory(GiB)": 91.52, "step": 89710, "token_acc": 0.7908154476866366, "train_speed(iter/s)": 0.13417 }, { "epoch": 1.1641164048282506, "grad_norm": 0.7302616834640503, "learning_rate": 4.062157714873725e-05, "loss": 0.7689062595367432, "memory(GiB)": 91.52, "step": 89715, "token_acc": 0.7833317644105557, "train_speed(iter/s)": 0.13417 }, { "epoch": 1.1641812832299063, "grad_norm": 0.7079169154167175, "learning_rate": 4.061630864875561e-05, "loss": 0.742647933959961, "memory(GiB)": 91.52, "step": 89720, "token_acc": 0.7837574655933524, "train_speed(iter/s)": 0.134169 }, { "epoch": 1.164246161631562, "grad_norm": 0.7279176712036133, "learning_rate": 4.0611040256760905e-05, "loss": 0.7937285900115967, "memory(GiB)": 91.52, "step": 89725, "token_acc": 0.7649371969610201, "train_speed(iter/s)": 0.134169 }, { "epoch": 1.1643110400332177, "grad_norm": 0.5579119920730591, "learning_rate": 4.0605771972813726e-05, "loss": 0.7295082092285157, "memory(GiB)": 91.52, "step": 89730, "token_acc": 0.7896938805574379, "train_speed(iter/s)": 0.134168 }, { "epoch": 1.1643759184348734, "grad_norm": 0.7018164992332458, "learning_rate": 4.060050379697472e-05, "loss": 0.7998364925384521, "memory(GiB)": 91.52, "step": 89735, "token_acc": 0.7630750273822563, "train_speed(iter/s)": 0.134167 }, { "epoch": 1.1644407968365291, "grad_norm": 0.8036117553710938, "learning_rate": 4.0595235729304505e-05, "loss": 0.7550741195678711, "memory(GiB)": 91.52, "step": 89740, "token_acc": 0.7876135855802174, "train_speed(iter/s)": 0.134167 }, { "epoch": 1.1645056752381848, "grad_norm": 0.7518190145492554, "learning_rate": 4.0589967769863725e-05, "loss": 0.7543772697448731, "memory(GiB)": 91.52, "step": 89745, "token_acc": 0.7745642886847378, "train_speed(iter/s)": 0.134166 }, { "epoch": 1.1645705536398405, "grad_norm": 0.6488047242164612, "learning_rate": 4.058469991871299e-05, "loss": 0.7229716777801514, "memory(GiB)": 91.52, "step": 89750, "token_acc": 0.7827297148590479, "train_speed(iter/s)": 0.134165 }, { "epoch": 1.1646354320414962, "grad_norm": 0.6473517417907715, "learning_rate": 4.05794321759129e-05, "loss": 0.7710417747497559, "memory(GiB)": 91.52, "step": 89755, "token_acc": 0.763421950967208, "train_speed(iter/s)": 0.134165 }, { "epoch": 1.164700310443152, "grad_norm": 0.6347808241844177, "learning_rate": 4.057416454152413e-05, "loss": 0.7388210296630859, "memory(GiB)": 91.52, "step": 89760, "token_acc": 0.7839401977333011, "train_speed(iter/s)": 0.134164 }, { "epoch": 1.1647651888448076, "grad_norm": 0.6586408615112305, "learning_rate": 4.056889701560726e-05, "loss": 0.7395577907562256, "memory(GiB)": 91.52, "step": 89765, "token_acc": 0.7809361438313701, "train_speed(iter/s)": 0.134163 }, { "epoch": 1.1648300672464633, "grad_norm": 0.7465136647224426, "learning_rate": 4.0563629598222906e-05, "loss": 0.7633070945739746, "memory(GiB)": 91.52, "step": 89770, "token_acc": 0.7839305822348811, "train_speed(iter/s)": 0.134162 }, { "epoch": 1.164894945648119, "grad_norm": 0.6977663636207581, "learning_rate": 4.05583622894317e-05, "loss": 0.7134531974792481, "memory(GiB)": 91.52, "step": 89775, "token_acc": 0.798379621682859, "train_speed(iter/s)": 0.134161 }, { "epoch": 1.1649598240497747, "grad_norm": 0.7188889980316162, "learning_rate": 4.055309508929424e-05, "loss": 0.7567771911621094, "memory(GiB)": 91.52, "step": 89780, "token_acc": 0.7654130481590469, "train_speed(iter/s)": 0.134161 }, { "epoch": 1.1650247024514304, "grad_norm": 0.7352569103240967, "learning_rate": 4.054782799787118e-05, "loss": 0.715949535369873, "memory(GiB)": 91.52, "step": 89785, "token_acc": 0.7833685198217218, "train_speed(iter/s)": 0.13416 }, { "epoch": 1.1650895808530861, "grad_norm": 0.7089543342590332, "learning_rate": 4.054256101522308e-05, "loss": 0.7705940246582031, "memory(GiB)": 91.52, "step": 89790, "token_acc": 0.7605806731185902, "train_speed(iter/s)": 0.134159 }, { "epoch": 1.1651544592547418, "grad_norm": 0.7397445440292358, "learning_rate": 4.0537294141410605e-05, "loss": 0.7860593795776367, "memory(GiB)": 91.52, "step": 89795, "token_acc": 0.7911452525502829, "train_speed(iter/s)": 0.134159 }, { "epoch": 1.1652193376563975, "grad_norm": 0.6858336329460144, "learning_rate": 4.053202737649433e-05, "loss": 0.7304860115051269, "memory(GiB)": 91.52, "step": 89800, "token_acc": 0.7764862530588743, "train_speed(iter/s)": 0.134158 }, { "epoch": 1.1652842160580532, "grad_norm": 0.636615514755249, "learning_rate": 4.0526760720534875e-05, "loss": 0.7420376300811767, "memory(GiB)": 91.52, "step": 89805, "token_acc": 0.7889566613162119, "train_speed(iter/s)": 0.134157 }, { "epoch": 1.165349094459709, "grad_norm": 0.7141087651252747, "learning_rate": 4.0521494173592835e-05, "loss": 0.7700991630554199, "memory(GiB)": 91.52, "step": 89810, "token_acc": 0.7720620720123702, "train_speed(iter/s)": 0.134156 }, { "epoch": 1.1654139728613646, "grad_norm": 0.6836382746696472, "learning_rate": 4.051622773572884e-05, "loss": 0.7671414375305176, "memory(GiB)": 91.52, "step": 89815, "token_acc": 0.7722586359610275, "train_speed(iter/s)": 0.134155 }, { "epoch": 1.1654788512630203, "grad_norm": 0.7219676375389099, "learning_rate": 4.051096140700349e-05, "loss": 0.7801114559173584, "memory(GiB)": 91.52, "step": 89820, "token_acc": 0.7654428701053954, "train_speed(iter/s)": 0.134155 }, { "epoch": 1.165543729664676, "grad_norm": 0.7096689939498901, "learning_rate": 4.050569518747737e-05, "loss": 0.7280051708221436, "memory(GiB)": 91.52, "step": 89825, "token_acc": 0.7696944625022162, "train_speed(iter/s)": 0.134154 }, { "epoch": 1.1656086080663317, "grad_norm": 0.679563045501709, "learning_rate": 4.050042907721112e-05, "loss": 0.7326128959655762, "memory(GiB)": 91.52, "step": 89830, "token_acc": 0.7833851445201402, "train_speed(iter/s)": 0.134153 }, { "epoch": 1.1656734864679874, "grad_norm": 0.7188275456428528, "learning_rate": 4.049516307626531e-05, "loss": 0.7194869041442871, "memory(GiB)": 91.52, "step": 89835, "token_acc": 0.79663443186156, "train_speed(iter/s)": 0.134152 }, { "epoch": 1.165738364869643, "grad_norm": 0.6820980310440063, "learning_rate": 4.048989718470056e-05, "loss": 0.7363009452819824, "memory(GiB)": 91.52, "step": 89840, "token_acc": 0.7827245804540968, "train_speed(iter/s)": 0.134151 }, { "epoch": 1.1658032432712988, "grad_norm": 0.7765663266181946, "learning_rate": 4.0484631402577447e-05, "loss": 0.7795706748962402, "memory(GiB)": 91.52, "step": 89845, "token_acc": 0.7769312409976499, "train_speed(iter/s)": 0.134151 }, { "epoch": 1.1658681216729545, "grad_norm": 0.7084358334541321, "learning_rate": 4.0479365729956584e-05, "loss": 0.7466425895690918, "memory(GiB)": 91.52, "step": 89850, "token_acc": 0.7790347521770977, "train_speed(iter/s)": 0.13415 }, { "epoch": 1.1659330000746102, "grad_norm": 0.6862719655036926, "learning_rate": 4.047410016689858e-05, "loss": 0.7630516529083252, "memory(GiB)": 91.52, "step": 89855, "token_acc": 0.775295137730941, "train_speed(iter/s)": 0.134149 }, { "epoch": 1.165997878476266, "grad_norm": 0.7173282504081726, "learning_rate": 4.0468834713464e-05, "loss": 0.7814631938934327, "memory(GiB)": 91.52, "step": 89860, "token_acc": 0.7891203784216201, "train_speed(iter/s)": 0.134149 }, { "epoch": 1.1660627568779216, "grad_norm": 0.7568050026893616, "learning_rate": 4.0463569369713475e-05, "loss": 0.7242742538452148, "memory(GiB)": 91.52, "step": 89865, "token_acc": 0.8102879841112215, "train_speed(iter/s)": 0.134148 }, { "epoch": 1.1661276352795773, "grad_norm": 0.7354573011398315, "learning_rate": 4.045830413570758e-05, "loss": 0.7482537269592285, "memory(GiB)": 91.52, "step": 89870, "token_acc": 0.7853692201518289, "train_speed(iter/s)": 0.134148 }, { "epoch": 1.166192513681233, "grad_norm": 0.8068205118179321, "learning_rate": 4.04530390115069e-05, "loss": 0.7398956298828125, "memory(GiB)": 91.52, "step": 89875, "token_acc": 0.7896398982856205, "train_speed(iter/s)": 0.134148 }, { "epoch": 1.1662573920828887, "grad_norm": 0.6673454642295837, "learning_rate": 4.044777399717202e-05, "loss": 0.7108944892883301, "memory(GiB)": 91.52, "step": 89880, "token_acc": 0.7871745300481263, "train_speed(iter/s)": 0.134146 }, { "epoch": 1.1663222704845444, "grad_norm": 0.767239511013031, "learning_rate": 4.044250909276356e-05, "loss": 0.7297821521759034, "memory(GiB)": 91.52, "step": 89885, "token_acc": 0.7684529081783289, "train_speed(iter/s)": 0.134145 }, { "epoch": 1.1663871488862, "grad_norm": 0.7546128034591675, "learning_rate": 4.043724429834209e-05, "loss": 0.7621340274810791, "memory(GiB)": 91.52, "step": 89890, "token_acc": 0.7932652988656971, "train_speed(iter/s)": 0.134145 }, { "epoch": 1.1664520272878558, "grad_norm": 0.7560307383537292, "learning_rate": 4.043197961396818e-05, "loss": 0.7497424602508544, "memory(GiB)": 91.52, "step": 89895, "token_acc": 0.765673070349354, "train_speed(iter/s)": 0.134144 }, { "epoch": 1.1665169056895115, "grad_norm": 0.7658223509788513, "learning_rate": 4.0426715039702445e-05, "loss": 0.7740344047546387, "memory(GiB)": 91.52, "step": 89900, "token_acc": 0.7896334253054789, "train_speed(iter/s)": 0.134143 }, { "epoch": 1.1665817840911672, "grad_norm": 0.7057695984840393, "learning_rate": 4.042145057560547e-05, "loss": 0.7448554992675781, "memory(GiB)": 91.52, "step": 89905, "token_acc": 0.7755170958210216, "train_speed(iter/s)": 0.134142 }, { "epoch": 1.166646662492823, "grad_norm": 0.689384400844574, "learning_rate": 4.041618622173782e-05, "loss": 0.7202241897583008, "memory(GiB)": 91.52, "step": 89910, "token_acc": 0.7784370108834128, "train_speed(iter/s)": 0.134141 }, { "epoch": 1.1667115408944786, "grad_norm": 0.7214378118515015, "learning_rate": 4.041092197816006e-05, "loss": 0.7422053337097168, "memory(GiB)": 91.52, "step": 89915, "token_acc": 0.778544061302682, "train_speed(iter/s)": 0.13414 }, { "epoch": 1.1667764192961343, "grad_norm": 0.6680048108100891, "learning_rate": 4.040565784493281e-05, "loss": 0.736232328414917, "memory(GiB)": 91.52, "step": 89920, "token_acc": 0.7796653072704262, "train_speed(iter/s)": 0.134139 }, { "epoch": 1.16684129769779, "grad_norm": 0.7535749673843384, "learning_rate": 4.040039382211662e-05, "loss": 0.7281770706176758, "memory(GiB)": 91.52, "step": 89925, "token_acc": 0.8009058242251834, "train_speed(iter/s)": 0.134139 }, { "epoch": 1.1669061760994457, "grad_norm": 0.7669386267662048, "learning_rate": 4.039512990977209e-05, "loss": 0.7588075637817383, "memory(GiB)": 91.52, "step": 89930, "token_acc": 0.777792190945648, "train_speed(iter/s)": 0.134139 }, { "epoch": 1.1669710545011014, "grad_norm": 0.6775796413421631, "learning_rate": 4.038986610795979e-05, "loss": 0.7265515327453613, "memory(GiB)": 91.52, "step": 89935, "token_acc": 0.8006060183895235, "train_speed(iter/s)": 0.134138 }, { "epoch": 1.167035932902757, "grad_norm": 0.731225311756134, "learning_rate": 4.03846024167403e-05, "loss": 0.7363823890686035, "memory(GiB)": 91.52, "step": 89940, "token_acc": 0.7950255054548162, "train_speed(iter/s)": 0.134137 }, { "epoch": 1.1671008113044128, "grad_norm": 0.7797234058380127, "learning_rate": 4.037933883617417e-05, "loss": 0.7504884719848632, "memory(GiB)": 91.52, "step": 89945, "token_acc": 0.7786942379182156, "train_speed(iter/s)": 0.134136 }, { "epoch": 1.1671656897060685, "grad_norm": 0.7865840196609497, "learning_rate": 4.037407536632199e-05, "loss": 0.7879759788513183, "memory(GiB)": 91.52, "step": 89950, "token_acc": 0.7886706155807153, "train_speed(iter/s)": 0.134135 }, { "epoch": 1.1672305681077242, "grad_norm": 0.8149060606956482, "learning_rate": 4.036881200724434e-05, "loss": 0.7710336208343506, "memory(GiB)": 91.52, "step": 89955, "token_acc": 0.7812572412195041, "train_speed(iter/s)": 0.134135 }, { "epoch": 1.1672954465093799, "grad_norm": 0.7165640592575073, "learning_rate": 4.036354875900177e-05, "loss": 0.7393322944641113, "memory(GiB)": 91.52, "step": 89960, "token_acc": 0.7841842030763424, "train_speed(iter/s)": 0.134134 }, { "epoch": 1.1673603249110356, "grad_norm": 0.710953950881958, "learning_rate": 4.035828562165487e-05, "loss": 0.7111533641815185, "memory(GiB)": 91.52, "step": 89965, "token_acc": 0.7986902373471628, "train_speed(iter/s)": 0.134133 }, { "epoch": 1.1674252033126913, "grad_norm": 0.635184109210968, "learning_rate": 4.035302259526419e-05, "loss": 0.7353157043457031, "memory(GiB)": 91.52, "step": 89970, "token_acc": 0.7917948363460409, "train_speed(iter/s)": 0.134132 }, { "epoch": 1.1674900817143468, "grad_norm": 0.7366272211074829, "learning_rate": 4.034775967989033e-05, "loss": 0.7977316856384278, "memory(GiB)": 91.52, "step": 89975, "token_acc": 0.765138285342972, "train_speed(iter/s)": 0.134132 }, { "epoch": 1.1675549601160027, "grad_norm": 0.8119558691978455, "learning_rate": 4.034249687559382e-05, "loss": 0.7910916328430175, "memory(GiB)": 91.52, "step": 89980, "token_acc": 0.7651744929654668, "train_speed(iter/s)": 0.134131 }, { "epoch": 1.1676198385176582, "grad_norm": 0.7144041061401367, "learning_rate": 4.0337234182435213e-05, "loss": 0.7758048057556153, "memory(GiB)": 91.52, "step": 89985, "token_acc": 0.7850063938618926, "train_speed(iter/s)": 0.13413 }, { "epoch": 1.167684716919314, "grad_norm": 0.7315635085105896, "learning_rate": 4.033197160047511e-05, "loss": 0.7565053939819336, "memory(GiB)": 91.52, "step": 89990, "token_acc": 0.7809157038938811, "train_speed(iter/s)": 0.13413 }, { "epoch": 1.1677495953209696, "grad_norm": 0.6945211291313171, "learning_rate": 4.032670912977404e-05, "loss": 0.7509315013885498, "memory(GiB)": 91.52, "step": 89995, "token_acc": 0.7900523741805695, "train_speed(iter/s)": 0.134129 }, { "epoch": 1.1678144737226255, "grad_norm": 0.771865725517273, "learning_rate": 4.032144677039259e-05, "loss": 0.7493367195129395, "memory(GiB)": 91.52, "step": 90000, "token_acc": 0.7702698195644199, "train_speed(iter/s)": 0.134128 }, { "epoch": 1.1678144737226255, "eval_loss": 0.8083596229553223, "eval_runtime": 1648.8295, "eval_samples_per_second": 30.216, "eval_steps_per_second": 1.889, "eval_token_acc": 0.7735333756374319, "step": 90000 } ], "logging_steps": 5, "max_steps": 154134, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.232186859624239e+20, "train_batch_size": 4, "trial_name": null, "trial_params": null }