{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.9241877256317688, "eval_steps": 25, "global_step": 204, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.019253910950661854, "grad_norm": 23.54662322998047, "learning_rate": 0.00019901960784313727, "loss": 9.4209, "step": 1 }, { "epoch": 0.03850782190132371, "grad_norm": 22.151025772094727, "learning_rate": 0.00019803921568627454, "loss": 9.3584, "step": 2 }, { "epoch": 0.05776173285198556, "grad_norm": 32.229759216308594, "learning_rate": 0.00019705882352941177, "loss": 9.1469, "step": 3 }, { "epoch": 0.07701564380264742, "grad_norm": 42.96324920654297, "learning_rate": 0.000196078431372549, "loss": 8.5595, "step": 4 }, { "epoch": 0.09626955475330927, "grad_norm": 32.40974044799805, "learning_rate": 0.00019509803921568628, "loss": 8.3043, "step": 5 }, { "epoch": 0.11552346570397112, "grad_norm": 32.838134765625, "learning_rate": 0.00019411764705882354, "loss": 8.1422, "step": 6 }, { "epoch": 0.13477737665463296, "grad_norm": 34.38292694091797, "learning_rate": 0.0001931372549019608, "loss": 7.7643, "step": 7 }, { "epoch": 0.15403128760529483, "grad_norm": 31.947425842285156, "learning_rate": 0.00019215686274509807, "loss": 7.4565, "step": 8 }, { "epoch": 0.17328519855595667, "grad_norm": 242.39166259765625, "learning_rate": 0.0001911764705882353, "loss": 7.436, "step": 9 }, { "epoch": 0.19253910950661854, "grad_norm": 25.68425750732422, "learning_rate": 0.00019019607843137254, "loss": 7.1307, "step": 10 }, { "epoch": 0.21179302045728038, "grad_norm": 24.717641830444336, "learning_rate": 0.0001892156862745098, "loss": 7.1206, "step": 11 }, { "epoch": 0.23104693140794225, "grad_norm": 36.47980880737305, "learning_rate": 0.00018823529411764707, "loss": 6.6912, "step": 12 }, { "epoch": 0.2503008423586041, "grad_norm": 28.181612014770508, "learning_rate": 0.00018725490196078433, "loss": 6.6547, "step": 13 }, { "epoch": 0.2695547533092659, "grad_norm": 24.55516242980957, "learning_rate": 0.00018627450980392157, "loss": 6.9486, "step": 14 }, { "epoch": 0.2888086642599278, "grad_norm": 32.426963806152344, "learning_rate": 0.00018529411764705883, "loss": 7.1069, "step": 15 }, { "epoch": 0.30806257521058966, "grad_norm": 20.413976669311523, "learning_rate": 0.00018431372549019607, "loss": 6.6628, "step": 16 }, { "epoch": 0.32731648616125153, "grad_norm": 28.58907699584961, "learning_rate": 0.00018333333333333334, "loss": 6.5333, "step": 17 }, { "epoch": 0.34657039711191334, "grad_norm": 24.02996253967285, "learning_rate": 0.0001823529411764706, "loss": 6.5981, "step": 18 }, { "epoch": 0.3658243080625752, "grad_norm": 23.250669479370117, "learning_rate": 0.00018137254901960786, "loss": 6.4779, "step": 19 }, { "epoch": 0.3850782190132371, "grad_norm": 15.006091117858887, "learning_rate": 0.0001803921568627451, "loss": 6.6096, "step": 20 }, { "epoch": 0.4043321299638989, "grad_norm": 16.560985565185547, "learning_rate": 0.00017941176470588236, "loss": 6.6496, "step": 21 }, { "epoch": 0.42358604091456076, "grad_norm": 31.329875946044922, "learning_rate": 0.00017843137254901963, "loss": 6.9627, "step": 22 }, { "epoch": 0.4428399518652226, "grad_norm": 12.381958961486816, "learning_rate": 0.00017745098039215687, "loss": 6.398, "step": 23 }, { "epoch": 0.4620938628158845, "grad_norm": 9.271923065185547, "learning_rate": 0.00017647058823529413, "loss": 6.6, "step": 24 }, { "epoch": 0.4813477737665463, "grad_norm": 12.544185638427734, "learning_rate": 0.00017549019607843137, "loss": 6.4684, "step": 25 }, { "epoch": 0.4813477737665463, "eval_clap": 0.09883298724889755, "eval_loss": 6.00625467300415, "eval_runtime": 166.3531, "eval_samples_per_second": 0.096, "eval_steps_per_second": 0.096, "step": 25 }, { "epoch": 0.5006016847172082, "grad_norm": 11.769013404846191, "learning_rate": 0.00017450980392156863, "loss": 6.5248, "step": 26 }, { "epoch": 0.51985559566787, "grad_norm": 11.039627075195312, "learning_rate": 0.0001735294117647059, "loss": 6.6403, "step": 27 }, { "epoch": 0.5391095066185319, "grad_norm": 17.4042911529541, "learning_rate": 0.00017254901960784316, "loss": 6.8092, "step": 28 }, { "epoch": 0.5583634175691937, "grad_norm": 12.926351547241211, "learning_rate": 0.0001715686274509804, "loss": 6.5886, "step": 29 }, { "epoch": 0.5776173285198556, "grad_norm": 12.865156173706055, "learning_rate": 0.00017058823529411766, "loss": 6.6176, "step": 30 }, { "epoch": 0.5968712394705175, "grad_norm": 15.517515182495117, "learning_rate": 0.0001696078431372549, "loss": 6.4096, "step": 31 }, { "epoch": 0.6161251504211793, "grad_norm": 12.356785774230957, "learning_rate": 0.00016862745098039216, "loss": 6.4528, "step": 32 }, { "epoch": 0.6353790613718412, "grad_norm": 15.226251602172852, "learning_rate": 0.00016764705882352942, "loss": 6.3188, "step": 33 }, { "epoch": 0.6546329723225031, "grad_norm": 13.221582412719727, "learning_rate": 0.0001666666666666667, "loss": 6.542, "step": 34 }, { "epoch": 0.6738868832731648, "grad_norm": 13.414304733276367, "learning_rate": 0.00016568627450980395, "loss": 6.4272, "step": 35 }, { "epoch": 0.6931407942238267, "grad_norm": 27.81321907043457, "learning_rate": 0.0001647058823529412, "loss": 6.7035, "step": 36 }, { "epoch": 0.7123947051744886, "grad_norm": 17.882911682128906, "learning_rate": 0.00016372549019607843, "loss": 6.6117, "step": 37 }, { "epoch": 0.7316486161251504, "grad_norm": 10.675613403320312, "learning_rate": 0.0001627450980392157, "loss": 6.4818, "step": 38 }, { "epoch": 0.7509025270758123, "grad_norm": 11.32511043548584, "learning_rate": 0.00016176470588235295, "loss": 6.4717, "step": 39 }, { "epoch": 0.7701564380264742, "grad_norm": 13.292048454284668, "learning_rate": 0.00016078431372549022, "loss": 6.4119, "step": 40 }, { "epoch": 0.789410348977136, "grad_norm": 9.824177742004395, "learning_rate": 0.00015980392156862746, "loss": 6.6399, "step": 41 }, { "epoch": 0.8086642599277978, "grad_norm": 18.48476791381836, "learning_rate": 0.0001588235294117647, "loss": 6.4116, "step": 42 }, { "epoch": 0.8279181708784596, "grad_norm": 10.409250259399414, "learning_rate": 0.00015784313725490196, "loss": 6.4832, "step": 43 }, { "epoch": 0.8471720818291215, "grad_norm": 18.297466278076172, "learning_rate": 0.00015686274509803922, "loss": 6.308, "step": 44 }, { "epoch": 0.8664259927797834, "grad_norm": 12.408952713012695, "learning_rate": 0.00015588235294117648, "loss": 6.3373, "step": 45 }, { "epoch": 0.8856799037304453, "grad_norm": 12.280571937561035, "learning_rate": 0.00015490196078431375, "loss": 6.3173, "step": 46 }, { "epoch": 0.9049338146811071, "grad_norm": 12.348167419433594, "learning_rate": 0.00015392156862745098, "loss": 6.2873, "step": 47 }, { "epoch": 0.924187725631769, "grad_norm": 28.005126953125, "learning_rate": 0.00015294117647058822, "loss": 6.7117, "step": 48 }, { "epoch": 0.9434416365824309, "grad_norm": 16.248571395874023, "learning_rate": 0.00015196078431372549, "loss": 6.3493, "step": 49 }, { "epoch": 0.9626955475330926, "grad_norm": 19.102869033813477, "learning_rate": 0.00015098039215686275, "loss": 6.4209, "step": 50 }, { "epoch": 0.9626955475330926, "eval_clap": 0.13957397639751434, "eval_loss": 6.070012092590332, "eval_runtime": 165.6113, "eval_samples_per_second": 0.097, "eval_steps_per_second": 0.097, "step": 50 }, { "epoch": 0.9819494584837545, "grad_norm": 6.675487995147705, "learning_rate": 0.00015000000000000001, "loss": 6.1695, "step": 51 }, { "epoch": 1.0, "grad_norm": 14.88092041015625, "learning_rate": 0.00014901960784313728, "loss": 5.6169, "step": 52 }, { "epoch": 1.0192539109506618, "grad_norm": 19.78269386291504, "learning_rate": 0.00014803921568627451, "loss": 6.5455, "step": 53 }, { "epoch": 1.0385078219013237, "grad_norm": 7.873740196228027, "learning_rate": 0.00014705882352941178, "loss": 6.3154, "step": 54 }, { "epoch": 1.0577617328519855, "grad_norm": 10.514632225036621, "learning_rate": 0.00014607843137254902, "loss": 6.5085, "step": 55 }, { "epoch": 1.0770156438026475, "grad_norm": 10.021757125854492, "learning_rate": 0.00014509803921568628, "loss": 6.5109, "step": 56 }, { "epoch": 1.0962695547533092, "grad_norm": 8.690667152404785, "learning_rate": 0.00014411764705882354, "loss": 6.5515, "step": 57 }, { "epoch": 1.1155234657039712, "grad_norm": 12.78662109375, "learning_rate": 0.00014313725490196078, "loss": 6.5425, "step": 58 }, { "epoch": 1.134777376654633, "grad_norm": 10.592965126037598, "learning_rate": 0.00014215686274509804, "loss": 6.5105, "step": 59 }, { "epoch": 1.154031287605295, "grad_norm": 7.947122573852539, "learning_rate": 0.0001411764705882353, "loss": 6.6142, "step": 60 }, { "epoch": 1.1732851985559567, "grad_norm": 6.823319911956787, "learning_rate": 0.00014019607843137255, "loss": 6.5339, "step": 61 }, { "epoch": 1.1925391095066185, "grad_norm": 16.670989990234375, "learning_rate": 0.0001392156862745098, "loss": 6.3022, "step": 62 }, { "epoch": 1.2117930204572804, "grad_norm": 20.09317398071289, "learning_rate": 0.00013823529411764707, "loss": 6.0779, "step": 63 }, { "epoch": 1.2310469314079422, "grad_norm": 8.030014991760254, "learning_rate": 0.0001372549019607843, "loss": 6.3284, "step": 64 }, { "epoch": 1.2503008423586042, "grad_norm": 10.324827194213867, "learning_rate": 0.00013627450980392157, "loss": 6.4022, "step": 65 }, { "epoch": 1.269554753309266, "grad_norm": 29.070960998535156, "learning_rate": 0.00013529411764705884, "loss": 6.7835, "step": 66 }, { "epoch": 1.288808664259928, "grad_norm": 17.838394165039062, "learning_rate": 0.00013431372549019608, "loss": 6.5344, "step": 67 }, { "epoch": 1.3080625752105897, "grad_norm": 10.388354301452637, "learning_rate": 0.00013333333333333334, "loss": 6.3438, "step": 68 }, { "epoch": 1.3273164861612514, "grad_norm": 9.607653617858887, "learning_rate": 0.0001323529411764706, "loss": 6.4325, "step": 69 }, { "epoch": 1.3465703971119134, "grad_norm": 9.639688491821289, "learning_rate": 0.00013137254901960784, "loss": 6.3907, "step": 70 }, { "epoch": 1.3658243080625752, "grad_norm": 9.424043655395508, "learning_rate": 0.0001303921568627451, "loss": 6.605, "step": 71 }, { "epoch": 1.3850782190132371, "grad_norm": 8.21303653717041, "learning_rate": 0.00012941176470588237, "loss": 6.6275, "step": 72 }, { "epoch": 1.404332129963899, "grad_norm": 10.479741096496582, "learning_rate": 0.00012843137254901963, "loss": 6.4801, "step": 73 }, { "epoch": 1.4235860409145609, "grad_norm": 21.424253463745117, "learning_rate": 0.00012745098039215687, "loss": 6.3391, "step": 74 }, { "epoch": 1.4428399518652226, "grad_norm": 6.5513224601745605, "learning_rate": 0.0001264705882352941, "loss": 6.7252, "step": 75 }, { "epoch": 1.4428399518652226, "eval_clap": 0.10309316217899323, "eval_loss": 6.036521911621094, "eval_runtime": 165.4554, "eval_samples_per_second": 0.097, "eval_steps_per_second": 0.097, "step": 75 }, { "epoch": 1.4620938628158844, "grad_norm": 32.52528762817383, "learning_rate": 0.00012549019607843137, "loss": 6.1922, "step": 76 }, { "epoch": 1.4813477737665464, "grad_norm": 23.51795196533203, "learning_rate": 0.00012450980392156863, "loss": 6.3506, "step": 77 }, { "epoch": 1.5006016847172083, "grad_norm": 10.925686836242676, "learning_rate": 0.0001235294117647059, "loss": 6.4783, "step": 78 }, { "epoch": 1.5198555956678699, "grad_norm": 7.924820899963379, "learning_rate": 0.00012254901960784316, "loss": 6.6288, "step": 79 }, { "epoch": 1.5391095066185319, "grad_norm": 6.946601390838623, "learning_rate": 0.00012156862745098039, "loss": 6.4085, "step": 80 }, { "epoch": 1.5583634175691938, "grad_norm": 10.120043754577637, "learning_rate": 0.00012058823529411765, "loss": 6.4667, "step": 81 }, { "epoch": 1.5776173285198556, "grad_norm": 9.635017395019531, "learning_rate": 0.0001196078431372549, "loss": 6.3742, "step": 82 }, { "epoch": 1.5968712394705173, "grad_norm": 6.578627586364746, "learning_rate": 0.00011862745098039216, "loss": 6.1956, "step": 83 }, { "epoch": 1.6161251504211793, "grad_norm": 18.30640983581543, "learning_rate": 0.00011764705882352942, "loss": 6.4804, "step": 84 }, { "epoch": 1.6353790613718413, "grad_norm": 11.166876792907715, "learning_rate": 0.00011666666666666668, "loss": 6.4495, "step": 85 }, { "epoch": 1.654632972322503, "grad_norm": 8.15738582611084, "learning_rate": 0.00011568627450980394, "loss": 6.1371, "step": 86 }, { "epoch": 1.6738868832731648, "grad_norm": 9.473989486694336, "learning_rate": 0.00011470588235294118, "loss": 6.366, "step": 87 }, { "epoch": 1.6931407942238268, "grad_norm": 16.634380340576172, "learning_rate": 0.00011372549019607843, "loss": 6.1748, "step": 88 }, { "epoch": 1.7123947051744886, "grad_norm": 20.92518424987793, "learning_rate": 0.0001127450980392157, "loss": 6.0918, "step": 89 }, { "epoch": 1.7316486161251503, "grad_norm": 10.186667442321777, "learning_rate": 0.00011176470588235294, "loss": 6.1072, "step": 90 }, { "epoch": 1.7509025270758123, "grad_norm": 21.300180435180664, "learning_rate": 0.00011078431372549021, "loss": 6.724, "step": 91 }, { "epoch": 1.7701564380264743, "grad_norm": 17.833845138549805, "learning_rate": 0.00010980392156862746, "loss": 6.2231, "step": 92 }, { "epoch": 1.789410348977136, "grad_norm": 12.850127220153809, "learning_rate": 0.0001088235294117647, "loss": 6.4846, "step": 93 }, { "epoch": 1.8086642599277978, "grad_norm": 16.229764938354492, "learning_rate": 0.00010784313725490196, "loss": 6.6046, "step": 94 }, { "epoch": 1.8279181708784598, "grad_norm": 41.6049690246582, "learning_rate": 0.00010686274509803922, "loss": 6.5044, "step": 95 }, { "epoch": 1.8471720818291215, "grad_norm": 8.0320463180542, "learning_rate": 0.00010588235294117647, "loss": 6.4836, "step": 96 }, { "epoch": 1.8664259927797833, "grad_norm": 19.129127502441406, "learning_rate": 0.00010490196078431374, "loss": 6.1962, "step": 97 }, { "epoch": 1.8856799037304453, "grad_norm": 14.464997291564941, "learning_rate": 0.00010392156862745099, "loss": 6.2694, "step": 98 }, { "epoch": 1.9049338146811072, "grad_norm": 25.245752334594727, "learning_rate": 0.00010294117647058823, "loss": 6.0148, "step": 99 }, { "epoch": 1.924187725631769, "grad_norm": 12.66399097442627, "learning_rate": 0.00010196078431372549, "loss": 6.1879, "step": 100 }, { "epoch": 1.924187725631769, "eval_clap": 0.12328307330608368, "eval_loss": 5.896579742431641, "eval_runtime": 165.5834, "eval_samples_per_second": 0.097, "eval_steps_per_second": 0.097, "step": 100 }, { "epoch": 1.9434416365824307, "grad_norm": 12.162952423095703, "learning_rate": 0.00010098039215686274, "loss": 6.1875, "step": 101 }, { "epoch": 1.9626955475330927, "grad_norm": 16.754629135131836, "learning_rate": 0.0001, "loss": 6.5483, "step": 102 }, { "epoch": 1.9819494584837545, "grad_norm": 9.804841995239258, "learning_rate": 9.901960784313727e-05, "loss": 6.0631, "step": 103 }, { "epoch": 2.0, "grad_norm": 26.169551849365234, "learning_rate": 9.80392156862745e-05, "loss": 6.3384, "step": 104 }, { "epoch": 2.019253910950662, "grad_norm": 22.054380416870117, "learning_rate": 9.705882352941177e-05, "loss": 6.5192, "step": 105 }, { "epoch": 2.0385078219013235, "grad_norm": 13.319371223449707, "learning_rate": 9.607843137254903e-05, "loss": 6.1904, "step": 106 }, { "epoch": 2.0577617328519855, "grad_norm": 13.158707618713379, "learning_rate": 9.509803921568627e-05, "loss": 6.4906, "step": 107 }, { "epoch": 2.0770156438026475, "grad_norm": 7.972289562225342, "learning_rate": 9.411764705882353e-05, "loss": 6.4551, "step": 108 }, { "epoch": 2.0962695547533094, "grad_norm": 14.052528381347656, "learning_rate": 9.313725490196079e-05, "loss": 6.2028, "step": 109 }, { "epoch": 2.115523465703971, "grad_norm": 21.128631591796875, "learning_rate": 9.215686274509804e-05, "loss": 6.121, "step": 110 }, { "epoch": 2.134777376654633, "grad_norm": 9.11488151550293, "learning_rate": 9.11764705882353e-05, "loss": 6.559, "step": 111 }, { "epoch": 2.154031287605295, "grad_norm": 10.081767082214355, "learning_rate": 9.019607843137255e-05, "loss": 6.4236, "step": 112 }, { "epoch": 2.1732851985559565, "grad_norm": 7.397235870361328, "learning_rate": 8.921568627450981e-05, "loss": 6.5415, "step": 113 }, { "epoch": 2.1925391095066185, "grad_norm": 9.652939796447754, "learning_rate": 8.823529411764706e-05, "loss": 6.3744, "step": 114 }, { "epoch": 2.2117930204572804, "grad_norm": 12.823005676269531, "learning_rate": 8.725490196078432e-05, "loss": 5.9683, "step": 115 }, { "epoch": 2.2310469314079424, "grad_norm": 9.981169700622559, "learning_rate": 8.627450980392158e-05, "loss": 6.2714, "step": 116 }, { "epoch": 2.250300842358604, "grad_norm": 11.026590347290039, "learning_rate": 8.529411764705883e-05, "loss": 6.1287, "step": 117 }, { "epoch": 2.269554753309266, "grad_norm": 14.469505310058594, "learning_rate": 8.431372549019608e-05, "loss": 6.2634, "step": 118 }, { "epoch": 2.288808664259928, "grad_norm": 10.639300346374512, "learning_rate": 8.333333333333334e-05, "loss": 6.1014, "step": 119 }, { "epoch": 2.30806257521059, "grad_norm": 10.407938003540039, "learning_rate": 8.23529411764706e-05, "loss": 6.2487, "step": 120 }, { "epoch": 2.3273164861612514, "grad_norm": 18.310867309570312, "learning_rate": 8.137254901960785e-05, "loss": 6.025, "step": 121 }, { "epoch": 2.3465703971119134, "grad_norm": 13.314108848571777, "learning_rate": 8.039215686274511e-05, "loss": 6.1319, "step": 122 }, { "epoch": 2.3658243080625754, "grad_norm": 12.528412818908691, "learning_rate": 7.941176470588235e-05, "loss": 6.27, "step": 123 }, { "epoch": 2.385078219013237, "grad_norm": 10.71603775024414, "learning_rate": 7.843137254901961e-05, "loss": 6.4118, "step": 124 }, { "epoch": 2.404332129963899, "grad_norm": 8.234016418457031, "learning_rate": 7.745098039215687e-05, "loss": 6.3642, "step": 125 }, { "epoch": 2.404332129963899, "eval_clap": 0.10650094598531723, "eval_loss": 6.806448936462402, "eval_runtime": 165.8182, "eval_samples_per_second": 0.096, "eval_steps_per_second": 0.096, "step": 125 }, { "epoch": 2.423586040914561, "grad_norm": 13.84628963470459, "learning_rate": 7.647058823529411e-05, "loss": 6.0872, "step": 126 }, { "epoch": 2.4428399518652224, "grad_norm": 7.576101779937744, "learning_rate": 7.549019607843137e-05, "loss": 6.3515, "step": 127 }, { "epoch": 2.4620938628158844, "grad_norm": 9.205301284790039, "learning_rate": 7.450980392156864e-05, "loss": 6.0883, "step": 128 }, { "epoch": 2.4813477737665464, "grad_norm": 8.85059928894043, "learning_rate": 7.352941176470589e-05, "loss": 5.824, "step": 129 }, { "epoch": 2.5006016847172083, "grad_norm": 6.963297367095947, "learning_rate": 7.254901960784314e-05, "loss": 6.4633, "step": 130 }, { "epoch": 2.51985559566787, "grad_norm": 6.612102508544922, "learning_rate": 7.156862745098039e-05, "loss": 6.3979, "step": 131 }, { "epoch": 2.539109506618532, "grad_norm": 11.322911262512207, "learning_rate": 7.058823529411765e-05, "loss": 6.2103, "step": 132 }, { "epoch": 2.558363417569194, "grad_norm": 21.0396671295166, "learning_rate": 6.96078431372549e-05, "loss": 5.6772, "step": 133 }, { "epoch": 2.577617328519856, "grad_norm": 13.040122985839844, "learning_rate": 6.862745098039216e-05, "loss": 6.0072, "step": 134 }, { "epoch": 2.5968712394705173, "grad_norm": 13.392056465148926, "learning_rate": 6.764705882352942e-05, "loss": 6.0408, "step": 135 }, { "epoch": 2.6161251504211793, "grad_norm": 9.345407485961914, "learning_rate": 6.666666666666667e-05, "loss": 6.345, "step": 136 }, { "epoch": 2.6353790613718413, "grad_norm": 9.068965911865234, "learning_rate": 6.568627450980392e-05, "loss": 6.0518, "step": 137 }, { "epoch": 2.654632972322503, "grad_norm": 9.924796104431152, "learning_rate": 6.470588235294118e-05, "loss": 6.404, "step": 138 }, { "epoch": 2.673886883273165, "grad_norm": 11.512860298156738, "learning_rate": 6.372549019607843e-05, "loss": 5.849, "step": 139 }, { "epoch": 2.693140794223827, "grad_norm": 9.558600425720215, "learning_rate": 6.274509803921569e-05, "loss": 6.0751, "step": 140 }, { "epoch": 2.7123947051744883, "grad_norm": 14.465291976928711, "learning_rate": 6.176470588235295e-05, "loss": 5.5432, "step": 141 }, { "epoch": 2.7316486161251503, "grad_norm": 14.843960762023926, "learning_rate": 6.078431372549019e-05, "loss": 5.8858, "step": 142 }, { "epoch": 2.7509025270758123, "grad_norm": 8.04920768737793, "learning_rate": 5.980392156862745e-05, "loss": 5.8131, "step": 143 }, { "epoch": 2.7701564380264743, "grad_norm": 9.71105670928955, "learning_rate": 5.882352941176471e-05, "loss": 5.9374, "step": 144 }, { "epoch": 2.7894103489771362, "grad_norm": 5.949017524719238, "learning_rate": 5.784313725490197e-05, "loss": 6.4545, "step": 145 }, { "epoch": 2.808664259927798, "grad_norm": 7.233414649963379, "learning_rate": 5.6862745098039215e-05, "loss": 6.1215, "step": 146 }, { "epoch": 2.8279181708784598, "grad_norm": 9.445034980773926, "learning_rate": 5.588235294117647e-05, "loss": 5.7711, "step": 147 }, { "epoch": 2.8471720818291217, "grad_norm": 6.351881980895996, "learning_rate": 5.490196078431373e-05, "loss": 6.3073, "step": 148 }, { "epoch": 2.8664259927797833, "grad_norm": 5.955877304077148, "learning_rate": 5.392156862745098e-05, "loss": 6.2675, "step": 149 }, { "epoch": 2.8856799037304453, "grad_norm": 7.2687764167785645, "learning_rate": 5.294117647058824e-05, "loss": 6.2382, "step": 150 }, { "epoch": 2.8856799037304453, "eval_clap": 0.07656023651361465, "eval_loss": 6.118464469909668, "eval_runtime": 165.7635, "eval_samples_per_second": 0.097, "eval_steps_per_second": 0.097, "step": 150 }, { "epoch": 2.9049338146811072, "grad_norm": 7.581653594970703, "learning_rate": 5.1960784313725495e-05, "loss": 6.1951, "step": 151 }, { "epoch": 2.9241877256317688, "grad_norm": 5.309889793395996, "learning_rate": 5.0980392156862745e-05, "loss": 6.1416, "step": 152 }, { "epoch": 2.9434416365824307, "grad_norm": 10.804561614990234, "learning_rate": 5e-05, "loss": 6.4203, "step": 153 }, { "epoch": 2.9626955475330927, "grad_norm": 7.452890872955322, "learning_rate": 4.901960784313725e-05, "loss": 6.3695, "step": 154 }, { "epoch": 2.9819494584837543, "grad_norm": 7.373142719268799, "learning_rate": 4.803921568627452e-05, "loss": 6.0469, "step": 155 }, { "epoch": 3.0, "grad_norm": 6.503188610076904, "learning_rate": 4.705882352941177e-05, "loss": 5.5774, "step": 156 }, { "epoch": 3.019253910950662, "grad_norm": 6.571235656738281, "learning_rate": 4.607843137254902e-05, "loss": 6.3784, "step": 157 }, { "epoch": 3.0385078219013235, "grad_norm": 6.059790134429932, "learning_rate": 4.5098039215686275e-05, "loss": 6.2638, "step": 158 }, { "epoch": 3.0577617328519855, "grad_norm": 7.978560447692871, "learning_rate": 4.411764705882353e-05, "loss": 6.2388, "step": 159 }, { "epoch": 3.0770156438026475, "grad_norm": 4.5174479484558105, "learning_rate": 4.313725490196079e-05, "loss": 6.1811, "step": 160 }, { "epoch": 3.0962695547533094, "grad_norm": 16.497093200683594, "learning_rate": 4.215686274509804e-05, "loss": 5.8567, "step": 161 }, { "epoch": 3.115523465703971, "grad_norm": 10.036762237548828, "learning_rate": 4.11764705882353e-05, "loss": 5.7851, "step": 162 }, { "epoch": 3.134777376654633, "grad_norm": 8.312905311584473, "learning_rate": 4.0196078431372555e-05, "loss": 6.3701, "step": 163 }, { "epoch": 3.154031287605295, "grad_norm": 6.305182456970215, "learning_rate": 3.9215686274509805e-05, "loss": 6.2461, "step": 164 }, { "epoch": 3.1732851985559565, "grad_norm": 6.297240257263184, "learning_rate": 3.8235294117647055e-05, "loss": 6.1583, "step": 165 }, { "epoch": 3.1925391095066185, "grad_norm": 6.377700328826904, "learning_rate": 3.725490196078432e-05, "loss": 5.8368, "step": 166 }, { "epoch": 3.2117930204572804, "grad_norm": 6.20255708694458, "learning_rate": 3.627450980392157e-05, "loss": 6.1394, "step": 167 }, { "epoch": 3.2310469314079424, "grad_norm": 10.172269821166992, "learning_rate": 3.529411764705883e-05, "loss": 5.99, "step": 168 }, { "epoch": 3.250300842358604, "grad_norm": 12.56449031829834, "learning_rate": 3.431372549019608e-05, "loss": 6.2823, "step": 169 }, { "epoch": 3.269554753309266, "grad_norm": 6.517347812652588, "learning_rate": 3.3333333333333335e-05, "loss": 6.4417, "step": 170 }, { "epoch": 3.288808664259928, "grad_norm": 7.165337085723877, "learning_rate": 3.235294117647059e-05, "loss": 6.1048, "step": 171 }, { "epoch": 3.30806257521059, "grad_norm": 14.79480266571045, "learning_rate": 3.137254901960784e-05, "loss": 5.9012, "step": 172 }, { "epoch": 3.3273164861612514, "grad_norm": 10.55307388305664, "learning_rate": 3.0392156862745097e-05, "loss": 6.0419, "step": 173 }, { "epoch": 3.3465703971119134, "grad_norm": 7.354953289031982, "learning_rate": 2.9411764705882354e-05, "loss": 5.9871, "step": 174 }, { "epoch": 3.3658243080625754, "grad_norm": 7.013256549835205, "learning_rate": 2.8431372549019608e-05, "loss": 6.3169, "step": 175 }, { "epoch": 3.3658243080625754, "eval_clap": 0.09689466655254364, "eval_loss": 6.116217613220215, "eval_runtime": 165.7689, "eval_samples_per_second": 0.097, "eval_steps_per_second": 0.097, "step": 175 }, { "epoch": 3.385078219013237, "grad_norm": 8.007953643798828, "learning_rate": 2.7450980392156865e-05, "loss": 6.0573, "step": 176 }, { "epoch": 3.404332129963899, "grad_norm": 7.166982173919678, "learning_rate": 2.647058823529412e-05, "loss": 6.3097, "step": 177 }, { "epoch": 3.423586040914561, "grad_norm": 5.868830680847168, "learning_rate": 2.5490196078431373e-05, "loss": 6.1856, "step": 178 }, { "epoch": 3.4428399518652224, "grad_norm": 7.172518253326416, "learning_rate": 2.4509803921568626e-05, "loss": 6.284, "step": 179 }, { "epoch": 3.4620938628158844, "grad_norm": 5.972955226898193, "learning_rate": 2.3529411764705884e-05, "loss": 6.1067, "step": 180 }, { "epoch": 3.4813477737665464, "grad_norm": 5.716938495635986, "learning_rate": 2.2549019607843138e-05, "loss": 6.2792, "step": 181 }, { "epoch": 3.5006016847172083, "grad_norm": 5.647866249084473, "learning_rate": 2.1568627450980395e-05, "loss": 6.336, "step": 182 }, { "epoch": 3.51985559566787, "grad_norm": 7.596288204193115, "learning_rate": 2.058823529411765e-05, "loss": 6.1188, "step": 183 }, { "epoch": 3.539109506618532, "grad_norm": 9.767680168151855, "learning_rate": 1.9607843137254903e-05, "loss": 6.3607, "step": 184 }, { "epoch": 3.558363417569194, "grad_norm": 5.301209926605225, "learning_rate": 1.862745098039216e-05, "loss": 6.0671, "step": 185 }, { "epoch": 3.577617328519856, "grad_norm": 6.347781658172607, "learning_rate": 1.7647058823529414e-05, "loss": 6.1538, "step": 186 }, { "epoch": 3.5968712394705173, "grad_norm": 6.653684139251709, "learning_rate": 1.6666666666666667e-05, "loss": 6.1422, "step": 187 }, { "epoch": 3.6161251504211793, "grad_norm": 9.340754508972168, "learning_rate": 1.568627450980392e-05, "loss": 5.6681, "step": 188 }, { "epoch": 3.6353790613718413, "grad_norm": 6.159310340881348, "learning_rate": 1.4705882352941177e-05, "loss": 5.8408, "step": 189 }, { "epoch": 3.654632972322503, "grad_norm": 7.5495195388793945, "learning_rate": 1.3725490196078432e-05, "loss": 6.1853, "step": 190 }, { "epoch": 3.673886883273165, "grad_norm": 6.215287208557129, "learning_rate": 1.2745098039215686e-05, "loss": 6.082, "step": 191 }, { "epoch": 3.693140794223827, "grad_norm": 5.863905906677246, "learning_rate": 1.1764705882352942e-05, "loss": 6.0772, "step": 192 }, { "epoch": 3.7123947051744883, "grad_norm": 5.785052299499512, "learning_rate": 1.0784313725490197e-05, "loss": 6.2809, "step": 193 }, { "epoch": 3.7316486161251503, "grad_norm": 8.62579345703125, "learning_rate": 9.803921568627451e-06, "loss": 5.9173, "step": 194 }, { "epoch": 3.7509025270758123, "grad_norm": 8.095368385314941, "learning_rate": 8.823529411764707e-06, "loss": 6.2614, "step": 195 }, { "epoch": 3.7701564380264743, "grad_norm": 6.416041851043701, "learning_rate": 7.84313725490196e-06, "loss": 5.7276, "step": 196 }, { "epoch": 3.7894103489771362, "grad_norm": 6.0362868309021, "learning_rate": 6.862745098039216e-06, "loss": 6.1875, "step": 197 }, { "epoch": 3.808664259927798, "grad_norm": 6.641626834869385, "learning_rate": 5.882352941176471e-06, "loss": 6.0641, "step": 198 }, { "epoch": 3.8279181708784598, "grad_norm": 6.249925136566162, "learning_rate": 4.901960784313726e-06, "loss": 6.4255, "step": 199 }, { "epoch": 3.8471720818291217, "grad_norm": 7.856912136077881, "learning_rate": 3.92156862745098e-06, "loss": 5.7667, "step": 200 }, { "epoch": 3.8471720818291217, "eval_clap": 0.11432015895843506, "eval_loss": 6.130455017089844, "eval_runtime": 165.7823, "eval_samples_per_second": 0.097, "eval_steps_per_second": 0.097, "step": 200 }, { "epoch": 3.8664259927797833, "grad_norm": 8.209946632385254, "learning_rate": 2.9411764705882355e-06, "loss": 6.1598, "step": 201 }, { "epoch": 3.8856799037304453, "grad_norm": 7.541530609130859, "learning_rate": 1.96078431372549e-06, "loss": 5.7201, "step": 202 }, { "epoch": 3.9049338146811072, "grad_norm": 36.531105041503906, "learning_rate": 9.80392156862745e-07, "loss": 6.0873, "step": 203 }, { "epoch": 3.9241877256317688, "grad_norm": 6.220560073852539, "learning_rate": 0.0, "loss": 6.0892, "step": 204 }, { "epoch": 3.9241877256317688, "step": 204, "total_flos": 784195045500888.0, "train_loss": 6.39456293629665, "train_runtime": 14405.0011, "train_samples_per_second": 0.231, "train_steps_per_second": 0.014 } ], "logging_steps": 1.0, "max_steps": 204, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 784195045500888.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }