4040 base_metric : qwen3-sft/20251117105949/tracker.jsonl
4141 check_metrics :
4242 grad_norm : 0.000001
43+ loss/maxvio : 0.000001
44+ loss/local_loss : 0.000001
45+ loss/reduced_balancing_loss : 0.000001
4346 loss/reduced_llm_loss : 0.000001
4447 lr : 0
4548 memory/max_memory_GB : 0.2
6467 base_metric : qwen3-sft-ep8/cec3a8d2/tracker.jsonl
6568 check_metrics :
6669 grad_norm : 0.000001
70+ loss/maxvio : 0.000001
71+ loss/local_loss : 0.000001
72+ loss/reduced_balancing_loss : 0.000001
6773 loss/reduced_llm_loss : 0.000001
6874 lr : 0
6975 memory/max_memory_GB : 0.2
8894 base_metric : qwen3-sft-ep8/cec3a8d2_resume/tracker.jsonl
8995 check_metrics :
9096 grad_norm : 0.000001
97+ loss/maxvio : 0.000001
98+ loss/local_loss : 0.000001
99+ loss/reduced_balancing_loss : 0.000001
91100 loss/reduced_llm_loss : 0.000001
92101 lr : 0
93102 memory/max_memory_GB : 0.2
@@ -109,6 +118,9 @@ case:
109118 base_metric : qwen3-sft-tp2/cec3a8d2/tracker.jsonl
110119 check_metrics :
111120 grad_norm : 0.000001
121+ loss/maxvio : 0.000001
122+ loss/local_loss : 0.000001
123+ loss/reduced_balancing_loss : 0.000001
112124 loss/reduced_llm_loss : 0.000001
113125 lr : 0
114126 memory/max_memory_GB : 0.2
@@ -133,6 +145,9 @@ case:
133145 base_metric : qwen3-sft-recompute/d76995/tracker.jsonl
134146 check_metrics :
135147 grad_norm : 0.000001
148+ loss/maxvio : 0.000001
149+ loss/local_loss : 0.000001
150+ loss/reduced_balancing_loss : 0.000001
136151 loss/reduced_llm_loss : 0.000001
137152 lr : 0
138153 memory/max_memory_GB : 0.2
@@ -159,6 +174,9 @@ case:
159174 base_metric : qwen3-sft-fp8/d76995/tracker.jsonl
160175 check_metrics :
161176 grad_norm : 0.1
177+ loss/maxvio : 0.000001
178+ loss/local_loss : 0.000001
179+ loss/reduced_balancing_loss : 0.000001
162180 loss/reduced_llm_loss : 0.000001
163181 lr : 0
164182 memory/max_memory_GB : 0.2
@@ -183,6 +201,9 @@ case:
183201 base_metric : qwen3-sft/20251117105949/tracker.jsonl
184202 check_metrics :
185203 grad_norm : 1
204+ loss/maxvio : 0.000001
205+ loss/local_loss : 0.000001
206+ loss/reduced_balancing_loss : 0.000001
186207 loss/reduced_llm_loss : 0.02
187208 lr : 0
188209 timeout : 10800
@@ -204,6 +225,9 @@ case:
204225 base_metric : qwen3-sft-celoss/812c1021/tracker.jsonl
205226 check_metrics :
206227 grad_norm : 0.000001
228+ loss/maxvio : 0.000001
229+ loss/local_loss : 0.000001
230+ loss/reduced_balancing_loss : 0.000001
207231 loss/reduced_llm_loss : 0.000001
208232 lr : 0
209233 memory/max_memory_GB : 0.2
@@ -226,6 +250,9 @@ case:
226250 base_metric : qwen3-30B-sp4-intralayer2/c0eba147/tracker.jsonl
227251 check_metrics :
228252 grad_norm : 0.000001
253+ loss/maxvio : 0.000001
254+ loss/local_loss : 0.000001
255+ loss/reduced_balancing_loss : 0.000001
229256 loss/reduced_llm_loss : 0.000001
230257 lr : 0
231258 memory/max_memory_GB : 0.2
@@ -248,6 +275,9 @@ case:
248275 base_metric : qwen3-30B-sp8-intralayer2/c0eba147/tracker.jsonl
249276 check_metrics :
250277 grad_norm : 0.025
278+ loss/maxvio : 0.000001
279+ loss/local_loss : 0.000001
280+ loss/reduced_balancing_loss : 0.000001
251281 loss/reduced_llm_loss : 0.000001
252282 lr : 0
253283 memory/max_memory_GB : 0.2
@@ -270,6 +300,9 @@ case:
270300 base_metric : gptoss-sft/7b774a0e2/tracker.jsonl
271301 check_metrics :
272302 grad_norm : 0.9
303+ loss/maxvio : 0.000001
304+ loss/local_loss : 0.000001
305+ loss/reduced_balancing_loss : 0.000001
273306 loss/reduced_llm_loss : 0.1
274307 lr : 0
275308 memory/max_memory_GB : 0.2
@@ -294,6 +327,9 @@ case:
294327 base_metric : qwen3-sft-cache/e968368a/tracker.jsonl
295328 check_metrics :
296329 grad_norm : 0.000001
330+ loss/maxvio : 0.000001
331+ loss/local_loss : 0.000001
332+ loss/reduced_balancing_loss : 0.000001
297333 loss/reduced_llm_loss : 0.000001
298334 lr : 0
299335 memory/max_memory_GB : 0.2
@@ -317,6 +353,9 @@ case:
317353 base_metric : qwen3-sft-vl-dense/812c1021/tracker.jsonl
318354 check_metrics :
319355 grad_norm : 0.000001
356+ loss/maxvio : 0.000001
357+ loss/local_loss : 0.000001
358+ loss/reduced_balancing_loss : 0.000001
320359 loss/reduced_llm_loss : 0.000001
321360 lr : 0
322361 memory/max_memory_GB : 0.2
0 commit comments