Skip to content

Commit 92f65f6

Browse files
committed
add more loss verification
1 parent d3eba75 commit 92f65f6

1 file changed

Lines changed: 39 additions & 0 deletions

File tree

autotest/config.yaml

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,9 @@ case:
4040
base_metric: qwen3-sft/20251117105949/tracker.jsonl
4141
check_metrics:
4242
grad_norm: 0.000001
43+
loss/maxvio: 0.000001
44+
loss/local_loss: 0.000001
45+
loss/reduced_balancing_loss: 0.000001
4346
loss/reduced_llm_loss: 0.000001
4447
lr: 0
4548
memory/max_memory_GB: 0.2
@@ -64,6 +67,9 @@ case:
6467
base_metric: qwen3-sft-ep8/cec3a8d2/tracker.jsonl
6568
check_metrics:
6669
grad_norm: 0.000001
70+
loss/maxvio: 0.000001
71+
loss/local_loss: 0.000001
72+
loss/reduced_balancing_loss: 0.000001
6773
loss/reduced_llm_loss: 0.000001
6874
lr: 0
6975
memory/max_memory_GB: 0.2
@@ -88,6 +94,9 @@ case:
8894
base_metric: qwen3-sft-ep8/cec3a8d2_resume/tracker.jsonl
8995
check_metrics:
9096
grad_norm: 0.000001
97+
loss/maxvio: 0.000001
98+
loss/local_loss: 0.000001
99+
loss/reduced_balancing_loss: 0.000001
91100
loss/reduced_llm_loss: 0.000001
92101
lr: 0
93102
memory/max_memory_GB: 0.2
@@ -109,6 +118,9 @@ case:
109118
base_metric: qwen3-sft-tp2/cec3a8d2/tracker.jsonl
110119
check_metrics:
111120
grad_norm: 0.000001
121+
loss/maxvio: 0.000001
122+
loss/local_loss: 0.000001
123+
loss/reduced_balancing_loss: 0.000001
112124
loss/reduced_llm_loss: 0.000001
113125
lr: 0
114126
memory/max_memory_GB: 0.2
@@ -133,6 +145,9 @@ case:
133145
base_metric: qwen3-sft-recompute/d76995/tracker.jsonl
134146
check_metrics:
135147
grad_norm: 0.000001
148+
loss/maxvio: 0.000001
149+
loss/local_loss: 0.000001
150+
loss/reduced_balancing_loss: 0.000001
136151
loss/reduced_llm_loss: 0.000001
137152
lr: 0
138153
memory/max_memory_GB: 0.2
@@ -159,6 +174,9 @@ case:
159174
base_metric: qwen3-sft-fp8/d76995/tracker.jsonl
160175
check_metrics:
161176
grad_norm: 0.1
177+
loss/maxvio: 0.000001
178+
loss/local_loss: 0.000001
179+
loss/reduced_balancing_loss: 0.000001
162180
loss/reduced_llm_loss: 0.000001
163181
lr: 0
164182
memory/max_memory_GB: 0.2
@@ -183,6 +201,9 @@ case:
183201
base_metric: qwen3-sft/20251117105949/tracker.jsonl
184202
check_metrics:
185203
grad_norm: 1
204+
loss/maxvio: 0.000001
205+
loss/local_loss: 0.000001
206+
loss/reduced_balancing_loss: 0.000001
186207
loss/reduced_llm_loss: 0.02
187208
lr: 0
188209
timeout: 10800
@@ -204,6 +225,9 @@ case:
204225
base_metric: qwen3-sft-celoss/812c1021/tracker.jsonl
205226
check_metrics:
206227
grad_norm: 0.000001
228+
loss/maxvio: 0.000001
229+
loss/local_loss: 0.000001
230+
loss/reduced_balancing_loss: 0.000001
207231
loss/reduced_llm_loss: 0.000001
208232
lr: 0
209233
memory/max_memory_GB: 0.2
@@ -226,6 +250,9 @@ case:
226250
base_metric: qwen3-30B-sp4-intralayer2/c0eba147/tracker.jsonl
227251
check_metrics:
228252
grad_norm: 0.000001
253+
loss/maxvio: 0.000001
254+
loss/local_loss: 0.000001
255+
loss/reduced_balancing_loss: 0.000001
229256
loss/reduced_llm_loss: 0.000001
230257
lr: 0
231258
memory/max_memory_GB: 0.2
@@ -248,6 +275,9 @@ case:
248275
base_metric: qwen3-30B-sp8-intralayer2/c0eba147/tracker.jsonl
249276
check_metrics:
250277
grad_norm: 0.025
278+
loss/maxvio: 0.000001
279+
loss/local_loss: 0.000001
280+
loss/reduced_balancing_loss: 0.000001
251281
loss/reduced_llm_loss: 0.000001
252282
lr: 0
253283
memory/max_memory_GB: 0.2
@@ -270,6 +300,9 @@ case:
270300
base_metric: gptoss-sft/7b774a0e2/tracker.jsonl
271301
check_metrics:
272302
grad_norm: 0.9
303+
loss/maxvio: 0.000001
304+
loss/local_loss: 0.000001
305+
loss/reduced_balancing_loss: 0.000001
273306
loss/reduced_llm_loss: 0.1
274307
lr: 0
275308
memory/max_memory_GB: 0.2
@@ -294,6 +327,9 @@ case:
294327
base_metric: qwen3-sft-cache/e968368a/tracker.jsonl
295328
check_metrics:
296329
grad_norm: 0.000001
330+
loss/maxvio: 0.000001
331+
loss/local_loss: 0.000001
332+
loss/reduced_balancing_loss: 0.000001
297333
loss/reduced_llm_loss: 0.000001
298334
lr: 0
299335
memory/max_memory_GB: 0.2
@@ -317,6 +353,9 @@ case:
317353
base_metric: qwen3-sft-vl-dense/812c1021/tracker.jsonl
318354
check_metrics:
319355
grad_norm: 0.000001
356+
loss/maxvio: 0.000001
357+
loss/local_loss: 0.000001
358+
loss/reduced_balancing_loss: 0.000001
320359
loss/reduced_llm_loss: 0.000001
321360
lr: 0
322361
memory/max_memory_GB: 0.2

0 commit comments

Comments
 (0)