_config_type: haystacks.embeddings.train_batch.TrainConfig accelerator: gpu accumulate_grad_batches: 1 adam_beta1: 0.9 adam_beta2: 0.95 base_save_dir: /home/sabri/code/haystacks/checkpoints check_val_every_n_epoch: null ckpt_path: null devices: 8 dtype: bfloat16 embedding_module: _config_type: haystacks.embeddings.modeling.TokenEmbeddingModuleConfig batch_attention_layers: [] embedding_dim: 8192 kwargs: {} target: _is_type: true name: haystacks.embeddings.modeling.TokenEmbeddingModule enable_checkpointing: false foreach: null gradient_clip_val: null launch_id: null learnable_bias: true learnable_temp: true limit_train_batches: 1.0 limit_val_batches: 1.0 load_hub: true log_every_n_steps: 4 log_grad_norms: false loss_comparison: matched loss_token_idxs: - 64 - 128 - 256 - 512 - 1024 lr: 0.0001 lr_scheduler: null manual_save_epochs: null manual_save_steps: 8192 max_epochs: 512 max_hidden_layers: null max_problems: null max_seq_len: 1024 max_steps: -1 model_name: meta-llama/Llama-3.2-1B-Instruct name: no_batch-attention-lr0.0001-bs32-d8192-new num_sanity_val_steps: null num_workers: 0 objective: cross_entropy output_dir: null overfit_batches: 0.0 precision: bf16 reload_dataloaders_every_n_epochs: 0 run_dir: null run_id: null samples_per_batch: 32 save_intermediates: true script_id: null seed: 42 train_batch_size: 1 train_data_path: ScalingIntelligence/math-train-l3.2-3Bi-meta-n128 use_wandb: false val_batch_size: 1 val_check_interval: 64 val_data_path: ScalingIntelligence/math-test-l3.2-3Bi-meta-n128 val_rollout_data_path: null val_samples_per_batch: 32 validate_before_train: true wandb: _config_type: haystacks.embeddings.train_batch.WandbLoggerConfig group: '' id: null job_type: train kwargs: {} log_model: false mode: online name: null prefix: '' project: haystacks save_dir: . tags: [] target: _is_type: true name: pytorch_lightning.loggers.wandb.WandbLogger weight_decay: 0.1 weights_only: true