{ "_class_name": "UNetSpatioTemporalConditionModel", "_diffusers_version": "0.24.0", "addition_time_embed_dim": 1, "block_out_channels": [ 128, 256, 256, 512 ], "cross_attention_dim": 1, "decay": 0.9999, "down_block_types": [ "CrossAttnDownBlockSpatioTemporal", "CrossAttnDownBlockSpatioTemporal", "CrossAttnDownBlockSpatioTemporal", "DownBlockSpatioTemporal" ], "in_channels": 8, "inv_gamma": 1.0, "layers_per_block": 2, "min_decay": 0.0, "num_attention_heads": [ 8, 16, 16, 32 ], "num_frames": 64, "optimization_step": 80000, "out_channels": 4, "power": 0.6666666666666666, "projection_class_embeddings_input_dim": 1, "sample_size": 14, "transformer_layers_per_block": 1, "up_block_types": [ "UpBlockSpatioTemporal", "CrossAttnUpBlockSpatioTemporal", "CrossAttnUpBlockSpatioTemporal", "CrossAttnUpBlockSpatioTemporal" ], "update_after_step": 0, "use_ema_warmup": false }