disco-eth
/

sao-instruct

Model card Files Files and versions

sao-instruct / config.json

ungersboeck's picture

Initial commit

570d967 2 months ago

history blame contribute delete

6.07 kB

	{
	"model_type": "diffusion_cond",
	"sample_size": 2097152,
	"sample_rate": 44100,
	"audio_channels": 2,
	"model": {
	"pretransform": {
	"type": "autoencoder",
	"iterate_batch": true,
	"config": {
	"encoder": {
	"type": "oobleck",
	"requires_grad": false,
	"config": {
	"in_channels": 2,
	"channels": 128,
	"c_mults": [1, 2, 4, 8, 16],
	"strides": [2, 4, 4, 8, 8],
	"latent_dim": 128,
	"use_snake": true
	}
	},
	"decoder": {
	"type": "oobleck",
	"requires_grad": false,
	"config": {
	"out_channels": 2,
	"channels": 128,
	"c_mults": [1, 2, 4, 8, 16],
	"strides": [2, 4, 4, 8, 8],
	"latent_dim": 64,
	"use_snake": true,
	"final_tanh": false
	}
	},
	"bottleneck": {
	"type": "vae"
	},
	"latent_dim": 64,
	"downsampling_ratio": 2048,
	"io_channels": 2
	}
	},
	"conditioning": {
	"configs": [
	{
	"id": "prompt",
	"type": "t5",
	"config": {
	"t5_model_name": "t5-base",
	"max_length": 128
	}
	},
	{
	"id": "seconds_start",
	"type": "number",
	"config": {
	"min_val": 0,
	"max_val": 512
	}
	},
	{
	"id": "seconds_total",
	"type": "number",
	"config": {
	"min_val": 0,
	"max_val": 512
	}
	},
	{
	"id": "input_audio",
	"type": "pretransform",
	"config": {
	"sample_rate": 44100,
	"pretransform_ckpt_path": "vae_model.ckpt",
	"pretransform_config": {
	"type": "autoencoder",
	"iterate_batch": true,
	"config": {
	"encoder": {
	"type": "oobleck",
	"requires_grad": false,
	"config": {
	"in_channels": 2,
	"channels": 128,
	"c_mults": [1, 2, 4, 8, 16],
	"strides": [2, 4, 4, 8, 8],
	"latent_dim": 128,
	"use_snake": true
	}
	},
	"decoder": {
	"type": "oobleck",
	"requires_grad": false,
	"config": {
	"out_channels": 2,
	"channels": 128,
	"c_mults": [1, 2, 4, 8, 16],
	"strides": [2, 4, 4, 8, 8],
	"latent_dim": 64,
	"use_snake": true,
	"final_tanh": false
	}
	},
	"bottleneck": {
	"type": "vae"
	},
	"latent_dim": 64,
	"downsampling_ratio": 2048,
	"io_channels": 2
	}
	}
	}
	}
	],
	"cond_dim": 768
	},
	"diffusion": {
	"cross_attention_cond_ids": ["prompt", "seconds_start", "seconds_total"],
	"global_cond_ids": ["seconds_start", "seconds_total"],
	"input_concat_ids": ["input_audio"],
	"type": "dit",
	"config": {
	"io_channels": 64,
	"embed_dim": 1536,
	"depth": 24,
	"num_heads": 24,
	"cond_token_dim": 768,
	"global_cond_dim": 1536,
	"project_cond_tokens": false,
	"transformer_type": "continuous_transformer",
	"input_concat_dim": 64
	}
	},
	"io_channels": 64
	},
	"training": {
	"use_ema": true,
	"log_loss_info": false,
	"optimizer_configs": {
	"diffusion": {
	"optimizer": {
	"type": "AdamW",
	"config": {
	"lr": 5e-5,
	"betas": [0.9, 0.999],
	"weight_decay": 1e-3
	}
	},
	"scheduler": {
	"type": "InverseLR",
	"config": {
	"inv_gamma": 1000000,
	"power": 0.5,
	"warmup": 0.99
	}
	}
	}
	},
	"demo": {
	"demo_every": 500,
	"demo_steps": 50,
	"num_demos": 6,
	"demo_cond_from_batch": true,
	"demo_cfg_scales": [6]
	}
	}
	}