mirror of
https://github.com/index-tts/index-tts.git
synced 2025-11-28 02:10:23 +08:00
* indextts2 * update lfs for audio files --------- Co-authored-by: wangyining02 <wangyining02@bilibili.com>
120 lines
2.8 KiB
YAML
120 lines
2.8 KiB
YAML
dataset:
|
|
bpe_model: bpe.model
|
|
sample_rate: 24000
|
|
squeeze: false
|
|
mel:
|
|
sample_rate: 24000
|
|
n_fft: 1024
|
|
hop_length: 256
|
|
win_length: 1024
|
|
n_mels: 100
|
|
mel_fmin: 0
|
|
normalize: false
|
|
|
|
gpt:
|
|
model_dim: 1280
|
|
max_mel_tokens: 1815
|
|
max_text_tokens: 600
|
|
heads: 20
|
|
use_mel_codes_as_input: true
|
|
mel_length_compression: 1024
|
|
layers: 24
|
|
number_text_tokens: 12000
|
|
number_mel_codes: 8194
|
|
start_mel_token: 8192
|
|
stop_mel_token: 8193
|
|
start_text_token: 0
|
|
stop_text_token: 1
|
|
train_solo_embeddings: false
|
|
condition_type: "conformer_perceiver"
|
|
condition_module:
|
|
output_size: 512
|
|
linear_units: 2048
|
|
attention_heads: 8
|
|
num_blocks: 6
|
|
input_layer: "conv2d2"
|
|
perceiver_mult: 2
|
|
emo_condition_module:
|
|
output_size: 512
|
|
linear_units: 1024
|
|
attention_heads: 4
|
|
num_blocks: 4
|
|
input_layer: "conv2d2"
|
|
perceiver_mult: 2
|
|
|
|
semantic_codec:
|
|
codebook_size: 8192
|
|
hidden_size: 1024
|
|
codebook_dim: 8
|
|
vocos_dim: 384
|
|
vocos_intermediate_dim: 2048
|
|
vocos_num_layers: 12
|
|
|
|
s2mel:
|
|
preprocess_params:
|
|
sr: 22050
|
|
spect_params:
|
|
n_fft: 1024
|
|
win_length: 1024
|
|
hop_length: 256
|
|
n_mels: 80
|
|
fmin: 0
|
|
fmax: "None"
|
|
|
|
dit_type: "DiT"
|
|
reg_loss_type: "l1"
|
|
style_encoder:
|
|
dim: 192
|
|
length_regulator:
|
|
channels: 512
|
|
is_discrete: false
|
|
in_channels: 1024
|
|
content_codebook_size: 2048
|
|
sampling_ratios: [1, 1, 1, 1]
|
|
vector_quantize: false
|
|
n_codebooks: 1
|
|
quantizer_dropout: 0.0
|
|
f0_condition: false
|
|
n_f0_bins: 512
|
|
DiT:
|
|
hidden_dim: 512
|
|
num_heads: 8
|
|
depth: 13
|
|
class_dropout_prob: 0.1
|
|
block_size: 8192
|
|
in_channels: 80
|
|
style_condition: true
|
|
final_layer_type: 'wavenet'
|
|
target: 'mel'
|
|
content_dim: 512
|
|
content_codebook_size: 1024
|
|
content_type: 'discrete'
|
|
f0_condition: false
|
|
n_f0_bins: 512
|
|
content_codebooks: 1
|
|
is_causal: false
|
|
long_skip_connection: true
|
|
zero_prompt_speech_token: false
|
|
time_as_token: false
|
|
style_as_token: false
|
|
uvit_skip_connection: true
|
|
add_resblock_in_transformer: false
|
|
wavenet:
|
|
hidden_dim: 512
|
|
num_layers: 8
|
|
kernel_size: 5
|
|
dilation_rate: 1
|
|
p_dropout: 0.2
|
|
style_condition: true
|
|
|
|
gpt_checkpoint: gpt.pth
|
|
w2v_stat: wav2vec2bert_stats.pt
|
|
s2mel_checkpoint: s2mel.pth
|
|
emo_matrix: feat2.pt
|
|
spk_matrix: feat1.pt
|
|
emo_num: [3, 17, 2, 8, 4, 5, 10, 24]
|
|
qwen_emo_path: qwen0.6bemo4-merge/
|
|
vocoder:
|
|
type: "bigvgan"
|
|
name: "nvidia/bigvgan_v2_22khz_80band_256x"
|
|
version: 2.0
|