Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,4 @@ obj/
# 打包产物
*.tlx
/build/
/tools/CheckModel
116 changes: 84 additions & 32 deletions DiffSingerDeclarations.cs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ public static class DiffSingerDeclarations
public const string KeySpeed = "speed";
public const string KeySpeaker = "speaker";
public const string KeyLanguage = "language";
public const string KeyMixPrefix = "mix:"; // 说话人混合轨 key 前缀:mix:<suffix>
public const string KeyMix = "speaker_mix"; // part 属性:说话人混合变长键控容器(ExtensibleObjectConfig),条目键 = suffix
public const string KeyMixPrefix = "mix:"; // 说话人混合自动化轨 key 前缀:mix:<suffix>

// Gender(GENC) / Speed(VELC) 连续轨:忠实采 OpenUtau 原生 UI 量程(非半音/倍率),convert 据此逐字移植。
// · GENC ∈ [-100,100],基线 0 = 不移位,正 = formant 下移;增广范围 KeyShift* 仅入 convert 缩放,不做轨边界。
Expand Down Expand Up @@ -52,64 +53,117 @@ public readonly record struct VarianceSpec(
new("tension", "Tension", "#A573E5", c => c.UseTensionEmbed, c => c.PredictTension, -100, 100, 0, -10, 10, (x, y) => x + y / 20),
};

// 可编辑曲线:variance 量(连续 delta,基线=中性值=纯预测)+ Gender/Speed(连续,基线为中性值)。
// 可编辑曲线 = 固定轨(variance / Gender / Speed,与 part 属性无关)+ 已启用的说话人混合轨(f(part 属性))。
// 必须连续(非分段):宿主仅把连续轨接进合成;中性基线上画偏移正是 delta 的天然形态。
public static OrderedMap<string, AutomationConfig> BuildAutomationConfigs(VoicebankConfig config)
// mix 轨是动态集:用户在 part 面板 speaker_mix 容器(ExtensibleObjectConfig)+ 一个 speaker 才出现其曲线、删除即消失;
// 宿主既有 OnPartPropertiesModified→RebuildAutomationConfigs→AutomationConfigsModified 链驱动曲线按钮自动增减(免新增 wiring)。
public static OrderedMap<PropertyKey, AutomationConfig> BuildAutomationConfigs(VoicebankConfig config, PropertyObject partProperties)
{
var map = new OrderedMap<string, AutomationConfig>();
var map = BuildFixedAutomationConfigs(config);
foreach (var (suffix, color) in SelectedMixTracks(config, partProperties))
map.Add((KeyMixPrefix + suffix, suffix), Continuous(color, 0, 0, 100)); // [0,100]、基线 0;轨名 = suffix(非 UI 文案、不过 L.Tr)
return map;
}

// 固定轨(与 part 属性无关):variance(按声库能力)+ Gender/Speed。会话据此 key 集在构造期一次性订阅(恒定不随属性变)。
public static OrderedMap<PropertyKey, AutomationConfig> BuildFixedAutomationConfigs(VoicebankConfig config)
{
var map = new OrderedMap<PropertyKey, AutomationConfig>();
foreach (var v in Variances)
if (v.Use(config))
map.Add(v.Key, Continuous(v.Display, v.Color, v.Neutral, v.EditMin, v.EditMax));
map.Add((v.Key, L.Tr(v.Display)), Continuous(v.Color, v.Neutral, v.EditMin, v.EditMax));

if (config.UseKeyShiftEmbed)
map.Add(KeyGender, Continuous("Gender", "#E5A573", GenderBaseline, GenderMin, GenderMax));
map.Add((KeyGender, L.Tr("Gender")), Continuous("#E5A573", GenderBaseline, GenderMin, GenderMax));
if (config.UseSpeedEmbed)
map.Add(KeySpeed, Continuous("Speed", "#73B5E5", SpeedBaseline, SpeedMin, SpeedMax));
map.Add((KeySpeed, L.Tr("Speed")), Continuous("#73B5E5", SpeedBaseline, SpeedMin, SpeedMax));
return map;
}

// 主/兜底 speaker 的 suffix(part 属性 KeySpeaker,缺省取声库首个)。它恒占混合一席(defaultWeight)、
// 不作混合候选——故下方候选/已选/automation 轨一律排除它,避免双重计入与自混冗余。可变:随 KeySpeaker 改。
public static string DefaultSuffix(VoicebankConfig config, PropertyObject partProperties)
=> Suffix(partProperties.GetString(KeySpeaker, config.Speakers.Count > 0 ? config.Speakers[0] : string.Empty));

// 说话人混合:多说话人时每 speaker 一条逐帧权重轨(连续、[0,100]、基线 0),mix:<suffix>。
// 不画时全权重落到 part 级 KeySpeaker(默认 suffix)⇒ 等价单说话人广播;画上即逐帧混入该 speaker。
int mixColorIndex = 0;
// 已启用混合的说话人 (suffix, 稳定配色):遍历全量去重 speaker 表(配色按其固定索引轮转,保证同一 speaker
// 不论何时加入颜色一致),排除主/兜底 speaker,过滤出 part 属性 speaker_mix 容器里已存在的键(present = 用户已 + 该 speaker)。
public static IEnumerable<(string Suffix, string Color)> SelectedMixTracks(VoicebankConfig config, PropertyObject partProperties)
{
var selected = partProperties.GetObject(KeyMix);
var defaultSuffix = DefaultSuffix(config, partProperties);
int i = 0;
foreach (var (key, suffix) in SpeakerMixTracks(config))
map.Add(key, Continuous(suffix, MixColors[mixColorIndex++ % MixColors.Length], 0, 0, 100));
return map;
{
var color = MixColors[i++ % MixColors.Length]; // i 覆盖全量(含默认)→ 颜色不随默认变更漂移
if (suffix == defaultSuffix)
continue;
if (selected.Map.ContainsKey(suffix))
yield return (suffix, color);
}
}

// 只读回显轨:仅当声学接受该量为输入且方差器能产基线时——显示方差器纯预测(内容感知基线,真实声学单位)。
public static OrderedMap<string, AutomationConfig> BuildReadbackConfigs(VoicebankConfig config)
public static OrderedMap<PropertyKey, AutomationConfig> BuildReadbackConfigs(VoicebankConfig config)
{
var map = new OrderedMap<string, AutomationConfig>();
var map = new OrderedMap<PropertyKey, AutomationConfig>();
foreach (var v in Variances)
if (v.Use(config) && v.Predict(config))
map.Add(v.Key, Piecewise(v.Display, v.Color, v.AcousticMin, v.AcousticMax));
map.Add((v.Key, L.Tr(v.Display)), Piecewise(v.Color, v.AcousticMin, v.AcousticMax));
return map;
}

// part 级面板:多说话人暴露说话人选择、多语言暴露 part 默认语言。
public static ObjectConfig BuildPartConfig(VoicebankConfig config)
// part 级面板:多说话人暴露「主/兜底 speaker 选择 + 说话人混合容器」、多语言暴露 part 默认语言。
public static ObjectConfig BuildPartConfig(VoicebankConfig config, IPartPropertyContext context)
{
var properties = new OrderedMap<string, IControllerConfig>();
var properties = new OrderedMap<PropertyKey, IControllerConfig>();

if (config.Speakers.Count > 1)
properties.Add(KeySpeaker, new ComboBoxConfig
{
// 主/兜底 speaker:不画任何混合时的单说话人,且逐帧混合权重和不足 1 时由它补足(见 DiffSingerSpeakerMix)。
properties.Add((KeySpeaker, L.Tr("Speaker")), new ComboBoxConfig
{
DisplayText = L.Tr("Speaker"),
Options = SpeakerOptions(config.Speakers),
});
// 说话人混合:变长键控容器(ExtensibleObjectConfig)。用户从 + 菜单挑 speaker 加入(纯开关、空对象条目),
// 加入即出现该 speaker 的 [0,100] 逐帧混合曲线、删除即消失——免一次平铺所有 speaker 的曲线。
properties.Add((KeyMix, L.Tr("Speaker mix")), BuildSpeakerMixConfig(config, context.PartProperties));
}

if (HasLanguageChoice(config))
properties.Add(KeyLanguage, LanguageCombo(config, config.Languages[0]));
properties.Add((KeyLanguage, L.Tr("Language")), LanguageCombo(config, config.Languages[0]));

return new ObjectConfig { Properties = properties };
}

// 说话人混合容器:Properties = 当前已选 speaker(读 part 属性、present 键);AddableElements = 全量去重候选
// (宿主在 + 菜单隐藏已存在的键)。条目皆纯 presence(空 ObjectConfig):加入=启用混合、删除=禁用,权重靠曲线。
static ExtensibleObjectConfig BuildSpeakerMixConfig(VoicebankConfig config, PropertyObject partProperties)
{
var selected = partProperties.GetObject(KeyMix);
var defaultSuffix = DefaultSuffix(config, partProperties);
var props = new OrderedMap<PropertyKey, IControllerConfig>();
var addable = new List<AddableKey>();
foreach (var (key, suffix) in SpeakerMixTracks(config)) // 全量去重,suffix 既作条目键 Id 又作显示文本
{
if (suffix == defaultSuffix) // 主/兜底 speaker 不作混合候选(已恒占一席)
continue;
if (selected.Map.ContainsKey(suffix)) // 容器里若残留它(曾选后又被设为默认),一并排除、不渲染
props.Add((suffix, suffix), EmptyEntry());
addable.Add(new AddableKey((suffix, suffix), EmptyEntry()));
}
return new ExtensibleObjectConfig { Properties = props, AddableElements = addable };
}

static ObjectConfig EmptyEntry() => new() { Properties = new OrderedMap<PropertyKey, IControllerConfig>() };

// note 级面板:多语言声库暴露 per-note 语言覆盖;默认值取 part 当前默认语言(依赖 part 值 ⇒ 逐次构建)。
public static ObjectConfig BuildNoteConfig(VoicebankConfig config, INotePropertyContext context)
{
var properties = new OrderedMap<string, IControllerConfig>();
var properties = new OrderedMap<PropertyKey, IControllerConfig>();
if (HasLanguageChoice(config))
{
var partDefault = context.PartProperties.GetString(KeyLanguage, config.Languages[0]);
properties.Add(KeyLanguage, LanguageCombo(config, partDefault));
properties.Add((KeyLanguage, L.Tr("Language")), LanguageCombo(config, partDefault));
}
return new ObjectConfig { Properties = properties };
}
Expand All @@ -118,16 +172,15 @@ public static ObjectConfig BuildNoteConfig(VoicebankConfig config, INoteProperty

static ComboBoxConfig LanguageCombo(VoicebankConfig config, string defaultValue) => new()
{
DisplayText = L.Tr("Language"),
Options = ToOptions(config.Languages),
DefaultOption = PropertyValue.Create(defaultValue),
};

static List<ComboBoxOption> ToOptions(IReadOnlyList<string> values)
static List<ComboBoxOption> LanguageOptions(IReadOnlyList<string> languages)
{
var options = new List<ComboBoxOption>(values.Count);
foreach (var value in values)
options.Add(value); // 隐式转换:string → ComboBoxOption(值即显示文本)
var options = new List<ComboBoxOption> { new(PropertyValue.Create(string.Empty), "default") };
foreach (var lang in languages)
options.Add(lang);
return options;
}

Expand Down Expand Up @@ -168,18 +221,17 @@ public static string Suffix(string speaker)
"#4DD0E1", "#4DB6AC", "#81C784", "#DCE775", "#FFD54F", "#FFB74D",
};

static AutomationConfig Piecewise(string display, string color, double min, double max) => new()
// 轨名(DisplayText)随 PropertyKey 走、不属 config 本身(SDK 改动)——故此处只产纯量程/配色 config。
static AutomationConfig Piecewise(string color, double min, double max) => new()
{
DisplayText = L.Tr(display),
DefaultValue = double.NaN, // 分段:无基线、段间断开(NaN 自由区)
MinValue = min,
MaxValue = max,
Color = color,
};

static AutomationConfig Continuous(string display, string color, double baseline, double min, double max) => new()
static AutomationConfig Continuous(string color, double baseline, double min, double max) => new()
{
DisplayText = L.Tr(display),
DefaultValue = baseline,
MinValue = min,
MaxValue = max,
Expand Down
4 changes: 4 additions & 0 deletions DiffSingerForTuneLab.code-workspace
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@
{
"name": "TuneLab (参考:SDK/docs/范例)",
"path": "../TuneLab"
},
{
"name": "OpenUtau-lunai (参考:OpenUtau.Core)",
"path": "../OpenUtau-lunai"
}
],
"settings": {
Expand Down
3 changes: 3 additions & 0 deletions DiffSingerModels.cs
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,9 @@ public sealed class VoiceModels : IDisposable
readonly Dictionary<string, DiffSingerPredictor?> mPredictors = new(StringComparer.Ordinal);
readonly object mPredictorLock = new();

readonly object mAcousticLock = new();
readonly object mVocoderLock = new();

public InferenceSession Acoustic { get; }
public InferenceSession Vocoder { get; }

Expand Down
92 changes: 88 additions & 4 deletions DiffSingerPhonemizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,12 @@ public static class DiffSingerPhonemizer
const string Pause = "SP";
const double PaddingSec = 0.5; // 短语首辅音前导空间(OpenUtau padding=500ms)

// 延音符(tenuto/slur):TuneLab 原生延音符歌词为 "-";兼容 OpenUtau 导入工程的 "+"/"+~"/"+*" 前缀。
// 延音符不带自身音素——沿用前一发声 note 的元音、令其延展过来;其音高仅在 pitch 时间线(note_midi)上承载,
// 故声学/音素侧把它「跳过」即可让前元音自然伸到下一发声 note 起点。见 DiffSingerPitch.BuildNotes 对称处理。
public static bool IsSlur(string? lyric)
=> lyric == "-" || (lyric != null && lyric.StartsWith("+"));

sealed class Group
{
public double Pos; // 组起点(秒);元音组=note 起点,首组=首 note 起点-padding,哨兵=末 note 终点
Expand All @@ -48,9 +54,49 @@ public static List<PhonemeSpan> Phonemize(
for (int i = 0; i < notes.Count; i++)
{
var note = notes[i];

// 延音符:不产音素、不建组——前一组(前元音)会自然伸展到下一发声 note 起点(对齐终点跨过本 note)。
// notePhIndex 记空区间(本 note 无音素,NoteOf 不会落到它)。首 note 即延音符则无前可沿,退化为常规 G2P。
if (i > 0 && IsSlur(note.Lyric))
{
pinned[i] = false;
noteSymbolCount[i] = 0;
notePhIndex.Add(notePhIndex[^1]);
continue;
}

string[] symbols = GetSymbols(dur, note, noteLang[i], out pinned[i]);
noteSymbolCount[i] = symbols.Length;

// 连音符:不产生新音素,只延展前音的时长(通过 group 自然吸收)
// 连音符:- 不产生边界(前组自然吸收),+ 拆前组末音素到独立组强制 dur 边界
if (symbols.Length == 0 && (lyric == "-" || lyric == "+"))
{
if (lyric == "+")
{
// 把前一个非空组的最后一个音素拆分到 + 组(仅当 >1 音素,否则退化为空组边界)
string moved = "AP";
bool splitted = false;
for (int gi = groups.Count - 1; gi >= 0; gi--)
{
if (groups[gi].Phonemes.Count > 1)
{
int lastIdx = groups[gi].Phonemes.Count - 1;
moved = groups[gi].Phonemes[lastIdx];
groups[gi].Phonemes.RemoveAt(lastIdx);
splitted = true;
break;
}
}
var g = new Group(note.StartTime, note.Pitch);
if (splitted) g.Phonemes.Add(moved);
groups.Add(g);
}
// - 则完全不做任何事,前组自然吸收时长,不影响 dur
notePhIndex.Add(notePhIndex[^1]);
continue;
}

var wordGroups = ProcessWord(dur, note, symbols);
groups[^1].Phonemes.AddRange(wordGroups[0].Phonemes); // 前置辅音并入前一组(侵入前一 note 尾)
groups.AddRange(wordGroups.Skip(1)); // 韵核组(起点=note 起点)
Expand Down Expand Up @@ -152,17 +198,56 @@ static List<Group> ProcessWord(DiffSingerPredictor dur, SynthesisNoteSnapshot no
return wordGroups;
}

// 取音素符号串:钉死=用 note.Phonemes 符号;否则 G2P。过滤到「类型已定义 且 dur 表可 tokenize」;空则 [SP]。
// 取音素符号串:钉死/编辑器→用已有 phonemes 或 _phonemes 属性;连音符→空(slur 延展前音素);否则 G2P。
// 空结果且非连音符→ [SP] 兜底。
static string[] GetSymbols(DiffSingerPredictor dur, SynthesisNoteSnapshot note, string lang, out bool pinned)
{
string lyric = note.Lyric ?? string.Empty;
if (lyric == "-" || lyric == "+")
{
pinned = false;
return Array.Empty<string>();
}

// 优先使用 _phonemes 属性(音素编辑器写入)
var phonemesProp = note.Properties.GetString("_phonemes", "");
if (!string.IsNullOrEmpty(phonemesProp) && phonemesProp != "[]")
{
pinned = true;
return ParsePhonemesProperty(phonemesProp).Where(s => !string.IsNullOrEmpty(s) && dur.TryPhoneme(s, out _)).ToArray();
}

// 其次使用钉死音素
pinned = note.Phonemes.Count > 0;
IEnumerable<string> raw = pinned
? note.Phonemes.Select(p => p.Symbol)
: dur.G2P(note.Lyric ?? string.Empty, lang);
: dur.G2P(lyric, lang);
var symbols = raw.Where(s => dur.IsKnownSymbol(s) && dur.TryPhoneme(s, out _)).ToArray();
return symbols.Length > 0 ? symbols : new[] { Pause };
}

// 从 JSON 字符串解析音素符号列表:[{"s":"ja/b","v":false},...]
static string[] ParsePhonemesProperty(string json)
{
var result = new List<string>();
if (string.IsNullOrEmpty(json) || json.Length < 2) return result.ToArray();
try
{
int i = 0;
while (true)
{
int sIdx = json.IndexOf("\"s\":\"", i);
if (sIdx < 0) break;
sIdx += 5;
int eIdx = json.IndexOf('"', sIdx);
if (eIdx > sIdx) result.Add(json.Substring(sIdx, eIdx - sIdx));
i = eIdx + 1;
}
}
catch { }
return result.ToArray();
}

// OpenUtau stretch:source[from..from+count) 的帧时长按 ratio 缩放、终点对齐 endPos,返回各音素起点秒。
static IEnumerable<double> Stretch(IReadOnlyList<double> source, int from, int count, double ratio, double endPos)
{
Expand Down Expand Up @@ -218,8 +303,7 @@ static double[] RunDur(DiffSingerPredictor dur, long[] tokens, long[] langs,
var spk = new float[nTokens * hidden];
for (int i = 0; i < nTokens; i++) Array.Copy(emb, 0, spk, i * hidden, hidden);

var durModel = dur.Model("dur");
var durInputs = new List<NamedOnnxValue>
using var durOut = dur.RunModel("dur", new List<NamedOnnxValue>
{
NamedOnnxValue.CreateFromTensor("encoder_out", encDense),
NamedOnnxValue.CreateFromTensor("x_masks", maskDense),
Expand Down
Loading