Skip to content

Commit f086e6b

Browse files
committed
update: 语音合成新增音色与音调调节功能
1 parent f93c4b4 commit f086e6b

File tree

13 files changed

+252
-35
lines changed

13 files changed

+252
-35
lines changed

db/2025_10_27.sql

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
-- 为角色表添加语音合成高级参数字段
2+
ALTER TABLE `xiaozhi`.`sys_role`
3+
ADD COLUMN `ttsPitch` FLOAT DEFAULT 1.0 COMMENT '语音音调(0.5-2.0, 默认1.0)' AFTER `voiceName`,
4+
ADD COLUMN `ttsSpeed` FLOAT DEFAULT 1.0 COMMENT '语音语速(0.5-2.0, 默认1.0)' AFTER `ttsPitch`;
5+

src/main/java/com/xiaozhi/communication/common/MessageHandler.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ public void afterConnection(ChatSession chatSession, String deviceIdAuth) {
122122
if (role.getTtsId() != null) {
123123
SysConfig ttsConfig = configService.selectConfigById(role.getTtsId());
124124
if (ttsConfig != null) {
125-
ttsFactory.getTtsService(ttsConfig, role.getVoiceName());// 提前初始化,加速后续使用
125+
ttsFactory.getTtsService(ttsConfig, role.getVoiceName(), role.getTtsPitch(), role.getTtsSpeed());// 提前初始化,加速后续使用
126126
}
127127
}
128128
if (role.getModelId() != null) {

src/main/java/com/xiaozhi/controller/RoleController.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,13 +109,15 @@ public AjaxResult testAudio(
109109
@Parameter(description = "消息文本") String message,
110110
@Parameter(description = "语音合成提供方") String provider,
111111
@Parameter(description = "TTS ID") Integer ttsId,
112-
@Parameter(description = "音色名称") String voiceName) {
112+
@Parameter(description = "音色名称") String voiceName,
113+
@Parameter(description = "语音音调(0.5-2.0)") Float ttsPitch,
114+
@Parameter(description = "语音语速(0.5-2.0)") Float ttsSpeed) {
113115
SysConfig config = null;
114116
try {
115117
if (!provider.equals("edge")) {
116118
config = configService.selectConfigById(ttsId);
117119
}
118-
String audioFilePath = ttsService.getTtsService(config, voiceName).textToSpeech(message);
120+
String audioFilePath = ttsService.getTtsService(config, voiceName, ttsPitch, ttsSpeed).textToSpeech(message);
119121
AjaxResult result = AjaxResult.success();
120122
result.put("data", audioFilePath);
121123
return result;

src/main/java/com/xiaozhi/dialogue/service/DialogueService.java

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -231,14 +231,16 @@ private static class TtsTask implements Comparable<TtsTask> {
231231
private final boolean isLast;
232232
private final SysConfig ttsConfig;
233233
private final String voiceName;
234+
private final Float ttsPitch;
235+
private final Float ttsSpeed;
234236
private final ChatSession session;
235237
private final long createTime;
236238
private int retryCount = 0;
237239
private boolean isRetry = false;
238240

239241
public TtsTask(ChatSession session, String sessionId, Sentence sentence,
240242
EmoSentence emoSentence, boolean isFirst, boolean isLast,
241-
SysConfig ttsConfig, String voiceName) {
243+
SysConfig ttsConfig, String voiceName, Float ttsPitch, Float ttsSpeed) {
242244
this.session = session;
243245
this.sessionId = sessionId;
244246
this.sentence = sentence;
@@ -247,6 +249,8 @@ public TtsTask(ChatSession session, String sessionId, Sentence sentence,
247249
this.isLast = isLast;
248250
this.ttsConfig = ttsConfig;
249251
this.voiceName = voiceName;
252+
this.ttsPitch = ttsPitch;
253+
this.ttsSpeed = ttsSpeed;
250254
this.createTime = System.currentTimeMillis();
251255
}
252256

@@ -576,7 +580,7 @@ private void handleSentence(
576580

577581
// 使用虚拟线程异步生成音频文件
578582
Thread.startVirtualThread(() -> {
579-
generateAudio(session, sessionId, sentence, emoSentence, isFirst, isLast, ttsConfig, voiceName);
583+
generateAudio(session, sessionId, sentence, emoSentence, isFirst, isLast, ttsConfig, voiceName, role.getTtsPitch(), role.getTtsSpeed());
580584
});
581585
}
582586

@@ -592,11 +596,13 @@ private void generateAudio(
592596
boolean isFirst,
593597
boolean isLast,
594598
SysConfig ttsConfig,
595-
String voiceName) {
599+
String voiceName,
600+
Float ttsPitch,
601+
Float ttsSpeed) {
596602

597603
// 创建TTS任务
598604
TtsTask task = new TtsTask(session, sessionId, sentence, emoSentence,
599-
isFirst, isLast, ttsConfig, voiceName);
605+
isFirst, isLast, ttsConfig, voiceName, ttsPitch, ttsSpeed);
600606

601607
// 提交任务到队列
602608
submitTtsTask(task);
@@ -658,7 +664,7 @@ private void executeTtsTask(TtsTask task) {
658664
CompletableFuture<String> future = CompletableFuture.supplyAsync(() -> {
659665
try {
660666
long ttsStartTime = System.currentTimeMillis();
661-
String audioPath = ttsFactory.getTtsService(task.ttsConfig, task.voiceName)
667+
String audioPath = ttsFactory.getTtsService(task.ttsConfig, task.voiceName, task.ttsPitch, task.ttsSpeed)
662668
.textToSpeech(task.emoSentence.getTtsSentence());
663669
long ttsDuration = System.currentTimeMillis() - ttsStartTime;
664670

@@ -752,7 +758,9 @@ private void handleTtsFailure(TtsTask task, String reason) {
752758
task.isFirst,
753759
task.isLast,
754760
task.ttsConfig,
755-
task.voiceName
761+
task.voiceName,
762+
task.ttsPitch,
763+
task.ttsSpeed
756764
);
757765
retryTask.retryCount = task.retryCount;
758766
retryTask.isRetry = true;

src/main/java/com/xiaozhi/dialogue/tts/factory/TtsServiceFactory.java

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -35,53 +35,54 @@ public class TtsServiceFactory {
3535
*/
3636
public TtsService getDefaultTtsService() {
3737
var config = new SysConfig().setProvider(DEFAULT_PROVIDER);
38-
return getTtsService(config, TtsServiceFactory.DEFAULT_VOICE);
38+
return getTtsService(config, TtsServiceFactory.DEFAULT_VOICE, 1.0f, 1.0f);
3939
}
4040

41-
// 创建缓存键,包含provider、configId和voiceName,确保音色变化时创建新的服务实例
42-
private String createCacheKey(SysConfig config, String provider, String voiceName) {
41+
// 创建缓存键(包含pitch和speed)
42+
private String createCacheKey(SysConfig config, String provider, String voiceName, Float pitch, Float speed) {
4343
Integer configId = -1;
4444
if (config != null && config.getConfigId() != null) {
4545
configId = config.getConfigId();
4646
}
47-
return provider + ":" + configId + ":" + voiceName;
47+
return provider + ":" + configId + ":" + voiceName + ":" + pitch + ":" + speed;
4848
}
4949

5050
/**
51-
* 根据配置获取TTS服务
51+
* 根据配置获取TTS服务(带pitch和speed参数)
5252
*/
53-
public TtsService getTtsService(SysConfig config, String voiceName) {
53+
public TtsService getTtsService(SysConfig config, String voiceName, Float pitch, Float speed) {
5454

5555
config = !ObjectUtils.isEmpty(config) ? config : new SysConfig().setProvider(DEFAULT_PROVIDER);
5656

5757
// 如果提供商为空,则使用默认提供商
5858
var provider = config.getProvider();
59-
var cacheKey = createCacheKey(config, provider, voiceName);
59+
60+
var cacheKey = createCacheKey(config, provider, voiceName, pitch, speed);
6061

6162
// 检查是否已有该配置的服务实例
6263
if (serviceCache.containsKey(cacheKey)) {
6364
return serviceCache.get(cacheKey);
6465
}
6566

66-
var service = createApiService(config, voiceName);
67+
var service = createApiService(config, voiceName, pitch, speed);
6768
serviceCache.put(cacheKey, service);
6869
return service;
6970
}
7071

7172
/**
72-
* 根据配置创建API类型的TTS服务
73+
* 根据配置创建API类型的TTS服务(带pitch和speed参数)
7374
*/
74-
private TtsService createApiService(SysConfig config, String voiceName) {
75+
private TtsService createApiService(SysConfig config, String voiceName, Float pitch, Float speed) {
7576
// Make sure output dir exists
7677
String outputPath = OUTPUT_PATH;
7778
ensureOutputPath(outputPath);
7879

7980
return switch (config.getProvider()) {
80-
case "aliyun" -> new AliyunTtsService(config, voiceName, outputPath);
81-
case "volcengine" -> new VolcengineTtsService(config, voiceName, outputPath);
82-
case "xfyun" -> new XfyunTtsService(config, voiceName, outputPath);
83-
case "minimax" -> new MiniMaxTtsService(config, voiceName, outputPath);
84-
default -> new EdgeTtsService(voiceName, outputPath);
81+
case "aliyun" -> new AliyunTtsService(config, voiceName, pitch, speed, outputPath);
82+
case "volcengine" -> new VolcengineTtsService(config, voiceName, pitch, speed, outputPath);
83+
case "xfyun" -> new XfyunTtsService(config, voiceName, pitch, speed, outputPath);
84+
case "minimax" -> new MiniMaxTtsService(config, voiceName, pitch, speed, outputPath);
85+
default -> new EdgeTtsService(voiceName, pitch, speed, outputPath);
8586
};
8687
}
8788

src/main/java/com/xiaozhi/dialogue/tts/providers/AliyunTtsService.java

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,11 +46,17 @@ public class AliyunTtsService implements TtsService {
4646
private final String apiKey;
4747
private final String voiceName;
4848
private final String outputPath;
49+
50+
// 语音参数
51+
private final Float pitch;
52+
private final Float speed;
4953

5054
public AliyunTtsService(SysConfig config,
51-
String voiceName, String outputPath) {
55+
String voiceName, Float pitch, Float speed, String outputPath) {
5256
this.apiKey = config.getApiKey();
5357
this.voiceName = voiceName;
58+
this.pitch = pitch;
59+
this.speed = speed;
5460
this.outputPath = outputPath;
5561
}
5662

@@ -210,6 +216,8 @@ private String ttsCosyvoice(String text) {
210216
.apiKey(apiKey)
211217
.model("cosyvoice-v2")
212218
.voice(voiceName)
219+
.speechRate(speed)
220+
.pitchRate(pitch)
213221
.format(com.alibaba.dashscope.audio.ttsv2.SpeechSynthesisAudioFormat.WAV_16000HZ_MONO_16BIT)
214222
.build();
215223

@@ -289,6 +297,8 @@ public String ttsSambert(String text) {
289297
.apiKey(apiKey)
290298
.model(voiceName)
291299
.text(text)
300+
.rate(speed)
301+
.pitch(pitch)
292302
.sampleRate(AudioUtils.SAMPLE_RATE)
293303
.format(SpeechSynthesisAudioFormat.WAV)
294304
.build();

src/main/java/com/xiaozhi/dialogue/tts/providers/EdgeTtsService.java

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,17 @@ public class EdgeTtsService implements TtsService {
2424

2525
// 音频输出路径
2626
private String outputPath;
27-
28-
public EdgeTtsService(String voiceName, String outputPath) {
27+
28+
// 语音音调 (0.5-2.0)
29+
private Float pitch;
30+
31+
// 语音语速 (0.5-2.0)
32+
private Float speed;
33+
34+
public EdgeTtsService(String voiceName, Float pitch, Float speed, String outputPath) {
2935
this.voiceName = voiceName;
36+
this.pitch = pitch;
37+
this.speed = speed;
3038
this.outputPath = outputPath;
3139
}
3240

@@ -48,12 +56,23 @@ public String textToSpeech(String text) throws Exception {
4856
.collect(Collectors.toList()).get(0);
4957

5058
TTS ttsEngine = new TTS(voiceObj, text);
59+
60+
// 计算Edge TTS的rate参数 (将0.5-2.0映射到-50%到+100%)
61+
// speed=0.5 -> rate=-50%, speed=1.0 -> rate=+0%, speed=2.0 -> rate=+100%
62+
int ratePercent = (int)((speed - 1.0f) * 100);
63+
64+
// 计算Edge TTS的pitch参数 (将0.5-2.0映射到-50Hz到+50Hz)
65+
// pitch=0.5 -> -50Hz, pitch=1.0 -> 0Hz, pitch=2.0 -> +50Hz
66+
int pitchHz = (int)((pitch - 1.0f) * 50);
67+
5168
// 执行TTS转换获取音频文件
5269
String audioFilePath = ttsEngine.findHeadHook()
5370
.storage(outputPath)
5471
.fileName(getAudioFileName().split("\\.")[0])
5572
.isRateLimited(true)
5673
.overwrite(false)
74+
.voicePitch(pitchHz + "Hz")
75+
.voiceRate(ratePercent + "%")
5776
.formatMp3()
5877
.trans();
5978

src/main/java/com/xiaozhi/dialogue/tts/providers/MiniMaxTtsService.java

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,14 +30,20 @@ public class MiniMaxTtsService implements TtsService {
3030

3131
private final String outputPath;
3232
private final String voiceName;
33+
34+
// 语音参数
35+
private final Float pitch;
36+
private final Float speed;
3337

3438
private final OkHttpClient client = HttpUtil.client;
3539
private static final MediaType JSON = MediaType.parse("application/json; charset=utf-8");
3640

37-
public MiniMaxTtsService(SysConfig config, String voiceName, String outputPath) {
41+
public MiniMaxTtsService(SysConfig config, String voiceName, Float pitch, Float speed, String outputPath) {
3842
this.groupId = config.getAppId();
3943
this.apiKey = config.getApiKey();
4044
this.voiceName = voiceName;
45+
this.pitch = pitch;
46+
this.speed = speed;
4147
this.outputPath = outputPath;
4248
}
4349

@@ -59,7 +65,19 @@ public String textToSpeech(String text) throws Exception {
5965
}
6066

6167
private void sendRequest(String text, String filepath) {
68+
// 创建请求参数
6269
var params = new Text2AudioParams(voiceName, text);
70+
71+
// 设置语速(MiniMax范围 [0.5, 2],与我们的范围一致,直接使用)
72+
params.voiceSetting.setSpeed(speed);
73+
74+
// 设置音调(需要映射:我们的 [0.5, 2] → MiniMax的 [-12, 12])
75+
// 映射公式:minimax_pitch = (our_pitch - 1.0) × 24
76+
int minimaxPitch = (int)Math.round((pitch - 1.0f) * 24);
77+
// 确保值在有效范围内
78+
minimaxPitch = Math.max(-12, Math.min(12, minimaxPitch));
79+
params.voiceSetting.setPitch(minimaxPitch);
80+
6381
var request = new Request.Builder()
6482
.url("https://api.minimaxi.com/v1/t2a_v2?Groupid=%s".formatted(groupId))
6583
.addHeader("Content-Type", "application/json")

src/main/java/com/xiaozhi/dialogue/tts/providers/VolcengineTtsService.java

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,17 @@ public class VolcengineTtsService implements TtsService {
3232
// API相关
3333
private String appId;
3434
private String accessToken; // 对应 apiKey
35+
36+
// 语音参数
37+
private Float pitch;
38+
private Float speed;
3539

3640
private final OkHttpClient client = HttpUtil.client;
3741

38-
public VolcengineTtsService(SysConfig config, String voiceName, String outputPath) {
42+
public VolcengineTtsService(SysConfig config, String voiceName, Float pitch, Float speed, String outputPath) {
3943
this.voiceName = voiceName;
44+
this.pitch = pitch;
45+
this.speed = speed;
4046
this.outputPath = outputPath;
4147
this.appId = config.getAppId();
4248
this.accessToken = config.getApiKey();
@@ -97,9 +103,9 @@ private boolean sendRequest(String text, String audioFilePath) throws Exception
97103
JsonObject audio = new JsonObject();
98104
audio.addProperty("voice_type", voiceName);
99105
audio.addProperty("encoding", "wav");
100-
audio.addProperty("speed_ratio", 1.0);
106+
audio.addProperty("speed_ratio", speed);
101107
audio.addProperty("volume_ratio", 1.0);
102-
audio.addProperty("pitch_ratio", 1.0);
108+
audio.addProperty("pitch_ratio", pitch);
103109
audio.addProperty("rate", AudioUtils.SAMPLE_RATE);
104110
requestJson.add("audio", audio);
105111

src/main/java/com/xiaozhi/dialogue/tts/providers/XfyunTtsService.java

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,15 @@ public class XfyunTtsService implements TtsService {
3333
private String appId;
3434
private String apiKey;
3535
private String apiSecret;
36+
37+
// 语音参数
38+
private Float pitch;
39+
private Float speed;
3640

37-
public XfyunTtsService(SysConfig config, String voiceName, String outputPath) {
41+
public XfyunTtsService(SysConfig config, String voiceName, Float pitch, Float speed, String outputPath) {
3842
this.voiceName = voiceName;
43+
this.pitch = pitch;
44+
this.speed = speed;
3945
this.outputPath = outputPath;
4046
this.appId = config.getAppId();
4147
this.apiKey = config.getApiKey();
@@ -84,11 +90,23 @@ public String textToSpeech(String text) throws Exception {
8490
private boolean sendRequest(String text, File file) throws Exception {
8591
CountDownLatch recognitionLatch = new CountDownLatch(1);
8692
try {
93+
// 将我们的参数(0.5-2.0)映射到讯飞的参数(0-100)
94+
// 讯飞规则:0对应0.5倍,100对应2倍
95+
// 映射公式:xfyunValue = (ourValue - 0.5) * 100 / 1.5
96+
int xfyunSpeed = (int)Math.round((speed - 0.5f) * 100f / 1.5f);
97+
int xfyunPitch = (int)Math.round((pitch - 0.5f) * 100f / 1.5f);
98+
99+
// 确保值在有效范围内
100+
xfyunSpeed = Math.max(0, Math.min(100, xfyunSpeed));
101+
xfyunPitch = Math.max(0, Math.min(100, xfyunPitch));
102+
87103
// 设置合成参数
88104
TtsClient ttsClient = new TtsClient.Builder()
89105
.signature(appId, apiKey, apiSecret)
90106
.aue("lame")
91107
.vcn(voiceName)
108+
.speed(xfyunSpeed)
109+
.pitch(xfyunPitch)
92110
.build();
93111
ttsClient.send(text, new AbstractTtsWebSocketListener() {
94112
//返回格式为音频文件的二进制数组bytes

0 commit comments

Comments
 (0)