update: 语音合成新增音色与音调调节功能

joey-zhou · joey-zhou · commit f086e6b018a4 · 2025-10-27T10:44:19.000+08:00
diff --git a/db/2025_10_27.sql b/db/2025_10_27.sql
@@ -0,0 +1,5 @@
+-- 为角色表添加语音合成高级参数字段
+ALTER TABLE `xiaozhi`.`sys_role` 
+ADD COLUMN `ttsPitch` FLOAT DEFAULT 1.0 COMMENT '语音音调(0.5-2.0, 默认1.0)' AFTER `voiceName`,
+ADD COLUMN `ttsSpeed` FLOAT DEFAULT 1.0 COMMENT '语音语速(0.5-2.0, 默认1.0)' AFTER `ttsPitch`;
+
diff --git a/src/main/java/com/xiaozhi/communication/common/MessageHandler.java b/src/main/java/com/xiaozhi/communication/common/MessageHandler.java
@@ -122,7 +122,7 @@ public void afterConnection(ChatSession chatSession, String deviceIdAuth) {
                     if (role.getTtsId() != null) {
                         SysConfig ttsConfig = configService.selectConfigById(role.getTtsId());
                         if (ttsConfig != null) {
-                            ttsFactory.getTtsService(ttsConfig, role.getVoiceName());// 提前初始化，加速后续使用
+                            ttsFactory.getTtsService(ttsConfig, role.getVoiceName(), role.getTtsPitch(), role.getTtsSpeed());// 提前初始化，加速后续使用
                         }
                     }
                     if (role.getModelId() != null) {
diff --git a/src/main/java/com/xiaozhi/controller/RoleController.java b/src/main/java/com/xiaozhi/controller/RoleController.java
@@ -109,13 +109,15 @@ public AjaxResult testAudio(
         @Parameter(description = "消息文本") String message, 
         @Parameter(description = "语音合成提供方") String provider, 
         @Parameter(description = "TTS ID") Integer ttsId, 
-        @Parameter(description = "音色名称") String voiceName) {
+        @Parameter(description = "音色名称") String voiceName,
+        @Parameter(description = "语音音调(0.5-2.0)") Float ttsPitch,
+        @Parameter(description = "语音语速(0.5-2.0)") Float ttsSpeed) {
         SysConfig config = null;
         try {
             if (!provider.equals("edge")) {
                 config = configService.selectConfigById(ttsId);
             }
-            String audioFilePath = ttsService.getTtsService(config, voiceName).textToSpeech(message);
+            String audioFilePath = ttsService.getTtsService(config, voiceName, ttsPitch, ttsSpeed).textToSpeech(message);
             AjaxResult result = AjaxResult.success();
             result.put("data", audioFilePath);
             return result;
diff --git a/src/main/java/com/xiaozhi/dialogue/service/DialogueService.java b/src/main/java/com/xiaozhi/dialogue/service/DialogueService.java
@@ -231,14 +231,16 @@ private static class TtsTask implements Comparable<TtsTask> {
         private final boolean isLast;
         private final SysConfig ttsConfig;
         private final String voiceName;
+        private final Float ttsPitch;
+        private final Float ttsSpeed;
         private final ChatSession session;
         private final long createTime;
         private int retryCount = 0;
         private boolean isRetry = false;
 
         public TtsTask(ChatSession session, String sessionId, Sentence sentence,
                 EmoSentence emoSentence, boolean isFirst, boolean isLast,
-                SysConfig ttsConfig, String voiceName) {
+                SysConfig ttsConfig, String voiceName, Float ttsPitch, Float ttsSpeed) {
             this.session = session;
             this.sessionId = sessionId;
             this.sentence = sentence;
@@ -247,6 +249,8 @@ public TtsTask(ChatSession session, String sessionId, Sentence sentence,
             this.isLast = isLast;
             this.ttsConfig = ttsConfig;
             this.voiceName = voiceName;
+            this.ttsPitch = ttsPitch;
+            this.ttsSpeed = ttsSpeed;
             this.createTime = System.currentTimeMillis();
         }
 
@@ -576,7 +580,7 @@ private void handleSentence(
 
         // 使用虚拟线程异步生成音频文件
         Thread.startVirtualThread(() -> {
-            generateAudio(session, sessionId, sentence, emoSentence, isFirst, isLast, ttsConfig, voiceName);
+            generateAudio(session, sessionId, sentence, emoSentence, isFirst, isLast, ttsConfig, voiceName, role.getTtsPitch(), role.getTtsSpeed());
         });
     }
 
@@ -592,11 +596,13 @@ private void generateAudio(
             boolean isFirst,
             boolean isLast,
             SysConfig ttsConfig,
-            String voiceName) {
+            String voiceName,
+            Float ttsPitch,
+            Float ttsSpeed) {
 
         // 创建TTS任务
         TtsTask task = new TtsTask(session, sessionId, sentence, emoSentence,
-                isFirst, isLast, ttsConfig, voiceName);
+                isFirst, isLast, ttsConfig, voiceName, ttsPitch, ttsSpeed);
 
         // 提交任务到队列
         submitTtsTask(task);
@@ -658,7 +664,7 @@ private void executeTtsTask(TtsTask task) {
         CompletableFuture<String> future = CompletableFuture.supplyAsync(() -> {
             try {
                 long ttsStartTime = System.currentTimeMillis();
-                String audioPath = ttsFactory.getTtsService(task.ttsConfig, task.voiceName)
+                String audioPath = ttsFactory.getTtsService(task.ttsConfig, task.voiceName, task.ttsPitch, task.ttsSpeed)
                         .textToSpeech(task.emoSentence.getTtsSentence());
                 long ttsDuration = System.currentTimeMillis() - ttsStartTime;
 
@@ -752,7 +758,9 @@ private void handleTtsFailure(TtsTask task, String reason) {
                 task.isFirst, 
                 task.isLast, 
                 task.ttsConfig, 
-                task.voiceName
+                task.voiceName,
+                task.ttsPitch,
+                task.ttsSpeed
             );
             retryTask.retryCount = task.retryCount;
             retryTask.isRetry = true;
diff --git a/src/main/java/com/xiaozhi/dialogue/tts/factory/TtsServiceFactory.java b/src/main/java/com/xiaozhi/dialogue/tts/factory/TtsServiceFactory.java
@@ -35,53 +35,54 @@ public class TtsServiceFactory {
      */
     public TtsService getDefaultTtsService() {
         var config = new SysConfig().setProvider(DEFAULT_PROVIDER);
-        return getTtsService(config, TtsServiceFactory.DEFAULT_VOICE);
+        return getTtsService(config, TtsServiceFactory.DEFAULT_VOICE, 1.0f, 1.0f);
     }
 
-    // 创建缓存键，包含provider、configId和voiceName，确保音色变化时创建新的服务实例
-    private String createCacheKey(SysConfig config, String provider, String voiceName) {
+    // 创建缓存键（包含pitch和speed）
+    private String createCacheKey(SysConfig config, String provider, String voiceName, Float pitch, Float speed) {
         Integer configId = -1;
         if (config != null && config.getConfigId() != null) {
             configId = config.getConfigId();
         }
-        return provider + ":" + configId + ":" + voiceName;
+        return provider + ":" + configId + ":" + voiceName + ":" + pitch + ":" + speed;
     }
 
     /**
-     * 根据配置获取TTS服务
+     * 根据配置获取TTS服务（带pitch和speed参数）
      */
-    public TtsService getTtsService(SysConfig config, String voiceName) {
+    public TtsService getTtsService(SysConfig config, String voiceName, Float pitch, Float speed) {
         
         config = !ObjectUtils.isEmpty(config) ? config : new SysConfig().setProvider(DEFAULT_PROVIDER);
 
         // 如果提供商为空，则使用默认提供商
         var provider = config.getProvider();
-        var cacheKey = createCacheKey(config, provider, voiceName);
+        
+        var cacheKey = createCacheKey(config, provider, voiceName, pitch, speed);
 
         // 检查是否已有该配置的服务实例
         if (serviceCache.containsKey(cacheKey)) {
             return serviceCache.get(cacheKey);
         }
 
-        var service = createApiService(config, voiceName);
+        var service = createApiService(config, voiceName, pitch, speed);
         serviceCache.put(cacheKey, service);
         return service;
     }
 
     /**
-     * 根据配置创建API类型的TTS服务
+     * 根据配置创建API类型的TTS服务（带pitch和speed参数）
      */
-    private TtsService createApiService(SysConfig config, String voiceName) {
+    private TtsService createApiService(SysConfig config, String voiceName, Float pitch, Float speed) {
         // Make sure output dir exists
         String outputPath = OUTPUT_PATH;
         ensureOutputPath(outputPath);
 
         return switch (config.getProvider()) {
-            case "aliyun" -> new AliyunTtsService(config, voiceName, outputPath);
-            case "volcengine" -> new VolcengineTtsService(config, voiceName, outputPath);
-            case "xfyun" -> new XfyunTtsService(config, voiceName, outputPath);
-            case "minimax" -> new MiniMaxTtsService(config, voiceName, outputPath);
-            default -> new EdgeTtsService(voiceName, outputPath);
+            case "aliyun" -> new AliyunTtsService(config, voiceName, pitch, speed, outputPath);
+            case "volcengine" -> new VolcengineTtsService(config, voiceName, pitch, speed, outputPath);
+            case "xfyun" -> new XfyunTtsService(config, voiceName, pitch, speed, outputPath);
+            case "minimax" -> new MiniMaxTtsService(config, voiceName, pitch, speed, outputPath);
+            default -> new EdgeTtsService(voiceName, pitch, speed, outputPath);
         };
     }
 
diff --git a/src/main/java/com/xiaozhi/dialogue/tts/providers/AliyunTtsService.java b/src/main/java/com/xiaozhi/dialogue/tts/providers/AliyunTtsService.java
@@ -46,11 +46,17 @@ public class AliyunTtsService implements TtsService {
     private final String apiKey;
     private final String voiceName;
     private final String outputPath;
+    
+    // 语音参数
+    private final Float pitch;
+    private final Float speed;
 
     public AliyunTtsService(SysConfig config,
-            String voiceName, String outputPath) {
+            String voiceName, Float pitch, Float speed, String outputPath) {
         this.apiKey = config.getApiKey();
         this.voiceName = voiceName;
+        this.pitch = pitch;
+        this.speed = speed;
         this.outputPath = outputPath;
     }
 
@@ -210,6 +216,8 @@ private String ttsCosyvoice(String text) {
                                 .apiKey(apiKey)
                                 .model("cosyvoice-v2")
                                 .voice(voiceName)
+                                .speechRate(speed)
+                                .pitchRate(pitch)
                                 .format(com.alibaba.dashscope.audio.ttsv2.SpeechSynthesisAudioFormat.WAV_16000HZ_MONO_16BIT)
                                 .build();
                 
@@ -289,6 +297,8 @@ public String ttsSambert(String text) {
                         .apiKey(apiKey)
                         .model(voiceName)
                         .text(text)
+                        .rate(speed)
+                        .pitch(pitch)
                         .sampleRate(AudioUtils.SAMPLE_RATE)
                         .format(SpeechSynthesisAudioFormat.WAV)
                         .build();
diff --git a/src/main/java/com/xiaozhi/dialogue/tts/providers/EdgeTtsService.java b/src/main/java/com/xiaozhi/dialogue/tts/providers/EdgeTtsService.java
@@ -24,9 +24,17 @@ public class EdgeTtsService implements TtsService {
 
     // 音频输出路径
     private String outputPath;
-
-    public EdgeTtsService(String voiceName, String outputPath) {
+    
+    // 语音音调 (0.5-2.0)
+    private Float pitch;
+    
+    // 语音语速 (0.5-2.0)
+    private Float speed;
+
+    public EdgeTtsService(String voiceName, Float pitch, Float speed, String outputPath) {
         this.voiceName = voiceName;
+        this.pitch = pitch;
+        this.speed = speed;
         this.outputPath = outputPath;
     }
 
@@ -48,12 +56,23 @@ public String textToSpeech(String text) throws Exception {
                 .collect(Collectors.toList()).get(0);
 
         TTS ttsEngine = new TTS(voiceObj, text);
+        
+        // 计算Edge TTS的rate参数 (将0.5-2.0映射到-50%到+100%)
+        // speed=0.5 -> rate=-50%, speed=1.0 -> rate=+0%, speed=2.0 -> rate=+100%
+        int ratePercent = (int)((speed - 1.0f) * 100);
+        
+        // 计算Edge TTS的pitch参数 (将0.5-2.0映射到-50Hz到+50Hz)
+        // pitch=0.5 -> -50Hz, pitch=1.0 -> 0Hz, pitch=2.0 -> +50Hz
+        int pitchHz = (int)((pitch - 1.0f) * 50);
+        
         // 执行TTS转换获取音频文件
         String audioFilePath = ttsEngine.findHeadHook()
                 .storage(outputPath)
                 .fileName(getAudioFileName().split("\\.")[0])
                 .isRateLimited(true)
                 .overwrite(false)
+                .voicePitch(pitchHz + "Hz")
+                .voiceRate(ratePercent + "%")
                 .formatMp3()
                 .trans();
 
diff --git a/src/main/java/com/xiaozhi/dialogue/tts/providers/MiniMaxTtsService.java b/src/main/java/com/xiaozhi/dialogue/tts/providers/MiniMaxTtsService.java
@@ -30,14 +30,20 @@ public class MiniMaxTtsService implements TtsService {
 
     private final String outputPath;
     private final String voiceName;
+    
+    // 语音参数
+    private final Float pitch;
+    private final Float speed;
 
     private final OkHttpClient client = HttpUtil.client;
     private static final MediaType JSON = MediaType.parse("application/json; charset=utf-8");
 
-    public MiniMaxTtsService(SysConfig config, String voiceName, String outputPath) {
+    public MiniMaxTtsService(SysConfig config, String voiceName, Float pitch, Float speed, String outputPath) {
         this.groupId = config.getAppId();
         this.apiKey = config.getApiKey();
         this.voiceName = voiceName;
+        this.pitch = pitch;
+        this.speed = speed;
         this.outputPath = outputPath;
     }
 
@@ -59,7 +65,19 @@ public String textToSpeech(String text) throws Exception {
     }
 
     private void sendRequest(String text, String filepath) {
+        // 创建请求参数
         var params = new Text2AudioParams(voiceName, text);
+        
+        // 设置语速（MiniMax范围 [0.5, 2]，与我们的范围一致，直接使用）
+        params.voiceSetting.setSpeed(speed);
+        
+        // 设置音调（需要映射：我们的 [0.5, 2] → MiniMax的 [-12, 12]）
+        // 映射公式：minimax_pitch = (our_pitch - 1.0) × 24
+        int minimaxPitch = (int)Math.round((pitch - 1.0f) * 24);
+        // 确保值在有效范围内
+        minimaxPitch = Math.max(-12, Math.min(12, minimaxPitch));
+        params.voiceSetting.setPitch(minimaxPitch);
+        
         var request = new Request.Builder()
                 .url("https://api.minimaxi.com/v1/t2a_v2?Groupid=%s".formatted(groupId))
                 .addHeader("Content-Type", "application/json")
diff --git a/src/main/java/com/xiaozhi/dialogue/tts/providers/VolcengineTtsService.java b/src/main/java/com/xiaozhi/dialogue/tts/providers/VolcengineTtsService.java
@@ -32,11 +32,17 @@ public class VolcengineTtsService implements TtsService {
     // API相关
     private String appId;
     private String accessToken; // 对应 apiKey
+    
+    // 语音参数
+    private Float pitch;
+    private Float speed;
 
     private final OkHttpClient client = HttpUtil.client;
 
-    public VolcengineTtsService(SysConfig config, String voiceName, String outputPath) {
+    public VolcengineTtsService(SysConfig config, String voiceName, Float pitch, Float speed, String outputPath) {
         this.voiceName = voiceName;
+        this.pitch = pitch;
+        this.speed = speed;
         this.outputPath = outputPath;
         this.appId = config.getAppId();
         this.accessToken = config.getApiKey();
@@ -97,9 +103,9 @@ private boolean sendRequest(String text, String audioFilePath) throws Exception
             JsonObject audio = new JsonObject();
             audio.addProperty("voice_type", voiceName);
             audio.addProperty("encoding", "wav");
-            audio.addProperty("speed_ratio", 1.0);
+            audio.addProperty("speed_ratio", speed);
             audio.addProperty("volume_ratio", 1.0);
-            audio.addProperty("pitch_ratio", 1.0);
+            audio.addProperty("pitch_ratio", pitch);
             audio.addProperty("rate", AudioUtils.SAMPLE_RATE);
             requestJson.add("audio", audio);
 
diff --git a/src/main/java/com/xiaozhi/dialogue/tts/providers/XfyunTtsService.java b/src/main/java/com/xiaozhi/dialogue/tts/providers/XfyunTtsService.java
@@ -33,9 +33,15 @@ public class XfyunTtsService implements TtsService {
     private String appId;
     private String apiKey;
     private String apiSecret;
+    
+    // 语音参数
+    private Float pitch;
+    private Float speed;
 
-    public XfyunTtsService(SysConfig config, String voiceName, String outputPath) {
+    public XfyunTtsService(SysConfig config, String voiceName, Float pitch, Float speed, String outputPath) {
         this.voiceName = voiceName;
+        this.pitch = pitch;
+        this.speed = speed;
         this.outputPath = outputPath;
         this.appId = config.getAppId();
         this.apiKey = config.getApiKey();
@@ -84,11 +90,23 @@ public String textToSpeech(String text) throws Exception {
     private boolean sendRequest(String text, File file) throws Exception {
         CountDownLatch recognitionLatch = new CountDownLatch(1);
         try {
+            // 将我们的参数（0.5-2.0）映射到讯飞的参数（0-100）
+            // 讯飞规则：0对应0.5倍，100对应2倍
+            // 映射公式：xfyunValue = (ourValue - 0.5) * 100 / 1.5
+            int xfyunSpeed = (int)Math.round((speed - 0.5f) * 100f / 1.5f);
+            int xfyunPitch = (int)Math.round((pitch - 0.5f) * 100f / 1.5f);
+            
+            // 确保值在有效范围内
+            xfyunSpeed = Math.max(0, Math.min(100, xfyunSpeed));
+            xfyunPitch = Math.max(0, Math.min(100, xfyunPitch));
+            
             // 设置合成参数
             TtsClient ttsClient = new TtsClient.Builder()
                     .signature(appId, apiKey, apiSecret)
                     .aue("lame")
                     .vcn(voiceName)
+                    .speed(xfyunSpeed)
+                    .pitch(xfyunPitch)
                     .build();
             ttsClient.send(text, new AbstractTtsWebSocketListener() {
                 //返回格式为音频文件的二进制数组bytes
diff --git a/src/main/java/com/xiaozhi/entity/SysRole.java b/src/main/java/com/xiaozhi/entity/SysRole.java
@@ -49,6 +49,18 @@ public class SysRole extends Base<SysRole> {
     @Schema(description = "语音名称")
     private String voiceName;
 
+    /**
+     * 语音音调(0.5-2.0, 默认1.0)
+     */
+    @Schema(description = "语音音调")
+    private Float ttsPitch = 1.0f;
+
+    /**
+     * 语音语速(0.5-2.0, 默认1.0)
+     */
+    @Schema(description = "语音语速")
+    private Float ttsSpeed = 1.0f;
+
     /**
      * 状态(1启用 0禁用)
      */
diff --git a/src/main/java/com/xiaozhi/mapper/RoleMapper.xml b/src/main/java/com/xiaozhi/mapper/RoleMapper.xml
diff --git a/web/src/views/page/Role.vue b/web/src/views/page/Role.vue

Original file line number	Diff line number	Diff line change
`@@ -122,7 +122,7 @@ public void afterConnection(ChatSession chatSession, String deviceIdAuth) {`
`122`	`122`	`if (role.getTtsId() != null) {`
`123`	`123`	`SysConfig ttsConfig = configService.selectConfigById(role.getTtsId());`
`124`	`124`	`if (ttsConfig != null) {`
`125`		`- ttsFactory.getTtsService(ttsConfig, role.getVoiceName());// 提前初始化，加速后续使用`
	`125`	`+ ttsFactory.getTtsService(ttsConfig, role.getVoiceName(), role.getTtsPitch(), role.getTtsSpeed());// 提前初始化，加速后续使用`
`126`	`126`	`}`
`127`	`127`	`}`
`128`	`128`	`if (role.getModelId() != null) {`