feat(ai-nutritionist): Coze TTS and streaming robustness

- Add Coze TTS endpoint and service; expose binary MP3 from controller.
- Bypass ResponseFilter for /audio/speech so MP3 bodies are not UTF-8 wrapped.
- UniApp: cozeTextToSpeech, TTS UI and play flow; SSE HTTP errors and diagnostics.
- Document TTS in docs/features.md; extend test-0325-1 with curl verification.

Made-with: Cursor
This commit is contained in:
msh-agent
2026-03-31 07:07:21 +08:00
parent 35052d655f
commit 2facd355ab
8 changed files with 433 additions and 351 deletions

View File

@@ -22,6 +22,7 @@ import org.springframework.web.servlet.mvc.method.annotation.SseEmitter;
import javax.servlet.http.HttpServletResponse;
import java.io.File;
import java.io.IOException;
import java.util.Map;
/**
* Coze API 控制器
@@ -132,4 +133,23 @@ public class CozeController {
public CozeBaseResponse<Object> workflowResume(@RequestBody CozeWorkflowResumeRequest request) {
return toolCozeService.workflowResume(request);
}
/**
* 文本转语音 (TTS)
*/
@ApiOperation(value = "文本转语音", notes = "调用 Coze TTS 将文本合成为 MP3 音频并直接返回二进制流")
@PostMapping("/audio/speech")
public void textToSpeech(@RequestBody Map<String, Object> params, HttpServletResponse response) throws IOException {
String input = (String) params.get("input");
String voiceId = (String) params.get("voiceId");
String format = params.get("format") != null ? (String) params.get("format") : "mp3";
Float speed = params.get("speed") != null ? ((Number) params.get("speed")).floatValue() : null;
byte[] audioData = toolCozeService.textToSpeech(input, voiceId, format, speed);
response.setContentType("audio/mpeg");
response.setHeader("Content-Disposition", "inline; filename=speech.mp3");
response.setContentLength(audioData.length);
response.getOutputStream().write(audioData);
response.getOutputStream().flush();
}
}

View File

@@ -27,6 +27,17 @@ public class ResponseFilter implements Filter {
@Override
public void doFilter(ServletRequest request, ServletResponse response, FilterChain filterChain)
throws IOException, ServletException {
HttpServletRequest httpRequest = (HttpServletRequest) request;
String uri = httpRequest.getRequestURI();
String accept = httpRequest.getHeader("Accept");
// SSE 流式响应和二进制音频响应不能被缓冲,直接透传
boolean isSseStream = uri != null && uri.contains("/stream");
boolean acceptsSse = accept != null && accept.contains("text/event-stream");
boolean isAudioResponse = uri != null && uri.contains("/audio/speech");
if (isSseStream || acceptsSse || isAudioResponse) {
filterChain.doFilter(request, response);
return;
}
ResponseWrapper wrapperResponse = new ResponseWrapper((HttpServletResponse) response);//转换成代理类
// 这里只拦截返回,直接让请求过去,如果在请求前有处理,可以在这里处理
filterChain.doFilter(request, wrapperResponse);

View File

@@ -1,5 +1,8 @@
package com.zbkj.service.service.impl.tool;
import com.coze.openapi.client.audio.common.AudioFormat;
import com.coze.openapi.client.audio.speech.CreateSpeechReq;
import com.coze.openapi.client.audio.speech.CreateSpeechResp;
import com.coze.openapi.client.chat.CreateChatReq;
import com.coze.openapi.client.chat.CreateChatResp;
import com.coze.openapi.client.chat.RetrieveChatReq;
@@ -437,6 +440,27 @@ public class ToolCozeServiceImpl implements ToolCozeService {
}
}
private static final String DEFAULT_VOICE_ID = "7468518753626652709";
@Override
public byte[] textToSpeech(String input, String voiceId, String format, Float speed) {
try {
CozeAPI client = getClient();
AudioFormat audioFormat = (format != null) ? AudioFormat.fromString(format) : AudioFormat.MP3;
CreateSpeechReq req = CreateSpeechReq.builder()
.input(input)
.voiceID(voiceId != null ? voiceId : DEFAULT_VOICE_ID)
.responseFormat(audioFormat)
.speed(speed != null ? speed : 1.0f)
.build();
CreateSpeechResp resp = client.audio().speech().create(req);
return resp.getResponse().bytes();
} catch (Exception e) {
logger.error("Coze TTS error", e);
throw new RuntimeException("语音合成失败: " + e.getMessage(), e);
}
}
/**
* 获取访问令牌
*/

View File

@@ -79,4 +79,15 @@ public interface ToolCozeService {
* @return 恢复结果
*/
CozeBaseResponse<Object> workflowResume(CozeWorkflowResumeRequest request);
/**
* 文本转语音 (TTS)
*
* @param input 要合成的文本
* @param voiceId 音色ID为 null 时使用默认中文音色
* @param format 音频格式,如 "mp3",为 null 时默认 mp3
* @param speed 语速1.0 为正常速度,为 null 时使用默认值
* @return 音频二进制数据
*/
byte[] textToSpeech(String input, String voiceId, String format, Float speed);
}