/* * Copyright 2023-2024 the original author or authors. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * https://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.alibaba.cloud.ai.tongyi.audio.speech; import com.alibaba.cloud.ai.tongyi.audio.AudioSpeechModels; import com.alibaba.cloud.ai.tongyi.audio.speech.api.*; import com.alibaba.cloud.ai.tongyi.metadata.audio.TongYiAudioSpeechResponseMetadata; import com.alibaba.dashscope.audio.tts.SpeechSynthesisParam; import com.alibaba.dashscope.audio.tts.SpeechSynthesisResult; import com.alibaba.dashscope.audio.tts.SpeechSynthesizer; import com.alibaba.dashscope.common.ResultCallback; import io.reactivex.Flowable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.util.Assert; import reactor.core.publisher.Flux; import reactor.core.scheduler.Schedulers; import java.nio.ByteBuffer; /** * TongYiAudioSpeechClient is a client for TongYi audio speech service for Spring Cloud Alibaba AI. * * @author yuluo * @author yuluo * @since 2023.0.1.0 */ public class TongYiAudioSpeechModel implements SpeechModel, SpeechStreamModel { private final Logger logger = LoggerFactory.getLogger(getClass()); /** * Default speed rate. */ private static final float SPEED_RATE = 1.0f; /** * TongYi models api. */ private final SpeechSynthesizer speechSynthesizer; /** * TongYi models options. */ private final TongYiAudioSpeechOptions defaultOptions; /** * TongYiAudioSpeechClient constructor. * @param speechSynthesizer the speech synthesizer */ public TongYiAudioSpeechModel(SpeechSynthesizer speechSynthesizer) { this(speechSynthesizer, null); } /** * TongYiAudioSpeechClient constructor. * @param speechSynthesizer the speech synthesizer * @param tongYiAudioOptions the tongYi audio options */ public TongYiAudioSpeechModel(SpeechSynthesizer speechSynthesizer, TongYiAudioSpeechOptions tongYiAudioOptions) { Assert.notNull(speechSynthesizer, "speechSynthesizer must not be null"); Assert.notNull(tongYiAudioOptions, "tongYiAudioOptions must not be null"); this.speechSynthesizer = speechSynthesizer; this.defaultOptions = tongYiAudioOptions; } /** * Call the TongYi audio speech service. * @param text the text message to be converted to audio. * @return the audio byte buffer. */ @Override public ByteBuffer call(String text) { var speechRequest = new SpeechPrompt(text); return call(speechRequest).getResult().getOutput(); } /** * Call the TongYi audio speech service. * @param prompt the speech prompt. * @return the speech response. */ @Override public SpeechResponse call(SpeechPrompt prompt) { var SCASpeechParam = merge(prompt.getOptions()); var speechSynthesisParams = toSpeechSynthesisParams(SCASpeechParam); speechSynthesisParams.setText(prompt.getInstructions().getText()); logger.info(speechSynthesisParams.toString()); var res = speechSynthesizer.call(speechSynthesisParams); return convert(res, null); } /** * Call the TongYi audio speech service. * @param prompt the speech prompt. * @param callback the result callback. * {@link SpeechSynthesizer#call(SpeechSynthesisParam, ResultCallback)} */ public void call(SpeechPrompt prompt, ResultCallback callback) { var SCASpeechParam = merge(prompt.getOptions()); var speechSynthesisParams = toSpeechSynthesisParams(SCASpeechParam); speechSynthesisParams.setText(prompt.getInstructions().getText()); speechSynthesizer.call(speechSynthesisParams, callback); } /** * Stream the TongYi audio speech service. * @param prompt the speech prompt. * @return the speech response. * {@link SpeechSynthesizer#streamCall(SpeechSynthesisParam)} */ @Override public Flux stream(SpeechPrompt prompt) { var SCASpeechParam = merge(prompt.getOptions()); Flowable resultFlowable = speechSynthesizer .streamCall(toSpeechSynthesisParams(SCASpeechParam)); return Flux.from(resultFlowable) .flatMap( res -> Flux.just(res.getAudioFrame()) .map(audio -> { var speech = new Speech(audio); var respMetadata = TongYiAudioSpeechResponseMetadata.from(res); return new SpeechResponse(speech, respMetadata); }) ).publishOn(Schedulers.parallel()); } public TongYiAudioSpeechOptions merge(TongYiAudioSpeechOptions target) { var mergeBuilder = TongYiAudioSpeechOptions.builder(); mergeBuilder.withModel(defaultOptions.getModel() != null ? defaultOptions.getModel() : target.getModel()); mergeBuilder.withPitch(defaultOptions.getPitch() != null ? defaultOptions.getPitch() : target.getPitch()); mergeBuilder.withRate(defaultOptions.getRate() != null ? defaultOptions.getRate() : target.getRate()); mergeBuilder.withFormat(defaultOptions.getFormat() != null ? defaultOptions.getFormat() : target.getFormat()); mergeBuilder.withSampleRate(defaultOptions.getSampleRate() != null ? defaultOptions.getSampleRate() : target.getSampleRate()); mergeBuilder.withTextType(defaultOptions.getTextType() != null ? defaultOptions.getTextType() : target.getTextType()); mergeBuilder.withVolume(defaultOptions.getVolume() != null ? defaultOptions.getVolume() : target.getVolume()); mergeBuilder.withEnablePhonemeTimestamp(defaultOptions.isEnablePhonemeTimestamp() != null ? defaultOptions.isEnablePhonemeTimestamp() : target.isEnablePhonemeTimestamp()); mergeBuilder.withEnableWordTimestamp(defaultOptions.isEnableWordTimestamp() != null ? defaultOptions.isEnableWordTimestamp() : target.isEnableWordTimestamp()); return mergeBuilder.build(); } public SpeechSynthesisParam toSpeechSynthesisParams(TongYiAudioSpeechOptions source) { var mergeBuilder = SpeechSynthesisParam.builder(); mergeBuilder.model(source.getModel() != null ? source.getModel() : AudioSpeechModels.SAMBERT_ZHICHU_V1); mergeBuilder.text(source.getText() != null ? source.getText() : ""); if (source.getFormat() != null) { mergeBuilder.format(source.getFormat()); } if (source.getRate() != null) { mergeBuilder.rate(source.getRate()); } if (source.getPitch() != null) { mergeBuilder.pitch(source.getPitch()); } if (source.getTextType() != null) { mergeBuilder.textType(source.getTextType()); } if (source.getSampleRate() != null) { mergeBuilder.sampleRate(source.getSampleRate()); } if (source.isEnablePhonemeTimestamp() != null) { mergeBuilder.enablePhonemeTimestamp(source.isEnablePhonemeTimestamp()); } if (source.isEnableWordTimestamp() != null) { mergeBuilder.enableWordTimestamp(source.isEnableWordTimestamp()); } if (source.getVolume() != null) { mergeBuilder.volume(source.getVolume()); } return mergeBuilder.build(); } /** * Convert the TongYi audio speech service result to the speech response. * @param result the audio byte buffer. * @param synthesisResult the synthesis result. * @return the speech response. */ private SpeechResponse convert(ByteBuffer result, SpeechSynthesisResult synthesisResult) { if (synthesisResult == null) { return new SpeechResponse(new Speech(result)); } var responseMetadata = TongYiAudioSpeechResponseMetadata.from(synthesisResult); var speech = new Speech(synthesisResult.getAudioFrame()); return new SpeechResponse(speech, responseMetadata); } }