/*
* Copyright 2023-2024 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.alibaba.cloud.ai.tongyi.audio.speech;
import com.alibaba.cloud.ai.tongyi.audio.AudioSpeechModels;
import com.alibaba.cloud.ai.tongyi.audio.speech.api.*;
import com.alibaba.cloud.ai.tongyi.metadata.audio.TongYiAudioSpeechResponseMetadata;
import com.alibaba.dashscope.audio.tts.SpeechSynthesisParam;
import com.alibaba.dashscope.audio.tts.SpeechSynthesisResult;
import com.alibaba.dashscope.audio.tts.SpeechSynthesizer;
import com.alibaba.dashscope.common.ResultCallback;
import io.reactivex.Flowable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.util.Assert;
import reactor.core.publisher.Flux;
import reactor.core.scheduler.Schedulers;
import java.nio.ByteBuffer;
/**
* TongYiAudioSpeechClient is a client for TongYi audio speech service for Spring Cloud Alibaba AI.
*
* @author yuluo
* @author yuluo
* @since 2023.0.1.0
*/
public class TongYiAudioSpeechModel implements SpeechModel, SpeechStreamModel {
private final Logger logger = LoggerFactory.getLogger(getClass());
/**
* Default speed rate.
*/
private static final float SPEED_RATE = 1.0f;
/**
* TongYi models api.
*/
private final SpeechSynthesizer speechSynthesizer;
/**
* TongYi models options.
*/
private final TongYiAudioSpeechOptions defaultOptions;
/**
* TongYiAudioSpeechClient constructor.
* @param speechSynthesizer the speech synthesizer
*/
public TongYiAudioSpeechModel(SpeechSynthesizer speechSynthesizer) {
this(speechSynthesizer, null);
}
/**
* TongYiAudioSpeechClient constructor.
* @param speechSynthesizer the speech synthesizer
* @param tongYiAudioOptions the tongYi audio options
*/
public TongYiAudioSpeechModel(SpeechSynthesizer speechSynthesizer, TongYiAudioSpeechOptions tongYiAudioOptions) {
Assert.notNull(speechSynthesizer, "speechSynthesizer must not be null");
Assert.notNull(tongYiAudioOptions, "tongYiAudioOptions must not be null");
this.speechSynthesizer = speechSynthesizer;
this.defaultOptions = tongYiAudioOptions;
}
/**
* Call the TongYi audio speech service.
* @param text the text message to be converted to audio.
* @return the audio byte buffer.
*/
@Override
public ByteBuffer call(String text) {
var speechRequest = new SpeechPrompt(text);
return call(speechRequest).getResult().getOutput();
}
/**
* Call the TongYi audio speech service.
* @param prompt the speech prompt.
* @return the speech response.
*/
@Override
public SpeechResponse call(SpeechPrompt prompt) {
var SCASpeechParam = merge(prompt.getOptions());
var speechSynthesisParams = toSpeechSynthesisParams(SCASpeechParam);
speechSynthesisParams.setText(prompt.getInstructions().getText());
logger.info(speechSynthesisParams.toString());
var res = speechSynthesizer.call(speechSynthesisParams);
return convert(res, null);
}
/**
* Call the TongYi audio speech service.
* @param prompt the speech prompt.
* @param callback the result callback.
* {@link SpeechSynthesizer#call(SpeechSynthesisParam, ResultCallback)}
*/
public void call(SpeechPrompt prompt, ResultCallback callback) {
var SCASpeechParam = merge(prompt.getOptions());
var speechSynthesisParams = toSpeechSynthesisParams(SCASpeechParam);
speechSynthesisParams.setText(prompt.getInstructions().getText());
speechSynthesizer.call(speechSynthesisParams, callback);
}
/**
* Stream the TongYi audio speech service.
* @param prompt the speech prompt.
* @return the speech response.
* {@link SpeechSynthesizer#streamCall(SpeechSynthesisParam)}
*/
@Override
public Flux stream(SpeechPrompt prompt) {
var SCASpeechParam = merge(prompt.getOptions());
Flowable resultFlowable = speechSynthesizer
.streamCall(toSpeechSynthesisParams(SCASpeechParam));
return Flux.from(resultFlowable)
.flatMap(
res -> Flux.just(res.getAudioFrame())
.map(audio -> {
var speech = new Speech(audio);
var respMetadata = TongYiAudioSpeechResponseMetadata.from(res);
return new SpeechResponse(speech, respMetadata);
})
).publishOn(Schedulers.parallel());
}
public TongYiAudioSpeechOptions merge(TongYiAudioSpeechOptions target) {
var mergeBuilder = TongYiAudioSpeechOptions.builder();
mergeBuilder.withModel(defaultOptions.getModel() != null ? defaultOptions.getModel() : target.getModel());
mergeBuilder.withPitch(defaultOptions.getPitch() != null ? defaultOptions.getPitch() : target.getPitch());
mergeBuilder.withRate(defaultOptions.getRate() != null ? defaultOptions.getRate() : target.getRate());
mergeBuilder.withFormat(defaultOptions.getFormat() != null ? defaultOptions.getFormat() : target.getFormat());
mergeBuilder.withSampleRate(defaultOptions.getSampleRate() != null ? defaultOptions.getSampleRate() : target.getSampleRate());
mergeBuilder.withTextType(defaultOptions.getTextType() != null ? defaultOptions.getTextType() : target.getTextType());
mergeBuilder.withVolume(defaultOptions.getVolume() != null ? defaultOptions.getVolume() : target.getVolume());
mergeBuilder.withEnablePhonemeTimestamp(defaultOptions.isEnablePhonemeTimestamp() != null ? defaultOptions.isEnablePhonemeTimestamp() : target.isEnablePhonemeTimestamp());
mergeBuilder.withEnableWordTimestamp(defaultOptions.isEnableWordTimestamp() != null ? defaultOptions.isEnableWordTimestamp() : target.isEnableWordTimestamp());
return mergeBuilder.build();
}
public SpeechSynthesisParam toSpeechSynthesisParams(TongYiAudioSpeechOptions source) {
var mergeBuilder = SpeechSynthesisParam.builder();
mergeBuilder.model(source.getModel() != null ? source.getModel() : AudioSpeechModels.SAMBERT_ZHICHU_V1);
mergeBuilder.text(source.getText() != null ? source.getText() : "");
if (source.getFormat() != null) {
mergeBuilder.format(source.getFormat());
}
if (source.getRate() != null) {
mergeBuilder.rate(source.getRate());
}
if (source.getPitch() != null) {
mergeBuilder.pitch(source.getPitch());
}
if (source.getTextType() != null) {
mergeBuilder.textType(source.getTextType());
}
if (source.getSampleRate() != null) {
mergeBuilder.sampleRate(source.getSampleRate());
}
if (source.isEnablePhonemeTimestamp() != null) {
mergeBuilder.enablePhonemeTimestamp(source.isEnablePhonemeTimestamp());
}
if (source.isEnableWordTimestamp() != null) {
mergeBuilder.enableWordTimestamp(source.isEnableWordTimestamp());
}
if (source.getVolume() != null) {
mergeBuilder.volume(source.getVolume());
}
return mergeBuilder.build();
}
/**
* Convert the TongYi audio speech service result to the speech response.
* @param result the audio byte buffer.
* @param synthesisResult the synthesis result.
* @return the speech response.
*/
private SpeechResponse convert(ByteBuffer result, SpeechSynthesisResult synthesisResult) {
if (synthesisResult == null) {
return new SpeechResponse(new Speech(result));
}
var responseMetadata = TongYiAudioSpeechResponseMetadata.from(synthesisResult);
var speech = new Speech(synthesisResult.getAudioFrame());
return new SpeechResponse(speech, responseMetadata);
}
}