dramaling-vocab-learning/backend/DramaLing.Api/Services/Media/Audio/AzureSpeechService.cs

using DramaLing.Api.Models.Dtos;
using System.Text;
using System.Security.Cryptography;

namespace DramaLing.Api.Services;

public interface IAzureSpeechService
{
    Task<TTSResponse> GenerateAudioAsync(TTSRequest request);
    Task<PronunciationResponse> EvaluatePronunciationAsync(Stream audioStream, PronunciationRequest request);
}

public class AzureSpeechService : IAzureSpeechService
{
    private readonly IConfiguration _configuration;
    private readonly ILogger<AzureSpeechService> _logger;
    private readonly bool _isConfigured;

    public AzureSpeechService(IConfiguration configuration, ILogger<AzureSpeechService> logger)
    {
        _configuration = configuration;
        _logger = logger;

        var subscriptionKey = _configuration["Azure:Speech:SubscriptionKey"];
        var region = _configuration["Azure:Speech:Region"];

        if (string.IsNullOrEmpty(subscriptionKey) || string.IsNullOrEmpty(region))
        {
            _logger.LogWarning("Azure Speech configuration is missing. TTS functionality will be disabled.");
            _isConfigured = false;
            return;
        }

        _isConfigured = true;
        _logger.LogInformation("Azure Speech service configured for region: {Region}", region);
    }

    public async Task<TTSResponse> GenerateAudioAsync(TTSRequest request)
    {
        try
        {
            if (!_isConfigured)
            {
                return new TTSResponse
                {
                    Error = "Azure Speech service is not configured"
                };
            }

            // 模擬 TTS 處理，返回模擬數據
            await Task.Delay(500); // 模擬 API 延遲

            // 生成模擬的 base64 音頻數據 (實際上是空的 MP3 標頭)
            var mockAudioData = Convert.ToBase64String(new byte[] {
                0xFF, 0xFB, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
            });
            var audioUrl = $"data:audio/mp3;base64,{mockAudioData}";

            return new TTSResponse
            {
                AudioUrl = audioUrl,
                Duration = CalculateAudioDuration(request.Text.Length),
                CacheHit = false
            };
        }
        catch (Exception ex)
        {
            _logger.LogError(ex, "Error generating audio for text: {Text}", request.Text);
            return new TTSResponse
            {
                Error = "Internal error generating audio"
            };
        }
    }

    public async Task<PronunciationResponse> EvaluatePronunciationAsync(Stream audioStream, PronunciationRequest request)
    {
        try
        {
            if (!_isConfigured)
            {
                return new PronunciationResponse
                {
                    Error = "Azure Speech service is not configured"
                };
            }

            // 模擬語音評估處理
            await Task.Delay(2000); // 模擬 API 調用延遲

            // 生成模擬的評分數據
            var random = new Random();
            var overallScore = random.Next(75, 95);

            return new PronunciationResponse
            {
                OverallScore = overallScore,
                Accuracy = (float)(random.NextDouble() * 20 + 75),
                Fluency = (float)(random.NextDouble() * 20 + 75),
                Completeness = (float)(random.NextDouble() * 20 + 75),
                Prosody = (float)(random.NextDouble() * 20 + 75),
                PhonemeScores = GenerateMockPhonemeScores(request.TargetText),
                Suggestions = GenerateMockSuggestions(overallScore)
            };
        }
        catch (Exception ex)
        {
            _logger.LogError(ex, "Error evaluating pronunciation for text: {Text}", request.TargetText);
            return new PronunciationResponse
            {
                Error = "Internal error evaluating pronunciation"
            };
        }
    }

    private List<PhonemeScore> GenerateMockPhonemeScores(string text)
    {
        var phonemes = new List<PhonemeScore>();
        var words = text.Split(' ', StringSplitOptions.RemoveEmptyEntries);

        foreach (var word in words.Take(3)) // 只處理前3個詞
        {
            phonemes.Add(new PhonemeScore
            {
                Phoneme = $"/{word[0]}/",
                Score = Random.Shared.Next(70, 95),
                Suggestion = Random.Shared.Next(0, 3) == 0 ? $"注意 {word} 的發音" : null
            });
        }

        return phonemes;
    }

    private List<string> GenerateMockSuggestions(int overallScore)
    {
        var suggestions = new List<string>();

        if (overallScore < 85)
        {
            suggestions.Add("注意單詞的重音位置");
        }

        if (overallScore < 80)
        {
            suggestions.Add("發音可以更清晰一些");
            suggestions.Add("嘗試放慢語速，確保每個音都發準");
        }

        if (overallScore >= 90)
        {
            suggestions.Add("發音很棒！繼續保持");
        }

        return suggestions;
    }

    private string GetVoiceName(string accent, string voicePreference)
    {
        return accent.ToLower() switch
        {
            "uk" => "en-GB-SoniaNeural",
            "us" => "en-US-AriaNeural",
            _ => "en-US-AriaNeural"
        };
    }

    private string CreateSSML(string text, string voice, float speed)
    {
        var rate = speed switch
        {
            < 0.8f => "slow",
            > 1.2f => "fast",
            _ => "medium"
        };

        return $@"
        <speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='en-US'>
            <voice name='{voice}'>
                <prosody rate='{rate}'>
                    {text}
                </prosody>
            </voice>
        </speak>";
    }

    private float CalculateAudioDuration(int textLength)
    {
        // 根據文字長度估算音頻時長：平均每個字符 0.1 秒
        return Math.Max(1.0f, textLength * 0.1f);
    }
}