dramaling-vocab-learning/backend/DramaLing.Api/Services/AI/GeminiImageDescriptionServi...

250 lines
10 KiB
C#
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

using DramaLing.Api.Models.DTOs;
using DramaLing.Api.Models.Entities;
using DramaLing.Api.Models.Configuration;
using Microsoft.Extensions.Options;
using System.Diagnostics;
using System.Text.Json;
using System.Text;
namespace DramaLing.Api.Services.AI;
public class GeminiImageDescriptionService : IGeminiImageDescriptionService
{
private readonly HttpClient _httpClient;
private readonly GeminiOptions _options;
private readonly ILogger<GeminiImageDescriptionService> _logger;
public GeminiImageDescriptionService(
HttpClient httpClient,
IOptions<GeminiOptions> options,
ILogger<GeminiImageDescriptionService> logger)
{
_httpClient = httpClient ?? throw new ArgumentNullException(nameof(httpClient));
_options = options.Value ?? throw new ArgumentNullException(nameof(options));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
_httpClient.Timeout = TimeSpan.FromSeconds(_options.TimeoutSeconds);
_httpClient.DefaultRequestHeaders.Add("User-Agent", "DramaLing/1.0");
}
public async Task<ImageDescriptionResult> GenerateDescriptionAsync(
Flashcard flashcard,
GenerationOptionsDto options)
{
var stopwatch = Stopwatch.StartNew();
try
{
_logger.LogInformation("Starting image description generation for flashcard {FlashcardId}", flashcard.Id);
var prompt = BuildImageDescriptionPrompt(flashcard, options);
// 直接調用 Gemini API
var response = await CallGeminiAPIDirectly(prompt);
if (string.IsNullOrWhiteSpace(response))
{
throw new InvalidOperationException("Gemini API returned empty response");
}
var description = ExtractDescription(response);
var optimizedPrompt = OptimizeForReplicate(description, options);
stopwatch.Stop();
var result = new ImageDescriptionResult
{
Success = true,
Description = description,
OptimizedPrompt = optimizedPrompt,
Cost = CalculateGeminiCost(prompt),
ProcessingTimeMs = (int)stopwatch.ElapsedMilliseconds
};
_logger.LogInformation("Image description generated successfully in {ElapsedMs}ms", stopwatch.ElapsedMilliseconds);
return result;
}
catch (Exception ex)
{
stopwatch.Stop();
_logger.LogError(ex, "Gemini description generation failed for flashcard {FlashcardId}", flashcard.Id);
return new ImageDescriptionResult
{
Success = false,
Error = ex.Message,
ProcessingTimeMs = (int)stopwatch.ElapsedMilliseconds
};
}
}
private string BuildImageDescriptionPrompt(Flashcard flashcard, GenerationOptionsDto options)
{
return $@"# 總覽
你是一位專業插畫設計師兼職英文老師,專門為英語學習教材製作插畫圖卡,用來幫助學生理解英文例句的意思。
# 例句資訊
例句:{flashcard.Example}
# SOP
1. 根據上述英文例句請撰寫一段圖像描述提示詞用於提供圖片生成AI作為生成圖片的提示詞
2. 請將下方「風格指南」的所有要求加入提示詞中
3. 並於圖片提示詞最後加上「Absolutely no visible text, characters, letters, numbers, symbols, handwriting, labels, or any form of writing anywhere in the image — including on signs, books, clothing, screens, or backgrounds.」
# 圖片提示詞規範
## 情境清楚
1. 角色描述具體清楚
- 明確指出圖中有哪些人物,包含性別、年齡、外觀特徵或服裝
- 如有兩人以上,需說明他們彼此的關係或互動狀態(如:母女、朋友、陌生人等)
2. 動作明確具象
- 說明主角正在做的動作,須是能被具體畫出來的動作(如:喝咖啡、講電話、跑步)
- 若動作帶有情緒(如:生氣地講電話、緊張地看著別人),請加入情緒描述以利傳達語意
- 人物比例正常、表情自然、生動但不誇張
3. 場景明確具體
- 指出事件發生的地點(如:公園、教室、咖啡廳、城市街道)
- 可補充時間(如:早上、傍晚)與天氣(如:下雨、晴天),幫助構圖更清楚
4. 物品明確具體
- 若例句中包含物品(如:書、手機、餐點、雨傘等),必須清楚描述物品的種類、外觀特徵、位置與用途
- 避免模糊詞(如 ""some stuff""、""a thing""),應具體指出是什麼物品
- 若物品為主題核心,請描述其使用情境或與人物的互動方式
- 若出現多個物品,需明確指示其關係與空間位置
- 所有物品須為日常生活中常見物件,避免使用過於抽象或符號化的圖像
5. 語意需與原句一致
- 提示詞必須忠實呈現英文句子的核心意思
- 若英文句含有抽象概念或隱喻,請轉化為對應的具象場景
6. 避免過於抽象或象徵性符號
- 圖片必須用生活中常見的情境、物體或角色表現,避免使用抽象圖形來傳達語意
- 圖片中不要出現任何文字
## 風格指南
- 風格類型扁平插畫Flat Illustration
- 線條特徵無描邊線條outline-less
- 色調:暖色調、柔和、低飽和
- 人物樣式:簡化卡通人物,表情自然,不誇張
- 背景構成:圖形簡化(如樹、草地),使用色塊區分層次
- 整體氛圍:溫馨、平靜、適合教育情境
- 技術風格:無紋理、無漸層、無光影寫實感
請根據以上規範,為這個英文例句生成圖片描述提示詞,並確保完全符合風格指南要求。";
}
private string ExtractDescription(string geminiResponse)
{
// 從 Gemini 回應中提取圖片描述
var description = geminiResponse.Trim();
// 移除可能的 markdown 標記
if (description.StartsWith("```"))
{
var lines = description.Split('\n');
description = string.Join('\n', lines.Skip(1).SkipLast(1));
}
return description.Trim();
}
private string OptimizeForReplicate(string description, GenerationOptionsDto options)
{
var optimizedPrompt = description;
// 確保包含扁平插畫風格要求
if (!optimizedPrompt.Contains("flat illustration"))
{
optimizedPrompt += ". Style guide: flat illustration style, outline-less shapes, warm and soft color tones, low saturation, cartoon-style characters with natural expressions, simplified background with color blocks, cozy and educational atmosphere, no texture, no gradients, no photorealism, no fantasy elements.";
}
// 強制加入禁止文字的規則
if (!optimizedPrompt.Contains("Absolutely no visible text"))
{
optimizedPrompt += " Absolutely no visible text, characters, letters, numbers, symbols, handwriting, labels, or any form of writing anywhere in the image — including on signs, books, clothing, screens, or backgrounds.";
}
return optimizedPrompt;
}
private decimal CalculateGeminiCost(string prompt)
{
// 粗略估算 token 數量和成本
var estimatedTokens = prompt.Length / 4; // 粗略估算
var inputCost = estimatedTokens * 0.000001m; // Gemini 1.5 Flash input cost
var outputCost = 500 * 0.000003m; // 假設輸出 500 tokens
return inputCost + outputCost;
}
private async Task<string> CallGeminiAPIDirectly(string prompt)
{
try
{
var requestBody = new
{
contents = new[]
{
new
{
parts = new[]
{
new { text = prompt }
}
}
},
generationConfig = new
{
temperature = _options.Temperature,
topK = 40,
topP = 0.95,
maxOutputTokens = _options.MaxOutputTokens
}
};
var json = JsonSerializer.Serialize(requestBody);
var content = new StringContent(json, Encoding.UTF8, "application/json");
var response = await _httpClient.PostAsync(
$"{_options.BaseUrl}/v1beta/models/{_options.Model}:generateContent?key={_options.ApiKey}",
content);
response.EnsureSuccessStatusCode();
var responseJson = await response.Content.ReadAsStringAsync();
return ExtractTextFromResponse(responseJson);
}
catch (Exception ex)
{
_logger.LogError(ex, "Gemini API call failed");
throw;
}
}
private string ExtractTextFromResponse(string responseJson)
{
using var document = JsonDocument.Parse(responseJson);
var root = document.RootElement;
if (root.TryGetProperty("candidates", out var candidatesElement) &&
candidatesElement.ValueKind == JsonValueKind.Array)
{
var firstCandidate = candidatesElement.EnumerateArray().FirstOrDefault();
if (firstCandidate.ValueKind != JsonValueKind.Undefined &&
firstCandidate.TryGetProperty("content", out var contentElement) &&
contentElement.TryGetProperty("parts", out var partsElement) &&
partsElement.ValueKind == JsonValueKind.Array)
{
var firstPart = partsElement.EnumerateArray().FirstOrDefault();
if (firstPart.TryGetProperty("text", out var textElement))
{
return textElement.GetString() ?? string.Empty;
}
}
}
return string.Empty;
}
}