250 lines
10 KiB
C#
250 lines
10 KiB
C#
using DramaLing.Api.Models.DTOs;
|
||
using DramaLing.Api.Models.Entities;
|
||
using DramaLing.Api.Models.Configuration;
|
||
using Microsoft.Extensions.Options;
|
||
using System.Diagnostics;
|
||
using System.Text.Json;
|
||
using System.Text;
|
||
|
||
namespace DramaLing.Api.Services.AI;
|
||
|
||
public class GeminiImageDescriptionService : IGeminiImageDescriptionService
|
||
{
|
||
private readonly HttpClient _httpClient;
|
||
private readonly GeminiOptions _options;
|
||
private readonly ILogger<GeminiImageDescriptionService> _logger;
|
||
|
||
public GeminiImageDescriptionService(
|
||
HttpClient httpClient,
|
||
IOptions<GeminiOptions> options,
|
||
ILogger<GeminiImageDescriptionService> logger)
|
||
{
|
||
_httpClient = httpClient ?? throw new ArgumentNullException(nameof(httpClient));
|
||
_options = options.Value ?? throw new ArgumentNullException(nameof(options));
|
||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||
|
||
_httpClient.Timeout = TimeSpan.FromSeconds(_options.TimeoutSeconds);
|
||
_httpClient.DefaultRequestHeaders.Add("User-Agent", "DramaLing/1.0");
|
||
}
|
||
|
||
public async Task<ImageDescriptionResult> GenerateDescriptionAsync(
|
||
Flashcard flashcard,
|
||
GenerationOptionsDto options)
|
||
{
|
||
var stopwatch = Stopwatch.StartNew();
|
||
|
||
try
|
||
{
|
||
_logger.LogInformation("Starting image description generation for flashcard {FlashcardId}", flashcard.Id);
|
||
|
||
var prompt = BuildImageDescriptionPrompt(flashcard, options);
|
||
|
||
// 直接調用 Gemini API
|
||
var response = await CallGeminiAPIDirectly(prompt);
|
||
|
||
if (string.IsNullOrWhiteSpace(response))
|
||
{
|
||
throw new InvalidOperationException("Gemini API returned empty response");
|
||
}
|
||
|
||
var description = ExtractDescription(response);
|
||
var optimizedPrompt = OptimizeForReplicate(description, options);
|
||
|
||
stopwatch.Stop();
|
||
|
||
var result = new ImageDescriptionResult
|
||
{
|
||
Success = true,
|
||
Description = description,
|
||
OptimizedPrompt = optimizedPrompt,
|
||
Cost = CalculateGeminiCost(prompt),
|
||
ProcessingTimeMs = (int)stopwatch.ElapsedMilliseconds
|
||
};
|
||
|
||
_logger.LogInformation("Image description generated successfully in {ElapsedMs}ms", stopwatch.ElapsedMilliseconds);
|
||
|
||
return result;
|
||
}
|
||
catch (Exception ex)
|
||
{
|
||
stopwatch.Stop();
|
||
_logger.LogError(ex, "Gemini description generation failed for flashcard {FlashcardId}", flashcard.Id);
|
||
|
||
return new ImageDescriptionResult
|
||
{
|
||
Success = false,
|
||
Error = ex.Message,
|
||
ProcessingTimeMs = (int)stopwatch.ElapsedMilliseconds
|
||
};
|
||
}
|
||
}
|
||
|
||
private string BuildImageDescriptionPrompt(Flashcard flashcard, GenerationOptionsDto options)
|
||
{
|
||
return $@"# 總覽
|
||
你是一位專業插畫設計師兼職英文老師,專門為英語學習教材製作插畫圖卡,用來幫助學生理解英文例句的意思。
|
||
|
||
# 例句資訊
|
||
例句:{flashcard.Example}
|
||
|
||
# SOP
|
||
1. 根據上述英文例句,請撰寫一段圖像描述提示詞,用於提供圖片生成AI作為生成圖片的提示詞
|
||
2. 請將下方「風格指南」的所有要求加入提示詞中
|
||
3. 並於圖片提示詞最後加上:「Absolutely no visible text, characters, letters, numbers, symbols, handwriting, labels, or any form of writing anywhere in the image — including on signs, books, clothing, screens, or backgrounds.」
|
||
|
||
# 圖片提示詞規範
|
||
|
||
## 情境清楚
|
||
1. 角色描述具體清楚
|
||
- 明確指出圖中有哪些人物,包含性別、年齡、外觀特徵或服裝
|
||
- 如有兩人以上,需說明他們彼此的關係或互動狀態(如:母女、朋友、陌生人等)
|
||
|
||
2. 動作明確具象
|
||
- 說明主角正在做的動作,須是能被具體畫出來的動作(如:喝咖啡、講電話、跑步)
|
||
- 若動作帶有情緒(如:生氣地講電話、緊張地看著別人),請加入情緒描述以利傳達語意
|
||
- 人物比例正常、表情自然、生動但不誇張
|
||
|
||
3. 場景明確具體
|
||
- 指出事件發生的地點(如:公園、教室、咖啡廳、城市街道)
|
||
- 可補充時間(如:早上、傍晚)與天氣(如:下雨、晴天),幫助構圖更清楚
|
||
|
||
4. 物品明確具體
|
||
- 若例句中包含物品(如:書、手機、餐點、雨傘等),必須清楚描述物品的種類、外觀特徵、位置與用途
|
||
- 避免模糊詞(如 ""some stuff""、""a thing""),應具體指出是什麼物品
|
||
- 若物品為主題核心,請描述其使用情境或與人物的互動方式
|
||
- 若出現多個物品,需明確指示其關係與空間位置
|
||
- 所有物品須為日常生活中常見物件,避免使用過於抽象或符號化的圖像
|
||
|
||
5. 語意需與原句一致
|
||
- 提示詞必須忠實呈現英文句子的核心意思
|
||
- 若英文句含有抽象概念或隱喻,請轉化為對應的具象場景
|
||
|
||
6. 避免過於抽象或象徵性符號
|
||
- 圖片必須用生活中常見的情境、物體或角色表現,避免使用抽象圖形來傳達語意
|
||
- 圖片中不要出現任何文字
|
||
|
||
## 風格指南
|
||
- 風格類型:扁平插畫(Flat Illustration)
|
||
- 線條特徵:無描邊線條(outline-less)
|
||
- 色調:暖色調、柔和、低飽和
|
||
- 人物樣式:簡化卡通人物,表情自然,不誇張
|
||
- 背景構成:圖形簡化(如樹、草地),使用色塊區分層次
|
||
- 整體氛圍:溫馨、平靜、適合教育情境
|
||
- 技術風格:無紋理、無漸層、無光影寫實感
|
||
|
||
請根據以上規範,為這個英文例句生成圖片描述提示詞,並確保完全符合風格指南要求。";
|
||
}
|
||
|
||
private string ExtractDescription(string geminiResponse)
|
||
{
|
||
// 從 Gemini 回應中提取圖片描述
|
||
var description = geminiResponse.Trim();
|
||
|
||
// 移除可能的 markdown 標記
|
||
if (description.StartsWith("```"))
|
||
{
|
||
var lines = description.Split('\n');
|
||
description = string.Join('\n', lines.Skip(1).SkipLast(1));
|
||
}
|
||
|
||
return description.Trim();
|
||
}
|
||
|
||
private string OptimizeForReplicate(string description, GenerationOptionsDto options)
|
||
{
|
||
var optimizedPrompt = description;
|
||
|
||
// 確保包含扁平插畫風格要求
|
||
if (!optimizedPrompt.Contains("flat illustration"))
|
||
{
|
||
optimizedPrompt += ". Style guide: flat illustration style, outline-less shapes, warm and soft color tones, low saturation, cartoon-style characters with natural expressions, simplified background with color blocks, cozy and educational atmosphere, no texture, no gradients, no photorealism, no fantasy elements.";
|
||
}
|
||
|
||
// 強制加入禁止文字的規則
|
||
if (!optimizedPrompt.Contains("Absolutely no visible text"))
|
||
{
|
||
optimizedPrompt += " Absolutely no visible text, characters, letters, numbers, symbols, handwriting, labels, or any form of writing anywhere in the image — including on signs, books, clothing, screens, or backgrounds.";
|
||
}
|
||
|
||
return optimizedPrompt;
|
||
}
|
||
|
||
private decimal CalculateGeminiCost(string prompt)
|
||
{
|
||
// 粗略估算 token 數量和成本
|
||
var estimatedTokens = prompt.Length / 4; // 粗略估算
|
||
var inputCost = estimatedTokens * 0.000001m; // Gemini 1.5 Flash input cost
|
||
var outputCost = 500 * 0.000003m; // 假設輸出 500 tokens
|
||
|
||
return inputCost + outputCost;
|
||
}
|
||
|
||
private async Task<string> CallGeminiAPIDirectly(string prompt)
|
||
{
|
||
try
|
||
{
|
||
var requestBody = new
|
||
{
|
||
contents = new[]
|
||
{
|
||
new
|
||
{
|
||
parts = new[]
|
||
{
|
||
new { text = prompt }
|
||
}
|
||
}
|
||
},
|
||
generationConfig = new
|
||
{
|
||
temperature = _options.Temperature,
|
||
topK = 40,
|
||
topP = 0.95,
|
||
maxOutputTokens = _options.MaxOutputTokens
|
||
}
|
||
};
|
||
|
||
var json = JsonSerializer.Serialize(requestBody);
|
||
var content = new StringContent(json, Encoding.UTF8, "application/json");
|
||
|
||
var response = await _httpClient.PostAsync(
|
||
$"{_options.BaseUrl}/v1beta/models/{_options.Model}:generateContent?key={_options.ApiKey}",
|
||
content);
|
||
|
||
response.EnsureSuccessStatusCode();
|
||
|
||
var responseJson = await response.Content.ReadAsStringAsync();
|
||
return ExtractTextFromResponse(responseJson);
|
||
}
|
||
catch (Exception ex)
|
||
{
|
||
_logger.LogError(ex, "Gemini API call failed");
|
||
throw;
|
||
}
|
||
}
|
||
|
||
private string ExtractTextFromResponse(string responseJson)
|
||
{
|
||
using var document = JsonDocument.Parse(responseJson);
|
||
var root = document.RootElement;
|
||
|
||
if (root.TryGetProperty("candidates", out var candidatesElement) &&
|
||
candidatesElement.ValueKind == JsonValueKind.Array)
|
||
{
|
||
var firstCandidate = candidatesElement.EnumerateArray().FirstOrDefault();
|
||
if (firstCandidate.ValueKind != JsonValueKind.Undefined &&
|
||
firstCandidate.TryGetProperty("content", out var contentElement) &&
|
||
contentElement.TryGetProperty("parts", out var partsElement) &&
|
||
partsElement.ValueKind == JsonValueKind.Array)
|
||
{
|
||
var firstPart = partsElement.EnumerateArray().FirstOrDefault();
|
||
if (firstPart.TryGetProperty("text", out var textElement))
|
||
{
|
||
return textElement.GetString() ?? string.Empty;
|
||
}
|
||
}
|
||
}
|
||
|
||
return string.Empty;
|
||
}
|
||
} |