joel momoent
This commit is contained in:
@@ -84,3 +84,4 @@ export type { AiProvider, AiResponse, MessageStyle } from "./types";
|
||||
export type { ToolContext, ToolCall, ToolResult } from "./tools";
|
||||
export { JOEL_TOOLS, MEMORY_EXTRACTION_TOOLS } from "./tools";
|
||||
export { getEmbeddingService, EmbeddingService } from "./embeddings";
|
||||
export { getVisionService, VisionService, type Attachment, type VisionAnalysis } from "./vision";
|
||||
|
||||
315
src/services/ai/vision.ts
Normal file
315
src/services/ai/vision.ts
Normal file
@@ -0,0 +1,315 @@
|
||||
/**
|
||||
* Vision AI service for analyzing image and video attachments
|
||||
* Uses a vision-capable model to describe attachments, which can then be passed to the main AI
|
||||
*/
|
||||
|
||||
import OpenAI from "openai";
|
||||
import { config } from "../../core/config";
|
||||
import { createLogger } from "../../core/logger";
|
||||
|
||||
const logger = createLogger("AI:Vision");
|
||||
|
||||
/**
|
||||
* Supported attachment types for vision analysis
|
||||
*/
|
||||
export type AttachmentType = "image" | "video" | "unknown";
|
||||
|
||||
/**
|
||||
* Represents an attachment to be analyzed
|
||||
*/
|
||||
export interface Attachment {
|
||||
url: string;
|
||||
name: string;
|
||||
contentType: string | null;
|
||||
size: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Result of vision analysis
|
||||
*/
|
||||
export interface VisionAnalysis {
|
||||
description: string;
|
||||
attachmentName: string;
|
||||
type: AttachmentType;
|
||||
}
|
||||
|
||||
/**
|
||||
* Vision model to use for image analysis
|
||||
* Gemini 2.0 Flash supports both images and videos
|
||||
*/
|
||||
const VISION_MODEL = "google/gemini-2.0-flash-001";
|
||||
|
||||
/**
|
||||
* Maximum file sizes for analysis
|
||||
*/
|
||||
const MAX_IMAGE_SIZE = 10 * 1024 * 1024; // 10MB for images
|
||||
const MAX_VIDEO_SIZE = 50 * 1024 * 1024; // 50MB for videos (Gemini supports larger but Discord limits)
|
||||
|
||||
/**
|
||||
* Maximum video duration we'll attempt to analyze (in seconds)
|
||||
* Gemini can handle longer but we want quick responses
|
||||
*/
|
||||
const MAX_VIDEO_DURATION_HINT = 60; // 1 minute
|
||||
|
||||
/**
|
||||
* Determine the type of attachment based on content type
|
||||
*/
|
||||
export function getAttachmentType(contentType: string | null): AttachmentType {
|
||||
if (!contentType) return "unknown";
|
||||
|
||||
if (contentType.startsWith("image/")) return "image";
|
||||
if (contentType.startsWith("video/")) return "video";
|
||||
|
||||
return "unknown";
|
||||
}
|
||||
|
||||
/**
|
||||
* Filter attachments to only include those we can analyze
|
||||
*/
|
||||
export function filterAnalyzableAttachments(attachments: Attachment[]): Attachment[] {
|
||||
return attachments.filter(att => {
|
||||
const type = getAttachmentType(att.contentType);
|
||||
|
||||
if (type === "image") {
|
||||
if (att.size > MAX_IMAGE_SIZE) {
|
||||
logger.debug("Skipping large image", { name: att.name, size: att.size });
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
if (type === "video") {
|
||||
if (att.size > MAX_VIDEO_SIZE) {
|
||||
logger.debug("Skipping large video", { name: att.name, size: att.size });
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
logger.debug("Skipping unsupported attachment", { name: att.name, type: att.contentType });
|
||||
return false;
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Analyze a single image attachment using vision AI
|
||||
*/
|
||||
async function analyzeImage(
|
||||
client: OpenAI,
|
||||
attachment: Attachment,
|
||||
context?: string
|
||||
): Promise<VisionAnalysis> {
|
||||
const systemPrompt = `You are analyzing an image attached to a Discord message.
|
||||
Provide a concise but detailed description of what's in the image.
|
||||
Include:
|
||||
- Main subjects/objects
|
||||
- Actions happening
|
||||
- Text visible in the image (if any)
|
||||
- Mood/tone of the image
|
||||
- Any memes, jokes, or references you recognize
|
||||
|
||||
Keep your description to 2-3 sentences unless the image contains important text or complex content.
|
||||
${context ? `\nContext from the user's message: "${context}"` : ""}`;
|
||||
|
||||
try {
|
||||
const completion = await client.chat.completions.create({
|
||||
model: VISION_MODEL,
|
||||
messages: [
|
||||
{
|
||||
role: "user",
|
||||
content: [
|
||||
{ type: "text", text: systemPrompt },
|
||||
{
|
||||
type: "image_url",
|
||||
image_url: {
|
||||
url: attachment.url,
|
||||
detail: "auto", // Let the model decide on resolution
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
max_tokens: 300,
|
||||
temperature: 0.3,
|
||||
});
|
||||
|
||||
const description = completion.choices[0]?.message?.content ?? "Unable to analyze image";
|
||||
|
||||
logger.debug("Image analyzed", {
|
||||
name: attachment.name,
|
||||
descriptionLength: description.length
|
||||
});
|
||||
|
||||
return {
|
||||
description,
|
||||
attachmentName: attachment.name,
|
||||
type: "image",
|
||||
};
|
||||
} catch (error) {
|
||||
logger.error("Failed to analyze image", { name: attachment.name, error });
|
||||
return {
|
||||
description: `[Image: ${attachment.name} - could not be analyzed]`,
|
||||
attachmentName: attachment.name,
|
||||
type: "image",
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Analyze a video attachment using vision AI
|
||||
* Gemini models support video analysis natively
|
||||
*/
|
||||
async function analyzeVideo(
|
||||
client: OpenAI,
|
||||
attachment: Attachment,
|
||||
context?: string
|
||||
): Promise<VisionAnalysis> {
|
||||
const systemPrompt = `You are analyzing a video attached to a Discord message.
|
||||
Provide a concise but detailed description of what happens in the video.
|
||||
Include:
|
||||
- What's shown/happening in the video
|
||||
- Any people, characters, or notable objects
|
||||
- The overall mood or tone
|
||||
- Any text, speech, or audio content you can identify
|
||||
- Memes, references, or jokes you recognize
|
||||
- Key moments or the "punchline" if it's a funny video
|
||||
|
||||
Keep your description to 3-4 sentences. Focus on what makes this video interesting or shareworthy.
|
||||
${context ? `\nContext from the user's message: "${context}"` : ""}`;
|
||||
|
||||
try {
|
||||
// For video, we pass the URL directly - Gemini will fetch and analyze it
|
||||
// Note: This works with public URLs that Gemini can access
|
||||
const completion = await client.chat.completions.create({
|
||||
model: VISION_MODEL,
|
||||
messages: [
|
||||
{
|
||||
role: "user",
|
||||
content: [
|
||||
{ type: "text", text: systemPrompt },
|
||||
{
|
||||
// Gemini accepts video URLs in the same format as images
|
||||
// The model auto-detects the content type
|
||||
type: "image_url",
|
||||
image_url: {
|
||||
url: attachment.url,
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
max_tokens: 400,
|
||||
temperature: 0.3,
|
||||
});
|
||||
|
||||
const description = completion.choices[0]?.message?.content ?? "Unable to analyze video";
|
||||
|
||||
logger.debug("Video analyzed", {
|
||||
name: attachment.name,
|
||||
descriptionLength: description.length
|
||||
});
|
||||
|
||||
return {
|
||||
description,
|
||||
attachmentName: attachment.name,
|
||||
type: "video",
|
||||
};
|
||||
} catch (error) {
|
||||
logger.error("Failed to analyze video", { name: attachment.name, error });
|
||||
|
||||
// Provide a fallback that at least acknowledges the video exists
|
||||
return {
|
||||
description: `[Video: ${attachment.name} - could not be analyzed. The user shared a video file.]`,
|
||||
attachmentName: attachment.name,
|
||||
type: "video",
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Analyze a single attachment based on its type
|
||||
*/
|
||||
async function analyzeAttachment(
|
||||
client: OpenAI,
|
||||
attachment: Attachment,
|
||||
context?: string
|
||||
): Promise<VisionAnalysis> {
|
||||
const type = getAttachmentType(attachment.contentType);
|
||||
|
||||
if (type === "video") {
|
||||
return analyzeVideo(client, attachment, context);
|
||||
}
|
||||
|
||||
return analyzeImage(client, attachment, context);
|
||||
}
|
||||
|
||||
/**
|
||||
* Vision service for analyzing attachments
|
||||
*/
|
||||
export class VisionService {
|
||||
private client: OpenAI;
|
||||
|
||||
constructor() {
|
||||
this.client = new OpenAI({
|
||||
baseURL: "https://openrouter.ai/api/v1",
|
||||
apiKey: config.ai.openRouterApiKey,
|
||||
defaultHeaders: {
|
||||
"HTTP-Referer": "https://github.com/crunk-bun",
|
||||
"X-Title": "Joel Discord Bot - Vision",
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Analyze multiple attachments and return descriptions
|
||||
*/
|
||||
async analyzeAttachments(
|
||||
attachments: Attachment[],
|
||||
messageContext?: string
|
||||
): Promise<VisionAnalysis[]> {
|
||||
const analyzable = filterAnalyzableAttachments(attachments);
|
||||
|
||||
if (analyzable.length === 0) {
|
||||
return [];
|
||||
}
|
||||
|
||||
logger.debug("Analyzing attachments", {
|
||||
count: analyzable.length,
|
||||
types: analyzable.map(a => getAttachmentType(a.contentType))
|
||||
});
|
||||
|
||||
// Analyze all attachments in parallel
|
||||
const results = await Promise.all(
|
||||
analyzable.map(att => analyzeAttachment(this.client, att, messageContext))
|
||||
);
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
/**
|
||||
* Format vision analysis results for inclusion in a prompt
|
||||
*/
|
||||
formatForPrompt(analyses: VisionAnalysis[]): string {
|
||||
if (analyses.length === 0) return "";
|
||||
|
||||
const formatted = analyses.map((a, i) => {
|
||||
const typeLabel = a.type === "video" ? "Video" : "Image";
|
||||
const label = analyses.length > 1
|
||||
? `[Attachment ${i + 1} (${typeLabel}): ${a.attachmentName}]`
|
||||
: `[Attached ${typeLabel}: ${a.attachmentName}]`;
|
||||
return `${label}\n${a.description}`;
|
||||
}).join("\n\n");
|
||||
|
||||
return `\n\n=== ATTACHED MEDIA ===\n${formatted}`;
|
||||
}
|
||||
}
|
||||
|
||||
// Singleton instance
|
||||
let visionService: VisionService | null = null;
|
||||
|
||||
export function getVisionService(): VisionService {
|
||||
if (!visionService) {
|
||||
visionService = new VisionService();
|
||||
}
|
||||
return visionService;
|
||||
}
|
||||
Reference in New Issue
Block a user