joel momoent

This commit is contained in:
2026-02-01 19:28:30 +01:00
parent 032d25c9af
commit 09143a0638
8 changed files with 565 additions and 2 deletions

View File

@@ -84,3 +84,4 @@ export type { AiProvider, AiResponse, MessageStyle } from "./types";
export type { ToolContext, ToolCall, ToolResult } from "./tools";
export { JOEL_TOOLS, MEMORY_EXTRACTION_TOOLS } from "./tools";
export { getEmbeddingService, EmbeddingService } from "./embeddings";
export { getVisionService, VisionService, type Attachment, type VisionAnalysis } from "./vision";

315
src/services/ai/vision.ts Normal file
View File

@@ -0,0 +1,315 @@
/**
* Vision AI service for analyzing image and video attachments
* Uses a vision-capable model to describe attachments, which can then be passed to the main AI
*/
import OpenAI from "openai";
import { config } from "../../core/config";
import { createLogger } from "../../core/logger";
const logger = createLogger("AI:Vision");
/**
* Supported attachment types for vision analysis
*/
export type AttachmentType = "image" | "video" | "unknown";
/**
* Represents an attachment to be analyzed
*/
export interface Attachment {
url: string;
name: string;
contentType: string | null;
size: number;
}
/**
* Result of vision analysis
*/
export interface VisionAnalysis {
description: string;
attachmentName: string;
type: AttachmentType;
}
/**
* Vision model to use for image analysis
* Gemini 2.0 Flash supports both images and videos
*/
const VISION_MODEL = "google/gemini-2.0-flash-001";
/**
* Maximum file sizes for analysis
*/
const MAX_IMAGE_SIZE = 10 * 1024 * 1024; // 10MB for images
const MAX_VIDEO_SIZE = 50 * 1024 * 1024; // 50MB for videos (Gemini supports larger but Discord limits)
/**
* Maximum video duration we'll attempt to analyze (in seconds)
* Gemini can handle longer but we want quick responses
*/
const MAX_VIDEO_DURATION_HINT = 60; // 1 minute
/**
* Determine the type of attachment based on content type
*/
export function getAttachmentType(contentType: string | null): AttachmentType {
if (!contentType) return "unknown";
if (contentType.startsWith("image/")) return "image";
if (contentType.startsWith("video/")) return "video";
return "unknown";
}
/**
* Filter attachments to only include those we can analyze
*/
export function filterAnalyzableAttachments(attachments: Attachment[]): Attachment[] {
return attachments.filter(att => {
const type = getAttachmentType(att.contentType);
if (type === "image") {
if (att.size > MAX_IMAGE_SIZE) {
logger.debug("Skipping large image", { name: att.name, size: att.size });
return false;
}
return true;
}
if (type === "video") {
if (att.size > MAX_VIDEO_SIZE) {
logger.debug("Skipping large video", { name: att.name, size: att.size });
return false;
}
return true;
}
logger.debug("Skipping unsupported attachment", { name: att.name, type: att.contentType });
return false;
});
}
/**
* Analyze a single image attachment using vision AI
*/
async function analyzeImage(
client: OpenAI,
attachment: Attachment,
context?: string
): Promise<VisionAnalysis> {
const systemPrompt = `You are analyzing an image attached to a Discord message.
Provide a concise but detailed description of what's in the image.
Include:
- Main subjects/objects
- Actions happening
- Text visible in the image (if any)
- Mood/tone of the image
- Any memes, jokes, or references you recognize
Keep your description to 2-3 sentences unless the image contains important text or complex content.
${context ? `\nContext from the user's message: "${context}"` : ""}`;
try {
const completion = await client.chat.completions.create({
model: VISION_MODEL,
messages: [
{
role: "user",
content: [
{ type: "text", text: systemPrompt },
{
type: "image_url",
image_url: {
url: attachment.url,
detail: "auto", // Let the model decide on resolution
},
},
],
},
],
max_tokens: 300,
temperature: 0.3,
});
const description = completion.choices[0]?.message?.content ?? "Unable to analyze image";
logger.debug("Image analyzed", {
name: attachment.name,
descriptionLength: description.length
});
return {
description,
attachmentName: attachment.name,
type: "image",
};
} catch (error) {
logger.error("Failed to analyze image", { name: attachment.name, error });
return {
description: `[Image: ${attachment.name} - could not be analyzed]`,
attachmentName: attachment.name,
type: "image",
};
}
}
/**
* Analyze a video attachment using vision AI
* Gemini models support video analysis natively
*/
async function analyzeVideo(
client: OpenAI,
attachment: Attachment,
context?: string
): Promise<VisionAnalysis> {
const systemPrompt = `You are analyzing a video attached to a Discord message.
Provide a concise but detailed description of what happens in the video.
Include:
- What's shown/happening in the video
- Any people, characters, or notable objects
- The overall mood or tone
- Any text, speech, or audio content you can identify
- Memes, references, or jokes you recognize
- Key moments or the "punchline" if it's a funny video
Keep your description to 3-4 sentences. Focus on what makes this video interesting or shareworthy.
${context ? `\nContext from the user's message: "${context}"` : ""}`;
try {
// For video, we pass the URL directly - Gemini will fetch and analyze it
// Note: This works with public URLs that Gemini can access
const completion = await client.chat.completions.create({
model: VISION_MODEL,
messages: [
{
role: "user",
content: [
{ type: "text", text: systemPrompt },
{
// Gemini accepts video URLs in the same format as images
// The model auto-detects the content type
type: "image_url",
image_url: {
url: attachment.url,
},
},
],
},
],
max_tokens: 400,
temperature: 0.3,
});
const description = completion.choices[0]?.message?.content ?? "Unable to analyze video";
logger.debug("Video analyzed", {
name: attachment.name,
descriptionLength: description.length
});
return {
description,
attachmentName: attachment.name,
type: "video",
};
} catch (error) {
logger.error("Failed to analyze video", { name: attachment.name, error });
// Provide a fallback that at least acknowledges the video exists
return {
description: `[Video: ${attachment.name} - could not be analyzed. The user shared a video file.]`,
attachmentName: attachment.name,
type: "video",
};
}
}
/**
* Analyze a single attachment based on its type
*/
async function analyzeAttachment(
client: OpenAI,
attachment: Attachment,
context?: string
): Promise<VisionAnalysis> {
const type = getAttachmentType(attachment.contentType);
if (type === "video") {
return analyzeVideo(client, attachment, context);
}
return analyzeImage(client, attachment, context);
}
/**
* Vision service for analyzing attachments
*/
export class VisionService {
private client: OpenAI;
constructor() {
this.client = new OpenAI({
baseURL: "https://openrouter.ai/api/v1",
apiKey: config.ai.openRouterApiKey,
defaultHeaders: {
"HTTP-Referer": "https://github.com/crunk-bun",
"X-Title": "Joel Discord Bot - Vision",
},
});
}
/**
* Analyze multiple attachments and return descriptions
*/
async analyzeAttachments(
attachments: Attachment[],
messageContext?: string
): Promise<VisionAnalysis[]> {
const analyzable = filterAnalyzableAttachments(attachments);
if (analyzable.length === 0) {
return [];
}
logger.debug("Analyzing attachments", {
count: analyzable.length,
types: analyzable.map(a => getAttachmentType(a.contentType))
});
// Analyze all attachments in parallel
const results = await Promise.all(
analyzable.map(att => analyzeAttachment(this.client, att, messageContext))
);
return results;
}
/**
* Format vision analysis results for inclusion in a prompt
*/
formatForPrompt(analyses: VisionAnalysis[]): string {
if (analyses.length === 0) return "";
const formatted = analyses.map((a, i) => {
const typeLabel = a.type === "video" ? "Video" : "Image";
const label = analyses.length > 1
? `[Attachment ${i + 1} (${typeLabel}): ${a.attachmentName}]`
: `[Attached ${typeLabel}: ${a.attachmentName}]`;
return `${label}\n${a.description}`;
}).join("\n\n");
return `\n\n=== ATTACHED MEDIA ===\n${formatted}`;
}
}
// Singleton instance
let visionService: VisionService | null = null;
export function getVisionService(): VisionService {
if (!visionService) {
visionService = new VisionService();
}
return visionService;
}