joel momoent

2026-02-01 19:28:30 +01:00
parent 032d25c9af
commit 09143a0638
8 changed files with 565 additions and 2 deletions
--- a/src/services/ai/index.ts
+++ b/src/services/ai/index.ts
@@ -84,3 +84,4 @@ export type { AiProvider, AiResponse, MessageStyle } from "./types";
 export type { ToolContext, ToolCall, ToolResult } from "./tools";
 export { JOEL_TOOLS, MEMORY_EXTRACTION_TOOLS } from "./tools";
 export { getEmbeddingService, EmbeddingService } from "./embeddings";
+export { getVisionService, VisionService, type Attachment, type VisionAnalysis } from "./vision";
--- a/src/services/ai/vision.ts
+++ b/src/services/ai/vision.ts
@@ -0,0 +1,315 @@
+/**
+ * Vision AI service for analyzing image and video attachments
+ * Uses a vision-capable model to describe attachments, which can then be passed to the main AI
+ */
+
+import OpenAI from "openai";
+import { config } from "../../core/config";
+import { createLogger } from "../../core/logger";
+
+const logger = createLogger("AI:Vision");
+
+/**
+ * Supported attachment types for vision analysis
+ */
+export type AttachmentType = "image" | "video" | "unknown";
+
+/**
+ * Represents an attachment to be analyzed
+ */
+export interface Attachment {
+  url: string;
+  name: string;
+  contentType: string | null;
+  size: number;
+}
+
+/**
+ * Result of vision analysis
+ */
+export interface VisionAnalysis {
+  description: string;
+  attachmentName: string;
+  type: AttachmentType;
+}
+
+/**
+ * Vision model to use for image analysis
+ * Gemini 2.0 Flash supports both images and videos
+ */
+const VISION_MODEL = "google/gemini-2.0-flash-001";
+
+/**
+ * Maximum file sizes for analysis
+ */
+const MAX_IMAGE_SIZE = 10 * 1024 * 1024;  // 10MB for images
+const MAX_VIDEO_SIZE = 50 * 1024 * 1024;  // 50MB for videos (Gemini supports larger but Discord limits)
+
+/**
+ * Maximum video duration we'll attempt to analyze (in seconds)
+ * Gemini can handle longer but we want quick responses
+ */
+const MAX_VIDEO_DURATION_HINT = 60; // 1 minute
+
+/**
+ * Determine the type of attachment based on content type
+ */
+export function getAttachmentType(contentType: string | null): AttachmentType {
+  if (!contentType) return "unknown";
+  
+  if (contentType.startsWith("image/")) return "image";
+  if (contentType.startsWith("video/")) return "video";
+  
+  return "unknown";
+}
+
+/**
+ * Filter attachments to only include those we can analyze
+ */
+export function filterAnalyzableAttachments(attachments: Attachment[]): Attachment[] {
+  return attachments.filter(att => {
+    const type = getAttachmentType(att.contentType);
+    
+    if (type === "image") {
+      if (att.size > MAX_IMAGE_SIZE) {
+        logger.debug("Skipping large image", { name: att.name, size: att.size });
+        return false;
+      }
+      return true;
+    }
+    
+    if (type === "video") {
+      if (att.size > MAX_VIDEO_SIZE) {
+        logger.debug("Skipping large video", { name: att.name, size: att.size });
+        return false;
+      }
+      return true;
+    }
+    
+    logger.debug("Skipping unsupported attachment", { name: att.name, type: att.contentType });
+    return false;
+  });
+}
+
+/**
+ * Analyze a single image attachment using vision AI
+ */
+async function analyzeImage(
+  client: OpenAI,
+  attachment: Attachment,
+  context?: string
+): Promise<VisionAnalysis> {
+  const systemPrompt = `You are analyzing an image attached to a Discord message. 
+Provide a concise but detailed description of what's in the image.
+Include:
+- Main subjects/objects
+- Actions happening
+- Text visible in the image (if any)
+- Mood/tone of the image
+- Any memes, jokes, or references you recognize
+
+Keep your description to 2-3 sentences unless the image contains important text or complex content.
+${context ? `\nContext from the user's message: "${context}"` : ""}`;
+
+  try {
+    const completion = await client.chat.completions.create({
+      model: VISION_MODEL,
+      messages: [
+        {
+          role: "user",
+          content: [
+            { type: "text", text: systemPrompt },
+            {
+              type: "image_url",
+              image_url: {
+                url: attachment.url,
+                detail: "auto", // Let the model decide on resolution
+              },
+            },
+          ],
+        },
+      ],
+      max_tokens: 300,
+      temperature: 0.3,
+    });
+
+    const description = completion.choices[0]?.message?.content ?? "Unable to analyze image";
+    
+    logger.debug("Image analyzed", { 
+      name: attachment.name, 
+      descriptionLength: description.length 
+    });
+
+    return {
+      description,
+      attachmentName: attachment.name,
+      type: "image",
+    };
+  } catch (error) {
+    logger.error("Failed to analyze image", { name: attachment.name, error });
+    return {
+      description: `[Image: ${attachment.name} - could not be analyzed]`,
+      attachmentName: attachment.name,
+      type: "image",
+    };
+  }
+}
+
+/**
+ * Analyze a video attachment using vision AI
+ * Gemini models support video analysis natively
+ */
+async function analyzeVideo(
+  client: OpenAI,
+  attachment: Attachment,
+  context?: string
+): Promise<VisionAnalysis> {
+  const systemPrompt = `You are analyzing a video attached to a Discord message.
+Provide a concise but detailed description of what happens in the video.
+Include:
+- What's shown/happening in the video
+- Any people, characters, or notable objects
+- The overall mood or tone
+- Any text, speech, or audio content you can identify
+- Memes, references, or jokes you recognize
+- Key moments or the "punchline" if it's a funny video
+
+Keep your description to 3-4 sentences. Focus on what makes this video interesting or shareworthy.
+${context ? `\nContext from the user's message: "${context}"` : ""}`;
+
+  try {
+    // For video, we pass the URL directly - Gemini will fetch and analyze it
+    // Note: This works with public URLs that Gemini can access
+    const completion = await client.chat.completions.create({
+      model: VISION_MODEL,
+      messages: [
+        {
+          role: "user",
+          content: [
+            { type: "text", text: systemPrompt },
+            {
+              // Gemini accepts video URLs in the same format as images
+              // The model auto-detects the content type
+              type: "image_url",
+              image_url: {
+                url: attachment.url,
+              },
+            },
+          ],
+        },
+      ],
+      max_tokens: 400,
+      temperature: 0.3,
+    });
+
+    const description = completion.choices[0]?.message?.content ?? "Unable to analyze video";
+    
+    logger.debug("Video analyzed", { 
+      name: attachment.name, 
+      descriptionLength: description.length 
+    });
+
+    return {
+      description,
+      attachmentName: attachment.name,
+      type: "video",
+    };
+  } catch (error) {
+    logger.error("Failed to analyze video", { name: attachment.name, error });
+    
+    // Provide a fallback that at least acknowledges the video exists
+    return {
+      description: `[Video: ${attachment.name} - could not be analyzed. The user shared a video file.]`,
+      attachmentName: attachment.name,
+      type: "video",
+    };
+  }
+}
+
+/**
+ * Analyze a single attachment based on its type
+ */
+async function analyzeAttachment(
+  client: OpenAI,
+  attachment: Attachment,
+  context?: string
+): Promise<VisionAnalysis> {
+  const type = getAttachmentType(attachment.contentType);
+  
+  if (type === "video") {
+    return analyzeVideo(client, attachment, context);
+  }
+  
+  return analyzeImage(client, attachment, context);
+}
+
+/**
+ * Vision service for analyzing attachments
+ */
+export class VisionService {
+  private client: OpenAI;
+
+  constructor() {
+    this.client = new OpenAI({
+      baseURL: "https://openrouter.ai/api/v1",
+      apiKey: config.ai.openRouterApiKey,
+      defaultHeaders: {
+        "HTTP-Referer": "https://github.com/crunk-bun",
+        "X-Title": "Joel Discord Bot - Vision",
+      },
+    });
+  }
+
+  /**
+   * Analyze multiple attachments and return descriptions
+   */
+  async analyzeAttachments(
+    attachments: Attachment[],
+    messageContext?: string
+  ): Promise<VisionAnalysis[]> {
+    const analyzable = filterAnalyzableAttachments(attachments);
+    
+    if (analyzable.length === 0) {
+      return [];
+    }
+
+    logger.debug("Analyzing attachments", { 
+      count: analyzable.length,
+      types: analyzable.map(a => getAttachmentType(a.contentType))
+    });
+
+    // Analyze all attachments in parallel
+    const results = await Promise.all(
+      analyzable.map(att => analyzeAttachment(this.client, att, messageContext))
+    );
+
+    return results;
+  }
+
+  /**
+   * Format vision analysis results for inclusion in a prompt
+   */
+  formatForPrompt(analyses: VisionAnalysis[]): string {
+    if (analyses.length === 0) return "";
+
+    const formatted = analyses.map((a, i) => {
+      const typeLabel = a.type === "video" ? "Video" : "Image";
+      const label = analyses.length > 1 
+        ? `[Attachment ${i + 1} (${typeLabel}): ${a.attachmentName}]` 
+        : `[Attached ${typeLabel}: ${a.attachmentName}]`;
+      return `${label}\n${a.description}`;
+    }).join("\n\n");
+
+    return `\n\n=== ATTACHED MEDIA ===\n${formatted}`;
+  }
+}
+
+// Singleton instance
+let visionService: VisionService | null = null;
+
+export function getVisionService(): VisionService {
+  if (!visionService) {
+    visionService = new VisionService();
+  }
+  return visionService;
+}