## CHANGES

- Fixed image attachment support for Codex and OpenCode agents 🖼️ - Added file-based image arguments for non-stream agents 📁 - Implemented temp file creation for image attachments 💾 - Enhanced integration tests with image upload capabilities 🧪 - Improved cleanup of temporary image files on exit 🧹 - Fixed lightbox keyboard navigation after image deletion ⌨️ - Repositioned thinking status dropdown to prevent overflow 📍 - Added proper image argument builders for each agent 🔧 - Streamlined image handling across different agent types 🎯 - Enhanced process manager with multi-image support 🚀
2026-03-10 08:31:19 +00:00 · 2025-12-19 13:55:07 -06:00
parent 55e0ad51e2
commit f634dc43ef
6 changed files with 163 additions and 6 deletions
--- a/src/tests/integration/provider-integration.test.ts
+++ b/src/tests/integration/provider-integration.test.ts
@@ -26,10 +26,16 @@ import { describe, it, expect, beforeAll, afterAll } from 'vitest';
 import { spawn, ChildProcess } from 'child_process';
 import { promisify } from 'util';
 import { exec } from 'child_process';
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
 import { getAgentCapabilities } from '../../main/agent-capabilities';

 const execAsync = promisify(exec);

+// Path to test image fixture
+const TEST_IMAGE_PATH = path.join(__dirname, '../fixtures/maestro-test-image.png');
+
 // Skip integration tests by default - they make real API calls and may incur costs.
 // Set RUN_INTEGRATION_TESTS=true to enable them.
 const SKIP_INTEGRATION = process.env.RUN_INTEGRATION_TESTS !== 'true';
@@ -55,6 +61,10 @@ interface ProviderConfig {
   * - process-manager.ts (--input-format stream-json for images)
   */
  buildInitialArgs: (prompt: string, options?: { images?: string[] }) => string[];
+  /** Build args for message with image (file path) - for agents that use file-based image args */
+  buildImageArgs?: (prompt: string, imagePath: string) => string[];
+  /** Build stdin content for stream-json mode (for Claude Code) */
+  buildStreamJsonInput?: (prompt: string, imageBase64: string, mediaType: string) => string;
  /** Build args for follow-up message (with session) */
  buildResumeArgs: (sessionId: string, prompt: string) => string[];
  /** Parse session ID from output */
@@ -131,6 +141,33 @@ const PROVIDERS: ProviderConfig[] = [
    isSuccessful: (output: string, exitCode: number) => {
      return exitCode === 0;
    },
+    /**
+     * Build stream-json input for Claude Code with image.
+     * This mirrors buildStreamJsonMessage() in process-manager.ts
+     */
+    buildStreamJsonInput: (prompt: string, imageBase64: string, mediaType: string) => {
+      const message = {
+        type: 'user',
+        message: {
+          role: 'user',
+          content: [
+            {
+              type: 'image',
+              source: {
+                type: 'base64',
+                media_type: mediaType,
+                data: imageBase64,
+              },
+            },
+            {
+              type: 'text',
+              text: prompt,
+            },
+          ],
+        },
+      };
+      return JSON.stringify(message);
+    },
  },
  {
    name: 'Codex',
@@ -232,6 +269,20 @@ const PROVIDERS: ProviderConfig[] = [
      }
      return false;
    },
+    /**
+     * Build args with image file path for Codex.
+     * Mirrors agent-detector.ts: imageArgs: (imagePath) => ['-i', imagePath]
+     */
+    buildImageArgs: (prompt: string, imagePath: string) => [
+      'exec',
+      '--dangerously-bypass-approvals-and-sandbox',
+      '--skip-git-repo-check',
+      '--json',
+      '-C', TEST_CWD,
+      '-i', imagePath,
+      '--',
+      prompt,
+    ],
  },
  {
    name: 'OpenCode',
@@ -308,6 +359,17 @@ const PROVIDERS: ProviderConfig[] = [
    isSuccessful: (output: string, exitCode: number) => {
      return exitCode === 0;
    },
+    /**
+     * Build args with image file path for OpenCode.
+     * Mirrors agent-detector.ts: imageArgs: (imagePath) => ['-f', imagePath]
+     */
+    buildImageArgs: (prompt: string, imagePath: string) => [
+      'run',
+      '--format', 'json',
+      '-f', imagePath,
+      '--',
+      prompt,
+    ],
  },
 ];

@@ -325,11 +387,13 @@ async function isProviderAvailable(provider: ProviderConfig): Promise<boolean> {

 /**
 * Run a provider command and capture output
+ * @param stdinContent - Optional content to write to stdin before closing (for stream-json mode)
 */
 function runProvider(
  provider: ProviderConfig,
  args: string[],
-  cwd: string = TEST_CWD
+  cwd: string = TEST_CWD,
+  stdinContent?: string
 ): Promise<{ stdout: string; stderr: string; exitCode: number }> {
  return new Promise((resolve) => {
    let stdout = '';
@@ -342,7 +406,11 @@ function runProvider(
      stdio: ['pipe', 'pipe', 'pipe'],
    });

-    // Close stdin immediately to signal EOF (prevents processes waiting for input)
+    // If we have stdin content, write it and then close
+    if (stdinContent) {
+      proc.stdin?.write(stdinContent + '\n');
+    }
+    // Close stdin to signal EOF (prevents processes waiting for input)
    proc.stdin?.end();

    proc.stdout?.on('data', (data) => {
--- a/src/main/agent-detector.ts
+++ b/src/main/agent-detector.ts
@@ -42,6 +42,7 @@ export interface AgentConfig {
  modelArgs?: (modelId: string) => string[]; // Function to build model selection args (e.g., ['--model', modelId])
  yoloModeArgs?: string[]; // Args for YOLO/full-access mode (e.g., ['--dangerously-bypass-approvals-and-sandbox'])
  workingDirArgs?: (dir: string) => string[]; // Function to build working directory args (e.g., ['-C', dir])
+  imageArgs?: (imagePath: string) => string[]; // Function to build image attachment args (e.g., ['-i', imagePath] for Codex)
 }

 const AGENT_DEFINITIONS: Omit<AgentConfig, 'available' | 'path' | 'capabilities'>[] = [
@@ -83,6 +84,7 @@ const AGENT_DEFINITIONS: Omit<AgentConfig, 'available' | 'path' | 'capabilities'
    readOnlyArgs: ['--sandbox', 'read-only'], // Read-only/plan mode
    yoloModeArgs: ['--dangerously-bypass-approvals-and-sandbox'], // Full access mode
    workingDirArgs: (dir: string) => ['-C', dir], // Set working directory
+    imageArgs: (imagePath: string) => ['-i', imagePath], // Image attachment: codex exec -i /path/to/image.png
    // Agent-specific configuration options shown in UI
    configOptions: [
      {
@@ -123,6 +125,7 @@ const AGENT_DEFINITIONS: Omit<AgentConfig, 'available' | 'path' | 'capabilities'
    readOnlyArgs: ['--agent', 'plan'], // Read-only/plan mode
    modelArgs: (modelId: string) => ['--model', modelId], // Model selection (e.g., 'ollama/qwen3:8b')
    yoloModeArgs: ['run'], // 'run' subcommand auto-approves all permissions (YOLO mode is implicit)
+    imageArgs: (imagePath: string) => ['-f', imagePath], // Image/file attachment: opencode run -f /path/to/image.png
    // Agent-specific configuration options shown in UI
    configOptions: [
      {
--- a/src/main/ipc/handlers/process.ts
+++ b/src/main/ipc/handlers/process.ts
@@ -293,6 +293,7 @@ export function registerProcessHandlers(deps: ProcessHandlerDependencies): void
        shellEnvVars: shellEnvVars,      // Shell-specific env vars (for terminal sessions)
        contextWindow, // Pass configured context window to process manager
        customEnvVars: effectiveCustomEnvVars, // Pass custom env vars (session-level or agent-level)
+        imageArgs: agent?.imageArgs,     // Function to build image CLI args (for Codex, OpenCode)
      });

      logger.info(`Process spawned successfully`, LOG_CONTEXT, {
--- a/src/main/process-manager.ts
+++ b/src/main/process-manager.ts
@@ -1,6 +1,9 @@
 import { spawn, ChildProcess } from 'child_process';
 import { EventEmitter } from 'events';
 import * as pty from 'node-pty';
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
 import { stripControlSequences, stripAllAnsiCodes } from './utils/terminalFilter';
 import { logger } from './utils/logger';
 import { getOutputParser, type ParsedEvent, type AgentOutputParser } from './parsers';
@@ -50,7 +53,8 @@ interface ProcessConfig {
  shell?: string; // Shell to use for terminal sessions (e.g., 'zsh', 'bash', 'fish', or full path)
  shellArgs?: string; // Additional CLI arguments for shell sessions (e.g., '--login')
  shellEnvVars?: Record<string, string>; // Environment variables for shell sessions
-  images?: string[]; // Base64 data URLs for images (passed via stream-json input)
+  images?: string[]; // Base64 data URLs for images (passed via stream-json input or file args)
+  imageArgs?: (imagePath: string) => string[]; // Function to build image CLI args (e.g., ['-i', path] for Codex)
  contextWindow?: number; // Configured context window size (0 or undefined = not configured, hide UI)
  customEnvVars?: Record<string, string>; // Custom environment variables from user configuration
 }
@@ -76,6 +80,7 @@ interface ManagedProcess {
  stdoutBuffer?: string; // Buffer for accumulating stdout output (for error detection at exit)
  streamedText?: string; // Buffer for accumulating streamed text from partial events (OpenCode, Codex)
  contextWindow?: number; // Configured context window size (0 or undefined = not configured)
+  tempImageFiles?: string[]; // Temp files to clean up when process exits (for file-based image args)
 }

 /**
@@ -138,6 +143,50 @@ function buildStreamJsonMessage(prompt: string, images: string[]): string {
  return JSON.stringify(message);
 }

+/**
+ * Save a base64 data URL image to a temp file.
+ * Returns the full path to the temp file.
+ */
+function saveImageToTempFile(dataUrl: string, index: number): string | null {
+  const parsed = parseDataUrl(dataUrl);
+  if (!parsed) {
+    logger.warn('[ProcessManager] Failed to parse data URL for temp file', 'ProcessManager');
+    return null;
+  }
+
+  // Determine file extension from media type
+  const ext = parsed.mediaType.split('/')[1] || 'png';
+  const filename = `maestro-image-${Date.now()}-${index}.${ext}`;
+  const tempPath = path.join(os.tmpdir(), filename);
+
+  try {
+    // Convert base64 to buffer and write to file
+    const buffer = Buffer.from(parsed.base64, 'base64');
+    fs.writeFileSync(tempPath, buffer);
+    logger.debug('[ProcessManager] Saved image to temp file', 'ProcessManager', { tempPath, size: buffer.length });
+    return tempPath;
+  } catch (error) {
+    logger.error('[ProcessManager] Failed to save image to temp file', 'ProcessManager', { error: String(error) });
+    return null;
+  }
+}
+
+/**
+ * Clean up temp image files.
+ */
+function cleanupTempFiles(files: string[]): void {
+  for (const file of files) {
+    try {
+      if (fs.existsSync(file)) {
+        fs.unlinkSync(file);
+        logger.debug('[ProcessManager] Cleaned up temp file', 'ProcessManager', { file });
+      }
+    } catch (error) {
+      logger.warn('[ProcessManager] Failed to clean up temp file', 'ProcessManager', { file, error: String(error) });
+    }
+  }
+}
+
 export class ProcessManager extends EventEmitter {
  private processes: Map<string, ManagedProcess> = new Map();

@@ -145,18 +194,38 @@ export class ProcessManager extends EventEmitter {
   * Spawn a new process for a session
   */
  spawn(config: ProcessConfig): { pid: number; success: boolean } {
-    const { sessionId, toolType, cwd, command, args, requiresPty, prompt, shell, shellArgs, shellEnvVars, images, contextWindow, customEnvVars } = config;
+    const { sessionId, toolType, cwd, command, args, requiresPty, prompt, shell, shellArgs, shellEnvVars, images, imageArgs, contextWindow, customEnvVars } = config;

    // For batch mode with images, use stream-json mode and send message via stdin
    // For batch mode without images, append prompt to args with -- separator
    const hasImages = images && images.length > 0;
    const capabilities = getAgentCapabilities(toolType);
    let finalArgs: string[];
+    let tempImageFiles: string[] = [];

    if (hasImages && prompt && capabilities.supportsStreamJsonInput) {
      // For agents that support stream-json input (like Claude Code), add the flag
      // The prompt will be sent via stdin as a JSON message with image data
      finalArgs = [...args, '--input-format', 'stream-json'];
+    } else if (hasImages && prompt && imageArgs) {
+      // For agents that use file-based image args (like Codex, OpenCode),
+      // save images to temp files and add CLI args
+      finalArgs = [...args]; // Start with base args
+      tempImageFiles = [];
+      for (let i = 0; i < images.length; i++) {
+        const tempPath = saveImageToTempFile(images[i], i);
+        if (tempPath) {
+          tempImageFiles.push(tempPath);
+          finalArgs = [...finalArgs, ...imageArgs(tempPath)];
+        }
+      }
+      // Add the prompt at the end
+      finalArgs = [...finalArgs, '--', prompt];
+      logger.debug('[ProcessManager] Using file-based image args', 'ProcessManager', {
+        sessionId,
+        imageCount: images.length,
+        tempFiles: tempImageFiles,
+      });
    } else if (prompt) {
      // Regular batch mode - prompt as CLI arg
      // The -- ensures prompt is treated as positional arg, not a flag (even if it starts with --)
@@ -170,6 +239,8 @@ export class ProcessManager extends EventEmitter {
      toolType,
      hasPrompt: !!prompt,
      hasImages,
+      hasImageArgs: !!imageArgs,
+      tempImageFilesCount: tempImageFiles.length,
      promptValue: prompt,
      baseArgs: args,
      finalArgs
@@ -402,6 +473,7 @@ export class ProcessManager extends EventEmitter {
          stderrBuffer: '', // Initialize stderr buffer for error detection at exit
          stdoutBuffer: '', // Initialize stdout buffer for error detection at exit
          contextWindow, // User-configured context window size (0 = not configured)
+          tempImageFiles: tempImageFiles.length > 0 ? tempImageFiles : undefined, // Temp files to clean up on exit
        };

        this.processes.set(sessionId, managedProcess);
@@ -681,6 +753,11 @@ export class ProcessManager extends EventEmitter {
            }
          }

+          // Clean up temp image files if any
+          if (managedProcess.tempImageFiles && managedProcess.tempImageFiles.length > 0) {
+            cleanupTempFiles(managedProcess.tempImageFiles);
+          }
+
          this.emit('exit', sessionId, code || 0);
          this.processes.delete(sessionId);
        });
@@ -705,6 +782,11 @@ export class ProcessManager extends EventEmitter {
            this.emit('agent-error', sessionId, agentError);
          }

+          // Clean up temp image files if any
+          if (managedProcess.tempImageFiles && managedProcess.tempImageFiles.length > 0) {
+            cleanupTempFiles(managedProcess.tempImageFiles);
+          }
+
          this.emit('data', sessionId, `[error] ${error.message}`);
          this.emit('exit', sessionId, 1); // Ensure exit is emitted on error
          this.processes.delete(sessionId);
--- a/src/renderer/components/LightboxModal.tsx
+++ b/src/renderer/components/LightboxModal.tsx
@@ -120,6 +120,9 @@ export function LightboxModal({ image, stagedImages, onClose, onNavigate, onDele
      const newList = stagedImages.filter(img => img !== image);
      onNavigate(newList[currentIndex]);
    }
+
+    // Refocus the lightbox after deletion so keyboard navigation continues working
+    setTimeout(() => lightboxRef.current?.focus(), 0);
  }, [image, stagedImages, currentIndex, onDelete, onNavigate, onClose]);

  // Default theme for ConfirmModal if not provided
--- a/src/renderer/components/ThinkingStatusPill.tsx
+++ b/src/renderer/components/ThinkingStatusPill.tsx
@@ -447,9 +447,9 @@ function ThinkingStatusPillInner({ sessions, theme, onSessionClick, namedSession
              </span>
            </div>

-            {/* Expanded dropdown - uses padding to create hover bridge between trigger and dropdown */}
+            {/* Expanded dropdown - positioned above to avoid going off-screen */}
            {isExpanded && (
-              <div className="absolute right-0 top-full pt-1 z-50">
+              <div className="absolute right-0 bottom-full pb-1 z-50">
                <div
                  className="min-w-[320px] rounded-lg shadow-xl overflow-hidden"
                  style={{