Add image question and DALL-E generation tools - v0.2.0

3 months ago · 51ab8b277a
4 changed files with 198 additions and 5 deletions
--- a/README.md
+++ b/README.md
@ -77,15 +77,54 @@ Analyzes an image from a file path using OpenAI's GPT-4 Vision.

 **Supported formats:** JPEG, PNG, GIF, WebP (automatically detected from file extension)

+### 3. ask_image_question
+Ask a specific question about an image using AI vision.
+
+**Parameters:**
+- `file_path` (string, required): Path to the image file
+- `prompt` (string, required): The question or instruction about the image
+
+**Returns:** AI response to the specific question about the image
+
+**Example usage:** "What color is the car in this image?", "How many people are in this photo?", "What text is visible in this image?"
+
+### 4. generate_image_dalle
+Generate images using OpenAI's DALL-E API.
+
+**Parameters:**
+- `prompt` (string, required): Description of the image to generate
+- `size` (string, optional): Image size - options: "1024x1024", "1792x1024", "1024x1792" (default: "1024x1024")
+- `quality` (string, optional): Image quality - options: "standard", "hd" (default: "standard")
+- `style` (string, optional): Image style - options: "vivid", "natural" (default: "vivid")
+- `n` (integer, optional): Number of images to generate (1-10, default: 1)
+
+**Returns:** Generated image URLs and metadata
+
+**Example prompts:** "A futuristic city skyline at sunset", "A cute robot playing with a cat", "Abstract art with blue and gold colors"
+
 ## Example Usage

 Once configured in Kilocode with a valid OpenAI API key:

+**Image Analysis:**
 ```
 Can you analyze the image at /path/to/image.jpg?
 ```

-The AI will use the `describe_image_from_file` tool to provide a detailed description.
+**Ask Specific Questions:**
+```
+What color is the car in /path/to/photo.jpg?
+How many people are visible in /path/to/group_photo.png?
+What text can you read in /path/to/document.jpg?
+```
+
+**Generate Images:**
+```
+Generate an image: "A peaceful mountain landscape at sunrise"
+Create a high-quality image of "A futuristic robot in a cyberpunk city" in 1792x1024 size
+```
+
+The AI will use the appropriate tools (`describe_image_from_file`, `ask_image_question`, or `generate_image_dalle`) to provide detailed responses.

 ## Installation Methods

--- a/image_recognition_server/server.py
+++ b/image_recognition_server/server.py
@ -127,6 +127,160 @@ def describe_image_from_file(file_path: str) -> str:
        logger.error(f"Error reading image file: {str(e)}", exc_info=True)
        return f"Error reading image file: {str(e)}"

+@mcp.tool()
+def ask_image_question(file_path: str, prompt: str) -> str:
+    """
+    Ask a specific question about an image using AI vision
+    
+    Args:
+        file_path: Path to the image file
+        prompt: The question or instruction about the image
+    
+    Returns:
+        AI response to the question about the image
+    """
+    try:
+        logger.debug(f"Asking question about image: {file_path}")
+        logger.debug(f"Question: {prompt}")
+        
+        # Open and encode the image file
+        with open(file_path, 'rb') as image_file:
+            image_data = base64.b64encode(image_file.read()).decode('utf-8')
+        
+        # Determine MIME type from file extension
+        mime_type = 'image/jpeg'
+        if file_path.lower().endswith('.png'):
+            mime_type = 'image/png'
+        elif file_path.lower().endswith('.gif'):
+            mime_type = 'image/gif'
+        elif file_path.lower().endswith('.webp'):
+            mime_type = 'image/webp'
+        
+        # Load image for basic metadata fallback
+        image_bytes = base64.b64decode(image_data)
+        image = Image.open(io.BytesIO(image_bytes))
+        
+        # If OpenAI is available, use Vision API with custom prompt
+        if HAS_OPENAI:
+            try:
+                response = openai.chat.completions.create(
+                    model="gpt-4o-mini",
+                    messages=[
+                        {
+                            "role": "user",
+                            "content": [
+                                {"type": "text", "text": prompt},
+                                {
+                                    "type": "image_url",
+                                    "image_url": {
+                                        "url": f"data:{mime_type};base64,{image_data}"
+                                    }
+                                }
+                            ]
+                        }
+                    ],
+                    max_tokens=500
+                )
+                
+                answer = response.choices[0].message.content
+                logger.debug(f"OpenAI response: {answer}")
+                return answer
+                
+            except Exception as e:
+                logger.error(f"OpenAI API error: {str(e)}", exc_info=True)
+                return f"OpenAI API error: {str(e)}\n\nNote: Configure OPENAI_API_KEY for AI-powered image analysis."
+        
+        # Return error if no OpenAI
+        return f"AI image analysis not available. Please configure OPENAI_API_KEY.\n\nImage metadata:\n- Size: {image.size[0]}x{image.size[1]} pixels\n- Mode: {image.mode}\n- Format: {image.format or 'Unknown'}"
+    
+    except Exception as e:
+        logger.error(f"Error processing image question: {str(e)}", exc_info=True)
+        return f"Error processing image question: {str(e)}"
+
+@mcp.tool()
+def generate_image_dalle(prompt: str, size: str = "1024x1024", quality: str = "standard", style: str = "vivid", n: int = 1) -> str:
+    """
+    Generate an image using DALL-E API
+    
+    Args:
+        prompt: Description of the image to generate
+        size: Image size - options: "1024x1024", "1792x1024", "1024x1792" (default: "1024x1024")
+        quality: Image quality - options: "standard", "hd" (default: "standard")
+        style: Image style - options: "vivid", "natural" (default: "vivid")
+        n: Number of images to generate (1-10, default: 1)
+    
+    Returns:
+        JSON response with generated image URLs and metadata
+    """
+    try:
+        logger.debug(f"Generating image with DALL-E")
+        logger.debug(f"Prompt: {prompt}")
+        logger.debug(f"Size: {size}, Quality: {quality}, Style: {style}, Count: {n}")
+        
+        # Validate parameters
+        valid_sizes = ["1024x1024", "1792x1024", "1024x1792"]
+        if size not in valid_sizes:
+            return f"Error: Invalid size '{size}'. Valid options: {', '.join(valid_sizes)}"
+        
+        valid_qualities = ["standard", "hd"]
+        if quality not in valid_qualities:
+            return f"Error: Invalid quality '{quality}'. Valid options: {', '.join(valid_qualities)}"
+        
+        valid_styles = ["vivid", "natural"]
+        if style not in valid_styles:
+            return f"Error: Invalid style '{style}'. Valid options: {', '.join(valid_styles)}"
+        
+        if not (1 <= n <= 10):
+            return "Error: Number of images must be between 1 and 10"
+        
+        # Check if OpenAI is available
+        if not HAS_OPENAI:
+            return "Error: OpenAI API key not configured. Please set OPENAI_API_KEY to use DALL-E image generation."
+        
+        try:
+            # Generate image using DALL-E 3
+            response = openai.images.generate(
+                model="dall-e-3",
+                prompt=prompt,
+                size=size,
+                quality=quality,
+                style=style,
+                n=n
+            )
+            
+            # Format response
+            result = {
+                "prompt": prompt,
+                "parameters": {
+                    "size": size,
+                    "quality": quality,
+                    "style": style,
+                    "count": n
+                },
+                "images": []
+            }
+            
+            for i, image_data in enumerate(response.data):
+                result["images"].append({
+                    "index": i + 1,
+                    "url": image_data.url,
+                    "revised_prompt": getattr(image_data, 'revised_prompt', None)
+                })
+            
+            logger.info(f"Successfully generated {len(response.data)} image(s)")
+            return f"Successfully generated {len(response.data)} image(s):\n\n" + "\n".join([
+                f"Image {img['index']}:\n  URL: {img['url']}\n  Revised prompt: {img['revised_prompt'] or 'N/A'}"
+                for img in result["images"]
+            ])
+            
+        except Exception as e:
+            logger.error(f"DALL-E API error: {str(e)}", exc_info=True)
+            return f"DALL-E API error: {str(e)}"
+    
+    except Exception as e:
+        logger.error(f"Error generating image: {str(e)}", exc_info=True)
+        return f"Error generating image: {str(e)}"
+
 def main():
    """Main entry point for the MCP server."""
    logger.info("Starting MCP Image Recognition Server")
--- a/pyproject.toml
+++ b/pyproject.toml
@ -4,8 +4,8 @@ build-backend = "setuptools.build_meta"

 [project]
 name = "image-recognition-mcp"
-version = "0.1.0"
-description = "An MCP server for AI-powered image analysis using OpenAI Vision API"
+version = "0.2.0"
+description = "An MCP server for AI-powered image analysis and generation using OpenAI Vision API and DALL-E"
 readme = "README.md"
 requires-python = ">=3.8"
 license = {text = "MIT"}
--- a/setup.py
+++ b/setup.py
@ -21,10 +21,10 @@ class PostInstallCommand(install):

 setup(
    name="image-recognition-mcp",
-    version="0.1.0",
+    version="0.2.0",
    author="Your Name",
    author_email="your.email@example.com",
-    description="An MCP server for AI-powered image analysis using OpenAI Vision API",
+    description="An MCP server for AI-powered image analysis and generation using OpenAI Vision API and DALL-E",
    long_description=long_description,
    long_description_content_type="text/markdown",
    url="https://github.com/yourusername/image-recognition-mcp",