Add image question and DALL-E generation tools - v0.2.0

3 months ago · 51ab8b277a
4 changed files with 198 additions and 5 deletions
--- a/README.md
+++ b/README.md
@ -77,15 +77,54 @@ Analyzes an image from a file path using OpenAI's GPT-4 Vision.
 **Supported formats:** JPEG, PNG, GIF, WebP (automatically detected from file extension)
 ### 3. ask_image_question
 Ask a specific question about an image using AI vision.
 **Parameters:**
 - `file_path` (string, required): Path to the image file
 - `prompt` (string, required): The question or instruction about the image
 **Returns:** AI response to the specific question about the image
 **Example usage:** "What color is the car in this image?", "How many people are in this photo?", "What text is visible in this image?"
 ### 4. generate_image_dalle
 Generate images using OpenAI's DALL-E API.
 **Parameters:**
 - `prompt` (string, required): Description of the image to generate
 - `size` (string, optional): Image size - options: "1024x1024", "1792x1024", "1024x1792" (default: "1024x1024")
 - `quality` (string, optional): Image quality - options: "standard", "hd" (default: "standard")
 - `style` (string, optional): Image style - options: "vivid", "natural" (default: "vivid")
 - `n` (integer, optional): Number of images to generate (1-10, default: 1)
 **Returns:** Generated image URLs and metadata
 **Example prompts:** "A futuristic city skyline at sunset", "A cute robot playing with a cat", "Abstract art with blue and gold colors"
 ## Example Usage
 Once configured in Kilocode with a valid OpenAI API key:
 **Image Analysis:**
 ```
 Can you analyze the image at /path/to/image.jpg?
 ```
-The AI will use the `describe_image_from_file` tool to provide a detailed description.
+**Ask Specific Questions:**
 ```
 What color is the car in /path/to/photo.jpg?
 How many people are visible in /path/to/group_photo.png?
 What text can you read in /path/to/document.jpg?
 ```
 **Generate Images:**
 ```
 Generate an image: "A peaceful mountain landscape at sunrise"
 Create a high-quality image of "A futuristic robot in a cyberpunk city" in 1792x1024 size
 ```
 The AI will use the appropriate tools (`describe_image_from_file`, `ask_image_question`, or `generate_image_dalle`) to provide detailed responses.
 ## Installation Methods
--- a/image_recognition_server/server.py
+++ b/image_recognition_server/server.py
@ -127,6 +127,160 @@ def describe_image_from_file(file_path: str) -> str:
        logger.error(f"Error reading image file: {str(e)}", exc_info=True)
        return f"Error reading image file: {str(e)}"
@mcp.tool()
 def ask_image_question(file_path: str, prompt: str) -> str:
    """
    Ask a specific question about an image using AI vision
    Args:
        file_path: Path to the image file
        prompt: The question or instruction about the image
    Returns:
        AI response to the question about the image
    """
    try:
        logger.debug(f"Asking question about image: {file_path}")
        logger.debug(f"Question: {prompt}")
        # Open and encode the image file
        with open(file_path, 'rb') as image_file:
            image_data = base64.b64encode(image_file.read()).decode('utf-8')
        # Determine MIME type from file extension
        mime_type = 'image/jpeg'
        if file_path.lower().endswith('.png'):
            mime_type = 'image/png'
        elif file_path.lower().endswith('.gif'):
            mime_type = 'image/gif'
        elif file_path.lower().endswith('.webp'):
            mime_type = 'image/webp'
        # Load image for basic metadata fallback
        image_bytes = base64.b64decode(image_data)
        image = Image.open(io.BytesIO(image_bytes))
        # If OpenAI is available, use Vision API with custom prompt
        if HAS_OPENAI:
            try:
                response = openai.chat.completions.create(
                    model="gpt-4o-mini",
                    messages=[
                        {
                            "role": "user",
                            "content": [
                                {"type": "text", "text": prompt},
                                {
                                    "type": "image_url",
                                    "image_url": {
                                        "url": f"data:{mime_type};base64,{image_data}"
                                    }
                                }
                            ]
                        }
                    ],
                    max_tokens=500
                )
                answer = response.choices[0].message.content
                logger.debug(f"OpenAI response: {answer}")
                return answer
            except Exception as e:
                logger.error(f"OpenAI API error: {str(e)}", exc_info=True)
                return f"OpenAI API error: {str(e)}\n\nNote: Configure OPENAI_API_KEY for AI-powered image analysis."
        # Return error if no OpenAI
        return f"AI image analysis not available. Please configure OPENAI_API_KEY.\n\nImage metadata:\n- Size: {image.size[0]}x{image.size[1]} pixels\n- Mode: {image.mode}\n- Format: {image.format or 'Unknown'}"
    except Exception as e:
        logger.error(f"Error processing image question: {str(e)}", exc_info=True)
        return f"Error processing image question: {str(e)}"
@mcp.tool()
 def generate_image_dalle(prompt: str, size: str = "1024x1024", quality: str = "standard", style: str = "vivid", n: int = 1) -> str:
    """
    Generate an image using DALL-E API
    Args:
        prompt: Description of the image to generate
        size: Image size - options: "1024x1024", "1792x1024", "1024x1792" (default: "1024x1024")
        quality: Image quality - options: "standard", "hd" (default: "standard")
        style: Image style - options: "vivid", "natural" (default: "vivid")
        n: Number of images to generate (1-10, default: 1)
    Returns:
        JSON response with generated image URLs and metadata
    """
    try:
        logger.debug(f"Generating image with DALL-E")
        logger.debug(f"Prompt: {prompt}")
        logger.debug(f"Size: {size}, Quality: {quality}, Style: {style}, Count: {n}")
        # Validate parameters
        valid_sizes = ["1024x1024", "1792x1024", "1024x1792"]
        if size not in valid_sizes:
            return f"Error: Invalid size '{size}'. Valid options: {', '.join(valid_sizes)}"
        valid_qualities = ["standard", "hd"]
        if quality not in valid_qualities:
            return f"Error: Invalid quality '{quality}'. Valid options: {', '.join(valid_qualities)}"
        valid_styles = ["vivid", "natural"]
        if style not in valid_styles:
            return f"Error: Invalid style '{style}'. Valid options: {', '.join(valid_styles)}"
        if not (1 <= n <= 10):
            return "Error: Number of images must be between 1 and 10"
        # Check if OpenAI is available
        if not HAS_OPENAI:
            return "Error: OpenAI API key not configured. Please set OPENAI_API_KEY to use DALL-E image generation."
        try:
            # Generate image using DALL-E 3
            response = openai.images.generate(
                model="dall-e-3",
                prompt=prompt,
                size=size,
                quality=quality,
                style=style,
                n=n
            )
            # Format response
            result = {
                "prompt": prompt,
                "parameters": {
                    "size": size,
                    "quality": quality,
                    "style": style,
                    "count": n
                },
                "images": []
            }
            for i, image_data in enumerate(response.data):
                result["images"].append({
                    "index": i + 1,
                    "url": image_data.url,
                    "revised_prompt": getattr(image_data, 'revised_prompt', None)
                })
            logger.info(f"Successfully generated {len(response.data)} image(s)")
            return f"Successfully generated {len(response.data)} image(s):\n\n" + "\n".join([
                f"Image {img['index']}:\n  URL: {img['url']}\n  Revised prompt: {img['revised_prompt'] or 'N/A'}"
                for img in result["images"]
            ])
        except Exception as e:
            logger.error(f"DALL-E API error: {str(e)}", exc_info=True)
            return f"DALL-E API error: {str(e)}"
    except Exception as e:
        logger.error(f"Error generating image: {str(e)}", exc_info=True)
        return f"Error generating image: {str(e)}"
 def main():
    """Main entry point for the MCP server."""
    logger.info("Starting MCP Image Recognition Server")
--- a/pyproject.toml
+++ b/pyproject.toml
@ -4,8 +4,8 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "image-recognition-mcp"
-version = "0.1.0"
+version = "0.2.0"
-description = "An MCP server for AI-powered image analysis using OpenAI Vision API"
+description = "An MCP server for AI-powered image analysis and generation using OpenAI Vision API and DALL-E"
 readme = "README.md"
 requires-python = ">=3.8"
 license = {text = "MIT"}
--- a/setup.py
+++ b/setup.py
@ -21,10 +21,10 @@ class PostInstallCommand(install):
 setup(
    name="image-recognition-mcp",
-    version="0.1.0",
+    version="0.2.0",
    author="Your Name",
    author_email="your.email@example.com",
-    description="An MCP server for AI-powered image analysis using OpenAI Vision API",
+    description="An MCP server for AI-powered image analysis and generation using OpenAI Vision API and DALL-E",
    long_description=long_description,
    long_description_content_type="text/markdown",
    url="https://github.com/yourusername/image-recognition-mcp",