diff --git a/README.md b/README.md index 6969303..9ac0376 100644 --- a/README.md +++ b/README.md @@ -77,15 +77,54 @@ Analyzes an image from a file path using OpenAI's GPT-4 Vision. **Supported formats:** JPEG, PNG, GIF, WebP (automatically detected from file extension) +### 3. ask_image_question +Ask a specific question about an image using AI vision. + +**Parameters:** +- `file_path` (string, required): Path to the image file +- `prompt` (string, required): The question or instruction about the image + +**Returns:** AI response to the specific question about the image + +**Example usage:** "What color is the car in this image?", "How many people are in this photo?", "What text is visible in this image?" + +### 4. generate_image_dalle +Generate images using OpenAI's DALL-E API. + +**Parameters:** +- `prompt` (string, required): Description of the image to generate +- `size` (string, optional): Image size - options: "1024x1024", "1792x1024", "1024x1792" (default: "1024x1024") +- `quality` (string, optional): Image quality - options: "standard", "hd" (default: "standard") +- `style` (string, optional): Image style - options: "vivid", "natural" (default: "vivid") +- `n` (integer, optional): Number of images to generate (1-10, default: 1) + +**Returns:** Generated image URLs and metadata + +**Example prompts:** "A futuristic city skyline at sunset", "A cute robot playing with a cat", "Abstract art with blue and gold colors" + ## Example Usage Once configured in Kilocode with a valid OpenAI API key: +**Image Analysis:** ``` Can you analyze the image at /path/to/image.jpg? ``` -The AI will use the `describe_image_from_file` tool to provide a detailed description. +**Ask Specific Questions:** +``` +What color is the car in /path/to/photo.jpg? +How many people are visible in /path/to/group_photo.png? +What text can you read in /path/to/document.jpg? +``` + +**Generate Images:** +``` +Generate an image: "A peaceful mountain landscape at sunrise" +Create a high-quality image of "A futuristic robot in a cyberpunk city" in 1792x1024 size +``` + +The AI will use the appropriate tools (`describe_image_from_file`, `ask_image_question`, or `generate_image_dalle`) to provide detailed responses. ## Installation Methods diff --git a/image_recognition_server/server.py b/image_recognition_server/server.py index 2865cdb..ba94a9f 100644 --- a/image_recognition_server/server.py +++ b/image_recognition_server/server.py @@ -127,6 +127,160 @@ def describe_image_from_file(file_path: str) -> str: logger.error(f"Error reading image file: {str(e)}", exc_info=True) return f"Error reading image file: {str(e)}" +@mcp.tool() +def ask_image_question(file_path: str, prompt: str) -> str: + """ + Ask a specific question about an image using AI vision + + Args: + file_path: Path to the image file + prompt: The question or instruction about the image + + Returns: + AI response to the question about the image + """ + try: + logger.debug(f"Asking question about image: {file_path}") + logger.debug(f"Question: {prompt}") + + # Open and encode the image file + with open(file_path, 'rb') as image_file: + image_data = base64.b64encode(image_file.read()).decode('utf-8') + + # Determine MIME type from file extension + mime_type = 'image/jpeg' + if file_path.lower().endswith('.png'): + mime_type = 'image/png' + elif file_path.lower().endswith('.gif'): + mime_type = 'image/gif' + elif file_path.lower().endswith('.webp'): + mime_type = 'image/webp' + + # Load image for basic metadata fallback + image_bytes = base64.b64decode(image_data) + image = Image.open(io.BytesIO(image_bytes)) + + # If OpenAI is available, use Vision API with custom prompt + if HAS_OPENAI: + try: + response = openai.chat.completions.create( + model="gpt-4o-mini", + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": prompt}, + { + "type": "image_url", + "image_url": { + "url": f"data:{mime_type};base64,{image_data}" + } + } + ] + } + ], + max_tokens=500 + ) + + answer = response.choices[0].message.content + logger.debug(f"OpenAI response: {answer}") + return answer + + except Exception as e: + logger.error(f"OpenAI API error: {str(e)}", exc_info=True) + return f"OpenAI API error: {str(e)}\n\nNote: Configure OPENAI_API_KEY for AI-powered image analysis." + + # Return error if no OpenAI + return f"AI image analysis not available. Please configure OPENAI_API_KEY.\n\nImage metadata:\n- Size: {image.size[0]}x{image.size[1]} pixels\n- Mode: {image.mode}\n- Format: {image.format or 'Unknown'}" + + except Exception as e: + logger.error(f"Error processing image question: {str(e)}", exc_info=True) + return f"Error processing image question: {str(e)}" + +@mcp.tool() +def generate_image_dalle(prompt: str, size: str = "1024x1024", quality: str = "standard", style: str = "vivid", n: int = 1) -> str: + """ + Generate an image using DALL-E API + + Args: + prompt: Description of the image to generate + size: Image size - options: "1024x1024", "1792x1024", "1024x1792" (default: "1024x1024") + quality: Image quality - options: "standard", "hd" (default: "standard") + style: Image style - options: "vivid", "natural" (default: "vivid") + n: Number of images to generate (1-10, default: 1) + + Returns: + JSON response with generated image URLs and metadata + """ + try: + logger.debug(f"Generating image with DALL-E") + logger.debug(f"Prompt: {prompt}") + logger.debug(f"Size: {size}, Quality: {quality}, Style: {style}, Count: {n}") + + # Validate parameters + valid_sizes = ["1024x1024", "1792x1024", "1024x1792"] + if size not in valid_sizes: + return f"Error: Invalid size '{size}'. Valid options: {', '.join(valid_sizes)}" + + valid_qualities = ["standard", "hd"] + if quality not in valid_qualities: + return f"Error: Invalid quality '{quality}'. Valid options: {', '.join(valid_qualities)}" + + valid_styles = ["vivid", "natural"] + if style not in valid_styles: + return f"Error: Invalid style '{style}'. Valid options: {', '.join(valid_styles)}" + + if not (1 <= n <= 10): + return "Error: Number of images must be between 1 and 10" + + # Check if OpenAI is available + if not HAS_OPENAI: + return "Error: OpenAI API key not configured. Please set OPENAI_API_KEY to use DALL-E image generation." + + try: + # Generate image using DALL-E 3 + response = openai.images.generate( + model="dall-e-3", + prompt=prompt, + size=size, + quality=quality, + style=style, + n=n + ) + + # Format response + result = { + "prompt": prompt, + "parameters": { + "size": size, + "quality": quality, + "style": style, + "count": n + }, + "images": [] + } + + for i, image_data in enumerate(response.data): + result["images"].append({ + "index": i + 1, + "url": image_data.url, + "revised_prompt": getattr(image_data, 'revised_prompt', None) + }) + + logger.info(f"Successfully generated {len(response.data)} image(s)") + return f"Successfully generated {len(response.data)} image(s):\n\n" + "\n".join([ + f"Image {img['index']}:\n URL: {img['url']}\n Revised prompt: {img['revised_prompt'] or 'N/A'}" + for img in result["images"] + ]) + + except Exception as e: + logger.error(f"DALL-E API error: {str(e)}", exc_info=True) + return f"DALL-E API error: {str(e)}" + + except Exception as e: + logger.error(f"Error generating image: {str(e)}", exc_info=True) + return f"Error generating image: {str(e)}" + def main(): """Main entry point for the MCP server.""" logger.info("Starting MCP Image Recognition Server") diff --git a/pyproject.toml b/pyproject.toml index c079d38..bb77d60 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,8 +4,8 @@ build-backend = "setuptools.build_meta" [project] name = "image-recognition-mcp" -version = "0.1.0" -description = "An MCP server for AI-powered image analysis using OpenAI Vision API" +version = "0.2.0" +description = "An MCP server for AI-powered image analysis and generation using OpenAI Vision API and DALL-E" readme = "README.md" requires-python = ">=3.8" license = {text = "MIT"} diff --git a/setup.py b/setup.py index 18a1871..b61196a 100644 --- a/setup.py +++ b/setup.py @@ -21,10 +21,10 @@ class PostInstallCommand(install): setup( name="image-recognition-mcp", - version="0.1.0", + version="0.2.0", author="Your Name", author_email="your.email@example.com", - description="An MCP server for AI-powered image analysis using OpenAI Vision API", + description="An MCP server for AI-powered image analysis and generation using OpenAI Vision API and DALL-E", long_description=long_description, long_description_content_type="text/markdown", url="https://github.com/yourusername/image-recognition-mcp",