Browse Source

Add image question and DALL-E generation tools - v0.2.0

master
Matteo Benedetto 3 months ago
parent
commit
51ab8b277a
  1. 41
      README.md
  2. 154
      image_recognition_server/server.py
  3. 4
      pyproject.toml
  4. 4
      setup.py

41
README.md

@ -77,15 +77,54 @@ Analyzes an image from a file path using OpenAI's GPT-4 Vision.
**Supported formats:** JPEG, PNG, GIF, WebP (automatically detected from file extension)
### 3. ask_image_question
Ask a specific question about an image using AI vision.
**Parameters:**
- `file_path` (string, required): Path to the image file
- `prompt` (string, required): The question or instruction about the image
**Returns:** AI response to the specific question about the image
**Example usage:** "What color is the car in this image?", "How many people are in this photo?", "What text is visible in this image?"
### 4. generate_image_dalle
Generate images using OpenAI's DALL-E API.
**Parameters:**
- `prompt` (string, required): Description of the image to generate
- `size` (string, optional): Image size - options: "1024x1024", "1792x1024", "1024x1792" (default: "1024x1024")
- `quality` (string, optional): Image quality - options: "standard", "hd" (default: "standard")
- `style` (string, optional): Image style - options: "vivid", "natural" (default: "vivid")
- `n` (integer, optional): Number of images to generate (1-10, default: 1)
**Returns:** Generated image URLs and metadata
**Example prompts:** "A futuristic city skyline at sunset", "A cute robot playing with a cat", "Abstract art with blue and gold colors"
## Example Usage
Once configured in Kilocode with a valid OpenAI API key:
**Image Analysis:**
```
Can you analyze the image at /path/to/image.jpg?
```
The AI will use the `describe_image_from_file` tool to provide a detailed description.
**Ask Specific Questions:**
```
What color is the car in /path/to/photo.jpg?
How many people are visible in /path/to/group_photo.png?
What text can you read in /path/to/document.jpg?
```
**Generate Images:**
```
Generate an image: "A peaceful mountain landscape at sunrise"
Create a high-quality image of "A futuristic robot in a cyberpunk city" in 1792x1024 size
```
The AI will use the appropriate tools (`describe_image_from_file`, `ask_image_question`, or `generate_image_dalle`) to provide detailed responses.
## Installation Methods

154
image_recognition_server/server.py

@ -127,6 +127,160 @@ def describe_image_from_file(file_path: str) -> str:
logger.error(f"Error reading image file: {str(e)}", exc_info=True)
return f"Error reading image file: {str(e)}"
@mcp.tool()
def ask_image_question(file_path: str, prompt: str) -> str:
"""
Ask a specific question about an image using AI vision
Args:
file_path: Path to the image file
prompt: The question or instruction about the image
Returns:
AI response to the question about the image
"""
try:
logger.debug(f"Asking question about image: {file_path}")
logger.debug(f"Question: {prompt}")
# Open and encode the image file
with open(file_path, 'rb') as image_file:
image_data = base64.b64encode(image_file.read()).decode('utf-8')
# Determine MIME type from file extension
mime_type = 'image/jpeg'
if file_path.lower().endswith('.png'):
mime_type = 'image/png'
elif file_path.lower().endswith('.gif'):
mime_type = 'image/gif'
elif file_path.lower().endswith('.webp'):
mime_type = 'image/webp'
# Load image for basic metadata fallback
image_bytes = base64.b64decode(image_data)
image = Image.open(io.BytesIO(image_bytes))
# If OpenAI is available, use Vision API with custom prompt
if HAS_OPENAI:
try:
response = openai.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {
"url": f"data:{mime_type};base64,{image_data}"
}
}
]
}
],
max_tokens=500
)
answer = response.choices[0].message.content
logger.debug(f"OpenAI response: {answer}")
return answer
except Exception as e:
logger.error(f"OpenAI API error: {str(e)}", exc_info=True)
return f"OpenAI API error: {str(e)}\n\nNote: Configure OPENAI_API_KEY for AI-powered image analysis."
# Return error if no OpenAI
return f"AI image analysis not available. Please configure OPENAI_API_KEY.\n\nImage metadata:\n- Size: {image.size[0]}x{image.size[1]} pixels\n- Mode: {image.mode}\n- Format: {image.format or 'Unknown'}"
except Exception as e:
logger.error(f"Error processing image question: {str(e)}", exc_info=True)
return f"Error processing image question: {str(e)}"
@mcp.tool()
def generate_image_dalle(prompt: str, size: str = "1024x1024", quality: str = "standard", style: str = "vivid", n: int = 1) -> str:
"""
Generate an image using DALL-E API
Args:
prompt: Description of the image to generate
size: Image size - options: "1024x1024", "1792x1024", "1024x1792" (default: "1024x1024")
quality: Image quality - options: "standard", "hd" (default: "standard")
style: Image style - options: "vivid", "natural" (default: "vivid")
n: Number of images to generate (1-10, default: 1)
Returns:
JSON response with generated image URLs and metadata
"""
try:
logger.debug(f"Generating image with DALL-E")
logger.debug(f"Prompt: {prompt}")
logger.debug(f"Size: {size}, Quality: {quality}, Style: {style}, Count: {n}")
# Validate parameters
valid_sizes = ["1024x1024", "1792x1024", "1024x1792"]
if size not in valid_sizes:
return f"Error: Invalid size '{size}'. Valid options: {', '.join(valid_sizes)}"
valid_qualities = ["standard", "hd"]
if quality not in valid_qualities:
return f"Error: Invalid quality '{quality}'. Valid options: {', '.join(valid_qualities)}"
valid_styles = ["vivid", "natural"]
if style not in valid_styles:
return f"Error: Invalid style '{style}'. Valid options: {', '.join(valid_styles)}"
if not (1 <= n <= 10):
return "Error: Number of images must be between 1 and 10"
# Check if OpenAI is available
if not HAS_OPENAI:
return "Error: OpenAI API key not configured. Please set OPENAI_API_KEY to use DALL-E image generation."
try:
# Generate image using DALL-E 3
response = openai.images.generate(
model="dall-e-3",
prompt=prompt,
size=size,
quality=quality,
style=style,
n=n
)
# Format response
result = {
"prompt": prompt,
"parameters": {
"size": size,
"quality": quality,
"style": style,
"count": n
},
"images": []
}
for i, image_data in enumerate(response.data):
result["images"].append({
"index": i + 1,
"url": image_data.url,
"revised_prompt": getattr(image_data, 'revised_prompt', None)
})
logger.info(f"Successfully generated {len(response.data)} image(s)")
return f"Successfully generated {len(response.data)} image(s):\n\n" + "\n".join([
f"Image {img['index']}:\n URL: {img['url']}\n Revised prompt: {img['revised_prompt'] or 'N/A'}"
for img in result["images"]
])
except Exception as e:
logger.error(f"DALL-E API error: {str(e)}", exc_info=True)
return f"DALL-E API error: {str(e)}"
except Exception as e:
logger.error(f"Error generating image: {str(e)}", exc_info=True)
return f"Error generating image: {str(e)}"
def main():
"""Main entry point for the MCP server."""
logger.info("Starting MCP Image Recognition Server")

4
pyproject.toml

@ -4,8 +4,8 @@ build-backend = "setuptools.build_meta"
[project]
name = "image-recognition-mcp"
version = "0.1.0"
description = "An MCP server for AI-powered image analysis using OpenAI Vision API"
version = "0.2.0"
description = "An MCP server for AI-powered image analysis and generation using OpenAI Vision API and DALL-E"
readme = "README.md"
requires-python = ">=3.8"
license = {text = "MIT"}

4
setup.py

@ -21,10 +21,10 @@ class PostInstallCommand(install):
setup(
name="image-recognition-mcp",
version="0.1.0",
version="0.2.0",
author="Your Name",
author_email="your.email@example.com",
description="An MCP server for AI-powered image analysis using OpenAI Vision API",
description="An MCP server for AI-powered image analysis and generation using OpenAI Vision API and DALL-E",
long_description=long_description,
long_description_content_type="text/markdown",
url="https://github.com/yourusername/image-recognition-mcp",

Loading…
Cancel
Save