mllm_client.py

#!/usr/bin/env python3
import argparse
import os
import subprocess
import tempfile
from anthropic import Anthropic
from openai import OpenAI
import sys, shutil
import boto3
from botocore.config import Config
import json, logging, time
from dataclasses import dataclass, field
from typing import List, Optional, Dict, Any, Generator, TypedDict, Literal
import hashlib

JSONSchema = Any

model_mapping: Dict[str, str] = {
    'claude-sonnet': 'claude-3-5-sonnet-latest',
    'claude-haiku': 'claude-3-5-haiku-latest',
    'gemini': 'gemini-exp-1206',
}

@dataclass
class FunctionDefinition:
    name: str
    description: str
    parameters: Dict[str, JSONSchema]

    def to_openai_dict(self) -> Dict[str, Any]:
        return {
            "name": self.name,
            "description": self.description,
            "parameters": self.parameters
        }

@dataclass
class FunctionCall:
    name: str
    arguments: Dict[str, Any]

class Message(TypedDict):
    content: str
    role: Literal['user'] | Literal['assistant']

def stream_llm_output_uncached(
    messages: List[Message],
    model: str = 'claude-3-5-sonnet-latest',
    tokens: Optional[int] = None,
    functions: Optional[List[FunctionDefinition]] = None,
    function_call: Optional[str] = None,
    prediction: Optional[str] = None
) -> Generator[str | FunctionCall, None, None]:
    """
    Added parameters:
    - functions: List of function definitions
    - function_call: Optional forcing of specific function
    """
    assert len(messages) > 0
    model = model_mapping.get(model, model)
    if model.startswith('claude'):
        provider = 'anthropic'
    elif model.startswith('gemini'):
        provider = 'google'
    elif model.startswith('bedrock'):
        provider = 'bedrock'
    else:
        provider = 'openai'

    logging.info('LLM call (model = %s, characters = %d)', model, sum( len(m['content']) for m in messages ))
    start_time = time.time()
    
    if tokens is None:
        if model.startswith('o1'):
            tokens = 1024 * 30
        else:
            tokens = 1024 * 8

    if provider == 'google':
        from google.generativeai import GenerativeModel
        import google.generativeai as genai

        gemini_api_key = open(os.path.expanduser('~/kbox/gemini-api-key')).read().strip()
        genai.configure(api_key=gemini_api_key)
        model_inst = GenerativeModel(model)
        prompt = "\n".join(f"{m['role']}: {m['content']}" for m in messages)
        response = model_inst.generate_content(prompt, stream=True)
        for chunk in response:
            if chunk.text:
                yield chunk.text
    elif provider == 'openai':
        if 'OPENAI_API_KEY' not in os.environ:
            os.environ['OPENAI_API_KEY'] = open(os.path.expanduser('~/kbox/openai')).read().strip().split('=')[1]

        client = OpenAI()
        kwargs: dict[str, Any] = {}
        effort = 'medium'
        if model.startswith('o1:'):
            base, suffix = model.split(':', 1)
            suffix = suffix.strip()
            if suffix in ('low', 'medium', 'high'):
                effort = suffix
            model = base
        if model == 'o1':
            kwargs['reasoning_effort'] = effort

        if functions:
            kwargs["functions"] = [f.to_openai_dict() for f in functions]
        if function_call:
            kwargs["function_call"] = {"name": function_call}

        if prediction is not None:
            kwargs['prediction'] = {
                "type": "content",
                "content": prediction
            }
        else:
            kwargs['max_completion_tokens'] = tokens
            
        stream = (functions is None or len(functions) == 0) and not model.startswith('o1')
            
        response = client.chat.completions.create( # type: ignore            
            model=model,
            messages=messages, # type: ignore
            stream=stream,
            **kwargs)

        if stream:
            for chunk in response:
                delta = chunk.choices[0].delta
                if hasattr(delta, 'content') and delta.content is not None:
                    yield delta.content
        else:
            choice = response.choices[0].message
            if choice.function_call:
                function_call_data = choice.function_call
                func_call = FunctionCall(
                    name=function_call_data.name,
                    arguments=json.loads(function_call_data.arguments)
                )
                yield func_call
            else:
                yield choice.content

    elif provider == 'anthropic':
        
        if 'ANTHROPIC_API_KEY' not in os.environ:
            os.environ['ANTHROPIC_API_KEY'] = open(os.path.expanduser('~/kbox/claude')).read().strip()

        anthropic_client = Anthropic()
        kwargs = {
            "model": model,
            "max_tokens": tokens,
            "messages": messages,
        }
        if functions:
            kwargs["tools"] = [f.to_openai_dict() for f in functions]

        stream1 = anthropic_client.messages.stream(**kwargs)
        with stream1 as stream2:
            for text in stream2.text_stream:
                yield text
            # Handle function calling response
            if False and stream2.tool_calls:
                for tool_call in stream2.tool_calls:
                    yield FunctionCall(name=tool_call.function.name,
                                       arguments=tool_call.function.arguments)
    else:
        assert False, provider
        
    logging.info('LLM call finished (model = %s, characters = %d, time = %.1f)',
                 model, sum( len(m['content']) for m in messages ),
                 time.time() - start_time)

def stream_llm_output(
        messages: List[Message],
        model: str = 'claude-3-5-sonnet-latest',
        tokens: Optional[int] = None,
        functions: Optional[List[FunctionDefinition]] = None,
        function_call: Optional[str] = None,
        prediction: Optional[str]=None,
) -> Generator[str | FunctionCall, None, None]:
    """
    Wrapper around stream_llm_output_uncached that adds on-disk caching if MLLM_DISK_CACHE_LOCATION is set.
    """
    cache_location = os.environ.get('MLLM_DISK_CACHE_LOCATION')
    if cache_location:
        # Generate a hash based on the inputs
        cache_key_data = {
            'messages': messages,
            'model': model,
            'tokens': tokens,
            'functions': [f.to_openai_dict() for f in functions] if functions else None,
            'function_call': function_call
        }
        if prediction is not None:
            cache_key_data['prediction'] = prediction
        cache_key_json = json.dumps(cache_key_data, sort_keys=True)

        cache_hash = hashlib.sha256(cache_key_json.encode('utf-8')).hexdigest()
        
        cache_file = os.path.join(cache_location, f"{cache_hash}.json")

        if os.path.exists(cache_file):
            # Read from cache
            with open(cache_file, 'r') as f:
                cached_output = json.load(f)
            # Yield cached outputs
            for item in cached_output:
                if isinstance(item, dict) and 'FunctionCall' in item:
                    function_call_data = item['FunctionCall']
                    yield FunctionCall(**function_call_data)
                else:
                    yield item
            return

        else:
            # File doesn't exist in cache, need to generate and cache
            outputs = []
            for output in stream_llm_output_uncached(messages, model, tokens, functions, function_call, prediction):
                outputs.append(output)
                try:
                    yield output
                except GeneratorExit:
                    break
            # Save outputs to cache
            # Ensure outputs are serializable
            serializable_outputs = []
            for item in outputs:
                if isinstance(item, FunctionCall):
                    serializable_outputs.append({'FunctionCall': item.__dict__})
                else:
                    serializable_outputs.append(item)
            os.makedirs(cache_location, exist_ok=True)
            with open(cache_file, 'w') as f:
                json.dump(serializable_outputs, f)
    else:
        # No caching, just call the uncached function
        yield from stream_llm_output_uncached(messages, model, tokens, functions, function_call, prediction)

def llm_output(*args, **kwargs):
    result = []
    for part in stream_llm_output(*args, **kwargs):
        assert not isinstance(part, FunctionCall)
        result.append(part)
    return ''.join(result)

def llm_one_function_call(*args, **kwargs) -> FunctionCall:
    for part in stream_llm_output(*args, **kwargs):
        if isinstance(part, FunctionCall):
            return part
    raise Exception('no function called')