Spaces:

ar08
/

zzz

Running

File size: 11,681 Bytes

246d201

##################################################################################################
# Adapted from https://github.com/TheAgentCompany/TheAgentCompany/blob/main/evaluation/run_eval.py
##################################################################################################

import asyncio
import base64
import json
import os
import shutil
import tempfile
from typing import List

import yaml
from browsing import pre_login

from openhands.controller.state.state import State
from openhands.core.config import (
    AppConfig,
    LLMConfig,
    SandboxConfig,
    get_llm_config_arg,
    get_parser,
)
from openhands.core.logger import openhands_logger as logger
from openhands.core.main import create_runtime, run_controller
from openhands.events.action import CmdRunAction, MessageAction
from openhands.events.observation import BrowserOutputObservation, CmdOutputObservation
from openhands.runtime.base import Runtime
from openhands.utils.async_utils import call_async_from_sync


def get_config(

    base_container_image: str,

    task_short_name: str,

    mount_path_on_host: str,

    llm_config: LLMConfig,

) -> AppConfig:
    config = AppConfig(
        run_as_openhands=False,
        max_budget_per_task=4,
        max_iterations=100,
        save_trajectory_path=os.path.join(
            mount_path_on_host, f'traj_{task_short_name}.json'
        ),
        sandbox=SandboxConfig(
            base_container_image=base_container_image,
            enable_auto_lint=True,
            # using host network to access the host machine from the container
            use_host_network=True,
            # large enough timeout, since some testcases take very long to run
            timeout=300,
            api_key=os.environ.get('ALLHANDS_API_KEY', None),
        ),
        # we mount trajectories path so that trajectories, generated by OpenHands
        # controller, can be accessible to the evaluator file in the runtime container
        workspace_mount_path=mount_path_on_host,
        workspace_mount_path_in_sandbox='/outputs',
    )
    config.set_llm_config(llm_config)
    return config


def load_dependencies(runtime: Runtime) -> List[str]:
    """

    Every task has a dependencies.yml file, which lists all the services that the

    task depends on. This function loads the file and returns all dependent service names.

    """
    command = 'cat /utils/dependencies.yml'
    action = CmdRunAction(command=command)
    logger.info(action, extra={'msg_type': 'ACTION'})
    obs: CmdOutputObservation = runtime.run_action(action)
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
    assert obs.exit_code == 0
    dependencies = yaml.safe_load(obs.content)
    if dependencies is None:
        dependencies = []
    return dependencies


def init_task_env(runtime: Runtime, hostname: str, env_llm_config: LLMConfig):
    command = (
        f'SERVER_HOSTNAME={hostname} '
        f'LITELLM_API_KEY={env_llm_config.api_key.get_secret_value() if env_llm_config.api_key else None} '
        f'LITELLM_BASE_URL={env_llm_config.base_url} '
        f'LITELLM_MODEL={env_llm_config.model} '
        'bash /utils/init.sh'
    )
    action = CmdRunAction(command=command)
    action.set_hard_timeout(900)
    logger.info(action, extra={'msg_type': 'ACTION'})
    obs = runtime.run_action(action)
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
    assert obs.exit_code == 0


def codeact_user_response(state: State) -> str:
    msg = (
        'Please continue working on the task on whatever approach you think is suitable.\n'
        'If you think you have solved the task, please finish the interaction.\n'
        'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP.\n'
    )

    if state.history:
        # check if the agent has tried to talk to the user 3 times, if so, let the agent know it can give up
        user_msgs = [
            event
            for event in state.history
            if isinstance(event, MessageAction) and event.source == 'user'
        ]
        if len(user_msgs) >= 2:
            # let the agent know that it can give up when it has tried 3 times
            return (
                msg
                + 'If you want to give up, run: <execute_bash> exit </execute_bash>.\n'
            )
    return msg


def run_solver(

    runtime: Runtime,

    task_name: str,

    config: AppConfig,

    dependencies: List[str],

    save_final_state: bool,

    state_dir: str,

    save_screenshots: bool,

    screenshots_dir: str,

) -> State:
    instruction = 'Complete the task in /instruction/task.md'

    if 'gitlab' in dependencies:
        instruction += "\n\nGitlab username is 'root' and password is 'theagentcompany'"

    state: State | None = asyncio.run(
        run_controller(
            config=config,
            sid=task_name,
            initial_user_action=MessageAction(content=instruction),
            runtime=runtime,
            fake_user_response_fn=codeact_user_response,
        )
    )
    logger.info(state)

    if save_screenshots:
        screenshots_dir = os.path.join(screenshots_dir, task_name)
        os.makedirs(screenshots_dir, exist_ok=True)
        for image_id, obs in enumerate(state.history):
            if isinstance(obs, BrowserOutputObservation):
                image_data = base64.b64decode(obs.screenshot)
                with open(
                    os.path.join(screenshots_dir, f'{image_id}.png'), 'wb'
                ) as file:
                    file.write(image_data)

    if save_final_state:
        os.makedirs(state_dir, exist_ok=True)
        with open(os.path.join(state_dir, f'state_{task_name}.json'), 'w') as file:
            json.dump(str(state), file)

    return state


def run_evaluator(

    runtime: Runtime, env_llm_config: LLMConfig, trajectory_path: str, result_path: str

):
    command = (
        f'LITELLM_API_KEY={env_llm_config.api_key.get_secret_value() if env_llm_config.api_key else None} '
        f'LITELLM_BASE_URL={env_llm_config.base_url} '
        f'LITELLM_MODEL={env_llm_config.model} '
        f"DECRYPTION_KEY='theagentcompany is all you need' "  # Hardcoded Key
        f'python_default /utils/eval.py --trajectory_path {trajectory_path} --result_path {result_path}'
    )
    action = CmdRunAction(command=command)
    action.set_hard_timeout(600)
    logger.info(action, extra={'msg_type': 'ACTION'})
    obs = runtime.run_action(action)
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
    assert obs.exit_code == 0


if __name__ == '__main__':
    parser = get_parser()
    parser.add_argument(
        '--task-image-name',
        type=str,
        default='ghcr.io/theagentcompany/example-image:1.0.0',
        help='Task image name',
    )
    parser.add_argument(
        '--outputs-path',
        type=str,
        default='./outputs',
        help='Folder path to save trajectories and evaluation results',
    )
    parser.add_argument(
        '--server-hostname',
        type=str,
        default='localhost',
        help='Server hostname, e.g. localhost to access the host machine from the container, '
        'assuming the task docker container is run with `--network host` flag',
    )
    parser.add_argument(
        '--agent-llm-config',
        type=str,
        default=None,
        help='LLM config for agent',
    )
    parser.add_argument(
        '--env-llm-config',
        type=str,
        default=None,
        help='LLM config for evaluation environment (NPC & llm-based evaluator)',
    )
    args, _ = parser.parse_known_args()

    agent_llm_config: LLMConfig | None = None
    if args.agent_llm_config:
        agent_llm_config = get_llm_config_arg(args.agent_llm_config)

    if agent_llm_config is None:
        raise ValueError(
            f'Could not find LLM config for agent: --agent-llm-config {args.agent_llm_config}'
        )

    if agent_llm_config.api_key is None:
        raise ValueError('LLM API key is not set for agent')

    env_llm_config: LLMConfig | None = None
    if args.env_llm_config:
        env_llm_config = get_llm_config_arg(args.env_llm_config)

    if env_llm_config is None:
        raise ValueError(
            f'Could not find LLM config for evaluation environment: --env-llm-config {args.env_llm_config}'
        )

    if env_llm_config.api_key is None:
        raise ValueError('LLM API key is not set for evaluation environment')

    task_short_name = args.task_image_name.split('/')[-1].split(':')[0]
    logger.info(
        f'Task image name is {args.task_image_name}, short name is {task_short_name}'
    )

    # mount a temporary directory to pass trajectory from host to container, and to
    # pass the evaluation result from container to host
    # 1) trajectory is dumped by OpenHands library (on host machine), but it's needed by
    # evaluator (in container), so we mount a temporary directory to pass it in
    # 2) evaluation result is written by evaluator (in container), but we need to persist
    # it on host machine, so we mount a temporary directory to pass it out
    if os.getenv('TMPDIR') and os.path.exists(os.getenv('TMPDIR')):
        temp_dir = os.path.abspath(os.getenv('TMPDIR'))
    else:
        temp_dir = tempfile.mkdtemp()
    config: AppConfig = get_config(
        args.task_image_name, task_short_name, temp_dir, agent_llm_config
    )
    runtime: Runtime = create_runtime(config)
    call_async_from_sync(runtime.connect)

    init_task_env(runtime, args.server_hostname, env_llm_config)

    dependencies = load_dependencies(runtime)
    logger.info(f'Service dependencies: {dependencies}')

    try:
        pre_login(
            runtime,
            dependencies,
            save_screenshots=True,
            screenshots_dir=os.path.join(
                os.path.abspath(args.outputs_path), 'screenshots'
            ),
        )
    except Exception as e:
        logger.error(f'Failed to pre-login: {e}')

        # before giving up, let's try to init and login again
        init_task_env(runtime, args.server_hostname, env_llm_config)
        pre_login(
            runtime,
            dependencies,
            save_screenshots=True,
            screenshots_dir=os.path.join(
                os.path.abspath(args.outputs_path), 'screenshots'
            ),
        )

    state = run_solver(
        runtime,
        task_short_name,
        config,
        dependencies,
        save_final_state=True,
        state_dir=os.path.abspath(args.outputs_path),
        save_screenshots=True,
        screenshots_dir=os.path.join(os.path.abspath(args.outputs_path), 'screenshots'),
    )

    # this path is the absolute path in the runtime container
    trajectory_path = f'/outputs/traj_{task_short_name}.json'
    result_path = f'/outputs/eval_{task_short_name}.json'

    run_evaluator(runtime, env_llm_config, trajectory_path, result_path)

    # finally, move trajectory file and evaluation result from mount path on host (temp dir) to outputs path
    shutil.move(
        os.path.join(temp_dir, f'traj_{task_short_name}.json'),
        os.path.join(
            os.path.abspath(args.outputs_path), f'traj_{task_short_name}.json'
        ),
    )
    shutil.move(
        os.path.join(temp_dir, f'eval_{task_short_name}.json'),
        os.path.join(
            os.path.abspath(args.outputs_path), f'eval_{task_short_name}.json'
        ),
    )