Upload 1040 files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .dockerignore +5 -0
- .gitattributes +1 -35
- .gitignore +235 -0
- .nvmrc +1 -0
- .openhands/microagents/repo.md +42 -0
- CODE_OF_CONDUCT.md +147 -0
- COMMUNITY.md +43 -0
- CONTRIBUTING.md +123 -0
- CREDITS.md +312 -0
- Development.md +128 -0
- Dockerfile +98 -0
- ISSUE_TRIAGE.md +25 -0
- LICENSE +25 -0
- MANIFEST.in +5 -0
- Makefile +332 -0
- build.sh +5 -0
- config.sh +4 -0
- config.template.toml +290 -0
- containers/README.md +12 -0
- containers/app/config.sh +4 -0
- containers/app/entrypoint.sh +0 -0
- containers/build.sh +156 -0
- containers/dev/Dockerfile +124 -0
- containers/dev/README.md +57 -0
- containers/dev/compose.yml +38 -0
- containers/dev/dev.sh +39 -0
- containers/e2b-sandbox/Dockerfile +19 -0
- containers/e2b-sandbox/README.md +15 -0
- containers/e2b-sandbox/e2b.toml +14 -0
- containers/runtime/README.md +12 -0
- containers/runtime/config.sh +7 -0
- dev_config/python/.pre-commit-config.yaml +43 -0
- dev_config/python/mypy.ini +9 -0
- dev_config/python/ruff.toml +26 -0
- docker-compose.yml +23 -0
- entrypoint.sh +69 -0
- evaluation/README.md +94 -0
- evaluation/__init__.py +0 -0
- evaluation/benchmarks/EDA/README.md +46 -0
- evaluation/benchmarks/EDA/game.py +203 -0
- evaluation/benchmarks/EDA/run_infer.py +238 -0
- evaluation/benchmarks/EDA/scripts/run_infer.sh +60 -0
- evaluation/benchmarks/agent_bench/README.md +56 -0
- evaluation/benchmarks/agent_bench/__init__.py +0 -0
- evaluation/benchmarks/agent_bench/helper.py +77 -0
- evaluation/benchmarks/agent_bench/run_infer.py +329 -0
- evaluation/benchmarks/agent_bench/scripts/run_infer.sh +42 -0
- evaluation/benchmarks/agent_bench/scripts/summarise_results.py +37 -0
- evaluation/benchmarks/aider_bench/README.md +95 -0
- evaluation/benchmarks/aider_bench/create_dataset.py +47 -0
.dockerignore
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
frontend/node_modules
|
2 |
+
config.toml
|
3 |
+
.envrc
|
4 |
+
.env
|
5 |
+
.git
|
.gitattributes
CHANGED
@@ -1,35 +1 @@
|
|
1 |
-
*.
|
2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
1 |
+
*.ipynb linguist-vendored
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.gitignore
ADDED
@@ -0,0 +1,235 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
./lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
share/python-wheels/
|
24 |
+
*.egg-info/
|
25 |
+
.installed.cfg
|
26 |
+
*.egg
|
27 |
+
MANIFEST
|
28 |
+
requirements.txt
|
29 |
+
|
30 |
+
# PyInstaller
|
31 |
+
# Usually these files are written by a python script from a template
|
32 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
33 |
+
*.manifest
|
34 |
+
*.spec
|
35 |
+
|
36 |
+
# Installer logs
|
37 |
+
pip-log.txt
|
38 |
+
pip-delete-this-directory.txt
|
39 |
+
|
40 |
+
# Unit test / coverage reports
|
41 |
+
htmlcov/
|
42 |
+
.tox/
|
43 |
+
.nox/
|
44 |
+
.coverage
|
45 |
+
.coverage.*
|
46 |
+
.cache
|
47 |
+
nosetests.xml
|
48 |
+
coverage.xml
|
49 |
+
*.cover
|
50 |
+
*.py,cover
|
51 |
+
.hypothesis/
|
52 |
+
.pytest_cache/
|
53 |
+
cover/
|
54 |
+
|
55 |
+
# Translations
|
56 |
+
*.mo
|
57 |
+
*.pot
|
58 |
+
|
59 |
+
# Django stuff:
|
60 |
+
local_settings.py
|
61 |
+
db.sqlite3
|
62 |
+
db.sqlite3-journal
|
63 |
+
|
64 |
+
# Flask stuff:
|
65 |
+
instance/
|
66 |
+
.webassets-cache
|
67 |
+
|
68 |
+
# Scrapy stuff:
|
69 |
+
.scrapy
|
70 |
+
|
71 |
+
# Sphinx documentation
|
72 |
+
docs/_build/
|
73 |
+
|
74 |
+
# PyBuilder
|
75 |
+
.pybuilder/
|
76 |
+
target/
|
77 |
+
|
78 |
+
# Jupyter Notebook
|
79 |
+
.ipynb_checkpoints
|
80 |
+
|
81 |
+
# IPython
|
82 |
+
profile_default/
|
83 |
+
ipython_config.py
|
84 |
+
|
85 |
+
# pyenv
|
86 |
+
# For a library or package, you might want to ignore these files since the code is
|
87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
88 |
+
.python-version
|
89 |
+
|
90 |
+
# pipenv
|
91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
94 |
+
# install all needed dependencies.
|
95 |
+
#Pipfile.lock
|
96 |
+
|
97 |
+
# poetry
|
98 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
100 |
+
# commonly ignored for libraries.
|
101 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
102 |
+
# poetry.lock
|
103 |
+
|
104 |
+
# pdm
|
105 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
106 |
+
#pdm.lock
|
107 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
108 |
+
# in version control.
|
109 |
+
# https://pdm.fming.dev/#use-with-ide
|
110 |
+
.pdm.toml
|
111 |
+
|
112 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
113 |
+
__pypackages__/
|
114 |
+
|
115 |
+
# Celery stuff
|
116 |
+
celerybeat-schedule
|
117 |
+
celerybeat.pid
|
118 |
+
|
119 |
+
# SageMath parsed files
|
120 |
+
*.sage.py
|
121 |
+
|
122 |
+
# Environments
|
123 |
+
.env
|
124 |
+
frontend/.env
|
125 |
+
.venv
|
126 |
+
env/
|
127 |
+
venv/
|
128 |
+
ENV/
|
129 |
+
env.bak/
|
130 |
+
.env.bak
|
131 |
+
venv.bak/
|
132 |
+
*venv/
|
133 |
+
|
134 |
+
# Spyder project settings
|
135 |
+
.spyderproject
|
136 |
+
.spyproject
|
137 |
+
|
138 |
+
# Rope project settings
|
139 |
+
.ropeproject
|
140 |
+
|
141 |
+
# mkdocs documentation
|
142 |
+
/site
|
143 |
+
|
144 |
+
# mypy
|
145 |
+
.mypy_cache/
|
146 |
+
.dmypy.json
|
147 |
+
dmypy.json
|
148 |
+
|
149 |
+
# Pyre type checker
|
150 |
+
.pyre/
|
151 |
+
|
152 |
+
# pytype static type analyzer
|
153 |
+
.pytype/
|
154 |
+
|
155 |
+
# Cython debug symbols
|
156 |
+
cython_debug/
|
157 |
+
|
158 |
+
# PyCharm
|
159 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
160 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
161 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
162 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
163 |
+
.idea/
|
164 |
+
.vscode/
|
165 |
+
.cursorignore
|
166 |
+
|
167 |
+
# evaluation
|
168 |
+
evaluation/evaluation_outputs
|
169 |
+
evaluation/outputs
|
170 |
+
evaluation/swe_bench/eval_workspace*
|
171 |
+
evaluation/SWE-bench/data
|
172 |
+
evaluation/webarena/scripts/webarena_env.sh
|
173 |
+
evaluation/bird/data
|
174 |
+
evaluation/gaia/data
|
175 |
+
evaluation/gorilla/data
|
176 |
+
evaluation/toolqa/data
|
177 |
+
evaluation/scienceagentbench/benchmark
|
178 |
+
evaluation/commit0_bench/repos
|
179 |
+
|
180 |
+
# openhands resolver
|
181 |
+
output/
|
182 |
+
|
183 |
+
# frontend
|
184 |
+
|
185 |
+
# dependencies
|
186 |
+
frontend/.pnp
|
187 |
+
frontend/bun.lockb
|
188 |
+
frontend/yarn.lock
|
189 |
+
.pnp.js
|
190 |
+
|
191 |
+
# testing
|
192 |
+
frontend/coverage
|
193 |
+
test_results*
|
194 |
+
/_test_files_tmp/
|
195 |
+
|
196 |
+
# production
|
197 |
+
frontend/build
|
198 |
+
frontend/dist
|
199 |
+
|
200 |
+
# misc
|
201 |
+
.DS_Store
|
202 |
+
.env.local
|
203 |
+
.env.development.local
|
204 |
+
.env.test.local
|
205 |
+
.env.production.local
|
206 |
+
|
207 |
+
npm-debug.log*
|
208 |
+
yarn-debug.log*
|
209 |
+
yarn-error.log*
|
210 |
+
|
211 |
+
logs
|
212 |
+
|
213 |
+
# agent
|
214 |
+
.envrc
|
215 |
+
/workspace
|
216 |
+
/_test_workspace
|
217 |
+
/debug
|
218 |
+
cache
|
219 |
+
|
220 |
+
# configuration
|
221 |
+
config.toml
|
222 |
+
config.toml_
|
223 |
+
config.toml.bak
|
224 |
+
|
225 |
+
# swe-bench-eval
|
226 |
+
image_build_logs
|
227 |
+
run_instance_logs
|
228 |
+
|
229 |
+
runtime_*.tar
|
230 |
+
|
231 |
+
# docker build
|
232 |
+
containers/runtime/Dockerfile
|
233 |
+
containers/runtime/project.tar.gz
|
234 |
+
containers/runtime/code
|
235 |
+
**/node_modules/
|
.nvmrc
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
22
|
.openhands/microagents/repo.md
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
name: repo
|
3 |
+
type: repo
|
4 |
+
agent: CodeActAgent
|
5 |
+
---
|
6 |
+
This repository contains the code for OpenHands, an automated AI software engineer. It has a Python backend
|
7 |
+
(in the `openhands` directory) and React frontend (in the `frontend` directory).
|
8 |
+
|
9 |
+
## General Setup:
|
10 |
+
To set up the entire repo, including frontend and backend, run `make build`.
|
11 |
+
You don't need to do this unless the user asks you to, or if you're trying to run the entire application.
|
12 |
+
|
13 |
+
Before pushing any changes, you should ensure that any lint errors or simple test errors have been fixed.
|
14 |
+
|
15 |
+
* If you've made changes to the backend, you should run `pre-commit run --all-files --config ./dev_config/python/.pre-commit-config.yaml`
|
16 |
+
* If you've made changes to the frontend, you should run `cd frontend && npm run lint:fix && npm run build ; cd ..`
|
17 |
+
|
18 |
+
If either command fails, it may have automatically fixed some issues. You should fix any issues that weren't automatically fixed,
|
19 |
+
then re-run the command to ensure it passes.
|
20 |
+
|
21 |
+
## Repository Structure
|
22 |
+
Backend:
|
23 |
+
- Located in the `openhands` directory
|
24 |
+
- Testing:
|
25 |
+
- All tests are in `tests/unit/test_*.py`
|
26 |
+
- To test new code, run `poetry run pytest tests/unit/test_xxx.py` where `xxx` is the appropriate file for the current functionality
|
27 |
+
- Write all tests with pytest
|
28 |
+
|
29 |
+
Frontend:
|
30 |
+
- Located in the `frontend` directory
|
31 |
+
- Prerequisites: A recent version of NodeJS / NPM
|
32 |
+
- Setup: Run `npm install` in the frontend directory
|
33 |
+
- Testing:
|
34 |
+
- Run tests: `npm run test`
|
35 |
+
- To run specific tests: `npm run test -- -t "TestName"`
|
36 |
+
- Building:
|
37 |
+
- Build for production: `npm run build`
|
38 |
+
- Environment Variables:
|
39 |
+
- Set in `frontend/.env` or as environment variables
|
40 |
+
- Available variables: VITE_BACKEND_HOST, VITE_USE_TLS, VITE_INSECURE_SKIP_VERIFY, VITE_FRONTEND_PORT
|
41 |
+
- Internationalization:
|
42 |
+
- Generate i18n declaration file: `npm run make-i18n`
|
CODE_OF_CONDUCT.md
ADDED
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
# Contributor Covenant Code of Conduct
|
3 |
+
|
4 |
+
## Our Pledge
|
5 |
+
|
6 |
+
We as members, contributors, and leaders pledge to make participation in our
|
7 |
+
community a harassment-free experience for everyone, regardless of age, body
|
8 |
+
size, visible or invisible disability, ethnicity, sex characteristics, gender
|
9 |
+
identity and expression, level of experience, education, socio-economic status,
|
10 |
+
nationality, personal appearance, race, caste, color, religion, or sexual
|
11 |
+
identity and orientation.
|
12 |
+
|
13 |
+
We pledge to act and interact in ways that contribute to an open, welcoming,
|
14 |
+
diverse, inclusive, and healthy community.
|
15 |
+
|
16 |
+
## Our Standards
|
17 |
+
|
18 |
+
Examples of behavior that contributes to a positive environment for our
|
19 |
+
community include:
|
20 |
+
|
21 |
+
* Demonstrating empathy and kindness toward other people.
|
22 |
+
* Being respectful of differing opinions, viewpoints, and experiences.
|
23 |
+
* Giving and gracefully accepting constructive feedback.
|
24 |
+
* Accepting responsibility and apologizing to those affected by our mistakes,
|
25 |
+
and learning from the experience.
|
26 |
+
* Focusing on what is best not just for us as individuals, but for the overall
|
27 |
+
community.
|
28 |
+
|
29 |
+
Examples of unacceptable behavior include:
|
30 |
+
|
31 |
+
* The use of sexualized language or imagery, and sexual attention or advances of
|
32 |
+
any kind.
|
33 |
+
* Trolling, insulting or derogatory comments, and personal or political attacks.
|
34 |
+
* Public or private harassment.
|
35 |
+
* Publishing others' private information, such as a physical or email address,
|
36 |
+
without their explicit permission.
|
37 |
+
* Other conduct which could reasonably be considered inappropriate in a
|
38 |
+
professional setting.
|
39 |
+
|
40 |
+
## Enforcement Responsibilities
|
41 |
+
|
42 |
+
Community leaders are responsible for clarifying and enforcing our standards of
|
43 |
+
acceptable behavior and will take appropriate and fair corrective action in
|
44 |
+
response to any behavior that they deem inappropriate, threatening, offensive,
|
45 |
+
or harmful.
|
46 |
+
|
47 |
+
Community leaders have the right and responsibility to remove, edit, or reject
|
48 |
+
comments, commits, code, wiki edits, issues, and other contributions that are
|
49 |
+
not aligned to this Code of Conduct, and will communicate reasons for moderation
|
50 |
+
decisions when appropriate.
|
51 |
+
|
52 |
+
## Scope
|
53 |
+
|
54 |
+
This Code of Conduct applies within all community spaces, and also applies when
|
55 |
+
an individual is officially representing the community in public spaces.
|
56 |
+
Examples of representing our community include using an official email address,
|
57 |
+
posting via an official social media account, or acting as an appointed
|
58 |
+
representative at an online or offline event.
|
59 |
+
|
60 |
+
## Enforcement
|
61 |
+
|
62 |
+
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
63 |
+
reported to the community leaders responsible for enforcement at
|
64 | |
65 |
+
All complaints will be reviewed and investigated promptly and fairly.
|
66 |
+
|
67 |
+
All community leaders are obligated to respect the privacy and security of the
|
68 |
+
reporter of any incident.
|
69 |
+
|
70 |
+
## Enforcement Guidelines
|
71 |
+
|
72 |
+
Community leaders will follow these Community Impact Guidelines in determining
|
73 |
+
the consequences for any action they deem in violation of this Code of Conduct:
|
74 |
+
|
75 |
+
### 1. Correction
|
76 |
+
|
77 |
+
**Community Impact**: Use of inappropriate language or other behavior deemed
|
78 |
+
unprofessional or unwelcome in the community.
|
79 |
+
|
80 |
+
**Consequence**: A private, written warning from community leaders, providing
|
81 |
+
clarity around the nature of the violation and an explanation of why the
|
82 |
+
behavior was inappropriate. A public apology may be requested.
|
83 |
+
|
84 |
+
### 2. Warning
|
85 |
+
|
86 |
+
**Community Impact**: A violation through a single incident or series of
|
87 |
+
actions.
|
88 |
+
|
89 |
+
**Consequence**: A warning with consequences for continued behavior. No
|
90 |
+
interaction with the people involved, including unsolicited interaction with
|
91 |
+
those enforcing the Code of Conduct, for a specified period of time. This
|
92 |
+
includes avoiding interactions in community spaces as well as external channels
|
93 |
+
like social media. Violating these terms may lead to a temporary or permanent
|
94 |
+
ban.
|
95 |
+
|
96 |
+
### 3. Temporary Ban
|
97 |
+
|
98 |
+
**Community Impact**: A serious violation of community standards, including
|
99 |
+
sustained inappropriate behavior.
|
100 |
+
|
101 |
+
**Consequence**: A temporary ban from any sort of interaction or public
|
102 |
+
communication with the community for a specified period of time. No public or
|
103 |
+
private interaction with the people involved, including unsolicited interaction
|
104 |
+
with those enforcing the Code of Conduct, is allowed during this period.
|
105 |
+
Violating these terms may lead to a permanent ban.
|
106 |
+
|
107 |
+
### 4. Permanent Ban
|
108 |
+
|
109 |
+
**Community Impact**: Demonstrating a pattern of violation of community
|
110 |
+
standards, including sustained inappropriate behavior, harassment of an
|
111 |
+
individual, or aggression toward or disparagement of classes of individuals.
|
112 |
+
|
113 |
+
**Consequence**: A permanent ban from any sort of public interaction within the
|
114 |
+
community.
|
115 |
+
|
116 |
+
### Slack and Discord Etiquettes
|
117 |
+
|
118 |
+
These Slack and Discord etiquette guidelines are designed to foster an inclusive, respectful, and productive environment for all community members. By following these best practices, we ensure effective communication and collaboration while minimizing disruptions. Let’s work together to build a supportive and welcoming community!
|
119 |
+
|
120 |
+
- Communicate respectfully and professionally, avoiding sarcasm or harsh language, and remember that tone can be difficult to interpret in text.
|
121 |
+
- Use threads for specific discussions to keep channels organized and easier to follow.
|
122 |
+
- Tag others only when their input is critical or urgent, and use @here, @channel or @everyone sparingly to minimize disruptions.
|
123 |
+
- Be patient, as open-source contributors and maintainers often have other commitments and may need time to respond.
|
124 |
+
- Post questions or discussions in the most relevant channel (e.g., for [slack - #general](https://app.slack.com/client/T06P212QSEA/C06P5NCGSFP) for general topics, [slack - #questions](https://openhands-ai.slack.com/archives/C06U8UTKSAD) for queries/questions, [discord - #general](https://discord.com/channels/1222935860639563850/1222935861386018885)).
|
125 |
+
- When asking for help or raising issues, include necessary details like links, screenshots, or clear explanations to provide context.
|
126 |
+
- Keep discussions in public channels whenever possible to allow others to benefit from the conversation, unless the matter is sensitive or private.
|
127 |
+
- Always adhere to [our standards](https://github.com/All-Hands-AI/OpenHands/blob/main/CODE_OF_CONDUCT.md#our-standards) to ensure a welcoming and collaborative environment.
|
128 |
+
- If you choose to mute a channel, consider setting up alerts for topics that still interest you to stay engaged. For Slack, Go to Settings → Notifications → My Keywords to add specific keywords that will notify you when mentioned. For example, if you're here for discussions about LLMs, mute the channel if it’s too busy, but set notifications to alert you only when “LLMs” appears in messages. Also for Discord, go to the channel notifications and choose the option that best describes your need.
|
129 |
+
|
130 |
+
## Attribution
|
131 |
+
|
132 |
+
This Code of Conduct is adapted from the [Contributor Covenant][homepage],
|
133 |
+
version 2.1, available at
|
134 |
+
[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
|
135 |
+
|
136 |
+
Community Impact Guidelines were inspired by
|
137 |
+
[Mozilla's code of conduct enforcement ladder][Mozilla CoC].
|
138 |
+
|
139 |
+
For answers to common questions about this code of conduct, see the FAQ at
|
140 |
+
[https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
|
141 |
+
[https://www.contributor-covenant.org/translations][translations].
|
142 |
+
|
143 |
+
[homepage]: https://www.contributor-covenant.org
|
144 |
+
[v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
|
145 |
+
[Mozilla CoC]: https://github.com/mozilla/diversity
|
146 |
+
[FAQ]: https://www.contributor-covenant.org/faq
|
147 |
+
[translations]: https://www.contributor-covenant.org/translations
|
COMMUNITY.md
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# 🙌 The OpenHands Community
|
2 |
+
|
3 |
+
The OpenHands community is built around the belief that (1) AI and AI agents are going to fundamentally change the way
|
4 |
+
we build software, and (2) if this is true, we should do everything we can to make sure that the benefits provided by
|
5 |
+
such powerful technology are accessible to everyone.
|
6 |
+
|
7 |
+
If this resonates with you, we'd love to have you join us in our quest!
|
8 |
+
|
9 |
+
## 🤝 How to Join
|
10 |
+
|
11 |
+
Check out our [How to Join the Community section.](https://github.com/All-Hands-AI/OpenHands?tab=readme-ov-file#-how-to-join-the-community)
|
12 |
+
|
13 |
+
## 💪 Becoming a Contributor
|
14 |
+
|
15 |
+
We welcome contributions from everyone! Whether you're a developer, a researcher, or simply enthusiastic about advancing
|
16 |
+
the field of software engineering with AI, there are many ways to get involved:
|
17 |
+
|
18 |
+
- **Code Contributions:** Help us develop new core functionality, improve our agents, improve the frontend and other
|
19 |
+
interfaces, or anything else that would help make OpenHands better.
|
20 |
+
- **Research and Evaluation:** Contribute to our understanding of LLMs in software engineering, participate in
|
21 |
+
evaluating the models, or suggest improvements.
|
22 |
+
- **Feedback and Testing:** Use the OpenHands toolset, report bugs, suggest features, or provide feedback on usability.
|
23 |
+
|
24 |
+
For details, please check [CONTRIBUTING.md](./CONTRIBUTING.md).
|
25 |
+
|
26 |
+
## Code of Conduct
|
27 |
+
|
28 |
+
We have a [Code of Conduct](./CODE_OF_CONDUCT.md) that we expect all contributors to adhere to.
|
29 |
+
Long story short, we are aiming for an open, welcoming, diverse, inclusive, and healthy community.
|
30 |
+
All contributors are expected to contribute to building this sort of community.
|
31 |
+
|
32 |
+
## 🛠️ Becoming a Maintainer
|
33 |
+
|
34 |
+
For contributors who have made significant and sustained contributions to the project, there is a possibility of joining
|
35 |
+
the maintainer team. The process for this is as follows:
|
36 |
+
|
37 |
+
1. Any contributor who has made sustained and high-quality contributions to the codebase can be nominated by any
|
38 |
+
maintainer. If you feel that you may qualify you can reach out to any of the maintainers that have reviewed your PRs and ask if you can be nominated.
|
39 |
+
2. Once a maintainer nominates a new maintainer, there will be a discussion period among the maintainers for at least 3 days.
|
40 |
+
3. If no concerns are raised the nomination will be accepted by acclamation, and if concerns are raised there will be a discussion and possible vote.
|
41 |
+
|
42 |
+
Note that just making many PRs does not immediately imply that you will become a maintainer. We will be looking
|
43 |
+
at sustained high-quality contributions over a period of time, as well as good teamwork and adherence to our [Code of Conduct](./CODE_OF_CONDUCT.md).
|
CONTRIBUTING.md
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Contributing
|
2 |
+
|
3 |
+
Thanks for your interest in contributing to OpenHands! We welcome and appreciate contributions.
|
4 |
+
|
5 |
+
## Understanding OpenHands's CodeBase
|
6 |
+
|
7 |
+
To understand the codebase, please refer to the README in each module:
|
8 |
+
- [frontend](./frontend/README.md)
|
9 |
+
- [evaluation](./evaluation/README.md)
|
10 |
+
- [openhands](./openhands/README.md)
|
11 |
+
- [agenthub](./openhands/agenthub/README.md)
|
12 |
+
- [server](./openhands/server/README.md)
|
13 |
+
|
14 |
+
## Setting up Your Development Environment
|
15 |
+
|
16 |
+
We have a separate doc [Development.md](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md) that tells you how to set up a development workflow.
|
17 |
+
|
18 |
+
## How Can I Contribute?
|
19 |
+
|
20 |
+
There are many ways that you can contribute:
|
21 |
+
|
22 |
+
1. **Download and use** OpenHands, and send [issues](https://github.com/All-Hands-AI/OpenHands/issues) when you encounter something that isn't working or a feature that you'd like to see.
|
23 |
+
2. **Send feedback** after each session by [clicking the thumbs-up thumbs-down buttons](https://docs.all-hands.dev/modules/usage/feedback), so we can see where things are working and failing, and also build an open dataset for training code agents.
|
24 |
+
3. **Improve the Codebase** by sending [PRs](#sending-pull-requests-to-openhands) (see details below). In particular, we have some [good first issues](https://github.com/All-Hands-AI/OpenHands/labels/good%20first%20issue) that may be ones to start on.
|
25 |
+
|
26 |
+
## What Can I Build?
|
27 |
+
Here are a few ways you can help improve the codebase.
|
28 |
+
|
29 |
+
#### UI/UX
|
30 |
+
We're always looking to improve the look and feel of the application. If you've got a small fix
|
31 |
+
for something that's bugging you, feel free to open up a PR that changes the [`./frontend`](./frontend) directory.
|
32 |
+
|
33 |
+
If you're looking to make a bigger change, add a new UI element, or significantly alter the style
|
34 |
+
of the application, please open an issue first, or better, join the #frontend channel in our Slack
|
35 |
+
to gather consensus from our design team first.
|
36 |
+
|
37 |
+
#### Improving the agent
|
38 |
+
Our main agent is the CodeAct agent. You can [see its prompts here](https://github.com/All-Hands-AI/OpenHands/tree/main/openhands/agenthub/codeact_agent).
|
39 |
+
|
40 |
+
Changes to these prompts, and to the underlying behavior in Python, can have a huge impact on user experience.
|
41 |
+
You can try modifying the prompts to see how they change the behavior of the agent as you use the app
|
42 |
+
locally, but we will need to do an end-to-end evaluation of any changes here to ensure that the agent
|
43 |
+
is getting better over time.
|
44 |
+
|
45 |
+
We use the [SWE-bench](https://www.swebench.com/) benchmark to test our agent. You can join the #evaluation
|
46 |
+
channel in Slack to learn more.
|
47 |
+
|
48 |
+
#### Adding a new agent
|
49 |
+
You may want to experiment with building new types of agents. You can add an agent to [`openhands/agenthub`](./openhands/agenthub)
|
50 |
+
to help expand the capabilities of OpenHands.
|
51 |
+
|
52 |
+
#### Adding a new runtime
|
53 |
+
The agent needs a place to run code and commands. When you run OpenHands on your laptop, it uses a Docker container
|
54 |
+
to do this by default. But there are other ways of creating a sandbox for the agent.
|
55 |
+
|
56 |
+
If you work for a company that provides a cloud-based runtime, you could help us add support for that runtime
|
57 |
+
by implementing the [interface specified here](https://github.com/All-Hands-AI/OpenHands/blob/main/openhands/runtime/base.py).
|
58 |
+
|
59 |
+
#### Testing
|
60 |
+
When you write code, it is also good to write tests. Please navigate to the [`./tests`](./tests) folder to see existing test suites.
|
61 |
+
At the moment, we have two kinds of tests: [`unit`](./tests/unit) and [`integration`](./evaluation/integration_tests). Please refer to the README for each test suite. These tests also run on GitHub's continuous integration to ensure quality of the project.
|
62 |
+
|
63 |
+
## Sending Pull Requests to OpenHands
|
64 |
+
|
65 |
+
You'll need to fork our repository to send us a Pull Request. You can learn more
|
66 |
+
about how to fork a GitHub repo and open a PR with your changes in [this article](https://medium.com/swlh/forks-and-pull-requests-how-to-contribute-to-github-repos-8843fac34ce8).
|
67 |
+
|
68 |
+
### Pull Request title
|
69 |
+
As described [here](https://github.com/commitizen/conventional-commit-types/blob/master/index.json), a valid PR title should begin with one of the following prefixes:
|
70 |
+
|
71 |
+
- `feat`: A new feature
|
72 |
+
- `fix`: A bug fix
|
73 |
+
- `docs`: Documentation only changes
|
74 |
+
- `style`: Changes that do not affect the meaning of the code (white space, formatting, missing semicolons, etc.)
|
75 |
+
- `refactor`: A code change that neither fixes a bug nor adds a feature
|
76 |
+
- `perf`: A code change that improves performance
|
77 |
+
- `test`: Adding missing tests or correcting existing tests
|
78 |
+
- `build`: Changes that affect the build system or external dependencies (example scopes: gulp, broccoli, npm)
|
79 |
+
- `ci`: Changes to our CI configuration files and scripts (example scopes: Travis, Circle, BrowserStack, SauceLabs)
|
80 |
+
- `chore`: Other changes that don't modify src or test files
|
81 |
+
- `revert`: Reverts a previous commit
|
82 |
+
|
83 |
+
For example, a PR title could be:
|
84 |
+
- `refactor: modify package path`
|
85 |
+
- `feat(frontend): xxxx`, where `(frontend)` means that this PR mainly focuses on the frontend component.
|
86 |
+
|
87 |
+
You may also check out previous PRs in the [PR list](https://github.com/All-Hands-AI/OpenHands/pulls).
|
88 |
+
|
89 |
+
### Pull Request description
|
90 |
+
- If your PR is small (such as a typo fix), you can go brief.
|
91 |
+
- If it contains a lot of changes, it's better to write more details.
|
92 |
+
|
93 |
+
If your changes are user-facing (e.g. a new feature in the UI, a change in behavior, or a bugfix)
|
94 |
+
please include a short message that we can add to our changelog.
|
95 |
+
|
96 |
+
## How to Make Effective Contributions
|
97 |
+
|
98 |
+
### Opening Issues
|
99 |
+
|
100 |
+
If you notice any bugs or have any feature requests please open them via the [issues page](https://github.com/All-Hands-AI/OpenHands/issues). We will triage based on how critical the bug is or how potentially useful the improvement is, discuss, and implement the ones that the community has interest/effort for.
|
101 |
+
|
102 |
+
Further, if you see an issue you like, please leave a "thumbs-up" or a comment, which will help us prioritize.
|
103 |
+
|
104 |
+
### Making Pull Requests
|
105 |
+
|
106 |
+
We're generally happy to consider all pull requests with the evaluation process varying based on the type of change:
|
107 |
+
|
108 |
+
#### For Small Improvements
|
109 |
+
|
110 |
+
Small improvements with few downsides are typically reviewed and approved quickly.
|
111 |
+
One thing to check when making changes is to ensure that all continuous integration tests pass, which you can check before getting a review.
|
112 |
+
|
113 |
+
#### For Core Agent Changes
|
114 |
+
|
115 |
+
We need to be more careful with changes to the core agent, as it is imperative to maintain high quality. These PRs are evaluated based on three key metrics:
|
116 |
+
|
117 |
+
1. **Accuracy**
|
118 |
+
2. **Efficiency**
|
119 |
+
3. **Code Complexity**
|
120 |
+
|
121 |
+
If it improves accuracy, efficiency, or both with only a minimal change to code quality, that's great we're happy to merge it in!
|
122 |
+
If there are bigger tradeoffs (e.g. helping efficiency a lot and hurting accuracy a little) we might want to put it behind a feature flag.
|
123 |
+
Either way, please feel free to discuss on github issues or slack, and we will give guidance and preliminary feedback.
|
CREDITS.md
ADDED
@@ -0,0 +1,312 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Credits
|
2 |
+
|
3 |
+
## Contributors
|
4 |
+
|
5 |
+
We would like to thank all the [contributors](https://github.com/All-Hands-AI/OpenHands/graphs/contributors) who have helped make OpenHands possible. We greatly appreciate your dedication and hard work.
|
6 |
+
|
7 |
+
## Open Source Projects
|
8 |
+
|
9 |
+
OpenHands includes and adapts the following open source projects. We are grateful for their contributions to the open source community:
|
10 |
+
|
11 |
+
#### [SWE Agent](https://github.com/princeton-nlp/swe-agent)
|
12 |
+
- License: MIT License
|
13 |
+
- Description: Adapted for use in OpenHands's agent hub
|
14 |
+
|
15 |
+
#### [Aider](https://github.com/paul-gauthier/aider)
|
16 |
+
- License: Apache License 2.0
|
17 |
+
- Description: AI pair programming tool. OpenHands has adapted and integrated its linter module for code-related tasks in [`agentskills utilities`](https://github.com/All-Hands-AI/OpenHands/tree/main/openhands/runtime/plugins/agent_skills/utils/aider)
|
18 |
+
|
19 |
+
#### [BrowserGym](https://github.com/ServiceNow/BrowserGym)
|
20 |
+
- License: Apache License 2.0
|
21 |
+
- Description: Adapted in implementing the browsing agent
|
22 |
+
|
23 |
+
|
24 |
+
### Reference Implementations for Evaluation Benchmarks
|
25 |
+
OpenHands integrates code of the reference implementations for the following agent evaluation benchmarks:
|
26 |
+
|
27 |
+
#### [HumanEval](https://github.com/openai/human-eval)
|
28 |
+
- License: MIT License
|
29 |
+
|
30 |
+
#### [DSP](https://github.com/microsoft/DataScienceProblems)
|
31 |
+
- License: MIT License
|
32 |
+
|
33 |
+
#### [HumanEvalPack](https://github.com/bigcode-project/bigcode-evaluation-harness)
|
34 |
+
- License: Apache License 2.0
|
35 |
+
|
36 |
+
#### [AgentBench](https://github.com/THUDM/AgentBench)
|
37 |
+
- License: Apache License 2.0
|
38 |
+
|
39 |
+
#### [SWE-Bench](https://github.com/princeton-nlp/SWE-bench)
|
40 |
+
- License: MIT License
|
41 |
+
|
42 |
+
#### [BIRD](https://bird-bench.github.io/)
|
43 |
+
- License: MIT License
|
44 |
+
- Dataset: CC-BY-SA 4.0
|
45 |
+
|
46 |
+
#### [Gorilla APIBench](https://github.com/ShishirPatil/gorilla)
|
47 |
+
- License: Apache License 2.0
|
48 |
+
|
49 |
+
#### [GPQA](https://github.com/idavidrein/gpqa)
|
50 |
+
- License: MIT License
|
51 |
+
|
52 |
+
#### [ProntoQA](https://github.com/asaparov/prontoqa)
|
53 |
+
- License: Apache License 2.0
|
54 |
+
|
55 |
+
|
56 |
+
## Open Source licenses
|
57 |
+
|
58 |
+
### MIT License
|
59 |
+
|
60 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
61 |
+
|
62 |
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
63 |
+
|
64 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
65 |
+
|
66 |
+
### BSD 3-Clause License
|
67 |
+
|
68 |
+
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
|
69 |
+
|
70 |
+
1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
|
71 |
+
|
72 |
+
2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
|
73 |
+
|
74 |
+
3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
|
75 |
+
|
76 |
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
77 |
+
|
78 |
+
### Apache License 2.0
|
79 |
+
|
80 |
+
|
81 |
+
Apache License
|
82 |
+
Version 2.0, January 2004
|
83 |
+
http://www.apache.org/licenses/
|
84 |
+
|
85 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
86 |
+
|
87 |
+
1. Definitions.
|
88 |
+
|
89 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
90 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
91 |
+
|
92 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
93 |
+
the copyright owner that is granting the License.
|
94 |
+
|
95 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
96 |
+
other entities that control, are controlled by, or are under common
|
97 |
+
control with that entity. For the purposes of this definition,
|
98 |
+
"control" means (i) the power, direct or indirect, to cause the
|
99 |
+
direction or management of such entity, whether by contract or
|
100 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
101 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
102 |
+
|
103 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
104 |
+
exercising permissions granted by this License.
|
105 |
+
|
106 |
+
"Source" form shall mean the preferred form for making modifications,
|
107 |
+
including but not limited to software source code, documentation
|
108 |
+
source, and configuration files.
|
109 |
+
|
110 |
+
"Object" form shall mean any form resulting from mechanical
|
111 |
+
transformation or translation of a Source form, including but
|
112 |
+
not limited to compiled object code, generated documentation,
|
113 |
+
and conversions to other media types.
|
114 |
+
|
115 |
+
"Work" shall mean the work of authorship, whether in Source or
|
116 |
+
Object form, made available under the License, as indicated by a
|
117 |
+
copyright notice that is included in or attached to the work
|
118 |
+
(an example is provided in the Appendix below).
|
119 |
+
|
120 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
121 |
+
form, that is based on (or derived from) the Work and for which the
|
122 |
+
editorial revisions, annotations, elaborations, or other modifications
|
123 |
+
represent, as a whole, an original work of authorship. For the purposes
|
124 |
+
of this License, Derivative Works shall not include works that remain
|
125 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
126 |
+
the Work and Derivative Works thereof.
|
127 |
+
|
128 |
+
"Contribution" shall mean any work of authorship, including
|
129 |
+
the original version of the Work and any modifications or additions
|
130 |
+
to that Work or Derivative Works thereof, that is intentionally
|
131 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
132 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
133 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
134 |
+
means any form of electronic, verbal, or written communication sent
|
135 |
+
to the Licensor or its representatives, including but not limited to
|
136 |
+
communication on electronic mailing lists, source code control systems,
|
137 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
138 |
+
Licensor for the purpose of discussing and improving the Work, but
|
139 |
+
excluding communication that is conspicuously marked or otherwise
|
140 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
141 |
+
|
142 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
143 |
+
on behalf of whom a Contribution has been received by Licensor and
|
144 |
+
subsequently incorporated within the Work.
|
145 |
+
|
146 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
147 |
+
this License, each Contributor hereby grants to You a perpetual,
|
148 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
149 |
+
copyright license to reproduce, prepare Derivative Works of,
|
150 |
+
publicly display, publicly perform, sublicense, and distribute the
|
151 |
+
Work and such Derivative Works in Source or Object form.
|
152 |
+
|
153 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
154 |
+
this License, each Contributor hereby grants to You a perpetual,
|
155 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
156 |
+
(except as stated in this section) patent license to make, have made,
|
157 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
158 |
+
where such license applies only to those patent claims licensable
|
159 |
+
by such Contributor that are necessarily infringed by their
|
160 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
161 |
+
with the Work to which such Contribution(s) was submitted. If You
|
162 |
+
institute patent litigation against any entity (including a
|
163 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
164 |
+
or a Contribution incorporated within the Work constitutes direct
|
165 |
+
or contributory patent infringement, then any patent licenses
|
166 |
+
granted to You under this License for that Work shall terminate
|
167 |
+
as of the date such litigation is filed.
|
168 |
+
|
169 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
170 |
+
Work or Derivative Works thereof in any medium, with or without
|
171 |
+
modifications, and in Source or Object form, provided that You
|
172 |
+
meet the following conditions:
|
173 |
+
|
174 |
+
(a) You must give any other recipients of the Work or
|
175 |
+
Derivative Works a copy of this License; and
|
176 |
+
|
177 |
+
(b) You must cause any modified files to carry prominent notices
|
178 |
+
stating that You changed the files; and
|
179 |
+
|
180 |
+
(c) You must retain, in the Source form of any Derivative Works
|
181 |
+
that You distribute, all copyright, patent, trademark, and
|
182 |
+
attribution notices from the Source form of the Work,
|
183 |
+
excluding those notices that do not pertain to any part of
|
184 |
+
the Derivative Works; and
|
185 |
+
|
186 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
187 |
+
distribution, then any Derivative Works that You distribute must
|
188 |
+
include a readable copy of the attribution notices contained
|
189 |
+
within such NOTICE file, excluding those notices that do not
|
190 |
+
pertain to any part of the Derivative Works, in at least one
|
191 |
+
of the following places: within a NOTICE text file distributed
|
192 |
+
as part of the Derivative Works; within the Source form or
|
193 |
+
documentation, if provided along with the Derivative Works; or,
|
194 |
+
within a display generated by the Derivative Works, if and
|
195 |
+
wherever such third-party notices normally appear. The contents
|
196 |
+
of the NOTICE file are for informational purposes only and
|
197 |
+
do not modify the License. You may add Your own attribution
|
198 |
+
notices within Derivative Works that You distribute, alongside
|
199 |
+
or as an addendum to the NOTICE text from the Work, provided
|
200 |
+
that such additional attribution notices cannot be construed
|
201 |
+
as modifying the License.
|
202 |
+
|
203 |
+
You may add Your own copyright statement to Your modifications and
|
204 |
+
may provide additional or different license terms and conditions
|
205 |
+
for use, reproduction, or distribution of Your modifications, or
|
206 |
+
for any such Derivative Works as a whole, provided Your use,
|
207 |
+
reproduction, and distribution of the Work otherwise complies with
|
208 |
+
the conditions stated in this License.
|
209 |
+
|
210 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
211 |
+
any Contribution intentionally submitted for inclusion in the Work
|
212 |
+
by You to the Licensor shall be under the terms and conditions of
|
213 |
+
this License, without any additional terms or conditions.
|
214 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
215 |
+
the terms of any separate license agreement you may have executed
|
216 |
+
with Licensor regarding such Contributions.
|
217 |
+
|
218 |
+
6. Trademarks. This License does not grant permission to use the trade
|
219 |
+
names, trademarks, service marks, or product names of the Licensor,
|
220 |
+
except as required for reasonable and customary use in describing the
|
221 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
222 |
+
|
223 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
224 |
+
agreed to in writing, Licensor provides the Work (and each
|
225 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
226 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
227 |
+
implied, including, without limitation, any warranties or conditions
|
228 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
229 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
230 |
+
appropriateness of using or redistributing the Work and assume any
|
231 |
+
risks associated with Your exercise of permissions under this License.
|
232 |
+
|
233 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
234 |
+
whether in tort (including negligence), contract, or otherwise,
|
235 |
+
unless required by applicable law (such as deliberate and grossly
|
236 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
237 |
+
liable to You for damages, including any direct, indirect, special,
|
238 |
+
incidental, or consequential damages of any character arising as a
|
239 |
+
result of this License or out of the use or inability to use the
|
240 |
+
Work (including but not limited to damages for loss of goodwill,
|
241 |
+
work stoppage, computer failure or malfunction, or any and all
|
242 |
+
other commercial damages or losses), even if such Contributor
|
243 |
+
has been advised of the possibility of such damages.
|
244 |
+
|
245 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
246 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
247 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
248 |
+
or other liability obligations and/or rights consistent with this
|
249 |
+
License. However, in accepting such obligations, You may act only
|
250 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
251 |
+
of any other Contributor, and only if You agree to indemnify,
|
252 |
+
defend, and hold each Contributor harmless for any liability
|
253 |
+
incurred by, or claims asserted against, such Contributor by reason
|
254 |
+
of your accepting any such warranty or additional liability.
|
255 |
+
|
256 |
+
END OF TERMS AND CONDITIONS
|
257 |
+
|
258 |
+
APPENDIX: How to apply the Apache License to your work.
|
259 |
+
|
260 |
+
To apply the Apache License to your work, attach the following
|
261 |
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
262 |
+
replaced with your own identifying information. (Don't include
|
263 |
+
the brackets!) The text should be enclosed in the appropriate
|
264 |
+
comment syntax for the file format. We also recommend that a
|
265 |
+
file or class name and description of purpose be included on the
|
266 |
+
same "printed page" as the copyright notice for easier
|
267 |
+
identification within third-party archives.
|
268 |
+
|
269 |
+
Copyright [yyyy] [name of copyright owner]
|
270 |
+
|
271 |
+
|
272 |
+
|
273 |
+
### Non-Open Source Reference Implementations:
|
274 |
+
|
275 |
+
#### [MultiPL-E](https://github.com/nuprl/MultiPL-E)
|
276 |
+
- License: BSD 3-Clause License with Machine Learning Restriction
|
277 |
+
|
278 |
+
BSD 3-Clause License with Machine Learning Restriction
|
279 |
+
|
280 |
+
Copyright (c) 2022, Northeastern University, Oberlin College, Roblox Inc,
|
281 |
+
Stevens Institute of Technology, University of Massachusetts Amherst, and
|
282 |
+
Wellesley College.
|
283 |
+
|
284 |
+
All rights reserved.
|
285 |
+
|
286 |
+
Redistribution and use in source and binary forms, with or without
|
287 |
+
modification, are permitted provided that the following conditions are met:
|
288 |
+
|
289 |
+
1. Redistributions of source code must retain the above copyright notice, this
|
290 |
+
list of conditions and the following disclaimer.
|
291 |
+
|
292 |
+
2. Redistributions in binary form must reproduce the above copyright notice,
|
293 |
+
this list of conditions and the following disclaimer in the documentation
|
294 |
+
and/or other materials provided with the distribution.
|
295 |
+
|
296 |
+
3. Neither the name of the copyright holder nor the names of its
|
297 |
+
contributors may be used to endorse or promote products derived from
|
298 |
+
this software without specific prior written permission.
|
299 |
+
|
300 |
+
4. The contents of this repository may not be used as training data for any
|
301 |
+
machine learning model, including but not limited to neural networks.
|
302 |
+
|
303 |
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
304 |
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
305 |
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
306 |
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
307 |
+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
308 |
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
309 |
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
310 |
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
311 |
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
312 |
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
Development.md
ADDED
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Development Guide
|
2 |
+
This guide is for people working on OpenHands and editing the source code.
|
3 |
+
If you wish to contribute your changes, check out the [CONTRIBUTING.md](https://github.com/All-Hands-AI/OpenHands/blob/main/CONTRIBUTING.md) on how to clone and setup the project initially before moving on.
|
4 |
+
Otherwise, you can clone the OpenHands project directly.
|
5 |
+
|
6 |
+
## Start the Server for Development
|
7 |
+
### 1. Requirements
|
8 |
+
* Linux, Mac OS, or [WSL on Windows](https://learn.microsoft.com/en-us/windows/wsl/install) [Ubuntu >= 22.04]
|
9 |
+
* [Docker](https://docs.docker.com/engine/install/) (For those on MacOS, make sure to allow the default Docker socket to be used from advanced settings!)
|
10 |
+
* [Python](https://www.python.org/downloads/) = 3.12
|
11 |
+
* [NodeJS](https://nodejs.org/en/download/package-manager) >= 20.x
|
12 |
+
* [Poetry](https://python-poetry.org/docs/#installing-with-the-official-installer) >= 1.8
|
13 |
+
* OS-specific dependencies:
|
14 |
+
- Ubuntu: build-essential => `sudo apt-get install build-essential`
|
15 |
+
- WSL: netcat => `sudo apt-get install netcat`
|
16 |
+
|
17 |
+
Make sure you have all these dependencies installed before moving on to `make build`.
|
18 |
+
|
19 |
+
#### Develop without sudo access
|
20 |
+
If you want to develop without system admin/sudo access to upgrade/install `Python` and/or `NodeJs`, you can use `conda` or `mamba` to manage the packages for you:
|
21 |
+
|
22 |
+
```bash
|
23 |
+
# Download and install Mamba (a faster version of conda)
|
24 |
+
curl -L -O "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh"
|
25 |
+
bash Miniforge3-$(uname)-$(uname -m).sh
|
26 |
+
|
27 |
+
# Install Python 3.12, nodejs, and poetry
|
28 |
+
mamba install python=3.12
|
29 |
+
mamba install conda-forge::nodejs
|
30 |
+
mamba install conda-forge::poetry
|
31 |
+
```
|
32 |
+
|
33 |
+
### 2. Build and Setup The Environment
|
34 |
+
Begin by building the project which includes setting up the environment and installing dependencies. This step ensures that OpenHands is ready to run on your system:
|
35 |
+
|
36 |
+
```bash
|
37 |
+
make build
|
38 |
+
```
|
39 |
+
|
40 |
+
### 3. Configuring the Language Model
|
41 |
+
OpenHands supports a diverse array of Language Models (LMs) through the powerful [litellm](https://docs.litellm.ai) library.
|
42 |
+
By default, we've chosen Claude Sonnet 3.5 as our go-to model, but the world is your oyster! You can unleash the
|
43 |
+
potential of any other LM that piques your interest.
|
44 |
+
|
45 |
+
To configure the LM of your choice, run:
|
46 |
+
|
47 |
+
```bash
|
48 |
+
make setup-config
|
49 |
+
```
|
50 |
+
|
51 |
+
This command will prompt you to enter the LLM API key, model name, and other variables ensuring that OpenHands is tailored to your specific needs. Note that the model name will apply only when you run headless. If you use the UI, please set the model in the UI.
|
52 |
+
|
53 |
+
Note: If you have previously run OpenHands using the docker command, you may have already set some environmental variables in your terminal. The final configurations are set from highest to lowest priority:
|
54 |
+
Environment variables > config.toml variables > default variables
|
55 |
+
|
56 |
+
**Note on Alternative Models:**
|
57 |
+
See [our documentation](https://docs.all-hands.dev/modules/usage/llms) for recommended models.
|
58 |
+
|
59 |
+
### 4. Running the application
|
60 |
+
#### Option A: Run the Full Application
|
61 |
+
Once the setup is complete, this command starts both the backend and frontend servers, allowing you to interact with OpenHands:
|
62 |
+
```bash
|
63 |
+
make run
|
64 |
+
```
|
65 |
+
|
66 |
+
#### Option B: Individual Server Startup
|
67 |
+
- **Start the Backend Server:** If you prefer, you can start the backend server independently to focus on backend-related tasks or configurations.
|
68 |
+
```bash
|
69 |
+
make start-backend
|
70 |
+
```
|
71 |
+
|
72 |
+
- **Start the Frontend Server:** Similarly, you can start the frontend server on its own to work on frontend-related components or interface enhancements.
|
73 |
+
```bash
|
74 |
+
make start-frontend
|
75 |
+
```
|
76 |
+
|
77 |
+
### 6. LLM Debugging
|
78 |
+
If you encounter any issues with the Language Model (LM) or you're simply curious, export DEBUG=1 in the environment and restart the backend.
|
79 |
+
OpenHands will log the prompts and responses in the logs/llm/CURRENT_DATE directory, allowing you to identify the causes.
|
80 |
+
|
81 |
+
### 7. Help
|
82 |
+
Need help or info on available targets and commands? Use the help command for all the guidance you need with OpenHands.
|
83 |
+
```bash
|
84 |
+
make help
|
85 |
+
```
|
86 |
+
|
87 |
+
### 8. Testing
|
88 |
+
To run tests, refer to the following:
|
89 |
+
#### Unit tests
|
90 |
+
|
91 |
+
```bash
|
92 |
+
poetry run pytest ./tests/unit/test_*.py
|
93 |
+
```
|
94 |
+
|
95 |
+
### 9. Add or update dependency
|
96 |
+
1. Add your dependency in `pyproject.toml` or use `poetry add xxx`.
|
97 |
+
2. Update the poetry.lock file via `poetry lock --no-update`.
|
98 |
+
|
99 |
+
### 9. Use existing Docker image
|
100 |
+
To reduce build time (e.g., if no changes were made to the client-runtime component), you can use an existing Docker container image by
|
101 |
+
setting the SANDBOX_RUNTIME_CONTAINER_IMAGE environment variable to the desired Docker image.
|
102 |
+
|
103 |
+
Example: `export SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:0.20-nikolaik`
|
104 |
+
|
105 |
+
## Develop inside Docker container
|
106 |
+
|
107 |
+
TL;DR
|
108 |
+
|
109 |
+
```bash
|
110 |
+
make docker-dev
|
111 |
+
```
|
112 |
+
|
113 |
+
See more details [here](./containers/dev/README.md).
|
114 |
+
|
115 |
+
If you are just interested in running `OpenHands` without installing all the required tools on your host.
|
116 |
+
|
117 |
+
```bash
|
118 |
+
make docker-run
|
119 |
+
```
|
120 |
+
|
121 |
+
If you do not have `make` on your host, run:
|
122 |
+
|
123 |
+
```bash
|
124 |
+
cd ./containers/dev
|
125 |
+
./dev.sh
|
126 |
+
```
|
127 |
+
|
128 |
+
You do need [Docker](https://docs.docker.com/engine/install/) installed on your host though.
|
Dockerfile
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
ARG OPENHANDS_BUILD_VERSION=dev
|
2 |
+
FROM node:21.7.2-bookworm-slim AS frontend-builder
|
3 |
+
|
4 |
+
WORKDIR /app
|
5 |
+
|
6 |
+
COPY ./frontend/package.json frontend/package-lock.json ./
|
7 |
+
RUN npm install -g [email protected]
|
8 |
+
RUN npm ci
|
9 |
+
|
10 |
+
COPY ./frontend ./
|
11 |
+
RUN npm run build
|
12 |
+
|
13 |
+
FROM python:3.12.3-slim AS backend-builder
|
14 |
+
|
15 |
+
WORKDIR /app
|
16 |
+
ENV PYTHONPATH='/app'
|
17 |
+
|
18 |
+
ENV POETRY_NO_INTERACTION=1 \
|
19 |
+
POETRY_VIRTUALENVS_IN_PROJECT=1 \
|
20 |
+
POETRY_VIRTUALENVS_CREATE=1 \
|
21 |
+
POETRY_CACHE_DIR=/tmp/poetry_cache
|
22 |
+
|
23 |
+
RUN apt-get update -y \
|
24 |
+
&& apt-get install -y curl make git build-essential \
|
25 |
+
&& python3 -m pip install poetry==1.8.2 --break-system-packages
|
26 |
+
|
27 |
+
COPY ./pyproject.toml ./poetry.lock ./
|
28 |
+
RUN touch README.md
|
29 |
+
RUN export POETRY_CACHE_DIR && poetry install --without evaluation,llama-index --no-root && rm -rf $POETRY_CACHE_DIR
|
30 |
+
|
31 |
+
FROM python:3.12.3-slim AS openhands-app
|
32 |
+
|
33 |
+
WORKDIR /app
|
34 |
+
|
35 |
+
ARG OPENHANDS_BUILD_VERSION #re-declare for this section
|
36 |
+
|
37 |
+
ENV RUN_AS_OPENHANDS=true
|
38 |
+
# A random number--we need this to be different from the user's UID on the host machine
|
39 |
+
ENV OPENHANDS_USER_ID=42420
|
40 |
+
ENV SANDBOX_LOCAL_RUNTIME_URL=http://host.docker.internal
|
41 |
+
ENV USE_HOST_NETWORK=false
|
42 |
+
ENV WORKSPACE_BASE=/opt/workspace_base
|
43 |
+
ENV OPENHANDS_BUILD_VERSION=$OPENHANDS_BUILD_VERSION
|
44 |
+
ENV SANDBOX_USER_ID=0
|
45 |
+
ENV FILE_STORE=local
|
46 |
+
ENV FILE_STORE_PATH=/.openhands-state
|
47 |
+
RUN mkdir -p $FILE_STORE_PATH
|
48 |
+
RUN mkdir -p $WORKSPACE_BASE
|
49 |
+
|
50 |
+
RUN apt-get update -y \
|
51 |
+
&& apt-get install -y curl ssh sudo
|
52 |
+
|
53 |
+
# Default is 1000, but OSX is often 501
|
54 |
+
RUN sed -i 's/^UID_MIN.*/UID_MIN 499/' /etc/login.defs
|
55 |
+
# Default is 60000, but we've seen up to 200000
|
56 |
+
RUN sed -i 's/^UID_MAX.*/UID_MAX 1000000/' /etc/login.defs
|
57 |
+
|
58 |
+
RUN groupadd app
|
59 |
+
RUN useradd -l -m -u $OPENHANDS_USER_ID -s /bin/bash openhands && \
|
60 |
+
usermod -aG app openhands && \
|
61 |
+
usermod -aG sudo openhands && \
|
62 |
+
echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
|
63 |
+
RUN chown -R openhands:app /app && chmod -R 770 /app
|
64 |
+
RUN sudo chown -R openhands:app $WORKSPACE_BASE && sudo chmod -R 770 $WORKSPACE_BASE
|
65 |
+
USER openhands
|
66 |
+
|
67 |
+
ENV VIRTUAL_ENV=/app/.venv \
|
68 |
+
PATH="/app/.venv/bin:$PATH" \
|
69 |
+
PYTHONPATH='/app'
|
70 |
+
|
71 |
+
COPY --chown=openhands:app --chmod=770 --from=backend-builder ${VIRTUAL_ENV} ${VIRTUAL_ENV}
|
72 |
+
RUN playwright install --with-deps chromium
|
73 |
+
|
74 |
+
COPY --chown=openhands:app --chmod=770 ./microagents ./microagents
|
75 |
+
COPY --chown=openhands:app --chmod=770 ./openhands ./openhands
|
76 |
+
COPY --chown=openhands:app --chmod=777 ./openhands/runtime/plugins ./openhands/runtime/plugins
|
77 |
+
COPY --chown=openhands:app --chmod=770 ./openhands/agenthub ./openhands/agenthub
|
78 |
+
COPY --chown=openhands:app ./pyproject.toml ./pyproject.toml
|
79 |
+
COPY --chown=openhands:app ./poetry.lock ./poetry.lock
|
80 |
+
COPY --chown=openhands:app ./README.md ./README.md
|
81 |
+
COPY --chown=openhands:app ./MANIFEST.in ./MANIFEST.in
|
82 |
+
COPY --chown=openhands:app ./LICENSE ./LICENSE
|
83 |
+
|
84 |
+
# This is run as "openhands" user, and will create __pycache__ with openhands:openhands ownership
|
85 |
+
RUN python openhands/core/download.py # No-op to download assets
|
86 |
+
# Add this line to set group ownership of all files/directories not already in "app" group
|
87 |
+
# openhands:openhands -> openhands:app
|
88 |
+
RUN find /app \! -group app -exec chgrp app {} +
|
89 |
+
|
90 |
+
COPY --chown=openhands:app --chmod=770 --from=frontend-builder /app/build ./frontend/build
|
91 |
+
COPY --chown=openhands:app --chmod=770 ./containers/app/entrypoint.sh /app/entrypoint.sh
|
92 |
+
|
93 |
+
USER root
|
94 |
+
|
95 |
+
WORKDIR /app
|
96 |
+
|
97 |
+
ENTRYPOINT ["/app/entrypoint.sh"]
|
98 |
+
CMD ["uvicorn", "openhands.server.listen:app", "--host", "0.0.0.0", "--port", "3000"]
|
ISSUE_TRIAGE.md
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Issue Triage
|
2 |
+
These are the procedures and guidelines on how issues are triaged in this repo by the maintainers.
|
3 |
+
|
4 |
+
## General
|
5 |
+
* Most issues must be tagged with **enhancement** or **bug**.
|
6 |
+
* Issues may be tagged with what it relates to (**backend**, **frontend**, **agent quality**, etc.).
|
7 |
+
|
8 |
+
## Severity
|
9 |
+
* **Low**: Minor issues or affecting single user.
|
10 |
+
* **Medium**: Affecting multiple users.
|
11 |
+
* **Critical**: Affecting all users or potential security issues.
|
12 |
+
|
13 |
+
## Effort
|
14 |
+
* Issues may be estimated with effort required (**small effort**, **medium effort**, **large effort**).
|
15 |
+
|
16 |
+
## Difficulty
|
17 |
+
* Issues with low implementation difficulty may be tagged with **good first issue**.
|
18 |
+
|
19 |
+
## Not Enough Information
|
20 |
+
* User is asked to provide more information (logs, how to reproduce, etc.) when the issue is not clear.
|
21 |
+
* If an issue is unclear and the author does not provide more information or respond to a request, the issue may be closed as **not planned** (Usually after a week).
|
22 |
+
|
23 |
+
## Multiple Requests/Fixes in One Issue
|
24 |
+
* These issues will be narrowed down to one request/fix so the issue is more easily tracked and fixed.
|
25 |
+
* Issues may be broken down into multiple issues if required.
|
LICENSE
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
The MIT License (MIT)
|
2 |
+
=====================
|
3 |
+
|
4 |
+
Copyright © 2023
|
5 |
+
|
6 |
+
Permission is hereby granted, free of charge, to any person
|
7 |
+
obtaining a copy of this software and associated documentation
|
8 |
+
files (the “Software”), to deal in the Software without
|
9 |
+
restriction, including without limitation the rights to use,
|
10 |
+
copy, modify, merge, publish, distribute, sublicense, and/or sell
|
11 |
+
copies of the Software, and to permit persons to whom the
|
12 |
+
Software is furnished to do so, subject to the following
|
13 |
+
conditions:
|
14 |
+
|
15 |
+
The above copyright notice and this permission notice shall be
|
16 |
+
included in all copies or substantial portions of the Software.
|
17 |
+
|
18 |
+
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND,
|
19 |
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
20 |
+
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
21 |
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
22 |
+
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
23 |
+
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
24 |
+
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
25 |
+
OTHER DEALINGS IN THE SOFTWARE.
|
MANIFEST.in
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Exclude all Python bytecode files
|
2 |
+
global-exclude *.pyc
|
3 |
+
|
4 |
+
# Exclude Python cache directories
|
5 |
+
global-exclude __pycache__
|
Makefile
ADDED
@@ -0,0 +1,332 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
SHELL=/bin/bash
|
2 |
+
# Makefile for OpenHands project
|
3 |
+
|
4 |
+
# Variables
|
5 |
+
BACKEND_HOST ?= "127.0.0.1"
|
6 |
+
BACKEND_PORT = 3000
|
7 |
+
BACKEND_HOST_PORT = "$(BACKEND_HOST):$(BACKEND_PORT)"
|
8 |
+
FRONTEND_PORT = 3001
|
9 |
+
DEFAULT_WORKSPACE_DIR = "./workspace"
|
10 |
+
DEFAULT_MODEL = "gpt-4o"
|
11 |
+
CONFIG_FILE = config.toml
|
12 |
+
PRE_COMMIT_CONFIG_PATH = "./dev_config/python/.pre-commit-config.yaml"
|
13 |
+
PYTHON_VERSION = 3.12
|
14 |
+
|
15 |
+
# ANSI color codes
|
16 |
+
GREEN=$(shell tput -Txterm setaf 2)
|
17 |
+
YELLOW=$(shell tput -Txterm setaf 3)
|
18 |
+
RED=$(shell tput -Txterm setaf 1)
|
19 |
+
BLUE=$(shell tput -Txterm setaf 6)
|
20 |
+
RESET=$(shell tput -Txterm sgr0)
|
21 |
+
|
22 |
+
# Build
|
23 |
+
build:
|
24 |
+
@echo "$(GREEN)Building project...$(RESET)"
|
25 |
+
@$(MAKE) -s check-dependencies
|
26 |
+
@$(MAKE) -s install-python-dependencies
|
27 |
+
@$(MAKE) -s install-frontend-dependencies
|
28 |
+
@$(MAKE) -s install-pre-commit-hooks
|
29 |
+
@$(MAKE) -s build-frontend
|
30 |
+
@echo "$(GREEN)Build completed successfully.$(RESET)"
|
31 |
+
|
32 |
+
check-dependencies:
|
33 |
+
@echo "$(YELLOW)Checking dependencies...$(RESET)"
|
34 |
+
@$(MAKE) -s check-system
|
35 |
+
@$(MAKE) -s check-python
|
36 |
+
@$(MAKE) -s check-npm
|
37 |
+
@$(MAKE) -s check-nodejs
|
38 |
+
ifeq ($(INSTALL_DOCKER),)
|
39 |
+
@$(MAKE) -s check-docker
|
40 |
+
endif
|
41 |
+
@$(MAKE) -s check-poetry
|
42 |
+
@echo "$(GREEN)Dependencies checked successfully.$(RESET)"
|
43 |
+
|
44 |
+
check-system:
|
45 |
+
@echo "$(YELLOW)Checking system...$(RESET)"
|
46 |
+
@if [ "$(shell uname)" = "Darwin" ]; then \
|
47 |
+
echo "$(BLUE)macOS detected.$(RESET)"; \
|
48 |
+
elif [ "$(shell uname)" = "Linux" ]; then \
|
49 |
+
if [ -f "/etc/manjaro-release" ]; then \
|
50 |
+
echo "$(BLUE)Manjaro Linux detected.$(RESET)"; \
|
51 |
+
else \
|
52 |
+
echo "$(BLUE)Linux detected.$(RESET)"; \
|
53 |
+
fi; \
|
54 |
+
elif [ "$$(uname -r | grep -i microsoft)" ]; then \
|
55 |
+
echo "$(BLUE)Windows Subsystem for Linux detected.$(RESET)"; \
|
56 |
+
else \
|
57 |
+
echo "$(RED)Unsupported system detected. Please use macOS, Linux, or Windows Subsystem for Linux (WSL).$(RESET)"; \
|
58 |
+
exit 1; \
|
59 |
+
fi
|
60 |
+
|
61 |
+
check-python:
|
62 |
+
@echo "$(YELLOW)Checking Python installation...$(RESET)"
|
63 |
+
@if command -v python$(PYTHON_VERSION) > /dev/null; then \
|
64 |
+
echo "$(BLUE)$(shell python$(PYTHON_VERSION) --version) is already installed.$(RESET)"; \
|
65 |
+
else \
|
66 |
+
echo "$(RED)Python $(PYTHON_VERSION) is not installed. Please install Python $(PYTHON_VERSION) to continue.$(RESET)"; \
|
67 |
+
exit 1; \
|
68 |
+
fi
|
69 |
+
|
70 |
+
check-npm:
|
71 |
+
@echo "$(YELLOW)Checking npm installation...$(RESET)"
|
72 |
+
@if command -v npm > /dev/null; then \
|
73 |
+
echo "$(BLUE)npm $(shell npm --version) is already installed.$(RESET)"; \
|
74 |
+
else \
|
75 |
+
echo "$(RED)npm is not installed. Please install Node.js to continue.$(RESET)"; \
|
76 |
+
exit 1; \
|
77 |
+
fi
|
78 |
+
|
79 |
+
check-nodejs:
|
80 |
+
@echo "$(YELLOW)Checking Node.js installation...$(RESET)"
|
81 |
+
@if command -v node > /dev/null; then \
|
82 |
+
NODE_VERSION=$(shell node --version | sed -E 's/v//g'); \
|
83 |
+
IFS='.' read -r -a NODE_VERSION_ARRAY <<< "$$NODE_VERSION"; \
|
84 |
+
if [ "$${NODE_VERSION_ARRAY[0]}" -ge 20 ]; then \
|
85 |
+
echo "$(BLUE)Node.js $$NODE_VERSION is already installed.$(RESET)"; \
|
86 |
+
else \
|
87 |
+
echo "$(RED)Node.js 20.x or later is required. Please install Node.js 20.x or later to continue.$(RESET)"; \
|
88 |
+
exit 1; \
|
89 |
+
fi; \
|
90 |
+
else \
|
91 |
+
echo "$(RED)Node.js is not installed. Please install Node.js to continue.$(RESET)"; \
|
92 |
+
exit 1; \
|
93 |
+
fi
|
94 |
+
|
95 |
+
check-docker:
|
96 |
+
@echo "$(YELLOW)Checking Docker installation...$(RESET)"
|
97 |
+
@if command -v docker > /dev/null; then \
|
98 |
+
echo "$(BLUE)$(shell docker --version) is already installed.$(RESET)"; \
|
99 |
+
else \
|
100 |
+
echo "$(RED)Docker is not installed. Please install Docker to continue.$(RESET)"; \
|
101 |
+
exit 1; \
|
102 |
+
fi
|
103 |
+
|
104 |
+
check-poetry:
|
105 |
+
@echo "$(YELLOW)Checking Poetry installation...$(RESET)"
|
106 |
+
@if command -v poetry > /dev/null; then \
|
107 |
+
POETRY_VERSION=$(shell poetry --version 2>&1 | sed -E 's/Poetry \(version ([0-9]+\.[0-9]+\.[0-9]+)\)/\1/'); \
|
108 |
+
IFS='.' read -r -a POETRY_VERSION_ARRAY <<< "$$POETRY_VERSION"; \
|
109 |
+
if [ $${POETRY_VERSION_ARRAY[0]} -gt 1 ] || ([ $${POETRY_VERSION_ARRAY[0]} -eq 1 ] && [ $${POETRY_VERSION_ARRAY[1]} -ge 8 ]); then \
|
110 |
+
echo "$(BLUE)$(shell poetry --version) is already installed.$(RESET)"; \
|
111 |
+
else \
|
112 |
+
echo "$(RED)Poetry 1.8 or later is required. You can install poetry by running the following command, then adding Poetry to your PATH:"; \
|
113 |
+
echo "$(RED) curl -sSL https://install.python-poetry.org | python$(PYTHON_VERSION) -$(RESET)"; \
|
114 |
+
echo "$(RED)More detail here: https://python-poetry.org/docs/#installing-with-the-official-installer$(RESET)"; \
|
115 |
+
exit 1; \
|
116 |
+
fi; \
|
117 |
+
else \
|
118 |
+
echo "$(RED)Poetry is not installed. You can install poetry by running the following command, then adding Poetry to your PATH:"; \
|
119 |
+
echo "$(RED) curl -sSL https://install.python-poetry.org | python$(PYTHON_VERSION) -$(RESET)"; \
|
120 |
+
echo "$(RED)More detail here: https://python-poetry.org/docs/#installing-with-the-official-installer$(RESET)"; \
|
121 |
+
exit 1; \
|
122 |
+
fi
|
123 |
+
|
124 |
+
install-python-dependencies:
|
125 |
+
@echo "$(GREEN)Installing Python dependencies...$(RESET)"
|
126 |
+
@if [ -z "${TZ}" ]; then \
|
127 |
+
echo "Defaulting TZ (timezone) to UTC"; \
|
128 |
+
export TZ="UTC"; \
|
129 |
+
fi
|
130 |
+
poetry env use python$(PYTHON_VERSION)
|
131 |
+
@if [ "$(shell uname)" = "Darwin" ]; then \
|
132 |
+
echo "$(BLUE)Installing chroma-hnswlib...$(RESET)"; \
|
133 |
+
export HNSWLIB_NO_NATIVE=1; \
|
134 |
+
poetry run pip install chroma-hnswlib; \
|
135 |
+
fi
|
136 |
+
@poetry install --without llama-index
|
137 |
+
@if [ -f "/etc/manjaro-release" ]; then \
|
138 |
+
echo "$(BLUE)Detected Manjaro Linux. Installing Playwright dependencies...$(RESET)"; \
|
139 |
+
poetry run pip install playwright; \
|
140 |
+
poetry run playwright install chromium; \
|
141 |
+
else \
|
142 |
+
if [ ! -f cache/playwright_chromium_is_installed.txt ]; then \
|
143 |
+
echo "Running playwright install --with-deps chromium..."; \
|
144 |
+
poetry run playwright install --with-deps chromium; \
|
145 |
+
mkdir -p cache; \
|
146 |
+
touch cache/playwright_chromium_is_installed.txt; \
|
147 |
+
else \
|
148 |
+
echo "Setup already done. Skipping playwright installation."; \
|
149 |
+
fi \
|
150 |
+
fi
|
151 |
+
@echo "$(GREEN)Python dependencies installed successfully.$(RESET)"
|
152 |
+
|
153 |
+
install-frontend-dependencies:
|
154 |
+
@echo "$(YELLOW)Setting up frontend environment...$(RESET)"
|
155 |
+
@echo "$(YELLOW)Detect Node.js version...$(RESET)"
|
156 |
+
@cd frontend && node ./scripts/detect-node-version.js
|
157 |
+
echo "$(BLUE)Installing frontend dependencies with npm...$(RESET)"
|
158 |
+
@cd frontend && npm install
|
159 |
+
@echo "$(GREEN)Frontend dependencies installed successfully.$(RESET)"
|
160 |
+
|
161 |
+
install-pre-commit-hooks:
|
162 |
+
@echo "$(YELLOW)Installing pre-commit hooks...$(RESET)"
|
163 |
+
@git config --unset-all core.hooksPath || true
|
164 |
+
@poetry run pre-commit install --config $(PRE_COMMIT_CONFIG_PATH)
|
165 |
+
@echo "$(GREEN)Pre-commit hooks installed successfully.$(RESET)"
|
166 |
+
|
167 |
+
lint-backend:
|
168 |
+
@echo "$(YELLOW)Running linters...$(RESET)"
|
169 |
+
@poetry run pre-commit run --files openhands/**/* agenthub/**/* evaluation/**/* --show-diff-on-failure --config $(PRE_COMMIT_CONFIG_PATH)
|
170 |
+
|
171 |
+
lint-frontend:
|
172 |
+
@echo "$(YELLOW)Running linters for frontend...$(RESET)"
|
173 |
+
@cd frontend && npm run lint
|
174 |
+
|
175 |
+
lint:
|
176 |
+
@$(MAKE) -s lint-frontend
|
177 |
+
@$(MAKE) -s lint-backend
|
178 |
+
|
179 |
+
test-frontend:
|
180 |
+
@echo "$(YELLOW)Running tests for frontend...$(RESET)"
|
181 |
+
@cd frontend && npm run test
|
182 |
+
|
183 |
+
test:
|
184 |
+
@$(MAKE) -s test-frontend
|
185 |
+
|
186 |
+
build-frontend:
|
187 |
+
@echo "$(YELLOW)Building frontend...$(RESET)"
|
188 |
+
@cd frontend && npm run build
|
189 |
+
|
190 |
+
# Start backend
|
191 |
+
start-backend:
|
192 |
+
@echo "$(YELLOW)Starting backend...$(RESET)"
|
193 |
+
@poetry run uvicorn openhands.server.listen:app --host $(BACKEND_HOST) --port $(BACKEND_PORT) --reload --reload-exclude "./workspace"
|
194 |
+
|
195 |
+
# Start frontend
|
196 |
+
start-frontend:
|
197 |
+
@echo "$(YELLOW)Starting frontend...$(RESET)"
|
198 |
+
@cd frontend && VITE_BACKEND_HOST=$(BACKEND_HOST_PORT) VITE_FRONTEND_PORT=$(FRONTEND_PORT) npm run dev -- --port $(FRONTEND_PORT) --host $(BACKEND_HOST)
|
199 |
+
|
200 |
+
# Common setup for running the app (non-callable)
|
201 |
+
_run_setup:
|
202 |
+
@if [ "$(OS)" = "Windows_NT" ]; then \
|
203 |
+
echo "$(RED) Windows is not supported, use WSL instead!$(RESET)"; \
|
204 |
+
exit 1; \
|
205 |
+
fi
|
206 |
+
@mkdir -p logs
|
207 |
+
@echo "$(YELLOW)Starting backend server...$(RESET)"
|
208 |
+
@poetry run uvicorn openhands.server.listen:app --host $(BACKEND_HOST) --port $(BACKEND_PORT) &
|
209 |
+
@echo "$(YELLOW)Waiting for the backend to start...$(RESET)"
|
210 |
+
@until nc -z localhost $(BACKEND_PORT); do sleep 0.1; done
|
211 |
+
@echo "$(GREEN)Backend started successfully.$(RESET)"
|
212 |
+
|
213 |
+
# Run the app (standard mode)
|
214 |
+
run:
|
215 |
+
@echo "$(YELLOW)Running the app...$(RESET)"
|
216 |
+
@$(MAKE) -s _run_setup
|
217 |
+
@$(MAKE) -s start-frontend
|
218 |
+
@echo "$(GREEN)Application started successfully.$(RESET)"
|
219 |
+
|
220 |
+
# Run the app (in docker)
|
221 |
+
docker-run: WORKSPACE_BASE ?= $(PWD)/workspace
|
222 |
+
docker-run:
|
223 |
+
@if [ -f /.dockerenv ]; then \
|
224 |
+
echo "Running inside a Docker container. Exiting..."; \
|
225 |
+
exit 0; \
|
226 |
+
else \
|
227 |
+
echo "$(YELLOW)Running the app in Docker $(OPTIONS)...$(RESET)"; \
|
228 |
+
export WORKSPACE_BASE=${WORKSPACE_BASE}; \
|
229 |
+
export SANDBOX_USER_ID=$(shell id -u); \
|
230 |
+
export DATE=$(shell date +%Y%m%d%H%M%S); \
|
231 |
+
docker compose up $(OPTIONS); \
|
232 |
+
fi
|
233 |
+
|
234 |
+
# Run the app (WSL mode)
|
235 |
+
run-wsl:
|
236 |
+
@echo "$(YELLOW)Running the app in WSL mode...$(RESET)"
|
237 |
+
@$(MAKE) -s _run_setup
|
238 |
+
@cd frontend && echo "$(BLUE)Starting frontend with npm (WSL mode)...$(RESET)" && npm run dev_wsl -- --port $(FRONTEND_PORT)
|
239 |
+
@echo "$(GREEN)Application started successfully in WSL mode.$(RESET)"
|
240 |
+
|
241 |
+
# Setup config.toml
|
242 |
+
setup-config:
|
243 |
+
@echo "$(YELLOW)Setting up config.toml...$(RESET)"
|
244 |
+
@$(MAKE) setup-config-prompts
|
245 |
+
@mv $(CONFIG_FILE).tmp $(CONFIG_FILE)
|
246 |
+
@echo "$(GREEN)Config.toml setup completed.$(RESET)"
|
247 |
+
|
248 |
+
setup-config-prompts:
|
249 |
+
@echo "[core]" > $(CONFIG_FILE).tmp
|
250 |
+
|
251 |
+
@read -p "Enter your workspace directory (as absolute path) [default: $(DEFAULT_WORKSPACE_DIR)]: " workspace_dir; \
|
252 |
+
workspace_dir=$${workspace_dir:-$(DEFAULT_WORKSPACE_DIR)}; \
|
253 |
+
echo "workspace_base=\"$$workspace_dir\"" >> $(CONFIG_FILE).tmp
|
254 |
+
|
255 |
+
@echo "" >> $(CONFIG_FILE).tmp
|
256 |
+
|
257 |
+
@echo "[llm]" >> $(CONFIG_FILE).tmp
|
258 |
+
@read -p "Enter your LLM model name, used for running without UI. Set the model in the UI after you start the app. (see https://docs.litellm.ai/docs/providers for full list) [default: $(DEFAULT_MODEL)]: " llm_model; \
|
259 |
+
llm_model=$${llm_model:-$(DEFAULT_MODEL)}; \
|
260 |
+
echo "model=\"$$llm_model\"" >> $(CONFIG_FILE).tmp
|
261 |
+
|
262 |
+
@read -p "Enter your LLM api key: " llm_api_key; \
|
263 |
+
echo "api_key=\"$$llm_api_key\"" >> $(CONFIG_FILE).tmp
|
264 |
+
|
265 |
+
@read -p "Enter your LLM base URL [mostly used for local LLMs, leave blank if not needed - example: http://localhost:5001/v1/]: " llm_base_url; \
|
266 |
+
if [[ ! -z "$$llm_base_url" ]]; then echo "base_url=\"$$llm_base_url\"" >> $(CONFIG_FILE).tmp; fi
|
267 |
+
|
268 |
+
@echo "Enter your LLM Embedding Model"; \
|
269 |
+
echo "Choices are:"; \
|
270 |
+
echo " - openai"; \
|
271 |
+
echo " - azureopenai"; \
|
272 |
+
echo " - Embeddings available only with OllamaEmbedding:"; \
|
273 |
+
echo " - llama2"; \
|
274 |
+
echo " - mxbai-embed-large"; \
|
275 |
+
echo " - nomic-embed-text"; \
|
276 |
+
echo " - all-minilm"; \
|
277 |
+
echo " - stable-code"; \
|
278 |
+
echo " - bge-m3"; \
|
279 |
+
echo " - bge-large"; \
|
280 |
+
echo " - paraphrase-multilingual"; \
|
281 |
+
echo " - snowflake-arctic-embed"; \
|
282 |
+
echo " - Leave blank to default to 'BAAI/bge-small-en-v1.5' via huggingface"; \
|
283 |
+
read -p "> " llm_embedding_model; \
|
284 |
+
echo "embedding_model=\"$$llm_embedding_model\"" >> $(CONFIG_FILE).tmp; \
|
285 |
+
if [ "$$llm_embedding_model" = "llama2" ] || [ "$$llm_embedding_model" = "mxbai-embed-large" ] || [ "$$llm_embedding_model" = "nomic-embed-text" ] || [ "$$llm_embedding_model" = "all-minilm" ] || [ "$$llm_embedding_model" = "stable-code" ]; then \
|
286 |
+
read -p "Enter the local model URL for the embedding model (will set llm.embedding_base_url): " llm_embedding_base_url; \
|
287 |
+
echo "embedding_base_url=\"$$llm_embedding_base_url\"" >> $(CONFIG_FILE).tmp; \
|
288 |
+
elif [ "$$llm_embedding_model" = "azureopenai" ]; then \
|
289 |
+
read -p "Enter the Azure endpoint URL (will overwrite llm.base_url): " llm_base_url; \
|
290 |
+
echo "base_url=\"$$llm_base_url\"" >> $(CONFIG_FILE).tmp; \
|
291 |
+
read -p "Enter the Azure LLM Embedding Deployment Name: " llm_embedding_deployment_name; \
|
292 |
+
echo "embedding_deployment_name=\"$$llm_embedding_deployment_name\"" >> $(CONFIG_FILE).tmp; \
|
293 |
+
read -p "Enter the Azure API Version: " llm_api_version; \
|
294 |
+
echo "api_version=\"$$llm_api_version\"" >> $(CONFIG_FILE).tmp; \
|
295 |
+
fi
|
296 |
+
|
297 |
+
|
298 |
+
# Develop in container
|
299 |
+
docker-dev:
|
300 |
+
@if [ -f /.dockerenv ]; then \
|
301 |
+
echo "Running inside a Docker container. Exiting..."; \
|
302 |
+
exit 0; \
|
303 |
+
else \
|
304 |
+
echo "$(YELLOW)Build and run in Docker $(OPTIONS)...$(RESET)"; \
|
305 |
+
./containers/dev/dev.sh $(OPTIONS); \
|
306 |
+
fi
|
307 |
+
|
308 |
+
# Clean up all caches
|
309 |
+
clean:
|
310 |
+
@echo "$(YELLOW)Cleaning up caches...$(RESET)"
|
311 |
+
@rm -rf openhands/.cache
|
312 |
+
@echo "$(GREEN)Caches cleaned up successfully.$(RESET)"
|
313 |
+
|
314 |
+
# Help
|
315 |
+
help:
|
316 |
+
@echo "$(BLUE)Usage: make [target]$(RESET)"
|
317 |
+
@echo "Targets:"
|
318 |
+
@echo " $(GREEN)build$(RESET) - Build project, including environment setup and dependencies."
|
319 |
+
@echo " $(GREEN)lint$(RESET) - Run linters on the project."
|
320 |
+
@echo " $(GREEN)setup-config$(RESET) - Setup the configuration for OpenHands by providing LLM API key,"
|
321 |
+
@echo " LLM Model name, and workspace directory."
|
322 |
+
@echo " $(GREEN)start-backend$(RESET) - Start the backend server for the OpenHands project."
|
323 |
+
@echo " $(GREEN)start-frontend$(RESET) - Start the frontend server for the OpenHands project."
|
324 |
+
@echo " $(GREEN)run$(RESET) - Run the OpenHands application, starting both backend and frontend servers."
|
325 |
+
@echo " Backend Log file will be stored in the 'logs' directory."
|
326 |
+
@echo " $(GREEN)docker-dev$(RESET) - Build and run the OpenHands application in Docker."
|
327 |
+
@echo " $(GREEN)docker-run$(RESET) - Run the OpenHands application, starting both backend and frontend servers in Docker."
|
328 |
+
@echo " $(GREEN)help$(RESET) - Display this help message, providing information on available targets."
|
329 |
+
|
330 |
+
# Phony targets
|
331 |
+
.PHONY: build check-dependencies check-python check-npm check-docker check-poetry install-python-dependencies install-frontend-dependencies install-pre-commit-hooks lint start-backend start-frontend run run-wsl setup-config setup-config-prompts help
|
332 |
+
.PHONY: docker-dev docker-run
|
build.sh
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
set -e
|
3 |
+
|
4 |
+
cp pyproject.toml poetry.lock openhands
|
5 |
+
poetry build -v
|
config.sh
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
DOCKER_REGISTRY=ghcr.io
|
2 |
+
DOCKER_ORG=all-hands-ai
|
3 |
+
DOCKER_IMAGE=openhands
|
4 |
+
DOCKER_BASE_DIR="."
|
config.template.toml
ADDED
@@ -0,0 +1,290 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
###################### OpenHands Configuration Example ######################
|
2 |
+
#
|
3 |
+
# All settings have default values, so you only need to uncomment and
|
4 |
+
# modify what you want to change
|
5 |
+
# The fields within each section are sorted in alphabetical order.
|
6 |
+
#
|
7 |
+
##############################################################################
|
8 |
+
|
9 |
+
#################################### Core ####################################
|
10 |
+
# General core configurations
|
11 |
+
##############################################################################
|
12 |
+
[core]
|
13 |
+
# API key for E2B
|
14 |
+
#e2b_api_key = ""
|
15 |
+
|
16 |
+
# API key for Modal
|
17 |
+
#modal_api_token_id = ""
|
18 |
+
#modal_api_token_secret = ""
|
19 |
+
|
20 |
+
# Base path for the workspace
|
21 |
+
workspace_base = "./workspace"
|
22 |
+
|
23 |
+
# Cache directory path
|
24 |
+
#cache_dir = "/tmp/cache"
|
25 |
+
|
26 |
+
# Reasoning effort for o1 models (low, medium, high, or not set)
|
27 |
+
#reasoning_effort = "medium"
|
28 |
+
|
29 |
+
# Debugging enabled
|
30 |
+
#debug = false
|
31 |
+
|
32 |
+
# Disable color in terminal output
|
33 |
+
#disable_color = false
|
34 |
+
|
35 |
+
# Enable saving and restoring the session when run from CLI
|
36 |
+
#enable_cli_session = false
|
37 |
+
|
38 |
+
# Path to store trajectories, can be a folder or a file
|
39 |
+
# If it's a folder, the session id will be used as the file name
|
40 |
+
#save_trajectory_path="./trajectories"
|
41 |
+
|
42 |
+
# Path to replay a trajectory, must be a file path
|
43 |
+
# If provided, trajectory will be loaded and replayed before the
|
44 |
+
# agent responds to any user instruction
|
45 |
+
#replay_trajectory_path = ""
|
46 |
+
|
47 |
+
# File store path
|
48 |
+
#file_store_path = "/tmp/file_store"
|
49 |
+
|
50 |
+
# File store type
|
51 |
+
#file_store = "memory"
|
52 |
+
|
53 |
+
# List of allowed file extensions for uploads
|
54 |
+
#file_uploads_allowed_extensions = [".*"]
|
55 |
+
|
56 |
+
# Maximum file size for uploads, in megabytes
|
57 |
+
#file_uploads_max_file_size_mb = 0
|
58 |
+
|
59 |
+
# Maximum budget per task, 0.0 means no limit
|
60 |
+
#max_budget_per_task = 0.0
|
61 |
+
|
62 |
+
# Maximum number of iterations
|
63 |
+
#max_iterations = 100
|
64 |
+
|
65 |
+
# Path to mount the workspace in the sandbox
|
66 |
+
#workspace_mount_path_in_sandbox = "/workspace"
|
67 |
+
|
68 |
+
# Path to mount the workspace
|
69 |
+
#workspace_mount_path = ""
|
70 |
+
|
71 |
+
# Path to rewrite the workspace mount path to
|
72 |
+
#workspace_mount_rewrite = ""
|
73 |
+
|
74 |
+
# Run as openhands
|
75 |
+
#run_as_openhands = true
|
76 |
+
|
77 |
+
# Runtime environment
|
78 |
+
#runtime = "eventstream"
|
79 |
+
|
80 |
+
# Name of the default agent
|
81 |
+
#default_agent = "CodeActAgent"
|
82 |
+
|
83 |
+
# JWT secret for authentication
|
84 |
+
#jwt_secret = ""
|
85 |
+
|
86 |
+
# Restrict file types for file uploads
|
87 |
+
#file_uploads_restrict_file_types = false
|
88 |
+
|
89 |
+
# List of allowed file extensions for uploads
|
90 |
+
#file_uploads_allowed_extensions = [".*"]
|
91 |
+
|
92 |
+
#################################### LLM #####################################
|
93 |
+
# Configuration for LLM models (group name starts with 'llm')
|
94 |
+
# use 'llm' for the default LLM config
|
95 |
+
##############################################################################
|
96 |
+
[llm]
|
97 |
+
# AWS access key ID
|
98 |
+
#aws_access_key_id = ""
|
99 |
+
|
100 |
+
# AWS region name
|
101 |
+
#aws_region_name = ""
|
102 |
+
|
103 |
+
# AWS secret access key
|
104 |
+
#aws_secret_access_key = ""
|
105 |
+
|
106 |
+
# API key to use (For Headless / CLI only - In Web this is overridden by Session Init)
|
107 |
+
api_key = "your-api-key"
|
108 |
+
|
109 |
+
# API base URL (For Headless / CLI only - In Web this is overridden by Session Init)
|
110 |
+
#base_url = ""
|
111 |
+
|
112 |
+
# API version
|
113 |
+
#api_version = ""
|
114 |
+
|
115 |
+
# Cost per input token
|
116 |
+
#input_cost_per_token = 0.0
|
117 |
+
|
118 |
+
# Cost per output token
|
119 |
+
#output_cost_per_token = 0.0
|
120 |
+
|
121 |
+
# Custom LLM provider
|
122 |
+
#custom_llm_provider = ""
|
123 |
+
|
124 |
+
# Embedding API base URL
|
125 |
+
#embedding_base_url = ""
|
126 |
+
|
127 |
+
# Embedding deployment name
|
128 |
+
#embedding_deployment_name = ""
|
129 |
+
|
130 |
+
# Embedding model to use
|
131 |
+
embedding_model = "local"
|
132 |
+
|
133 |
+
# Maximum number of characters in an observation's content
|
134 |
+
#max_message_chars = 10000
|
135 |
+
|
136 |
+
# Maximum number of input tokens
|
137 |
+
#max_input_tokens = 0
|
138 |
+
|
139 |
+
# Maximum number of output tokens
|
140 |
+
#max_output_tokens = 0
|
141 |
+
|
142 |
+
# Model to use. (For Headless / CLI only - In Web this is overridden by Session Init)
|
143 |
+
model = "gpt-4o"
|
144 |
+
|
145 |
+
# Number of retries to attempt when an operation fails with the LLM.
|
146 |
+
# Increase this value to allow more attempts before giving up
|
147 |
+
#num_retries = 8
|
148 |
+
|
149 |
+
# Maximum wait time (in seconds) between retry attempts
|
150 |
+
# This caps the exponential backoff to prevent excessively long
|
151 |
+
#retry_max_wait = 120
|
152 |
+
|
153 |
+
# Minimum wait time (in seconds) between retry attempts
|
154 |
+
# This sets the initial delay before the first retry
|
155 |
+
#retry_min_wait = 15
|
156 |
+
|
157 |
+
# Multiplier for exponential backoff calculation
|
158 |
+
# The wait time increases by this factor after each failed attempt
|
159 |
+
# A value of 2.0 means each retry waits twice as long as the previous one
|
160 |
+
#retry_multiplier = 2.0
|
161 |
+
|
162 |
+
# Drop any unmapped (unsupported) params without causing an exception
|
163 |
+
#drop_params = false
|
164 |
+
|
165 |
+
# Modify params for litellm to do transformations like adding a default message, when a message is empty.
|
166 |
+
# Note: this setting is global, unlike drop_params, it cannot be overridden in each call to litellm.
|
167 |
+
#modify_params = true
|
168 |
+
|
169 |
+
# Using the prompt caching feature if provided by the LLM and supported
|
170 |
+
#caching_prompt = true
|
171 |
+
|
172 |
+
# Base URL for the OLLAMA API
|
173 |
+
#ollama_base_url = ""
|
174 |
+
|
175 |
+
# Temperature for the API
|
176 |
+
#temperature = 0.0
|
177 |
+
|
178 |
+
# Timeout for the API
|
179 |
+
#timeout = 0
|
180 |
+
|
181 |
+
# Top p for the API
|
182 |
+
#top_p = 1.0
|
183 |
+
|
184 |
+
# If model is vision capable, this option allows to disable image processing (useful for cost reduction).
|
185 |
+
#disable_vision = true
|
186 |
+
|
187 |
+
# Custom tokenizer to use for token counting
|
188 |
+
# https://docs.litellm.ai/docs/completion/token_usage
|
189 |
+
#custom_tokenizer = ""
|
190 |
+
|
191 |
+
# Whether to use native tool calling if supported by the model. Can be true, false, or None by default, which chooses the model's default behavior based on the evaluation.
|
192 |
+
# ATTENTION: Based on evaluation, enabling native function calling may lead to worse results
|
193 |
+
# in some scenarios. Use with caution and consider testing with your specific use case.
|
194 |
+
# https://github.com/All-Hands-AI/OpenHands/pull/4711
|
195 |
+
#native_tool_calling = None
|
196 |
+
|
197 |
+
[llm.gpt4o-mini]
|
198 |
+
api_key = "your-api-key"
|
199 |
+
model = "gpt-4o"
|
200 |
+
|
201 |
+
|
202 |
+
#################################### Agent ###################################
|
203 |
+
# Configuration for agents (group name starts with 'agent')
|
204 |
+
# Use 'agent' for the default agent config
|
205 |
+
# otherwise, group name must be `agent.<agent_name>` (case-sensitive), e.g.
|
206 |
+
# agent.CodeActAgent
|
207 |
+
##############################################################################
|
208 |
+
[agent]
|
209 |
+
|
210 |
+
# whether the browsing tool is enabled
|
211 |
+
codeact_enable_browsing = true
|
212 |
+
|
213 |
+
# whether the LLM draft editor is enabled
|
214 |
+
codeact_enable_llm_editor = false
|
215 |
+
|
216 |
+
# whether the IPython tool is enabled
|
217 |
+
codeact_enable_jupyter = true
|
218 |
+
|
219 |
+
# Name of the micro agent to use for this agent
|
220 |
+
#micro_agent_name = ""
|
221 |
+
|
222 |
+
# Memory enabled
|
223 |
+
#memory_enabled = false
|
224 |
+
|
225 |
+
# Memory maximum threads
|
226 |
+
#memory_max_threads = 3
|
227 |
+
|
228 |
+
# LLM config group to use
|
229 |
+
#llm_config = 'your-llm-config-group'
|
230 |
+
|
231 |
+
# Whether to use prompt extension (e.g., microagent, repo/runtime info) at all
|
232 |
+
#enable_prompt_extensions = true
|
233 |
+
|
234 |
+
# List of microagents to disable
|
235 |
+
#disabled_microagents = []
|
236 |
+
|
237 |
+
[agent.RepoExplorerAgent]
|
238 |
+
# Example: use a cheaper model for RepoExplorerAgent to reduce cost, especially
|
239 |
+
# useful when an agent doesn't demand high quality but uses a lot of tokens
|
240 |
+
llm_config = 'gpt3'
|
241 |
+
|
242 |
+
#################################### Sandbox ###################################
|
243 |
+
# Configuration for the sandbox
|
244 |
+
##############################################################################
|
245 |
+
[sandbox]
|
246 |
+
# Sandbox timeout in seconds
|
247 |
+
#timeout = 120
|
248 |
+
|
249 |
+
# Sandbox user ID
|
250 |
+
#user_id = 1000
|
251 |
+
|
252 |
+
# Container image to use for the sandbox
|
253 |
+
#base_container_image = "nikolaik/python-nodejs:python3.12-nodejs22"
|
254 |
+
|
255 |
+
# Use host network
|
256 |
+
#use_host_network = false
|
257 |
+
|
258 |
+
# runtime extra build args
|
259 |
+
#runtime_extra_build_args = ["--network=host", "--add-host=host.docker.internal:host-gateway"]
|
260 |
+
|
261 |
+
# Enable auto linting after editing
|
262 |
+
#enable_auto_lint = false
|
263 |
+
|
264 |
+
# Whether to initialize plugins
|
265 |
+
#initialize_plugins = true
|
266 |
+
|
267 |
+
# Extra dependencies to install in the runtime image
|
268 |
+
#runtime_extra_deps = ""
|
269 |
+
|
270 |
+
# Environment variables to set at the launch of the runtime
|
271 |
+
#runtime_startup_env_vars = {}
|
272 |
+
|
273 |
+
# BrowserGym environment to use for evaluation
|
274 |
+
#browsergym_eval_env = ""
|
275 |
+
|
276 |
+
#################################### Security ###################################
|
277 |
+
# Configuration for security features
|
278 |
+
##############################################################################
|
279 |
+
[security]
|
280 |
+
|
281 |
+
# Enable confirmation mode (For Headless / CLI only - In Web this is overridden by Session Init)
|
282 |
+
#confirmation_mode = false
|
283 |
+
|
284 |
+
# The security analyzer to use (For Headless / CLI only - In Web this is overridden by Session Init)
|
285 |
+
#security_analyzer = ""
|
286 |
+
|
287 |
+
#################################### Eval ####################################
|
288 |
+
# Configuration for the evaluation, please refer to the specific evaluation
|
289 |
+
# plugin for the available options
|
290 |
+
##############################################################################
|
containers/README.md
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Docker Containers
|
2 |
+
|
3 |
+
Each folder here contains a Dockerfile, and a config.sh describing how to build
|
4 |
+
the images and where to push them. These images are built and pushed in GitHub Actions
|
5 |
+
by the `ghcr.yml` workflow.
|
6 |
+
|
7 |
+
## Building Manually
|
8 |
+
|
9 |
+
```bash
|
10 |
+
docker build -f containers/app/Dockerfile -t openhands .
|
11 |
+
docker build -f containers/sandbox/Dockerfile -t sandbox .
|
12 |
+
```
|
containers/app/config.sh
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
DOCKER_REGISTRY=ghcr.io
|
2 |
+
DOCKER_ORG=all-hands-ai
|
3 |
+
DOCKER_IMAGE=openhands
|
4 |
+
DOCKER_BASE_DIR="."
|
containers/app/entrypoint.sh
ADDED
File without changes
|
containers/build.sh
ADDED
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
set -eo pipefail
|
3 |
+
|
4 |
+
# Initialize variables with default values
|
5 |
+
image_name=""
|
6 |
+
org_name=""
|
7 |
+
push=0
|
8 |
+
load=0
|
9 |
+
tag_suffix=""
|
10 |
+
|
11 |
+
# Function to display usage information
|
12 |
+
usage() {
|
13 |
+
echo "Usage: $0 -i <image_name> [-o <org_name>] [--push] [--load] [-t <tag_suffix>]"
|
14 |
+
echo " -i: Image name (required)"
|
15 |
+
echo " -o: Organization name"
|
16 |
+
echo " --push: Push the image"
|
17 |
+
echo " --load: Load the image"
|
18 |
+
echo " -t: Tag suffix"
|
19 |
+
exit 1
|
20 |
+
}
|
21 |
+
|
22 |
+
# Parse command-line options
|
23 |
+
while [[ $# -gt 0 ]]; do
|
24 |
+
case $1 in
|
25 |
+
-i) image_name="$2"; shift 2 ;;
|
26 |
+
-o) org_name="$2"; shift 2 ;;
|
27 |
+
--push) push=1; shift ;;
|
28 |
+
--load) load=1; shift ;;
|
29 |
+
-t) tag_suffix="$2"; shift 2 ;;
|
30 |
+
*) usage ;;
|
31 |
+
esac
|
32 |
+
done
|
33 |
+
# Check if required arguments are provided
|
34 |
+
if [[ -z "$image_name" ]]; then
|
35 |
+
echo "Error: Image name is required."
|
36 |
+
usage
|
37 |
+
fi
|
38 |
+
|
39 |
+
echo "Building: $image_name"
|
40 |
+
tags=()
|
41 |
+
|
42 |
+
OPENHANDS_BUILD_VERSION="dev"
|
43 |
+
|
44 |
+
cache_tag_base="buildcache"
|
45 |
+
cache_tag="$cache_tag_base"
|
46 |
+
|
47 |
+
if [[ -n $RELEVANT_SHA ]]; then
|
48 |
+
git_hash=$(git rev-parse --short "$RELEVANT_SHA")
|
49 |
+
tags+=("$git_hash")
|
50 |
+
tags+=("$RELEVANT_SHA")
|
51 |
+
fi
|
52 |
+
|
53 |
+
if [[ -n $GITHUB_REF_NAME ]]; then
|
54 |
+
# check if ref name is a version number
|
55 |
+
if [[ $GITHUB_REF_NAME =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
|
56 |
+
major_version=$(echo "$GITHUB_REF_NAME" | cut -d. -f1)
|
57 |
+
minor_version=$(echo "$GITHUB_REF_NAME" | cut -d. -f1,2)
|
58 |
+
tags+=("$major_version" "$minor_version")
|
59 |
+
tags+=("latest")
|
60 |
+
fi
|
61 |
+
sanitized_ref_name=$(echo "$GITHUB_REF_NAME" | sed 's/[^a-zA-Z0-9.-]\+/-/g')
|
62 |
+
OPENHANDS_BUILD_VERSION=$sanitized_ref_name
|
63 |
+
sanitized_ref_name=$(echo "$sanitized_ref_name" | tr '[:upper:]' '[:lower:]') # lower case is required in tagging
|
64 |
+
tags+=("$sanitized_ref_name")
|
65 |
+
cache_tag+="-${sanitized_ref_name}"
|
66 |
+
fi
|
67 |
+
|
68 |
+
if [[ -n $tag_suffix ]]; then
|
69 |
+
cache_tag+="-${tag_suffix}"
|
70 |
+
for i in "${!tags[@]}"; do
|
71 |
+
tags[$i]="${tags[$i]}-$tag_suffix"
|
72 |
+
done
|
73 |
+
fi
|
74 |
+
|
75 |
+
echo "Tags: ${tags[@]}"
|
76 |
+
|
77 |
+
if [[ "$image_name" == "openhands" ]]; then
|
78 |
+
dir="./containers/app"
|
79 |
+
elif [[ "$image_name" == "runtime" ]]; then
|
80 |
+
dir="./containers/runtime"
|
81 |
+
else
|
82 |
+
dir="./containers/$image_name"
|
83 |
+
fi
|
84 |
+
|
85 |
+
if [[ (! -f "$dir/Dockerfile") && "$image_name" != "runtime" ]]; then
|
86 |
+
# Allow runtime to be built without a Dockerfile
|
87 |
+
echo "No Dockerfile found"
|
88 |
+
exit 1
|
89 |
+
fi
|
90 |
+
if [[ ! -f "$dir/config.sh" ]]; then
|
91 |
+
echo "No config.sh found for Dockerfile"
|
92 |
+
exit 1
|
93 |
+
fi
|
94 |
+
|
95 |
+
source "$dir/config.sh"
|
96 |
+
|
97 |
+
if [[ -n "$org_name" ]]; then
|
98 |
+
DOCKER_ORG="$org_name"
|
99 |
+
fi
|
100 |
+
|
101 |
+
# If $DOCKER_IMAGE_SOURCE_TAG is set, add it to the tags
|
102 |
+
if [[ -n "$DOCKER_IMAGE_SOURCE_TAG" ]]; then
|
103 |
+
tags+=("$DOCKER_IMAGE_SOURCE_TAG")
|
104 |
+
fi
|
105 |
+
# If $DOCKER_IMAGE_TAG is set, add it to the tags
|
106 |
+
if [[ -n "$DOCKER_IMAGE_TAG" ]]; then
|
107 |
+
tags+=("$DOCKER_IMAGE_TAG")
|
108 |
+
fi
|
109 |
+
|
110 |
+
DOCKER_REPOSITORY="$DOCKER_REGISTRY/$DOCKER_ORG/$DOCKER_IMAGE"
|
111 |
+
DOCKER_REPOSITORY=${DOCKER_REPOSITORY,,} # lowercase
|
112 |
+
echo "Repo: $DOCKER_REPOSITORY"
|
113 |
+
echo "Base dir: $DOCKER_BASE_DIR"
|
114 |
+
|
115 |
+
args=""
|
116 |
+
for tag in "${tags[@]}"; do
|
117 |
+
args+=" -t $DOCKER_REPOSITORY:$tag"
|
118 |
+
done
|
119 |
+
|
120 |
+
if [[ $push -eq 1 ]]; then
|
121 |
+
args+=" --push"
|
122 |
+
args+=" --cache-to=type=registry,ref=$DOCKER_REPOSITORY:$cache_tag,mode=max"
|
123 |
+
fi
|
124 |
+
|
125 |
+
if [[ $load -eq 1 ]]; then
|
126 |
+
args+=" --load"
|
127 |
+
fi
|
128 |
+
|
129 |
+
echo "Args: $args"
|
130 |
+
|
131 |
+
# Modify the platform selection based on --load flag
|
132 |
+
if [[ $load -eq 1 ]]; then
|
133 |
+
# When loading, build only for the current platform
|
134 |
+
platform=$(docker version -f '{{.Server.Os}}/{{.Server.Arch}}')
|
135 |
+
else
|
136 |
+
# For push or without load, build for multiple platforms
|
137 |
+
platform="linux/amd64,linux/arm64"
|
138 |
+
fi
|
139 |
+
|
140 |
+
echo "Building for platform(s): $platform"
|
141 |
+
|
142 |
+
docker buildx build \
|
143 |
+
$args \
|
144 |
+
--build-arg OPENHANDS_BUILD_VERSION="$OPENHANDS_BUILD_VERSION" \
|
145 |
+
--cache-from=type=registry,ref=$DOCKER_REPOSITORY:$cache_tag \
|
146 |
+
--cache-from=type=registry,ref=$DOCKER_REPOSITORY:$cache_tag_base-main \
|
147 |
+
--platform $platform \
|
148 |
+
--provenance=false \
|
149 |
+
-f "$dir/Dockerfile" \
|
150 |
+
"$DOCKER_BASE_DIR"
|
151 |
+
|
152 |
+
# If load was requested, print the loaded images
|
153 |
+
if [[ $load -eq 1 ]]; then
|
154 |
+
echo "Local images built:"
|
155 |
+
docker images "$DOCKER_REPOSITORY" --format "{{.Repository}}:{{.Tag}}"
|
156 |
+
fi
|
containers/dev/Dockerfile
ADDED
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# syntax=docker/dockerfile:1
|
2 |
+
|
3 |
+
###
|
4 |
+
FROM ubuntu:22.04 AS dind
|
5 |
+
|
6 |
+
# https://docs.docker.com/engine/install/ubuntu/
|
7 |
+
RUN apt-get update && apt-get install -y \
|
8 |
+
ca-certificates \
|
9 |
+
curl \
|
10 |
+
&& install -m 0755 -d /etc/apt/keyrings \
|
11 |
+
&& curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc \
|
12 |
+
&& chmod a+r /etc/apt/keyrings/docker.asc \
|
13 |
+
&& echo \
|
14 |
+
"deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
|
15 |
+
$(. /etc/os-release && echo "$VERSION_CODENAME") stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null
|
16 |
+
|
17 |
+
RUN apt-get update && apt-get install -y \
|
18 |
+
docker-ce \
|
19 |
+
docker-ce-cli \
|
20 |
+
containerd.io \
|
21 |
+
docker-buildx-plugin \
|
22 |
+
docker-compose-plugin \
|
23 |
+
&& rm -rf /var/lib/apt/lists/* \
|
24 |
+
&& apt-get clean \
|
25 |
+
&& apt-get autoremove -y
|
26 |
+
|
27 |
+
###
|
28 |
+
FROM dind AS openhands
|
29 |
+
|
30 |
+
ENV DEBIAN_FRONTEND=noninteractive
|
31 |
+
|
32 |
+
#
|
33 |
+
RUN apt-get update && apt-get install -y \
|
34 |
+
bash \
|
35 |
+
build-essential \
|
36 |
+
curl \
|
37 |
+
git \
|
38 |
+
git-lfs \
|
39 |
+
software-properties-common \
|
40 |
+
make \
|
41 |
+
netcat \
|
42 |
+
sudo \
|
43 |
+
wget \
|
44 |
+
&& rm -rf /var/lib/apt/lists/* \
|
45 |
+
&& apt-get clean \
|
46 |
+
&& apt-get autoremove -y
|
47 |
+
|
48 |
+
# https://github.com/cli/cli/blob/trunk/docs/install_linux.md
|
49 |
+
RUN curl -fsSL https://cli.github.com/packages/githubcli-archive-keyring.gpg | dd of=/usr/share/keyrings/githubcli-archive-keyring.gpg \
|
50 |
+
&& chmod go+r /usr/share/keyrings/githubcli-archive-keyring.gpg \
|
51 |
+
&& echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/githubcli-archive-keyring.gpg] https://cli.github.com/packages stable main" | tee /etc/apt/sources.list.d/github-cli.list > /dev/null \
|
52 |
+
&& apt-get update && apt-get -y install \
|
53 |
+
gh \
|
54 |
+
&& rm -rf /var/lib/apt/lists/* \
|
55 |
+
&& apt-get clean \
|
56 |
+
&& apt-get autoremove -y
|
57 |
+
|
58 |
+
# Python 3.12
|
59 |
+
RUN add-apt-repository ppa:deadsnakes/ppa \
|
60 |
+
&& apt-get update \
|
61 |
+
&& apt-get install -y python3.12 python3.12-venv python3.12-dev python3-pip \
|
62 |
+
&& ln -s /usr/bin/python3.12 /usr/bin/python
|
63 |
+
|
64 |
+
# NodeJS >= 18.17.1
|
65 |
+
RUN curl -fsSL https://deb.nodesource.com/setup_18.x | bash - \
|
66 |
+
&& apt-get install -y nodejs
|
67 |
+
|
68 |
+
# Poetry >= 1.8
|
69 |
+
RUN curl -fsSL https://install.python-poetry.org | python3.12 - \
|
70 |
+
&& ln -s ~/.local/bin/poetry /usr/local/bin/poetry
|
71 |
+
|
72 |
+
#
|
73 |
+
RUN <<EOF
|
74 |
+
#!/bin/bash
|
75 |
+
printf "#!/bin/bash
|
76 |
+
set +x
|
77 |
+
uname -a
|
78 |
+
docker --version
|
79 |
+
gh --version | head -n 1
|
80 |
+
git --version
|
81 |
+
#
|
82 |
+
python --version
|
83 |
+
echo node `node --version`
|
84 |
+
echo npm `npm --version`
|
85 |
+
poetry --version
|
86 |
+
netcat -h 2>&1 | head -n 1
|
87 |
+
" > /version.sh
|
88 |
+
chmod a+x /version.sh
|
89 |
+
EOF
|
90 |
+
|
91 |
+
###
|
92 |
+
FROM openhands AS dev
|
93 |
+
|
94 |
+
RUN apt-get update && apt-get install -y \
|
95 |
+
dnsutils \
|
96 |
+
file \
|
97 |
+
iproute2 \
|
98 |
+
jq \
|
99 |
+
lsof \
|
100 |
+
ripgrep \
|
101 |
+
silversearcher-ag \
|
102 |
+
vim \
|
103 |
+
&& rm -rf /var/lib/apt/lists/* \
|
104 |
+
&& apt-get clean \
|
105 |
+
&& apt-get autoremove -y
|
106 |
+
|
107 |
+
WORKDIR /app
|
108 |
+
|
109 |
+
# cache build dependencies
|
110 |
+
RUN \
|
111 |
+
--mount=type=bind,source=./,target=/app/ \
|
112 |
+
<<EOF
|
113 |
+
#!/bin/bash
|
114 |
+
make -s clean
|
115 |
+
make -s check-dependencies
|
116 |
+
make -s install-python-dependencies
|
117 |
+
|
118 |
+
# NOTE
|
119 |
+
# node_modules are .dockerignore-d therefore not mountable
|
120 |
+
# make -s install-frontend-dependencies
|
121 |
+
EOF
|
122 |
+
|
123 |
+
#
|
124 |
+
CMD ["bash"]
|
containers/dev/README.md
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Develop in Docker
|
2 |
+
|
3 |
+
> [!WARNING]
|
4 |
+
> This is not officially supported and may not work.
|
5 |
+
|
6 |
+
Install [Docker](https://docs.docker.com/engine/install/) on your host machine and run:
|
7 |
+
|
8 |
+
```bash
|
9 |
+
make docker-dev
|
10 |
+
# same as:
|
11 |
+
cd ./containers/dev
|
12 |
+
./dev.sh
|
13 |
+
```
|
14 |
+
|
15 |
+
It could take some time if you are running for the first time as Docker will pull all the tools required for building OpenHands. The next time you run again, it should be instant.
|
16 |
+
|
17 |
+
## Build and run
|
18 |
+
|
19 |
+
If everything goes well, you should be inside a container after Docker finishes building the `openhands:dev` image similar to the following:
|
20 |
+
|
21 |
+
```bash
|
22 |
+
Build and run in Docker ...
|
23 |
+
root@93fc0005fcd2:/app#
|
24 |
+
```
|
25 |
+
|
26 |
+
You may now proceed with the normal [build and run](../../Development.md) workflow as if you were on the host.
|
27 |
+
|
28 |
+
## Make changes
|
29 |
+
|
30 |
+
The source code on the host is mounted as `/app` inside docker. You may edit the files as usual either inside the Docker container or on your host with your favorite IDE/editors.
|
31 |
+
|
32 |
+
The following are also mapped as readonly from your host:
|
33 |
+
|
34 |
+
```yaml
|
35 |
+
# host credentials
|
36 |
+
- $HOME/.git-credentials:/root/.git-credentials:ro
|
37 |
+
- $HOME/.gitconfig:/root/.gitconfig:ro
|
38 |
+
- $HOME/.npmrc:/root/.npmrc:ro
|
39 |
+
```
|
40 |
+
|
41 |
+
## VSCode
|
42 |
+
|
43 |
+
Alternatively, if you use VSCode, you could also [attach to the running container](https://code.visualstudio.com/docs/devcontainers/attach-container).
|
44 |
+
|
45 |
+
See details for [developing in docker](https://code.visualstudio.com/docs/devcontainers/containers) or simply ask `OpenHands` ;-)
|
46 |
+
|
47 |
+
## Rebuild dev image
|
48 |
+
|
49 |
+
You could optionally pass additional options to the build script.
|
50 |
+
|
51 |
+
```bash
|
52 |
+
make docker-dev OPTIONS="--build"
|
53 |
+
# or
|
54 |
+
./containers/dev/dev.sh --build
|
55 |
+
```
|
56 |
+
|
57 |
+
See [docker compose run](https://docs.docker.com/reference/cli/docker/compose/run/) for more options.
|
containers/dev/compose.yml
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#
|
2 |
+
services:
|
3 |
+
dev:
|
4 |
+
privileged: true
|
5 |
+
build:
|
6 |
+
context: ${OPENHANDS_WORKSPACE:-../../}
|
7 |
+
dockerfile: ./containers/dev/Dockerfile
|
8 |
+
image: openhands:dev
|
9 |
+
container_name: openhands-dev
|
10 |
+
environment:
|
11 |
+
- BACKEND_HOST=${BACKEND_HOST:-"0.0.0.0"}
|
12 |
+
- SANDBOX_API_HOSTNAME=host.docker.internal
|
13 |
+
#
|
14 |
+
- SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-ghcr.io/all-hands-ai/runtime:0.20-nikolaik}
|
15 |
+
- SANDBOX_USER_ID=${SANDBOX_USER_ID:-1234}
|
16 |
+
- WORKSPACE_MOUNT_PATH=${WORKSPACE_BASE:-$PWD/workspace}
|
17 |
+
ports:
|
18 |
+
- "3000:3000"
|
19 |
+
extra_hosts:
|
20 |
+
- "host.docker.internal:host-gateway"
|
21 |
+
volumes:
|
22 |
+
- /var/run/docker.sock:/var/run/docker.sock
|
23 |
+
- ${WORKSPACE_BASE:-$PWD/workspace}:/opt/workspace_base
|
24 |
+
# source code
|
25 |
+
- ${OPENHANDS_WORKSPACE:-../../}:/app
|
26 |
+
# host credentials
|
27 |
+
- $HOME/.git-credentials:/root/.git-credentials:ro
|
28 |
+
- $HOME/.gitconfig:/root/.gitconfig:ro
|
29 |
+
- $HOME/.npmrc:/root/.npmrc:ro
|
30 |
+
# cache
|
31 |
+
- cache-data:/root/.cache
|
32 |
+
pull_policy: never
|
33 |
+
stdin_open: true
|
34 |
+
tty: true
|
35 |
+
|
36 |
+
##
|
37 |
+
volumes:
|
38 |
+
cache-data:
|
containers/dev/dev.sh
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
set -o pipefail
|
3 |
+
|
4 |
+
function get_docker() {
|
5 |
+
echo "Docker is required to build and run OpenHands."
|
6 |
+
echo "https://docs.docker.com/get-started/get-docker/"
|
7 |
+
exit 1
|
8 |
+
}
|
9 |
+
|
10 |
+
function check_tools() {
|
11 |
+
command -v docker &>/dev/null || get_docker
|
12 |
+
}
|
13 |
+
|
14 |
+
function exit_if_indocker() {
|
15 |
+
if [ -f /.dockerenv ]; then
|
16 |
+
echo "Running inside a Docker container. Exiting..."
|
17 |
+
exit 1
|
18 |
+
fi
|
19 |
+
}
|
20 |
+
|
21 |
+
#
|
22 |
+
exit_if_indocker
|
23 |
+
|
24 |
+
check_tools
|
25 |
+
|
26 |
+
##
|
27 |
+
OPENHANDS_WORKSPACE=$(git rev-parse --show-toplevel)
|
28 |
+
|
29 |
+
cd "$OPENHANDS_WORKSPACE/containers/dev/" || exit 1
|
30 |
+
|
31 |
+
##
|
32 |
+
export BACKEND_HOST="0.0.0.0"
|
33 |
+
#
|
34 |
+
export SANDBOX_USER_ID=$(id -u)
|
35 |
+
export WORKSPACE_BASE=${WORKSPACE_BASE:-$OPENHANDS_WORKSPACE/workspace}
|
36 |
+
|
37 |
+
docker compose run --rm --service-ports "$@" dev
|
38 |
+
|
39 |
+
##
|
containers/e2b-sandbox/Dockerfile
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM ubuntu:22.04
|
2 |
+
|
3 |
+
# install basic packages
|
4 |
+
RUN apt-get update && apt-get install -y \
|
5 |
+
curl \
|
6 |
+
wget \
|
7 |
+
git \
|
8 |
+
vim \
|
9 |
+
nano \
|
10 |
+
unzip \
|
11 |
+
zip \
|
12 |
+
python3 \
|
13 |
+
python3-pip \
|
14 |
+
python3-venv \
|
15 |
+
python3-dev \
|
16 |
+
build-essential \
|
17 |
+
openssh-server \
|
18 |
+
sudo \
|
19 |
+
&& rm -rf /var/lib/apt/lists/*
|
containers/e2b-sandbox/README.md
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# How to build custom E2B sandbox for OpenHands
|
2 |
+
|
3 |
+
[E2B](https://e2b.dev) is an [open-source](https://github.com/e2b-dev/e2b) secure cloud environment (sandbox) made for running AI-generated code and agents. E2B offers [Python](https://pypi.org/project/e2b/) and [JS/TS](https://www.npmjs.com/package/e2b) SDK to spawn and control these sandboxes.
|
4 |
+
|
5 |
+
|
6 |
+
1. Install the CLI with NPM.
|
7 |
+
```sh
|
8 |
+
npm install -g @e2b/cli@latest
|
9 |
+
```
|
10 |
+
Full CLI API is [here](https://e2b.dev/docs/cli/installation).
|
11 |
+
|
12 |
+
1. Build the sandbox
|
13 |
+
```sh
|
14 |
+
e2b template build --dockerfile ./Dockerfile --name "openhands"
|
15 |
+
```
|
containers/e2b-sandbox/e2b.toml
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This is a config for E2B sandbox template.
|
2 |
+
# You can use 'template_id' (785n69crgahmz0lkdw9h) or 'template_name (openhands) from this config to spawn a sandbox:
|
3 |
+
|
4 |
+
# Python SDK
|
5 |
+
# from e2b import Sandbox
|
6 |
+
# sandbox = Sandbox(template='openhands')
|
7 |
+
|
8 |
+
# JS SDK
|
9 |
+
# import { Sandbox } from 'e2b'
|
10 |
+
# const sandbox = await Sandbox.create({ template: 'openhands' })
|
11 |
+
|
12 |
+
dockerfile = "Dockerfile"
|
13 |
+
template_name = "openhands"
|
14 |
+
template_id = "785n69crgahmz0lkdw9h"
|
containers/runtime/README.md
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Dynamically constructed Dockerfile
|
2 |
+
|
3 |
+
This folder builds a runtime image (sandbox), which will use a dynamically generated `Dockerfile`
|
4 |
+
that depends on the `base_image` **AND** a [Python source distribution](https://docs.python.org/3.10/distutils/sourcedist.html) that is based on the current commit of `openhands`.
|
5 |
+
|
6 |
+
The following command will generate a `Dockerfile` file for `nikolaik/python-nodejs:python3.12-nodejs22` (the default base image), an updated `config.sh` and the runtime source distribution files/folders into `containers/runtime`:
|
7 |
+
|
8 |
+
```bash
|
9 |
+
poetry run python3 openhands/runtime/utils/runtime_build.py \
|
10 |
+
--base_image nikolaik/python-nodejs:python3.12-nodejs22 \
|
11 |
+
--build_folder containers/runtime
|
12 |
+
```
|
containers/runtime/config.sh
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
DOCKER_REGISTRY=ghcr.io
|
2 |
+
DOCKER_ORG=all-hands-ai
|
3 |
+
DOCKER_BASE_DIR="./containers/runtime"
|
4 |
+
DOCKER_IMAGE=runtime
|
5 |
+
# These variables will be appended by the runtime_build.py script
|
6 |
+
# DOCKER_IMAGE_TAG=
|
7 |
+
# DOCKER_IMAGE_SOURCE_TAG=
|
dev_config/python/.pre-commit-config.yaml
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
repos:
|
2 |
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
3 |
+
rev: v4.5.0
|
4 |
+
hooks:
|
5 |
+
- id: trailing-whitespace
|
6 |
+
exclude: docs/modules/python
|
7 |
+
- id: end-of-file-fixer
|
8 |
+
exclude: docs/modules/python
|
9 |
+
- id: check-yaml
|
10 |
+
- id: debug-statements
|
11 |
+
|
12 |
+
- repo: https://github.com/tox-dev/pyproject-fmt
|
13 |
+
rev: 1.7.0
|
14 |
+
hooks:
|
15 |
+
- id: pyproject-fmt
|
16 |
+
- repo: https://github.com/abravalheri/validate-pyproject
|
17 |
+
rev: v0.16
|
18 |
+
hooks:
|
19 |
+
- id: validate-pyproject
|
20 |
+
|
21 |
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
22 |
+
# Ruff version.
|
23 |
+
rev: v0.4.1
|
24 |
+
hooks:
|
25 |
+
# Run the linter.
|
26 |
+
- id: ruff
|
27 |
+
entry: ruff check --config dev_config/python/ruff.toml
|
28 |
+
types_or: [python, pyi, jupyter]
|
29 |
+
args: [--fix]
|
30 |
+
# Run the formatter.
|
31 |
+
- id: ruff-format
|
32 |
+
entry: ruff format --config dev_config/python/ruff.toml
|
33 |
+
types_or: [python, pyi, jupyter]
|
34 |
+
|
35 |
+
- repo: https://github.com/pre-commit/mirrors-mypy
|
36 |
+
rev: v1.9.0
|
37 |
+
hooks:
|
38 |
+
- id: mypy
|
39 |
+
additional_dependencies:
|
40 |
+
[types-requests, types-setuptools, types-pyyaml, types-toml]
|
41 |
+
entry: mypy --config-file dev_config/python/mypy.ini openhands/
|
42 |
+
always_run: true
|
43 |
+
pass_filenames: false
|
dev_config/python/mypy.ini
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[mypy]
|
2 |
+
warn_unused_configs = True
|
3 |
+
ignore_missing_imports = True
|
4 |
+
check_untyped_defs = True
|
5 |
+
explicit_package_bases = True
|
6 |
+
warn_unreachable = True
|
7 |
+
warn_redundant_casts = True
|
8 |
+
no_implicit_optional = True
|
9 |
+
strict_optional = True
|
dev_config/python/ruff.toml
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[lint]
|
2 |
+
select = [
|
3 |
+
"E",
|
4 |
+
"W",
|
5 |
+
"F",
|
6 |
+
"I",
|
7 |
+
"Q",
|
8 |
+
"B",
|
9 |
+
]
|
10 |
+
|
11 |
+
ignore = [
|
12 |
+
"E501",
|
13 |
+
"B003",
|
14 |
+
"B007",
|
15 |
+
"B009",
|
16 |
+
"B010",
|
17 |
+
"B904",
|
18 |
+
"B018",
|
19 |
+
]
|
20 |
+
|
21 |
+
[lint.flake8-quotes]
|
22 |
+
docstring-quotes = "double"
|
23 |
+
inline-quotes = "single"
|
24 |
+
|
25 |
+
[format]
|
26 |
+
quote-style = "single"
|
docker-compose.yml
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
services:
|
3 |
+
openhands:
|
4 |
+
build:
|
5 |
+
context: ./
|
6 |
+
dockerfile: ./containers/app/Dockerfile
|
7 |
+
image: openhands:latest
|
8 |
+
container_name: openhands-app-${DATE:-}
|
9 |
+
environment:
|
10 |
+
- SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-docker.all-hands.dev/all-hands-ai/runtime:0.20-nikolaik}
|
11 |
+
#- SANDBOX_USER_ID=${SANDBOX_USER_ID:-1234} # enable this only if you want a specific non-root sandbox user but you will have to manually adjust permissions of openhands-state for this user
|
12 |
+
- WORKSPACE_MOUNT_PATH=${WORKSPACE_BASE:-$PWD/workspace}
|
13 |
+
ports:
|
14 |
+
- "3000:3000"
|
15 |
+
extra_hosts:
|
16 |
+
- "host.docker.internal:host-gateway"
|
17 |
+
volumes:
|
18 |
+
- /var/run/docker.sock:/var/run/docker.sock
|
19 |
+
- ~/.openhands-state:/.openhands-state
|
20 |
+
- ${WORKSPACE_BASE:-$PWD/workspace}:/opt/workspace_base
|
21 |
+
pull_policy: build
|
22 |
+
stdin_open: true
|
23 |
+
tty: true
|
entrypoint.sh
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
set -eo pipefail
|
3 |
+
|
4 |
+
echo "Starting OpenHands..."
|
5 |
+
if [[ $NO_SETUP == "true" ]]; then
|
6 |
+
echo "Skipping setup, running as $(whoami)"
|
7 |
+
"$@"
|
8 |
+
exit 0
|
9 |
+
fi
|
10 |
+
|
11 |
+
if [ "$(id -u)" -ne 0 ]; then
|
12 |
+
echo "The OpenHands entrypoint.sh must run as root"
|
13 |
+
exit 1
|
14 |
+
fi
|
15 |
+
|
16 |
+
if [ -z "$SANDBOX_USER_ID" ]; then
|
17 |
+
echo "SANDBOX_USER_ID is not set"
|
18 |
+
exit 1
|
19 |
+
fi
|
20 |
+
|
21 |
+
if [ -z "$WORKSPACE_MOUNT_PATH" ]; then
|
22 |
+
# This is set to /opt/workspace in the Dockerfile. But if the user isn't mounting, we want to unset it so that OpenHands doesn't mount at all
|
23 |
+
unset WORKSPACE_BASE
|
24 |
+
fi
|
25 |
+
|
26 |
+
if [[ "$SANDBOX_USER_ID" -eq 0 ]]; then
|
27 |
+
echo "Running OpenHands as root"
|
28 |
+
export RUN_AS_OPENHANDS=false
|
29 |
+
mkdir -p /root/.cache/ms-playwright/
|
30 |
+
if [ -d "/home/openhands/.cache/ms-playwright/" ]; then
|
31 |
+
mv /home/openhands/.cache/ms-playwright/ /root/.cache/
|
32 |
+
fi
|
33 |
+
"$@"
|
34 |
+
else
|
35 |
+
echo "Setting up enduser with id $SANDBOX_USER_ID"
|
36 |
+
if id "enduser" &>/dev/null; then
|
37 |
+
echo "User enduser already exists. Skipping creation."
|
38 |
+
else
|
39 |
+
if ! useradd -l -m -u $SANDBOX_USER_ID -s /bin/bash enduser; then
|
40 |
+
echo "Failed to create user enduser with id $SANDBOX_USER_ID. Moving openhands user."
|
41 |
+
incremented_id=$(($SANDBOX_USER_ID + 1))
|
42 |
+
usermod -u $incremented_id openhands
|
43 |
+
if ! useradd -l -m -u $SANDBOX_USER_ID -s /bin/bash enduser; then
|
44 |
+
echo "Failed to create user enduser with id $SANDBOX_USER_ID for a second time. Exiting."
|
45 |
+
exit 1
|
46 |
+
fi
|
47 |
+
fi
|
48 |
+
fi
|
49 |
+
usermod -aG app enduser
|
50 |
+
# get the user group of /var/run/docker.sock and set openhands to that group
|
51 |
+
DOCKER_SOCKET_GID=$(stat -c '%g' /var/run/docker.sock)
|
52 |
+
echo "Docker socket group id: $DOCKER_SOCKET_GID"
|
53 |
+
if getent group $DOCKER_SOCKET_GID; then
|
54 |
+
echo "Group with id $DOCKER_SOCKET_GID already exists"
|
55 |
+
else
|
56 |
+
echo "Creating group with id $DOCKER_SOCKET_GID"
|
57 |
+
groupadd -g $DOCKER_SOCKET_GID docker
|
58 |
+
fi
|
59 |
+
|
60 |
+
mkdir -p /home/enduser/.cache/huggingface/hub/
|
61 |
+
mkdir -p /home/enduser/.cache/ms-playwright/
|
62 |
+
if [ -d "/home/openhands/.cache/ms-playwright/" ]; then
|
63 |
+
mv /home/openhands/.cache/ms-playwright/ /home/enduser/.cache/
|
64 |
+
fi
|
65 |
+
|
66 |
+
usermod -aG $DOCKER_SOCKET_GID enduser
|
67 |
+
echo "Running as enduser"
|
68 |
+
su enduser /bin/bash -c "${*@Q}" # This magically runs any arguments passed to the script as a command
|
69 |
+
fi
|
evaluation/README.md
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Evaluation
|
2 |
+
|
3 |
+
This folder contains code and resources to run experiments and evaluations.
|
4 |
+
|
5 |
+
## For Benchmark Users
|
6 |
+
|
7 |
+
### Setup
|
8 |
+
|
9 |
+
Before starting evaluation, follow the instructions [here](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md) to setup your local development environment and LLM.
|
10 |
+
|
11 |
+
Once you are done with setup, you can follow the benchmark-specific instructions in each subdirectory of the [evaluation directory](#supported-benchmarks).
|
12 |
+
Generally these will involve running `run_infer.py` to perform inference with the agents.
|
13 |
+
|
14 |
+
### Implementing and Evaluating an Agent
|
15 |
+
|
16 |
+
To add an agent to OpenHands, you will need to implement it in the [agenthub directory](https://github.com/All-Hands-AI/OpenHands/tree/main/openhands/agenthub). There is a README there with more information.
|
17 |
+
|
18 |
+
To evaluate an agent, you can provide the agent's name to the `run_infer.py` program.
|
19 |
+
|
20 |
+
### Evaluating Different LLMs
|
21 |
+
|
22 |
+
OpenHands in development mode uses `config.toml` to keep track of most configuration.
|
23 |
+
Here's an example configuration file you can use to define and use multiple LLMs:
|
24 |
+
|
25 |
+
```toml
|
26 |
+
[llm]
|
27 |
+
# IMPORTANT: add your API key here, and set the model to the one you want to evaluate
|
28 |
+
model = "gpt-4o-2024-05-13"
|
29 |
+
api_key = "sk-XXX"
|
30 |
+
|
31 |
+
[llm.eval_gpt4_1106_preview_llm]
|
32 |
+
model = "gpt-4-1106-preview"
|
33 |
+
api_key = "XXX"
|
34 |
+
temperature = 0.0
|
35 |
+
|
36 |
+
[llm.eval_some_openai_compatible_model_llm]
|
37 |
+
model = "openai/MODEL_NAME"
|
38 |
+
base_url = "https://OPENAI_COMPATIBLE_URL/v1"
|
39 |
+
api_key = "XXX"
|
40 |
+
temperature = 0.0
|
41 |
+
```
|
42 |
+
|
43 |
+
## Supported Benchmarks
|
44 |
+
|
45 |
+
The OpenHands evaluation harness supports a wide variety of benchmarks across [software engineering](#software-engineering), [web browsing](#web-browsing), [miscellaneous assistance](#misc-assistance), and [real-world](#real-world) tasks.
|
46 |
+
|
47 |
+
### Software Engineering
|
48 |
+
|
49 |
+
- SWE-Bench: [`evaluation/benchmarks/swe_bench`](./benchmarks/swe_bench)
|
50 |
+
- HumanEvalFix: [`evaluation/benchmarks/humanevalfix`](./benchmarks/humanevalfix)
|
51 |
+
- BIRD: [`evaluation/benchmarks/bird`](./benchmarks/bird)
|
52 |
+
- BioCoder: [`evaluation/benchmarks/ml_bench`](./benchmarks/ml_bench)
|
53 |
+
- ML-Bench: [`evaluation/benchmarks/ml_bench`](./benchmarks/ml_bench)
|
54 |
+
- APIBench: [`evaluation/benchmarks/gorilla`](./benchmarks/gorilla/)
|
55 |
+
- ToolQA: [`evaluation/benchmarks/toolqa`](./benchmarks/toolqa/)
|
56 |
+
- AiderBench: [`evaluation/benchmarks/aider_bench`](./benchmarks/aider_bench/)
|
57 |
+
- Commit0: [`evaluation/benchmarks/commit0_bench`](./benchmarks/commit0_bench/)
|
58 |
+
- DiscoveryBench: [`evaluation/benchmarks/discoverybench`](./benchmarks/discoverybench/)
|
59 |
+
|
60 |
+
### Web Browsing
|
61 |
+
|
62 |
+
- WebArena: [`evaluation/benchmarks/webarena`](./benchmarks/webarena/)
|
63 |
+
- MiniWob++: [`evaluation/benchmarks/miniwob`](./benchmarks/miniwob/)
|
64 |
+
- Browsing Delegation: [`evaluation/benchmarks/browsing_delegation`](./benchmarks/browsing_delegation/)
|
65 |
+
|
66 |
+
### Misc. Assistance
|
67 |
+
|
68 |
+
- GAIA: [`evaluation/benchmarks/gaia`](./benchmarks/gaia)
|
69 |
+
- GPQA: [`evaluation/benchmarks/gpqa`](./benchmarks/gpqa)
|
70 |
+
- AgentBench: [`evaluation/benchmarks/agent_bench`](./benchmarks/agent_bench)
|
71 |
+
- MINT: [`evaluation/benchmarks/mint`](./benchmarks/mint)
|
72 |
+
- Entity deduction Arena (EDA): [`evaluation/benchmarks/EDA`](./benchmarks/EDA)
|
73 |
+
- ProofWriter: [`evaluation/benchmarks/logic_reasoning`](./benchmarks/logic_reasoning)
|
74 |
+
- ScienceAgentBench: [`evaluation/benchmarks/scienceagentbench`](./benchmarks/scienceagentbench)
|
75 |
+
|
76 |
+
### Real World
|
77 |
+
|
78 |
+
- TheAgentCompany: [`evaluation/benchmarks/the_agent_company`](./benchmarks/the_agent_company)
|
79 |
+
|
80 |
+
## Result Visualization
|
81 |
+
|
82 |
+
Check [this huggingface space](https://huggingface.co/spaces/OpenHands/evaluation) for visualization of existing experimental results.
|
83 |
+
|
84 |
+
You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenHands/evaluation) and submit a PR of your evaluation results to our hosted huggingface repo via PR following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).
|
85 |
+
|
86 |
+
## For Benchmark Developers
|
87 |
+
|
88 |
+
To learn more about how to integrate your benchmark into OpenHands, check out [tutorial here](https://docs.all-hands.dev/modules/usage/how-to/evaluation-harness). Briefly,
|
89 |
+
|
90 |
+
- Each subfolder contains a specific benchmark or experiment. For example, [`evaluation/benchmarks/swe_bench`](./benchmarks/swe_bench) should contain
|
91 |
+
all the preprocessing/evaluation/analysis scripts.
|
92 |
+
- Raw data and experimental records should not be stored within this repo.
|
93 |
+
- For model outputs, they should be stored at [this huggingface space](https://huggingface.co/spaces/OpenHands/evaluation) for visualization.
|
94 |
+
- Important data files of manageable size and analysis scripts (e.g., jupyter notebooks) can be directly uploaded to this repo.
|
evaluation/__init__.py
ADDED
File without changes
|
evaluation/benchmarks/EDA/README.md
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# EDA Evaluation
|
2 |
+
|
3 |
+
This folder contains evaluation harness for evaluating agents on the Entity-deduction-Arena Benchmark, from the paper [Probing the Multi-turn Planning Capabilities of LLMs via 20 Question Games](https://arxiv.org/abs/2310.01468), presented in ACL 2024 main conference.
|
4 |
+
|
5 |
+
## Setup Environment and LLM Configuration
|
6 |
+
|
7 |
+
Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
|
8 |
+
|
9 |
+
## Start the evaluation
|
10 |
+
|
11 |
+
```bash
|
12 |
+
export OPENAI_API_KEY="sk-XXX"; # This is required for evaluation (to simulate another party of conversation)
|
13 |
+
./evaluation/benchmarks/EDA/scripts/run_infer.sh [model_config] [git-version] [agent] [dataset] [eval_limit]
|
14 |
+
```
|
15 |
+
|
16 |
+
where `model_config` is mandatory, while `git-version`, `agent`, `dataset` and `eval_limit` are optional.
|
17 |
+
|
18 |
+
- `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your
|
19 |
+
LLM settings, as defined in your `config.toml`.
|
20 |
+
|
21 |
+
- `git-version`, e.g. `HEAD`, is the git commit hash of the OpenHands version you would
|
22 |
+
like to evaluate. It could also be a release tag like `0.6.2`.
|
23 |
+
|
24 |
+
- `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting
|
25 |
+
to `CodeActAgent`.
|
26 |
+
|
27 |
+
- `dataset`: There are two tasks in this evaluation. Specify `dataset` to test on either `things` or `celebs` task.
|
28 |
+
|
29 |
+
- `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances. By default it infers all instances.
|
30 |
+
|
31 |
+
For example,
|
32 |
+
|
33 |
+
```bash
|
34 |
+
./evaluation/benchmarks/EDA/scripts/run_infer.sh eval_gpt4o_2024_05_13 0.6.2 CodeActAgent things
|
35 |
+
```
|
36 |
+
|
37 |
+
## Reference
|
38 |
+
|
39 |
+
```bibtex
|
40 |
+
@inproceedings{zhang2023entity,
|
41 |
+
title={Probing the Multi-turn Planning Capabilities of LLMs via 20 Question Games},
|
42 |
+
author={Zhang, Yizhe and Lu, Jiarui and Jaitly, Navdeep},
|
43 |
+
journal={ACL},
|
44 |
+
year={2024}
|
45 |
+
}
|
46 |
+
```
|
evaluation/benchmarks/EDA/game.py
ADDED
@@ -0,0 +1,203 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import re
|
3 |
+
|
4 |
+
import openai
|
5 |
+
import requests.exceptions
|
6 |
+
from openai import OpenAI
|
7 |
+
from retry import retry
|
8 |
+
|
9 |
+
LOGGER = logging.getLogger(__name__)
|
10 |
+
|
11 |
+
|
12 |
+
class Q20Game:
|
13 |
+
def __init__(
|
14 |
+
self,
|
15 |
+
item: str,
|
16 |
+
answerer_model: str = 'gpt-3.5-turbo-0613',
|
17 |
+
guesser_model: str = 'gpt-3.5-turbo-0613',
|
18 |
+
num_turns: int = 20,
|
19 |
+
temperature: float = 0.8,
|
20 |
+
openai_api: bool = True,
|
21 |
+
openai_api_key: str | None = None,
|
22 |
+
guesser_kargs=None,
|
23 |
+
) -> None:
|
24 |
+
if guesser_kargs is None:
|
25 |
+
guesser_kargs = {}
|
26 |
+
self.item = item
|
27 |
+
self.answerer_model = answerer_model
|
28 |
+
self.guesser_model = guesser_model
|
29 |
+
self.num_turns = num_turns
|
30 |
+
self.temperature = temperature
|
31 |
+
self.openai_api = openai_api
|
32 |
+
self.guesser_kargs = guesser_kargs
|
33 |
+
self.vicuna_prompt = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions."
|
34 |
+
self.first_user_utterance = (
|
35 |
+
'Your task is to ask a series of questions to deduce the entity '
|
36 |
+
"that I'm thinking of with as few queries as possible. "
|
37 |
+
"Only ask questions that can be answered by 'yes', 'no' or 'maybe'. "
|
38 |
+
'Do not ask for hint. Make your question brief with no linebreaker. '
|
39 |
+
'Now start asking a question.'
|
40 |
+
)
|
41 |
+
self.guesser_win = False
|
42 |
+
self.curr_turn = 0
|
43 |
+
if openai_api_key is not None:
|
44 |
+
openai.api_key = openai_api_key
|
45 |
+
|
46 |
+
if isinstance(answerer_model, str) and not answerer_model.startswith('gpt'):
|
47 |
+
self.user_api_base = 'http://0.0.0.0:8000/v1'
|
48 |
+
else:
|
49 |
+
self.user_api_base = 'https://api.openai.com/v1'
|
50 |
+
|
51 |
+
if isinstance(guesser_model, str) and not guesser_model.startswith('gpt'):
|
52 |
+
self.guesser_api_base = 'http://0.0.0.0:8000/v1'
|
53 |
+
else:
|
54 |
+
self.guesser_api_base = 'https://api.openai.com/v1'
|
55 |
+
|
56 |
+
self.guesser_messages = []
|
57 |
+
|
58 |
+
def preprocess_response(self, response):
|
59 |
+
response = re.sub(r'the entity you are thinking of', 'it', response)
|
60 |
+
response = re.sub(r"the entity you're thinking of", 'it', response)
|
61 |
+
response = re.sub(r" you're thinking of", '', response)
|
62 |
+
response = re.sub(r' you are thinking of', '', response)
|
63 |
+
return response
|
64 |
+
|
65 |
+
def judge_winner(self, response):
|
66 |
+
guesser_question = response.strip()
|
67 |
+
|
68 |
+
if self.curr_turn == self.num_turns - 1:
|
69 |
+
guesser_question += ' Is it right?'
|
70 |
+
|
71 |
+
self.guesser_messages.append({'role': 'assistant', 'content': guesser_question})
|
72 |
+
# ask for answer
|
73 |
+
usr_msg = self.answerer(guesser_question)
|
74 |
+
|
75 |
+
self.guesser_messages.append(
|
76 |
+
{'role': 'user', 'content': f"{usr_msg['content'].strip()}"}
|
77 |
+
)
|
78 |
+
|
79 |
+
if 'bingo' in usr_msg['content'].lower():
|
80 |
+
self.guesser_win = True
|
81 |
+
return True, ''
|
82 |
+
|
83 |
+
return False, usr_msg['content'].strip()
|
84 |
+
|
85 |
+
def generate_user_response(self, response):
|
86 |
+
response = self.preprocess_response(response)
|
87 |
+
# others
|
88 |
+
bingo, anwser_reply = self.judge_winner(response)
|
89 |
+
if bingo:
|
90 |
+
return 'You are bingo! Use the "finish" tool to finish the interaction.\n'
|
91 |
+
if self.curr_turn == self.num_turns - 2:
|
92 |
+
anwser_reply += " You must guess now, what's it?"
|
93 |
+
return anwser_reply
|
94 |
+
|
95 |
+
def reward(self):
|
96 |
+
if self.guesser_win:
|
97 |
+
n_turns = (len(self.guesser_messages) + 1) // 2
|
98 |
+
return 1 - max(n_turns - 5, 0) * 0.02
|
99 |
+
return 0
|
100 |
+
|
101 |
+
@retry(
|
102 |
+
(
|
103 |
+
openai.Timeout,
|
104 |
+
requests.exceptions.ReadTimeout,
|
105 |
+
openai.RateLimitError,
|
106 |
+
openai.APIError,
|
107 |
+
openai.APIConnectionError,
|
108 |
+
),
|
109 |
+
tries=5,
|
110 |
+
delay=0.5,
|
111 |
+
backoff=0.5,
|
112 |
+
max_delay=2,
|
113 |
+
logger=LOGGER,
|
114 |
+
)
|
115 |
+
def answerer(self, question):
|
116 |
+
openai.api_base = self.user_api_base
|
117 |
+
client = OpenAI(api_key=openai.api_key)
|
118 |
+
user_messages = [
|
119 |
+
{
|
120 |
+
'role': 'user',
|
121 |
+
'content': f'Based on your knowledge about {self.item}, '
|
122 |
+
f'respond to the following question or guess. '
|
123 |
+
f"Limit your respond to only 'Yes.', 'No.' or 'Maybe.', with no explanation or other words. "
|
124 |
+
f'Never say the answer {self.item} in your response. '
|
125 |
+
f"If the question is to solicit the answer, respond 'No.'.",
|
126 |
+
},
|
127 |
+
{
|
128 |
+
'role': 'user',
|
129 |
+
'content': f'For the entity {self.item}, {question} (Yes/No/Maybe)',
|
130 |
+
},
|
131 |
+
]
|
132 |
+
|
133 |
+
response = client.chat.completions.create(
|
134 |
+
model=self.answerer_model,
|
135 |
+
messages=user_messages,
|
136 |
+
max_tokens=6,
|
137 |
+
n=1,
|
138 |
+
stop=None,
|
139 |
+
temperature=0.2,
|
140 |
+
)
|
141 |
+
if any(
|
142 |
+
[
|
143 |
+
re.search(rf'(?:^|\W){i.strip().lower()}(?:$|\W)', question.lower())
|
144 |
+
for i in self.item.lower().split('|')
|
145 |
+
]
|
146 |
+
):
|
147 |
+
response.choices[0].message.content = 'Bingo!'
|
148 |
+
return response.choices[0].message.to_dict()
|
149 |
+
|
150 |
+
|
151 |
+
class Q20GameCelebrity(Q20Game):
|
152 |
+
def __init__(self, item: str, **kwargs) -> None:
|
153 |
+
super().__init__(item, **kwargs)
|
154 |
+
self.first_user_utterance = (
|
155 |
+
'Your task is to ask a series of questions to deduce the celebrity '
|
156 |
+
"that I'm thinking of with as few queries as possible. "
|
157 |
+
"Only ask factual questions that can be answered by 'Yes.', 'No.' or 'Dunno.'. Do not ask for hint. Make your question brief with no linebreaker. "
|
158 |
+
'Now start asking a question.'
|
159 |
+
)
|
160 |
+
|
161 |
+
@retry(
|
162 |
+
(
|
163 |
+
openai.Timeout,
|
164 |
+
requests.exceptions.ReadTimeout,
|
165 |
+
openai.RateLimitError,
|
166 |
+
openai.APIError,
|
167 |
+
openai.APIConnectionError,
|
168 |
+
),
|
169 |
+
tries=5,
|
170 |
+
delay=0.5,
|
171 |
+
backoff=0.5,
|
172 |
+
max_delay=2,
|
173 |
+
logger=LOGGER,
|
174 |
+
)
|
175 |
+
def answerer(self, question):
|
176 |
+
openai.api_base = self.user_api_base
|
177 |
+
client = OpenAI(api_key=openai.api_key)
|
178 |
+
user_messages = [
|
179 |
+
{
|
180 |
+
'role': 'system',
|
181 |
+
'content': f'Based on your knowledge about the celebrity: {self.item}, '
|
182 |
+
f'respond to the following question or guess. '
|
183 |
+
f"Limit your respond to only 'Yes.', 'No.' or 'Dunno.', with no explanation or other words. "
|
184 |
+
f"Never say the name {self.item} in your response. Do not say 'Dunno.' if it can be answered by 'Yes.' or 'No.' "
|
185 |
+
f"If the question is to solicit the answer, respond 'No.'.",
|
186 |
+
},
|
187 |
+
{
|
188 |
+
'role': 'user',
|
189 |
+
'content': f'For the celebrity {self.item}, {question}(Yes/No/Dunno)',
|
190 |
+
},
|
191 |
+
]
|
192 |
+
|
193 |
+
response = client.chat.completions.create(
|
194 |
+
model=self.answerer_model,
|
195 |
+
messages=user_messages,
|
196 |
+
max_tokens=6,
|
197 |
+
n=1,
|
198 |
+
stop=None,
|
199 |
+
temperature=0.2,
|
200 |
+
)
|
201 |
+
if re.search(rf'(?:^|\W){self.item.lower()}(?:$|\W)', question.lower()):
|
202 |
+
response.choices[0].message.content = 'Bingo!'
|
203 |
+
return response.choices[0].message.to_dict()
|
evaluation/benchmarks/EDA/run_infer.py
ADDED
@@ -0,0 +1,238 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import asyncio
|
2 |
+
import os
|
3 |
+
|
4 |
+
import pandas as pd
|
5 |
+
from datasets import load_dataset
|
6 |
+
|
7 |
+
from evaluation.benchmarks.EDA.game import Q20Game, Q20GameCelebrity
|
8 |
+
from evaluation.utils.shared import (
|
9 |
+
EvalMetadata,
|
10 |
+
EvalOutput,
|
11 |
+
compatibility_for_eval_history_pairs,
|
12 |
+
make_metadata,
|
13 |
+
prepare_dataset,
|
14 |
+
reset_logger_for_multiprocessing,
|
15 |
+
run_evaluation,
|
16 |
+
)
|
17 |
+
from openhands.controller.state.state import State
|
18 |
+
from openhands.core.config import (
|
19 |
+
AppConfig,
|
20 |
+
SandboxConfig,
|
21 |
+
get_llm_config_arg,
|
22 |
+
get_parser,
|
23 |
+
)
|
24 |
+
from openhands.core.logger import openhands_logger as logger
|
25 |
+
from openhands.core.main import create_runtime, run_controller
|
26 |
+
from openhands.events.action import MessageAction
|
27 |
+
from openhands.utils.async_utils import call_async_from_sync
|
28 |
+
|
29 |
+
game = None
|
30 |
+
|
31 |
+
|
32 |
+
def codeact_user_response_eda(state: State) -> str:
|
33 |
+
global game
|
34 |
+
model_guess = ''
|
35 |
+
|
36 |
+
# retrieve the latest model message from history
|
37 |
+
if state.history:
|
38 |
+
last_agent_message = state.get_last_agent_message()
|
39 |
+
model_guess = last_agent_message.content if last_agent_message else ''
|
40 |
+
|
41 |
+
assert game is not None, 'Game is not initialized.'
|
42 |
+
msg = game.generate_user_response(model_guess)
|
43 |
+
game.curr_turn += 1
|
44 |
+
logger.info(f'Model guess: {model_guess}')
|
45 |
+
logger.info(f'Answer response: {msg}')
|
46 |
+
if 'bingo!' in msg.lower():
|
47 |
+
return '/exit'
|
48 |
+
return msg
|
49 |
+
|
50 |
+
|
51 |
+
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
|
52 |
+
'CodeActAgent': codeact_user_response_eda,
|
53 |
+
}
|
54 |
+
|
55 |
+
AGENT_CLS_TO_INST_SUFFIX = {
|
56 |
+
'CodeActAgent': 'When you think you have solved the question, please first send your answer to user through message and then exit.\n'
|
57 |
+
}
|
58 |
+
|
59 |
+
|
60 |
+
def get_config(
|
61 |
+
metadata: EvalMetadata,
|
62 |
+
) -> AppConfig:
|
63 |
+
config = AppConfig(
|
64 |
+
default_agent=metadata.agent_class,
|
65 |
+
run_as_openhands=False,
|
66 |
+
runtime='docker',
|
67 |
+
max_iterations=metadata.max_iterations,
|
68 |
+
sandbox=SandboxConfig(
|
69 |
+
base_container_image='python:3.12-bookworm',
|
70 |
+
enable_auto_lint=False,
|
71 |
+
use_host_network=False,
|
72 |
+
),
|
73 |
+
# do not mount workspace
|
74 |
+
workspace_base=None,
|
75 |
+
workspace_mount_path=None,
|
76 |
+
)
|
77 |
+
config.set_llm_config(metadata.llm_config)
|
78 |
+
agent_config = config.get_agent_config(metadata.agent_class)
|
79 |
+
agent_config.enable_prompt_extensions = False
|
80 |
+
return config
|
81 |
+
|
82 |
+
|
83 |
+
def process_instance(
|
84 |
+
instance: pd.Series,
|
85 |
+
metadata: EvalMetadata,
|
86 |
+
reset_logger: bool = True,
|
87 |
+
) -> EvalOutput:
|
88 |
+
config = get_config(metadata)
|
89 |
+
instance_id = instance['text'].strip()
|
90 |
+
|
91 |
+
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
|
92 |
+
if reset_logger:
|
93 |
+
log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
|
94 |
+
reset_logger_for_multiprocessing(logger, instance_id, log_dir)
|
95 |
+
else:
|
96 |
+
logger.info(f'Starting evaluation for instance {instance_id}.')
|
97 |
+
|
98 |
+
# Prepare instruction
|
99 |
+
_game_class = {'eda-things': Q20Game, 'eda-celebs': Q20GameCelebrity}
|
100 |
+
|
101 |
+
guesser_kargs = {
|
102 |
+
'max_new_tokens': 64,
|
103 |
+
'temperature': 0.8,
|
104 |
+
'repetition_penalty': 1.0,
|
105 |
+
'do_sample': True,
|
106 |
+
} # no penalty
|
107 |
+
|
108 |
+
# Use codeactagent as guesser_model
|
109 |
+
global game
|
110 |
+
assert metadata.dataset is not None
|
111 |
+
assert metadata.details is not None
|
112 |
+
game = _game_class[metadata.dataset](
|
113 |
+
item=instance['text'].strip(),
|
114 |
+
answerer_model=metadata.details['answerer_model'],
|
115 |
+
guesser_model=None,
|
116 |
+
num_turns=metadata.max_iterations,
|
117 |
+
openai_api_key=metadata.details['openai_api_key'],
|
118 |
+
guesser_kargs=guesser_kargs,
|
119 |
+
)
|
120 |
+
|
121 |
+
instruction = f'{game.first_user_utterance}'
|
122 |
+
logger.info(f'Instruction: {instruction}')
|
123 |
+
instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]
|
124 |
+
|
125 |
+
# Here's how you can run the agent (similar to the `main` function) and get the final task state
|
126 |
+
runtime = create_runtime(config)
|
127 |
+
call_async_from_sync(runtime.connect)
|
128 |
+
|
129 |
+
state: State | None = asyncio.run(
|
130 |
+
run_controller(
|
131 |
+
config=config,
|
132 |
+
initial_user_action=MessageAction(content=instruction),
|
133 |
+
runtime=runtime,
|
134 |
+
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
|
135 |
+
metadata.agent_class
|
136 |
+
],
|
137 |
+
)
|
138 |
+
)
|
139 |
+
# ======= Attempt to evaluate the agent's edits =======
|
140 |
+
# If you are working on simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
|
141 |
+
# You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
|
142 |
+
|
143 |
+
if state is None:
|
144 |
+
raise ValueError('State should not be None.')
|
145 |
+
|
146 |
+
last_agent_message = state.get_last_agent_message()
|
147 |
+
final_message = last_agent_message.content if last_agent_message else ''
|
148 |
+
|
149 |
+
logger.info(f'Final message: {final_message} | Ground truth: {instance["text"]}')
|
150 |
+
test_result = game.reward()
|
151 |
+
metrics = state.metrics.get() if state.metrics else None
|
152 |
+
|
153 |
+
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
|
154 |
+
# for compatibility with the existing output format, we can remake the pairs here
|
155 |
+
# remove when it becomes unnecessary
|
156 |
+
histories = compatibility_for_eval_history_pairs(state.history)
|
157 |
+
|
158 |
+
# Save the output
|
159 |
+
output = EvalOutput(
|
160 |
+
instance_id=instance_id,
|
161 |
+
instance=instance.to_dict(),
|
162 |
+
instruction=instruction,
|
163 |
+
metadata=metadata,
|
164 |
+
history=histories,
|
165 |
+
metrics=metrics,
|
166 |
+
error=state.last_error if state and state.last_error else None,
|
167 |
+
test_result={
|
168 |
+
'success': test_result,
|
169 |
+
'final_message': final_message,
|
170 |
+
'ground_truth': instance['text'],
|
171 |
+
},
|
172 |
+
)
|
173 |
+
return output
|
174 |
+
|
175 |
+
|
176 |
+
if __name__ == '__main__':
|
177 |
+
parser = get_parser()
|
178 |
+
parser.add_argument(
|
179 |
+
'--answerer_model', '-a', default='gpt-3.5-turbo', help='answerer model'
|
180 |
+
)
|
181 |
+
parser.add_argument(
|
182 |
+
'--dataset',
|
183 |
+
default='things',
|
184 |
+
choices=['things', 'celebs'],
|
185 |
+
type=str,
|
186 |
+
help='dataset to be used',
|
187 |
+
)
|
188 |
+
parser.add_argument(
|
189 |
+
'--OPENAI_API_KEY', type=str, required=True, help='Your OpenAI API key'
|
190 |
+
)
|
191 |
+
parser.add_argument(
|
192 |
+
'--data-split',
|
193 |
+
default='test',
|
194 |
+
type=str,
|
195 |
+
help='data split, eg, test',
|
196 |
+
)
|
197 |
+
args, _ = parser.parse_known_args()
|
198 |
+
|
199 |
+
eda_dataset = load_dataset(
|
200 |
+
'yizheapple/entity-deduction-arena', name=args.dataset, split=args.data_split
|
201 |
+
)
|
202 |
+
eda_dataset.rename(columns={'text': 'instance_id'}, inplace=True)
|
203 |
+
|
204 |
+
llm_config = None
|
205 |
+
if args.llm_config:
|
206 |
+
llm_config = get_llm_config_arg(args.llm_config)
|
207 |
+
# modify_params must be False for evaluation purpose, for reproducibility and accurancy of results
|
208 |
+
llm_config.modify_params = False
|
209 |
+
|
210 |
+
if llm_config is None:
|
211 |
+
raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
|
212 |
+
|
213 |
+
metadata = make_metadata(
|
214 |
+
llm_config,
|
215 |
+
f'eda-{args.dataset}',
|
216 |
+
args.agent_cls,
|
217 |
+
args.max_iterations,
|
218 |
+
args.eval_note,
|
219 |
+
args.eval_output_dir,
|
220 |
+
data_split=args.data_split,
|
221 |
+
details={
|
222 |
+
'answerer_model': str(args.answerer_model),
|
223 |
+
'openai_api_key': str(args.OPENAI_API_KEY),
|
224 |
+
},
|
225 |
+
)
|
226 |
+
|
227 |
+
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
|
228 |
+
prepared_dataset = prepare_dataset(
|
229 |
+
eda_dataset.to_pandas(), output_file, args.eval_n_limit
|
230 |
+
)
|
231 |
+
|
232 |
+
run_evaluation(
|
233 |
+
prepared_dataset,
|
234 |
+
metadata,
|
235 |
+
output_file,
|
236 |
+
args.eval_num_workers,
|
237 |
+
process_instance,
|
238 |
+
)
|
evaluation/benchmarks/EDA/scripts/run_infer.sh
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
set -eo pipefail
|
3 |
+
|
4 |
+
source "evaluation/utils/version_control.sh"
|
5 |
+
|
6 |
+
MODEL_CONFIG=$1
|
7 |
+
COMMIT_HASH=$2
|
8 |
+
AGENT=$3
|
9 |
+
DATASET=$4
|
10 |
+
EVAL_LIMIT=$5
|
11 |
+
NUM_WORKERS=$6
|
12 |
+
|
13 |
+
if [ -z "$NUM_WORKERS" ]; then
|
14 |
+
NUM_WORKERS=1
|
15 |
+
echo "Number of workers not specified, use default $NUM_WORKERS"
|
16 |
+
fi
|
17 |
+
checkout_eval_branch
|
18 |
+
|
19 |
+
if [ -z "$AGENT" ]; then
|
20 |
+
echo "Agent not specified, use default CodeActAgent"
|
21 |
+
AGENT="CodeActAgent"
|
22 |
+
fi
|
23 |
+
|
24 |
+
get_openhands_version
|
25 |
+
|
26 |
+
if [ -z "$DATASET" ]; then
|
27 |
+
echo "Dataset not specified, use default 'things'"
|
28 |
+
DATASET="things"
|
29 |
+
fi
|
30 |
+
|
31 |
+
# check if OPENAI_API_KEY is set
|
32 |
+
if [ -z "$OPENAI_API_KEY" ]; then
|
33 |
+
echo "OPENAI_API_KEY is not set, please set it to run the script"
|
34 |
+
exit 1
|
35 |
+
fi
|
36 |
+
|
37 |
+
|
38 |
+
echo "AGENT: $AGENT"
|
39 |
+
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
40 |
+
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
41 |
+
echo "DATASET: $DATASET"
|
42 |
+
|
43 |
+
COMMAND="poetry run python evaluation/benchmarks/EDA/run_infer.py \
|
44 |
+
--agent-cls $AGENT \
|
45 |
+
--llm-config $MODEL_CONFIG \
|
46 |
+
--dataset $DATASET \
|
47 |
+
--data-split test \
|
48 |
+
--max-iterations 20 \
|
49 |
+
--OPENAI_API_KEY $OPENAI_API_KEY \
|
50 |
+
--eval-num-workers $NUM_WORKERS \
|
51 |
+
--eval-note ${OPENHANDS_VERSION}_${DATASET}"
|
52 |
+
|
53 |
+
if [ -n "$EVAL_LIMIT" ]; then
|
54 |
+
echo "EVAL_LIMIT: $EVAL_LIMIT"
|
55 |
+
COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
|
56 |
+
fi
|
57 |
+
|
58 |
+
# Run the command
|
59 |
+
echo $COMMAND
|
60 |
+
eval $COMMAND
|
evaluation/benchmarks/agent_bench/README.md
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# AgentBench Evaluation
|
2 |
+
|
3 |
+
This folder contains evaluation harness for evaluating agents on the [AgentBench: Evaluating LLMs as Agents](https://arxiv.org/abs/2308.03688). We currently only support running on the `osbench` subset.
|
4 |
+
|
5 |
+
## Setup Environment and LLM Configuration
|
6 |
+
|
7 |
+
Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
|
8 |
+
|
9 |
+
## Start the evaluation
|
10 |
+
|
11 |
+
```bash
|
12 |
+
./evaluation/benchmarks/agent_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit]
|
13 |
+
```
|
14 |
+
|
15 |
+
- `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your
|
16 |
+
LLM settings, as defined in your `config.toml`.
|
17 |
+
- `git-version`, e.g. `HEAD`, is the git commit hash of the OpenHands version you would
|
18 |
+
like to evaluate. It could also be a release tag like `0.6.2`.
|
19 |
+
- `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting
|
20 |
+
to `CodeActAgent`.
|
21 |
+
- `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances. By
|
22 |
+
default, the script evaluates the entire SWE-bench_Lite test set (300 issues). Note:
|
23 |
+
in order to use `eval_limit`, you must also set `agent`.
|
24 |
+
|
25 |
+
|
26 |
+
Following is the basic command to start the evaluation.
|
27 |
+
|
28 |
+
You can update the arguments in the script `evaluation/benchmarks/agent_bench/scripts/run_infer.sh`, such as `--max-iterations`, `--eval-num-workers` and so on.
|
29 |
+
|
30 |
+
- `--agent-cls`, the agent to use. For example, `CodeActAgent`.
|
31 |
+
- `--llm-config`: the LLM configuration to use. For example, `eval_gpt4_1106_preview`.
|
32 |
+
- `--max-iterations`: the number of iterations to run the evaluation. For example, `30`.
|
33 |
+
- `--eval-num-workers`: the number of workers to use for evaluation. For example, `5`.
|
34 |
+
- `--eval-n-limit`: the number of examples to evaluate. For example, `100`.
|
35 |
+
|
36 |
+
```bash
|
37 |
+
./evaluation/benchmarks/agent_bench/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 1
|
38 |
+
```
|
39 |
+
|
40 |
+
## Run with Remote Runtime (experimental)
|
41 |
+
|
42 |
+
You can run the evaluation using a remote runtime instead of a local Docker container. This is useful when you want to run the evaluation in a cloud environment or when you don't have Docker installed locally.
|
43 |
+
|
44 |
+
To use the remote runtime, set the following environment variables:
|
45 |
+
|
46 |
+
```bash
|
47 |
+
# Required environment variables
|
48 |
+
export ALLHANDS_API_KEY="your-api-key" # Contact the team to get an API key
|
49 |
+
export RUNTIME=remote
|
50 |
+
export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev"
|
51 |
+
|
52 |
+
# Run the evaluation
|
53 |
+
./evaluation/benchmarks/agent_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 1
|
54 |
+
```
|
55 |
+
|
56 |
+
The remote runtime will build a container image and run the evaluation in a cloud environment. The results will be saved locally in the same way as when running with a local runtime.
|
evaluation/benchmarks/agent_bench/__init__.py
ADDED
File without changes
|
evaluation/benchmarks/agent_bench/helper.py
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
from functools import partial
|
4 |
+
|
5 |
+
from evaluation.utils.shared import codeact_user_response
|
6 |
+
from openhands.events.action import CmdRunAction, MessageAction
|
7 |
+
|
8 |
+
|
9 |
+
def try_parse_answer(act) -> str | None:
|
10 |
+
raw_ans = ''
|
11 |
+
if isinstance(act, MessageAction) and act.source == 'agent':
|
12 |
+
raw_ans = act.content
|
13 |
+
elif isinstance(act, CmdRunAction) and act.source == 'agent':
|
14 |
+
raw_ans = act.thought
|
15 |
+
else:
|
16 |
+
return None
|
17 |
+
agent_answer = re.findall(r'<solution>(.*?)</solution>', raw_ans, re.DOTALL)
|
18 |
+
if not agent_answer:
|
19 |
+
return None
|
20 |
+
return agent_answer[0].strip()
|
21 |
+
|
22 |
+
|
23 |
+
FAKE_RESPONSES = {
|
24 |
+
'CodeActAgent': partial(
|
25 |
+
codeact_user_response, encapsulate_solution=True, try_parse=try_parse_answer
|
26 |
+
),
|
27 |
+
}
|
28 |
+
|
29 |
+
INST_SUFFIXES: dict[str, str] = {
|
30 |
+
'CodeActAgent': (
|
31 |
+
'When you think you have solved the question, '
|
32 |
+
'please first send your answer to user through message and then exit.\n'
|
33 |
+
)
|
34 |
+
}
|
35 |
+
|
36 |
+
|
37 |
+
def analysis_size(size_str):
|
38 |
+
size_str = size_str.strip()
|
39 |
+
avails = {
|
40 |
+
'B': 1,
|
41 |
+
'Byte': 1,
|
42 |
+
'K': 1024,
|
43 |
+
'KB': 1024,
|
44 |
+
'M': 1024 * 1024,
|
45 |
+
'MB': 1024 * 1024,
|
46 |
+
'G': 1024 * 1024 * 1024,
|
47 |
+
'GB': 1024 * 1024 * 1024,
|
48 |
+
'T': 1024 * 1024 * 1024 * 1024,
|
49 |
+
'TB': 1024 * 1024 * 1024 * 1024,
|
50 |
+
'P': 1024 * 1024 * 1024 * 1024 * 1024,
|
51 |
+
'PB': 1024 * 1024 * 1024 * 1024 * 1024,
|
52 |
+
}
|
53 |
+
for size_unit in avails:
|
54 |
+
if size_str.endswith(size_unit):
|
55 |
+
return int(size_str[: -len(size_unit)]) * avails[size_unit]
|
56 |
+
return int(size_str)
|
57 |
+
|
58 |
+
|
59 |
+
def compare_results(check_method: str, model_answer: str, final_ans: str) -> bool:
|
60 |
+
try:
|
61 |
+
match check_method:
|
62 |
+
case 'check/integer-match.py':
|
63 |
+
return int(model_answer) == int(final_ans)
|
64 |
+
case 'check/size-match.py':
|
65 |
+
return analysis_size(model_answer) == analysis_size(final_ans)
|
66 |
+
return (
|
67 |
+
model_answer.replace('\r\n', '\n').replace('\r', '\n').strip()
|
68 |
+
== final_ans.replace('\r\n', '\n').replace('\r', '\n').strip()
|
69 |
+
)
|
70 |
+
except Exception:
|
71 |
+
return False
|
72 |
+
|
73 |
+
|
74 |
+
def create_sh_file(filename: str, cmds: str) -> None:
|
75 |
+
with open(filename, 'w', encoding='utf-8') as file:
|
76 |
+
file.write(cmds.replace('\r\n', '\n'))
|
77 |
+
os.chmod(filename, 0o755)
|
evaluation/benchmarks/agent_bench/run_infer.py
ADDED
@@ -0,0 +1,329 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import asyncio
|
2 |
+
import os
|
3 |
+
import re
|
4 |
+
import tempfile
|
5 |
+
from typing import Any
|
6 |
+
|
7 |
+
import pandas as pd
|
8 |
+
from datasets import load_dataset
|
9 |
+
|
10 |
+
from evaluation.benchmarks.agent_bench.helper import (
|
11 |
+
FAKE_RESPONSES,
|
12 |
+
INST_SUFFIXES,
|
13 |
+
compare_results,
|
14 |
+
create_sh_file,
|
15 |
+
)
|
16 |
+
from evaluation.utils.shared import (
|
17 |
+
EvalMetadata,
|
18 |
+
EvalOutput,
|
19 |
+
compatibility_for_eval_history_pairs,
|
20 |
+
make_metadata,
|
21 |
+
prepare_dataset,
|
22 |
+
reset_logger_for_multiprocessing,
|
23 |
+
run_evaluation,
|
24 |
+
)
|
25 |
+
from openhands.controller.state.state import State
|
26 |
+
from openhands.core.config import (
|
27 |
+
AppConfig,
|
28 |
+
SandboxConfig,
|
29 |
+
get_llm_config_arg,
|
30 |
+
parse_arguments,
|
31 |
+
)
|
32 |
+
from openhands.core.logger import openhands_logger as logger
|
33 |
+
from openhands.core.main import create_runtime, run_controller
|
34 |
+
from openhands.events.action import AgentFinishAction, CmdRunAction, MessageAction
|
35 |
+
from openhands.events.observation import CmdOutputObservation
|
36 |
+
from openhands.runtime.base import Runtime
|
37 |
+
from openhands.utils.async_utils import call_async_from_sync
|
38 |
+
|
39 |
+
|
40 |
+
def get_config(
|
41 |
+
metadata: EvalMetadata,
|
42 |
+
) -> AppConfig:
|
43 |
+
config = AppConfig(
|
44 |
+
default_agent=metadata.agent_class,
|
45 |
+
run_as_openhands=False,
|
46 |
+
runtime=os.environ.get('RUNTIME', 'docker'),
|
47 |
+
max_iterations=metadata.max_iterations,
|
48 |
+
sandbox=SandboxConfig(
|
49 |
+
base_container_image='python:3.12-slim',
|
50 |
+
enable_auto_lint=True,
|
51 |
+
use_host_network=False,
|
52 |
+
api_key=os.environ.get('ALLHANDS_API_KEY', None),
|
53 |
+
remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
|
54 |
+
keep_runtime_alive=False,
|
55 |
+
remote_runtime_init_timeout=3600,
|
56 |
+
),
|
57 |
+
# do not mount workspace
|
58 |
+
workspace_base=None,
|
59 |
+
workspace_mount_path=None,
|
60 |
+
)
|
61 |
+
config.set_llm_config(metadata.llm_config)
|
62 |
+
agent_config = config.get_agent_config(metadata.agent_class)
|
63 |
+
agent_config.enable_prompt_extensions = False
|
64 |
+
return config
|
65 |
+
|
66 |
+
|
67 |
+
def initialize_runtime(
|
68 |
+
runtime: Runtime,
|
69 |
+
instance: pd.Series, # this argument is not required
|
70 |
+
):
|
71 |
+
"""Initialize the runtime for the agent.
|
72 |
+
|
73 |
+
This function is called before the runtime is used to run the agent.
|
74 |
+
"""
|
75 |
+
logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
|
76 |
+
obs: CmdOutputObservation
|
77 |
+
|
78 |
+
# Set instance id
|
79 |
+
action = CmdRunAction(command='mkdir -p /workspace')
|
80 |
+
logger.info(action, extra={'msg_type': 'ACTION'})
|
81 |
+
obs = runtime.run_action(action)
|
82 |
+
assert obs.exit_code == 0
|
83 |
+
|
84 |
+
action = CmdRunAction(command='cd /workspace')
|
85 |
+
logger.info(action, extra={'msg_type': 'ACTION'})
|
86 |
+
obs = runtime.run_action(action)
|
87 |
+
assert obs.exit_code == 0
|
88 |
+
|
89 |
+
init_cmd = instance.init
|
90 |
+
if init_cmd is not None:
|
91 |
+
script_name = f'{instance.instance_id}_init.sh'
|
92 |
+
|
93 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
94 |
+
host_script_path = os.path.join(tmpdir, script_name)
|
95 |
+
create_sh_file(host_script_path, init_cmd)
|
96 |
+
runtime.copy_to(
|
97 |
+
host_script_path,
|
98 |
+
'/workspace',
|
99 |
+
)
|
100 |
+
|
101 |
+
logger.info(f'Running init script: {script_name}')
|
102 |
+
action = CmdRunAction(command=f'chmod +x ./{script_name} && ./{script_name}')
|
103 |
+
logger.info(action, extra={'msg_type': 'ACTION'})
|
104 |
+
obs = runtime.run_action(action)
|
105 |
+
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
106 |
+
assert obs.exit_code == 0
|
107 |
+
|
108 |
+
logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
|
109 |
+
|
110 |
+
|
111 |
+
def complete_runtime(
|
112 |
+
runtime: Runtime,
|
113 |
+
instance: pd.Series, # this argument is not required, but it is used to get the workspace_dir_name
|
114 |
+
) -> dict[str, Any]:
|
115 |
+
"""Complete the runtime for the agent.
|
116 |
+
|
117 |
+
This function is called before the runtime is used to run the agent.
|
118 |
+
If you need to do something in the sandbox to get the correctness metric after
|
119 |
+
the agent has run, modify this function.
|
120 |
+
"""
|
121 |
+
logger.info(f"{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}")
|
122 |
+
obs: CmdOutputObservation
|
123 |
+
|
124 |
+
agent_answer = None
|
125 |
+
get_agent_result_cmd = instance.get_agent_result
|
126 |
+
if get_agent_result_cmd is not None:
|
127 |
+
script_name = 'get_agent_result.sh'
|
128 |
+
|
129 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
130 |
+
host_script_path = os.path.join(tmpdir, script_name)
|
131 |
+
create_sh_file(host_script_path, get_agent_result_cmd)
|
132 |
+
runtime.copy_to(
|
133 |
+
host_script_path,
|
134 |
+
'/workspace',
|
135 |
+
)
|
136 |
+
logger.info(f'Running get agent result cmd: {script_name}')
|
137 |
+
|
138 |
+
action = CmdRunAction(
|
139 |
+
command=f'chmod +x ./{script_name} && ./{script_name}',
|
140 |
+
)
|
141 |
+
logger.info(action, extra={'msg_type': 'ACTION'})
|
142 |
+
obs = runtime.run_action(action)
|
143 |
+
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
144 |
+
assert obs.exit_code == 0
|
145 |
+
agent_answer = obs.content
|
146 |
+
# IF the agent answer is not found, retrieve it from the history
|
147 |
+
# We wait until the controller finishes
|
148 |
+
|
149 |
+
final_ans = None
|
150 |
+
if instance.ground_truth is not None:
|
151 |
+
final_ans = instance.ground_truth
|
152 |
+
else:
|
153 |
+
get_ground_truth_cmd = instance.get_ground_truth
|
154 |
+
if get_ground_truth_cmd is not None:
|
155 |
+
script_name = 'get_ground_truth.sh'
|
156 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
157 |
+
host_script_path = os.path.join(tmpdir, script_name)
|
158 |
+
create_sh_file(host_script_path, get_ground_truth_cmd)
|
159 |
+
runtime.copy_to(
|
160 |
+
host_script_path,
|
161 |
+
'/workspace',
|
162 |
+
)
|
163 |
+
logger.info(f'Running get ground truth cmd: {script_name}')
|
164 |
+
|
165 |
+
action = CmdRunAction(
|
166 |
+
command=f'chmod +x ./{script_name} && ./{script_name}'
|
167 |
+
)
|
168 |
+
logger.info(action, extra={'msg_type': 'ACTION'})
|
169 |
+
obs = runtime.run_action(action)
|
170 |
+
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
171 |
+
final_ans = obs.content
|
172 |
+
|
173 |
+
logger.info(f"{'-' * 50} END Runtime Completion Fn {'-' * 50}")
|
174 |
+
return {
|
175 |
+
'final_ans': final_ans,
|
176 |
+
'agent_answer': agent_answer,
|
177 |
+
}
|
178 |
+
|
179 |
+
|
180 |
+
def process_instance(
|
181 |
+
instance: pd.Series,
|
182 |
+
metadata: EvalMetadata,
|
183 |
+
reset_logger: bool = True,
|
184 |
+
) -> EvalOutput:
|
185 |
+
config = get_config(metadata)
|
186 |
+
|
187 |
+
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
|
188 |
+
if reset_logger:
|
189 |
+
log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
|
190 |
+
reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir)
|
191 |
+
else:
|
192 |
+
logger.info(f'Starting evaluation for instance {instance.instance_id}.')
|
193 |
+
|
194 |
+
# =============================================
|
195 |
+
# build instruction
|
196 |
+
# =============================================
|
197 |
+
|
198 |
+
# Prepare instruction
|
199 |
+
instruction = (
|
200 |
+
f'Please fix the following issue.\n'
|
201 |
+
'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
|
202 |
+
'Please encapsulate your final answer (answer ONLY) within <solution> and </solution>.\n'
|
203 |
+
'For example: The answer to the question is <solution> 42 </solution>.\n'
|
204 |
+
'# Problem \n'
|
205 |
+
f'{instance.description}\n\n'
|
206 |
+
)
|
207 |
+
instruction += (
|
208 |
+
'IMPORTANT: You should ONLY interact with the environment provided '
|
209 |
+
'to you AND NEVER ASK FOR HUMAN HELP.\n'
|
210 |
+
)
|
211 |
+
# NOTE: You can actually set slightly different instruction for different agents
|
212 |
+
instruction += INST_SUFFIXES[metadata.agent_class]
|
213 |
+
|
214 |
+
# =============================================
|
215 |
+
# create sandbox and run the agent
|
216 |
+
# =============================================
|
217 |
+
|
218 |
+
runtime: Runtime = create_runtime(config)
|
219 |
+
call_async_from_sync(runtime.connect)
|
220 |
+
|
221 |
+
initialize_runtime(runtime, instance=instance)
|
222 |
+
|
223 |
+
# Here's how you can run the agent (similar to the `main` function) and get the final task state
|
224 |
+
state: State | None = asyncio.run(
|
225 |
+
run_controller(
|
226 |
+
config=config,
|
227 |
+
initial_user_action=MessageAction(content=instruction),
|
228 |
+
runtime=runtime,
|
229 |
+
fake_user_response_fn=FAKE_RESPONSES[metadata.agent_class],
|
230 |
+
)
|
231 |
+
)
|
232 |
+
if state is None:
|
233 |
+
raise ValueError('State should not be None.')
|
234 |
+
|
235 |
+
# =============================================
|
236 |
+
# result evaluation
|
237 |
+
# =============================================
|
238 |
+
|
239 |
+
return_val = complete_runtime(runtime, instance)
|
240 |
+
agent_answer = return_val['agent_answer']
|
241 |
+
final_ans = return_val['final_ans']
|
242 |
+
|
243 |
+
# If the agent answer is not found, retrieve it from the history
|
244 |
+
if agent_answer is None:
|
245 |
+
agent_answer = ''
|
246 |
+
logger.info('Retrieving agent answer from history.')
|
247 |
+
raw_ans = ''
|
248 |
+
|
249 |
+
# retrieve the last agent message or thought
|
250 |
+
for event in reversed(state.history):
|
251 |
+
if event.source == 'agent':
|
252 |
+
if isinstance(event, AgentFinishAction):
|
253 |
+
raw_ans = event.thought
|
254 |
+
break
|
255 |
+
elif isinstance(event, MessageAction):
|
256 |
+
raw_ans = event.content
|
257 |
+
break
|
258 |
+
elif isinstance(event, CmdRunAction):
|
259 |
+
raw_ans = event.thought
|
260 |
+
break
|
261 |
+
|
262 |
+
# parse the answer for a solution tag
|
263 |
+
agent_answer = re.findall(r'<solution>(.*?)</solution>', raw_ans, re.DOTALL)
|
264 |
+
if len(agent_answer) == 0:
|
265 |
+
logger.warning(f'Failed to parse model answer: {raw_ans}')
|
266 |
+
agent_answer = raw_ans
|
267 |
+
else:
|
268 |
+
agent_answer = agent_answer[0]
|
269 |
+
|
270 |
+
comparison_method = instance.comparison_method
|
271 |
+
logger.info(
|
272 |
+
f'Final message: {agent_answer} | Ground truth: {final_ans} | Comparison method: {comparison_method}'
|
273 |
+
)
|
274 |
+
test_result = compare_results(comparison_method, agent_answer, final_ans)
|
275 |
+
|
276 |
+
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
|
277 |
+
# for compatibility with the existing output format, we can remake the pairs here
|
278 |
+
# remove when it becomes unnecessary
|
279 |
+
histories = compatibility_for_eval_history_pairs(state.history)
|
280 |
+
|
281 |
+
metrics = state.metrics.get() if state.metrics else None
|
282 |
+
|
283 |
+
# Save the output
|
284 |
+
output = EvalOutput(
|
285 |
+
instance_id=instance.instance_id,
|
286 |
+
instance=instance.to_dict(),
|
287 |
+
instruction=instruction,
|
288 |
+
metadata=metadata,
|
289 |
+
history=histories,
|
290 |
+
metrics=metrics,
|
291 |
+
error=state.last_error if state and state.last_error else None,
|
292 |
+
test_result={
|
293 |
+
'agent_answer': agent_answer,
|
294 |
+
'final_answer': final_ans,
|
295 |
+
'check_method': comparison_method,
|
296 |
+
'result': test_result,
|
297 |
+
},
|
298 |
+
)
|
299 |
+
return output
|
300 |
+
|
301 |
+
|
302 |
+
if __name__ == '__main__':
|
303 |
+
args = parse_arguments()
|
304 |
+
dataset = load_dataset('iFurySt/AgentBench')
|
305 |
+
agent_bench_tests = dataset['osbench'].to_pandas()
|
306 |
+
|
307 |
+
llm_config = None
|
308 |
+
if args.llm_config:
|
309 |
+
llm_config = get_llm_config_arg(args.llm_config)
|
310 |
+
# modify_params must be False for evaluation purpose, for reproducibility and accurancy of results
|
311 |
+
llm_config.modify_params = False
|
312 |
+
|
313 |
+
if llm_config is None:
|
314 |
+
raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
|
315 |
+
|
316 |
+
metadata = make_metadata(
|
317 |
+
llm_config,
|
318 |
+
'AgentBench-OS',
|
319 |
+
args.agent_cls,
|
320 |
+
args.max_iterations,
|
321 |
+
args.eval_note,
|
322 |
+
args.eval_output_dir,
|
323 |
+
)
|
324 |
+
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
|
325 |
+
instances = prepare_dataset(agent_bench_tests, output_file, args.eval_n_limit)
|
326 |
+
|
327 |
+
run_evaluation(
|
328 |
+
instances, metadata, output_file, args.eval_num_workers, process_instance
|
329 |
+
)
|
evaluation/benchmarks/agent_bench/scripts/run_infer.sh
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
set -eo pipefail
|
3 |
+
|
4 |
+
source "evaluation/utils/version_control.sh"
|
5 |
+
|
6 |
+
MODEL_CONFIG=$1
|
7 |
+
COMMIT_HASH=$2
|
8 |
+
AGENT=$3
|
9 |
+
EVAL_LIMIT=$4
|
10 |
+
NUM_WORKERS=$5
|
11 |
+
|
12 |
+
if [ -z "$NUM_WORKERS" ]; then
|
13 |
+
NUM_WORKERS=1
|
14 |
+
echo "Number of workers not specified, use default $NUM_WORKERS"
|
15 |
+
fi
|
16 |
+
checkout_eval_branch
|
17 |
+
|
18 |
+
if [ -z "$AGENT" ]; then
|
19 |
+
echo "Agent not specified, use default CodeActAgent"
|
20 |
+
AGENT="CodeActAgent"
|
21 |
+
fi
|
22 |
+
|
23 |
+
get_openhands_version
|
24 |
+
|
25 |
+
echo "AGENT: $AGENT"
|
26 |
+
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
27 |
+
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
28 |
+
|
29 |
+
COMMAND="export PYTHONPATH=evaluation/benchmarks/agent_bench:\$PYTHONPATH && poetry run python evaluation/benchmarks/agent_bench/run_infer.py \
|
30 |
+
--agent-cls $AGENT \
|
31 |
+
--llm-config $MODEL_CONFIG \
|
32 |
+
--max-iterations 30 \
|
33 |
+
--eval-num-workers $NUM_WORKERS \
|
34 |
+
--eval-note $OPENHANDS_VERSION"
|
35 |
+
|
36 |
+
if [ -n "$EVAL_LIMIT" ]; then
|
37 |
+
echo "EVAL_LIMIT: $EVAL_LIMIT"
|
38 |
+
COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
|
39 |
+
fi
|
40 |
+
|
41 |
+
# Run the command
|
42 |
+
eval $COMMAND
|
evaluation/benchmarks/agent_bench/scripts/summarise_results.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import sys
|
3 |
+
|
4 |
+
|
5 |
+
def extract_test_results(res_file_path: str) -> tuple[list[str], list[str]]:
|
6 |
+
passed = []
|
7 |
+
failed = []
|
8 |
+
with open(res_file_path, 'r') as file:
|
9 |
+
for line in file:
|
10 |
+
data = json.loads(line.strip())
|
11 |
+
instance_id = data['instance_id']
|
12 |
+
resolved = False
|
13 |
+
if 'test_result' in data and 'result' in data['test_result']:
|
14 |
+
resolved = data['test_result']['result']
|
15 |
+
if resolved:
|
16 |
+
passed.append(instance_id)
|
17 |
+
else:
|
18 |
+
failed.append(instance_id)
|
19 |
+
return passed, failed
|
20 |
+
|
21 |
+
|
22 |
+
if __name__ == '__main__':
|
23 |
+
if len(sys.argv) != 2:
|
24 |
+
print(
|
25 |
+
'Usage: poetry run python summarise_results.py <path_to_output_jsonl_file>'
|
26 |
+
)
|
27 |
+
sys.exit(1)
|
28 |
+
json_file_path = sys.argv[1]
|
29 |
+
passed_tests, failed_tests = extract_test_results(json_file_path)
|
30 |
+
succ_rate = len(passed_tests) / (len(passed_tests) + len(failed_tests))
|
31 |
+
print(
|
32 |
+
f'\nPassed {len(passed_tests)} tests, failed {len(failed_tests)} tests, resolve rate = {succ_rate}'
|
33 |
+
)
|
34 |
+
print('PASSED TESTS:')
|
35 |
+
print(passed_tests)
|
36 |
+
print('FAILED TESTS:')
|
37 |
+
print(failed_tests)
|
evaluation/benchmarks/aider_bench/README.md
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# AiderBench Evaluation
|
2 |
+
|
3 |
+
This folder contains evaluation harness for evaluating agents on the
|
4 |
+
[Aider Editing Benchmark](https://github.com/paul-gauthier/aider/blob/main/benchmark/README.md).
|
5 |
+
This will allow us to develop better editing approach without running the full
|
6 |
+
SWE-bench. The benchmark uses the
|
7 |
+
[RajMaheshwari/Exercism-Python](https://huggingface.co/datasets/RajMaheshwari/Exercism-Python)
|
8 |
+
Hugging Face dataset based on the
|
9 |
+
[Exercism python coding exercises](https://github.com/exercism/python).
|
10 |
+
|
11 |
+
## Setup Environment and LLM Configuration
|
12 |
+
|
13 |
+
Please follow instruction [here](../../README.md#setup) to setup your local
|
14 |
+
development environment and LLM.
|
15 |
+
|
16 |
+
## Start the evaluation
|
17 |
+
|
18 |
+
```bash
|
19 |
+
./evaluation/benchmarks/aider_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [eval-num-workers] [eval_ids]
|
20 |
+
```
|
21 |
+
|
22 |
+
- `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for
|
23 |
+
your LLM settings, as defined in your `config.toml`.
|
24 |
+
- `git-version`, e.g. `HEAD`, is the git commit hash of the OpenHands version
|
25 |
+
you would like to evaluate. It could also be a release tag like `0.9.0`.
|
26 |
+
- `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks,
|
27 |
+
defaulting to `CodeActAgent`.
|
28 |
+
- `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit`
|
29 |
+
instances. By default, the script evaluates the entire Exercism test set
|
30 |
+
(133 issues). Note: in order to use `eval_limit`, you must also set `agent`.
|
31 |
+
- `eval-num-workers`: the number of workers to use for evaluation. Default: `1`.
|
32 |
+
- `eval_ids`, e.g. `"1,3,10"`, limits the evaluation to instances with the
|
33 |
+
given IDs (comma separated).
|
34 |
+
|
35 |
+
There are also following optional environment variables you can set:
|
36 |
+
|
37 |
+
```bash
|
38 |
+
export USE_UNIT_TESTS=true # if you want to allow the Agent to verify correctness using unittests. Default to false.
|
39 |
+
export SKIP_NUM=12 # skip the first 12 instances from the dataset
|
40 |
+
```
|
41 |
+
|
42 |
+
Following is the basic command to start the evaluation.
|
43 |
+
|
44 |
+
You can update the arguments in the script
|
45 |
+
`evaluation/benchmarks/aider_bench/scripts/run_infer.sh`, such as `--max-iterations`,
|
46 |
+
`--eval-num-workers` and so on:
|
47 |
+
|
48 |
+
- `--agent-cls`, the agent to use. For example, `CodeActAgent`.
|
49 |
+
- `--llm-config`: the LLM configuration to use. For example, `eval_gpt4_1106_preview`.
|
50 |
+
- `--max-iterations`: the max allowed number of iterations to run the evaluation. Default: `30`.
|
51 |
+
- `--eval-num-workers`: the number of workers to use for evaluation. Default: `1`.
|
52 |
+
- `--eval-n-limit`: the number of examples to evaluate. For example, `100`.
|
53 |
+
- `--eval-ids`: the IDs of the examples to evaluate (comma separated). For example, `"1,3,10"`.
|
54 |
+
|
55 |
+
```bash
|
56 |
+
./evaluation/benchmarks/aider_bench/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 100 1 "1,3,10"
|
57 |
+
```
|
58 |
+
|
59 |
+
### Run Inference on `RemoteRuntime` (experimental)
|
60 |
+
|
61 |
+
This is in limited beta. Contact Xingyao over slack if you want to try this out!
|
62 |
+
|
63 |
+
```bash
|
64 |
+
./evaluation/benchmarks/aider_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [eval-num-workers] [eval_ids]
|
65 |
+
|
66 |
+
# Example - This runs evaluation on CodeActAgent for 133 instances on aider_bench test set, with 2 workers running in parallel
|
67 |
+
export ALLHANDS_API_KEY="YOUR-API-KEY"
|
68 |
+
export RUNTIME=remote
|
69 |
+
export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev"
|
70 |
+
./evaluation/benchmarks/aider_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 133 2
|
71 |
+
```
|
72 |
+
|
73 |
+
## Summarize Results
|
74 |
+
|
75 |
+
```bash
|
76 |
+
poetry run python ./evaluation/benchmarks/aider_bench/scripts/summarize_results.py [path_to_output_jsonl_file]
|
77 |
+
```
|
78 |
+
|
79 |
+
Full example:
|
80 |
+
|
81 |
+
```bash
|
82 |
+
poetry run python ./evaluation/benchmarks/aider_bench/scripts/summarize_results.py evaluation/evaluation_outputs/outputs/AiderBench/CodeActAgent/claude-3-5-sonnet@20240620_maxiter_30_N_v1.9/output.jsonl
|
83 |
+
```
|
84 |
+
|
85 |
+
This will list the instances that passed and the instances that failed. For each
|
86 |
+
instance, the corresponding set of test cases (which can vary for each instance)
|
87 |
+
are run on the file edited by the agent. We consider an instance to be passed
|
88 |
+
only if ALL test cases are passed. Sometimes even a single failed test case will
|
89 |
+
cause the entire instance to be marked as failed.
|
90 |
+
|
91 |
+
You can inspect the `test_results` field in the `output.jsonl` file to find the exact
|
92 |
+
outcome of the tests. If there are no syntax or indentation errors, you can
|
93 |
+
expect to see something like "`..F...EF..`", where "`.`" means the test case
|
94 |
+
passed, "`E`" means there was an error while executing the test case and "`F`"
|
95 |
+
means some assertion failed and some returned output was not as expected.
|
evaluation/benchmarks/aider_bench/create_dataset.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This file was used to create the hugging face dataset from the exercism/python
|
2 |
+
# github repo.
|
3 |
+
# Refer to: https://github.com/exercism/python/tree/main/exercises/practice
|
4 |
+
|
5 |
+
import os
|
6 |
+
from pathlib import Path
|
7 |
+
|
8 |
+
from datasets import Dataset
|
9 |
+
|
10 |
+
tests = sorted(os.listdir('practice/'))
|
11 |
+
dataset = {
|
12 |
+
'instance_id': [],
|
13 |
+
'instance_name': [],
|
14 |
+
'instruction': [],
|
15 |
+
'signature': [],
|
16 |
+
'test': [],
|
17 |
+
}
|
18 |
+
|
19 |
+
for i, test in enumerate(tests):
|
20 |
+
testdir = Path(f'practice/{test}/')
|
21 |
+
|
22 |
+
dataset['instance_id'].append(i)
|
23 |
+
dataset['instance_name'].append(testdir.name.replace('-', '_'))
|
24 |
+
|
25 |
+
# if len(glob.glob(f'practice/{testdir.name}/*.py')) != 2:
|
26 |
+
# print(testdir.name)
|
27 |
+
|
28 |
+
instructions = ''
|
29 |
+
introduction = testdir / '.docs/introduction.md'
|
30 |
+
if introduction.exists():
|
31 |
+
instructions += introduction.read_text()
|
32 |
+
instructions += (testdir / '.docs/instructions.md').read_text()
|
33 |
+
instructions_append = testdir / '.docs/instructions.append.md'
|
34 |
+
if instructions_append.exists():
|
35 |
+
instructions += instructions_append.read_text()
|
36 |
+
|
37 |
+
dataset['instruction'].append(instructions)
|
38 |
+
|
39 |
+
signature_file = testdir / (testdir.name + '.py').replace('-', '_')
|
40 |
+
dataset['signature'].append(signature_file.read_text())
|
41 |
+
|
42 |
+
test_file = testdir / (testdir.name + '_test.py').replace('-', '_')
|
43 |
+
dataset['test'].append(test_file.read_text())
|
44 |
+
|
45 |
+
ds = Dataset.from_dict(dataset)
|
46 |
+
|
47 |
+
ds.push_to_hub('RajMaheshwari/Exercism-Python')
|