ar08 commited on
Commit
246d201
·
verified ·
1 Parent(s): b5267e0

Upload 1040 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .dockerignore +5 -0
  2. .gitattributes +1 -35
  3. .gitignore +235 -0
  4. .nvmrc +1 -0
  5. .openhands/microagents/repo.md +42 -0
  6. CODE_OF_CONDUCT.md +147 -0
  7. COMMUNITY.md +43 -0
  8. CONTRIBUTING.md +123 -0
  9. CREDITS.md +312 -0
  10. Development.md +128 -0
  11. Dockerfile +98 -0
  12. ISSUE_TRIAGE.md +25 -0
  13. LICENSE +25 -0
  14. MANIFEST.in +5 -0
  15. Makefile +332 -0
  16. build.sh +5 -0
  17. config.sh +4 -0
  18. config.template.toml +290 -0
  19. containers/README.md +12 -0
  20. containers/app/config.sh +4 -0
  21. containers/app/entrypoint.sh +0 -0
  22. containers/build.sh +156 -0
  23. containers/dev/Dockerfile +124 -0
  24. containers/dev/README.md +57 -0
  25. containers/dev/compose.yml +38 -0
  26. containers/dev/dev.sh +39 -0
  27. containers/e2b-sandbox/Dockerfile +19 -0
  28. containers/e2b-sandbox/README.md +15 -0
  29. containers/e2b-sandbox/e2b.toml +14 -0
  30. containers/runtime/README.md +12 -0
  31. containers/runtime/config.sh +7 -0
  32. dev_config/python/.pre-commit-config.yaml +43 -0
  33. dev_config/python/mypy.ini +9 -0
  34. dev_config/python/ruff.toml +26 -0
  35. docker-compose.yml +23 -0
  36. entrypoint.sh +69 -0
  37. evaluation/README.md +94 -0
  38. evaluation/__init__.py +0 -0
  39. evaluation/benchmarks/EDA/README.md +46 -0
  40. evaluation/benchmarks/EDA/game.py +203 -0
  41. evaluation/benchmarks/EDA/run_infer.py +238 -0
  42. evaluation/benchmarks/EDA/scripts/run_infer.sh +60 -0
  43. evaluation/benchmarks/agent_bench/README.md +56 -0
  44. evaluation/benchmarks/agent_bench/__init__.py +0 -0
  45. evaluation/benchmarks/agent_bench/helper.py +77 -0
  46. evaluation/benchmarks/agent_bench/run_infer.py +329 -0
  47. evaluation/benchmarks/agent_bench/scripts/run_infer.sh +42 -0
  48. evaluation/benchmarks/agent_bench/scripts/summarise_results.py +37 -0
  49. evaluation/benchmarks/aider_bench/README.md +95 -0
  50. evaluation/benchmarks/aider_bench/create_dataset.py +47 -0
.dockerignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ frontend/node_modules
2
+ config.toml
3
+ .envrc
4
+ .env
5
+ .git
.gitattributes CHANGED
@@ -1,35 +1 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.ipynb linguist-vendored
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ ./lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+ requirements.txt
29
+
30
+ # PyInstaller
31
+ # Usually these files are written by a python script from a template
32
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
+ *.manifest
34
+ *.spec
35
+
36
+ # Installer logs
37
+ pip-log.txt
38
+ pip-delete-this-directory.txt
39
+
40
+ # Unit test / coverage reports
41
+ htmlcov/
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ *.py,cover
51
+ .hypothesis/
52
+ .pytest_cache/
53
+ cover/
54
+
55
+ # Translations
56
+ *.mo
57
+ *.pot
58
+
59
+ # Django stuff:
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ # poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/#use-with-ide
110
+ .pdm.toml
111
+
112
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113
+ __pypackages__/
114
+
115
+ # Celery stuff
116
+ celerybeat-schedule
117
+ celerybeat.pid
118
+
119
+ # SageMath parsed files
120
+ *.sage.py
121
+
122
+ # Environments
123
+ .env
124
+ frontend/.env
125
+ .venv
126
+ env/
127
+ venv/
128
+ ENV/
129
+ env.bak/
130
+ .env.bak
131
+ venv.bak/
132
+ *venv/
133
+
134
+ # Spyder project settings
135
+ .spyderproject
136
+ .spyproject
137
+
138
+ # Rope project settings
139
+ .ropeproject
140
+
141
+ # mkdocs documentation
142
+ /site
143
+
144
+ # mypy
145
+ .mypy_cache/
146
+ .dmypy.json
147
+ dmypy.json
148
+
149
+ # Pyre type checker
150
+ .pyre/
151
+
152
+ # pytype static type analyzer
153
+ .pytype/
154
+
155
+ # Cython debug symbols
156
+ cython_debug/
157
+
158
+ # PyCharm
159
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
160
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
161
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
162
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
163
+ .idea/
164
+ .vscode/
165
+ .cursorignore
166
+
167
+ # evaluation
168
+ evaluation/evaluation_outputs
169
+ evaluation/outputs
170
+ evaluation/swe_bench/eval_workspace*
171
+ evaluation/SWE-bench/data
172
+ evaluation/webarena/scripts/webarena_env.sh
173
+ evaluation/bird/data
174
+ evaluation/gaia/data
175
+ evaluation/gorilla/data
176
+ evaluation/toolqa/data
177
+ evaluation/scienceagentbench/benchmark
178
+ evaluation/commit0_bench/repos
179
+
180
+ # openhands resolver
181
+ output/
182
+
183
+ # frontend
184
+
185
+ # dependencies
186
+ frontend/.pnp
187
+ frontend/bun.lockb
188
+ frontend/yarn.lock
189
+ .pnp.js
190
+
191
+ # testing
192
+ frontend/coverage
193
+ test_results*
194
+ /_test_files_tmp/
195
+
196
+ # production
197
+ frontend/build
198
+ frontend/dist
199
+
200
+ # misc
201
+ .DS_Store
202
+ .env.local
203
+ .env.development.local
204
+ .env.test.local
205
+ .env.production.local
206
+
207
+ npm-debug.log*
208
+ yarn-debug.log*
209
+ yarn-error.log*
210
+
211
+ logs
212
+
213
+ # agent
214
+ .envrc
215
+ /workspace
216
+ /_test_workspace
217
+ /debug
218
+ cache
219
+
220
+ # configuration
221
+ config.toml
222
+ config.toml_
223
+ config.toml.bak
224
+
225
+ # swe-bench-eval
226
+ image_build_logs
227
+ run_instance_logs
228
+
229
+ runtime_*.tar
230
+
231
+ # docker build
232
+ containers/runtime/Dockerfile
233
+ containers/runtime/project.tar.gz
234
+ containers/runtime/code
235
+ **/node_modules/
.nvmrc ADDED
@@ -0,0 +1 @@
 
 
1
+ 22
.openhands/microagents/repo.md ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ name: repo
3
+ type: repo
4
+ agent: CodeActAgent
5
+ ---
6
+ This repository contains the code for OpenHands, an automated AI software engineer. It has a Python backend
7
+ (in the `openhands` directory) and React frontend (in the `frontend` directory).
8
+
9
+ ## General Setup:
10
+ To set up the entire repo, including frontend and backend, run `make build`.
11
+ You don't need to do this unless the user asks you to, or if you're trying to run the entire application.
12
+
13
+ Before pushing any changes, you should ensure that any lint errors or simple test errors have been fixed.
14
+
15
+ * If you've made changes to the backend, you should run `pre-commit run --all-files --config ./dev_config/python/.pre-commit-config.yaml`
16
+ * If you've made changes to the frontend, you should run `cd frontend && npm run lint:fix && npm run build ; cd ..`
17
+
18
+ If either command fails, it may have automatically fixed some issues. You should fix any issues that weren't automatically fixed,
19
+ then re-run the command to ensure it passes.
20
+
21
+ ## Repository Structure
22
+ Backend:
23
+ - Located in the `openhands` directory
24
+ - Testing:
25
+ - All tests are in `tests/unit/test_*.py`
26
+ - To test new code, run `poetry run pytest tests/unit/test_xxx.py` where `xxx` is the appropriate file for the current functionality
27
+ - Write all tests with pytest
28
+
29
+ Frontend:
30
+ - Located in the `frontend` directory
31
+ - Prerequisites: A recent version of NodeJS / NPM
32
+ - Setup: Run `npm install` in the frontend directory
33
+ - Testing:
34
+ - Run tests: `npm run test`
35
+ - To run specific tests: `npm run test -- -t "TestName"`
36
+ - Building:
37
+ - Build for production: `npm run build`
38
+ - Environment Variables:
39
+ - Set in `frontend/.env` or as environment variables
40
+ - Available variables: VITE_BACKEND_HOST, VITE_USE_TLS, VITE_INSECURE_SKIP_VERIFY, VITE_FRONTEND_PORT
41
+ - Internationalization:
42
+ - Generate i18n declaration file: `npm run make-i18n`
CODE_OF_CONDUCT.md ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # Contributor Covenant Code of Conduct
3
+
4
+ ## Our Pledge
5
+
6
+ We as members, contributors, and leaders pledge to make participation in our
7
+ community a harassment-free experience for everyone, regardless of age, body
8
+ size, visible or invisible disability, ethnicity, sex characteristics, gender
9
+ identity and expression, level of experience, education, socio-economic status,
10
+ nationality, personal appearance, race, caste, color, religion, or sexual
11
+ identity and orientation.
12
+
13
+ We pledge to act and interact in ways that contribute to an open, welcoming,
14
+ diverse, inclusive, and healthy community.
15
+
16
+ ## Our Standards
17
+
18
+ Examples of behavior that contributes to a positive environment for our
19
+ community include:
20
+
21
+ * Demonstrating empathy and kindness toward other people.
22
+ * Being respectful of differing opinions, viewpoints, and experiences.
23
+ * Giving and gracefully accepting constructive feedback.
24
+ * Accepting responsibility and apologizing to those affected by our mistakes,
25
+ and learning from the experience.
26
+ * Focusing on what is best not just for us as individuals, but for the overall
27
+ community.
28
+
29
+ Examples of unacceptable behavior include:
30
+
31
+ * The use of sexualized language or imagery, and sexual attention or advances of
32
+ any kind.
33
+ * Trolling, insulting or derogatory comments, and personal or political attacks.
34
+ * Public or private harassment.
35
+ * Publishing others' private information, such as a physical or email address,
36
+ without their explicit permission.
37
+ * Other conduct which could reasonably be considered inappropriate in a
38
+ professional setting.
39
+
40
+ ## Enforcement Responsibilities
41
+
42
+ Community leaders are responsible for clarifying and enforcing our standards of
43
+ acceptable behavior and will take appropriate and fair corrective action in
44
+ response to any behavior that they deem inappropriate, threatening, offensive,
45
+ or harmful.
46
+
47
+ Community leaders have the right and responsibility to remove, edit, or reject
48
+ comments, commits, code, wiki edits, issues, and other contributions that are
49
+ not aligned to this Code of Conduct, and will communicate reasons for moderation
50
+ decisions when appropriate.
51
+
52
+ ## Scope
53
+
54
+ This Code of Conduct applies within all community spaces, and also applies when
55
+ an individual is officially representing the community in public spaces.
56
+ Examples of representing our community include using an official email address,
57
+ posting via an official social media account, or acting as an appointed
58
+ representative at an online or offline event.
59
+
60
+ ## Enforcement
61
+
62
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be
63
+ reported to the community leaders responsible for enforcement at
64
65
+ All complaints will be reviewed and investigated promptly and fairly.
66
+
67
+ All community leaders are obligated to respect the privacy and security of the
68
+ reporter of any incident.
69
+
70
+ ## Enforcement Guidelines
71
+
72
+ Community leaders will follow these Community Impact Guidelines in determining
73
+ the consequences for any action they deem in violation of this Code of Conduct:
74
+
75
+ ### 1. Correction
76
+
77
+ **Community Impact**: Use of inappropriate language or other behavior deemed
78
+ unprofessional or unwelcome in the community.
79
+
80
+ **Consequence**: A private, written warning from community leaders, providing
81
+ clarity around the nature of the violation and an explanation of why the
82
+ behavior was inappropriate. A public apology may be requested.
83
+
84
+ ### 2. Warning
85
+
86
+ **Community Impact**: A violation through a single incident or series of
87
+ actions.
88
+
89
+ **Consequence**: A warning with consequences for continued behavior. No
90
+ interaction with the people involved, including unsolicited interaction with
91
+ those enforcing the Code of Conduct, for a specified period of time. This
92
+ includes avoiding interactions in community spaces as well as external channels
93
+ like social media. Violating these terms may lead to a temporary or permanent
94
+ ban.
95
+
96
+ ### 3. Temporary Ban
97
+
98
+ **Community Impact**: A serious violation of community standards, including
99
+ sustained inappropriate behavior.
100
+
101
+ **Consequence**: A temporary ban from any sort of interaction or public
102
+ communication with the community for a specified period of time. No public or
103
+ private interaction with the people involved, including unsolicited interaction
104
+ with those enforcing the Code of Conduct, is allowed during this period.
105
+ Violating these terms may lead to a permanent ban.
106
+
107
+ ### 4. Permanent Ban
108
+
109
+ **Community Impact**: Demonstrating a pattern of violation of community
110
+ standards, including sustained inappropriate behavior, harassment of an
111
+ individual, or aggression toward or disparagement of classes of individuals.
112
+
113
+ **Consequence**: A permanent ban from any sort of public interaction within the
114
+ community.
115
+
116
+ ### Slack and Discord Etiquettes
117
+
118
+ These Slack and Discord etiquette guidelines are designed to foster an inclusive, respectful, and productive environment for all community members. By following these best practices, we ensure effective communication and collaboration while minimizing disruptions. Let’s work together to build a supportive and welcoming community!
119
+
120
+ - Communicate respectfully and professionally, avoiding sarcasm or harsh language, and remember that tone can be difficult to interpret in text.
121
+ - Use threads for specific discussions to keep channels organized and easier to follow.
122
+ - Tag others only when their input is critical or urgent, and use @here, @channel or @everyone sparingly to minimize disruptions.
123
+ - Be patient, as open-source contributors and maintainers often have other commitments and may need time to respond.
124
+ - Post questions or discussions in the most relevant channel (e.g., for [slack - #general](https://app.slack.com/client/T06P212QSEA/C06P5NCGSFP) for general topics, [slack - #questions](https://openhands-ai.slack.com/archives/C06U8UTKSAD) for queries/questions, [discord - #general](https://discord.com/channels/1222935860639563850/1222935861386018885)).
125
+ - When asking for help or raising issues, include necessary details like links, screenshots, or clear explanations to provide context.
126
+ - Keep discussions in public channels whenever possible to allow others to benefit from the conversation, unless the matter is sensitive or private.
127
+ - Always adhere to [our standards](https://github.com/All-Hands-AI/OpenHands/blob/main/CODE_OF_CONDUCT.md#our-standards) to ensure a welcoming and collaborative environment.
128
+ - If you choose to mute a channel, consider setting up alerts for topics that still interest you to stay engaged. For Slack, Go to Settings → Notifications → My Keywords to add specific keywords that will notify you when mentioned. For example, if you're here for discussions about LLMs, mute the channel if it’s too busy, but set notifications to alert you only when “LLMs” appears in messages. Also for Discord, go to the channel notifications and choose the option that best describes your need.
129
+
130
+ ## Attribution
131
+
132
+ This Code of Conduct is adapted from the [Contributor Covenant][homepage],
133
+ version 2.1, available at
134
+ [https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
135
+
136
+ Community Impact Guidelines were inspired by
137
+ [Mozilla's code of conduct enforcement ladder][Mozilla CoC].
138
+
139
+ For answers to common questions about this code of conduct, see the FAQ at
140
+ [https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
141
+ [https://www.contributor-covenant.org/translations][translations].
142
+
143
+ [homepage]: https://www.contributor-covenant.org
144
+ [v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
145
+ [Mozilla CoC]: https://github.com/mozilla/diversity
146
+ [FAQ]: https://www.contributor-covenant.org/faq
147
+ [translations]: https://www.contributor-covenant.org/translations
COMMUNITY.md ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🙌 The OpenHands Community
2
+
3
+ The OpenHands community is built around the belief that (1) AI and AI agents are going to fundamentally change the way
4
+ we build software, and (2) if this is true, we should do everything we can to make sure that the benefits provided by
5
+ such powerful technology are accessible to everyone.
6
+
7
+ If this resonates with you, we'd love to have you join us in our quest!
8
+
9
+ ## 🤝 How to Join
10
+
11
+ Check out our [How to Join the Community section.](https://github.com/All-Hands-AI/OpenHands?tab=readme-ov-file#-how-to-join-the-community)
12
+
13
+ ## 💪 Becoming a Contributor
14
+
15
+ We welcome contributions from everyone! Whether you're a developer, a researcher, or simply enthusiastic about advancing
16
+ the field of software engineering with AI, there are many ways to get involved:
17
+
18
+ - **Code Contributions:** Help us develop new core functionality, improve our agents, improve the frontend and other
19
+ interfaces, or anything else that would help make OpenHands better.
20
+ - **Research and Evaluation:** Contribute to our understanding of LLMs in software engineering, participate in
21
+ evaluating the models, or suggest improvements.
22
+ - **Feedback and Testing:** Use the OpenHands toolset, report bugs, suggest features, or provide feedback on usability.
23
+
24
+ For details, please check [CONTRIBUTING.md](./CONTRIBUTING.md).
25
+
26
+ ## Code of Conduct
27
+
28
+ We have a [Code of Conduct](./CODE_OF_CONDUCT.md) that we expect all contributors to adhere to.
29
+ Long story short, we are aiming for an open, welcoming, diverse, inclusive, and healthy community.
30
+ All contributors are expected to contribute to building this sort of community.
31
+
32
+ ## 🛠️ Becoming a Maintainer
33
+
34
+ For contributors who have made significant and sustained contributions to the project, there is a possibility of joining
35
+ the maintainer team. The process for this is as follows:
36
+
37
+ 1. Any contributor who has made sustained and high-quality contributions to the codebase can be nominated by any
38
+ maintainer. If you feel that you may qualify you can reach out to any of the maintainers that have reviewed your PRs and ask if you can be nominated.
39
+ 2. Once a maintainer nominates a new maintainer, there will be a discussion period among the maintainers for at least 3 days.
40
+ 3. If no concerns are raised the nomination will be accepted by acclamation, and if concerns are raised there will be a discussion and possible vote.
41
+
42
+ Note that just making many PRs does not immediately imply that you will become a maintainer. We will be looking
43
+ at sustained high-quality contributions over a period of time, as well as good teamwork and adherence to our [Code of Conduct](./CODE_OF_CONDUCT.md).
CONTRIBUTING.md ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Contributing
2
+
3
+ Thanks for your interest in contributing to OpenHands! We welcome and appreciate contributions.
4
+
5
+ ## Understanding OpenHands's CodeBase
6
+
7
+ To understand the codebase, please refer to the README in each module:
8
+ - [frontend](./frontend/README.md)
9
+ - [evaluation](./evaluation/README.md)
10
+ - [openhands](./openhands/README.md)
11
+ - [agenthub](./openhands/agenthub/README.md)
12
+ - [server](./openhands/server/README.md)
13
+
14
+ ## Setting up Your Development Environment
15
+
16
+ We have a separate doc [Development.md](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md) that tells you how to set up a development workflow.
17
+
18
+ ## How Can I Contribute?
19
+
20
+ There are many ways that you can contribute:
21
+
22
+ 1. **Download and use** OpenHands, and send [issues](https://github.com/All-Hands-AI/OpenHands/issues) when you encounter something that isn't working or a feature that you'd like to see.
23
+ 2. **Send feedback** after each session by [clicking the thumbs-up thumbs-down buttons](https://docs.all-hands.dev/modules/usage/feedback), so we can see where things are working and failing, and also build an open dataset for training code agents.
24
+ 3. **Improve the Codebase** by sending [PRs](#sending-pull-requests-to-openhands) (see details below). In particular, we have some [good first issues](https://github.com/All-Hands-AI/OpenHands/labels/good%20first%20issue) that may be ones to start on.
25
+
26
+ ## What Can I Build?
27
+ Here are a few ways you can help improve the codebase.
28
+
29
+ #### UI/UX
30
+ We're always looking to improve the look and feel of the application. If you've got a small fix
31
+ for something that's bugging you, feel free to open up a PR that changes the [`./frontend`](./frontend) directory.
32
+
33
+ If you're looking to make a bigger change, add a new UI element, or significantly alter the style
34
+ of the application, please open an issue first, or better, join the #frontend channel in our Slack
35
+ to gather consensus from our design team first.
36
+
37
+ #### Improving the agent
38
+ Our main agent is the CodeAct agent. You can [see its prompts here](https://github.com/All-Hands-AI/OpenHands/tree/main/openhands/agenthub/codeact_agent).
39
+
40
+ Changes to these prompts, and to the underlying behavior in Python, can have a huge impact on user experience.
41
+ You can try modifying the prompts to see how they change the behavior of the agent as you use the app
42
+ locally, but we will need to do an end-to-end evaluation of any changes here to ensure that the agent
43
+ is getting better over time.
44
+
45
+ We use the [SWE-bench](https://www.swebench.com/) benchmark to test our agent. You can join the #evaluation
46
+ channel in Slack to learn more.
47
+
48
+ #### Adding a new agent
49
+ You may want to experiment with building new types of agents. You can add an agent to [`openhands/agenthub`](./openhands/agenthub)
50
+ to help expand the capabilities of OpenHands.
51
+
52
+ #### Adding a new runtime
53
+ The agent needs a place to run code and commands. When you run OpenHands on your laptop, it uses a Docker container
54
+ to do this by default. But there are other ways of creating a sandbox for the agent.
55
+
56
+ If you work for a company that provides a cloud-based runtime, you could help us add support for that runtime
57
+ by implementing the [interface specified here](https://github.com/All-Hands-AI/OpenHands/blob/main/openhands/runtime/base.py).
58
+
59
+ #### Testing
60
+ When you write code, it is also good to write tests. Please navigate to the [`./tests`](./tests) folder to see existing test suites.
61
+ At the moment, we have two kinds of tests: [`unit`](./tests/unit) and [`integration`](./evaluation/integration_tests). Please refer to the README for each test suite. These tests also run on GitHub's continuous integration to ensure quality of the project.
62
+
63
+ ## Sending Pull Requests to OpenHands
64
+
65
+ You'll need to fork our repository to send us a Pull Request. You can learn more
66
+ about how to fork a GitHub repo and open a PR with your changes in [this article](https://medium.com/swlh/forks-and-pull-requests-how-to-contribute-to-github-repos-8843fac34ce8).
67
+
68
+ ### Pull Request title
69
+ As described [here](https://github.com/commitizen/conventional-commit-types/blob/master/index.json), a valid PR title should begin with one of the following prefixes:
70
+
71
+ - `feat`: A new feature
72
+ - `fix`: A bug fix
73
+ - `docs`: Documentation only changes
74
+ - `style`: Changes that do not affect the meaning of the code (white space, formatting, missing semicolons, etc.)
75
+ - `refactor`: A code change that neither fixes a bug nor adds a feature
76
+ - `perf`: A code change that improves performance
77
+ - `test`: Adding missing tests or correcting existing tests
78
+ - `build`: Changes that affect the build system or external dependencies (example scopes: gulp, broccoli, npm)
79
+ - `ci`: Changes to our CI configuration files and scripts (example scopes: Travis, Circle, BrowserStack, SauceLabs)
80
+ - `chore`: Other changes that don't modify src or test files
81
+ - `revert`: Reverts a previous commit
82
+
83
+ For example, a PR title could be:
84
+ - `refactor: modify package path`
85
+ - `feat(frontend): xxxx`, where `(frontend)` means that this PR mainly focuses on the frontend component.
86
+
87
+ You may also check out previous PRs in the [PR list](https://github.com/All-Hands-AI/OpenHands/pulls).
88
+
89
+ ### Pull Request description
90
+ - If your PR is small (such as a typo fix), you can go brief.
91
+ - If it contains a lot of changes, it's better to write more details.
92
+
93
+ If your changes are user-facing (e.g. a new feature in the UI, a change in behavior, or a bugfix)
94
+ please include a short message that we can add to our changelog.
95
+
96
+ ## How to Make Effective Contributions
97
+
98
+ ### Opening Issues
99
+
100
+ If you notice any bugs or have any feature requests please open them via the [issues page](https://github.com/All-Hands-AI/OpenHands/issues). We will triage based on how critical the bug is or how potentially useful the improvement is, discuss, and implement the ones that the community has interest/effort for.
101
+
102
+ Further, if you see an issue you like, please leave a "thumbs-up" or a comment, which will help us prioritize.
103
+
104
+ ### Making Pull Requests
105
+
106
+ We're generally happy to consider all pull requests with the evaluation process varying based on the type of change:
107
+
108
+ #### For Small Improvements
109
+
110
+ Small improvements with few downsides are typically reviewed and approved quickly.
111
+ One thing to check when making changes is to ensure that all continuous integration tests pass, which you can check before getting a review.
112
+
113
+ #### For Core Agent Changes
114
+
115
+ We need to be more careful with changes to the core agent, as it is imperative to maintain high quality. These PRs are evaluated based on three key metrics:
116
+
117
+ 1. **Accuracy**
118
+ 2. **Efficiency**
119
+ 3. **Code Complexity**
120
+
121
+ If it improves accuracy, efficiency, or both with only a minimal change to code quality, that's great we're happy to merge it in!
122
+ If there are bigger tradeoffs (e.g. helping efficiency a lot and hurting accuracy a little) we might want to put it behind a feature flag.
123
+ Either way, please feel free to discuss on github issues or slack, and we will give guidance and preliminary feedback.
CREDITS.md ADDED
@@ -0,0 +1,312 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Credits
2
+
3
+ ## Contributors
4
+
5
+ We would like to thank all the [contributors](https://github.com/All-Hands-AI/OpenHands/graphs/contributors) who have helped make OpenHands possible. We greatly appreciate your dedication and hard work.
6
+
7
+ ## Open Source Projects
8
+
9
+ OpenHands includes and adapts the following open source projects. We are grateful for their contributions to the open source community:
10
+
11
+ #### [SWE Agent](https://github.com/princeton-nlp/swe-agent)
12
+ - License: MIT License
13
+ - Description: Adapted for use in OpenHands's agent hub
14
+
15
+ #### [Aider](https://github.com/paul-gauthier/aider)
16
+ - License: Apache License 2.0
17
+ - Description: AI pair programming tool. OpenHands has adapted and integrated its linter module for code-related tasks in [`agentskills utilities`](https://github.com/All-Hands-AI/OpenHands/tree/main/openhands/runtime/plugins/agent_skills/utils/aider)
18
+
19
+ #### [BrowserGym](https://github.com/ServiceNow/BrowserGym)
20
+ - License: Apache License 2.0
21
+ - Description: Adapted in implementing the browsing agent
22
+
23
+
24
+ ### Reference Implementations for Evaluation Benchmarks
25
+ OpenHands integrates code of the reference implementations for the following agent evaluation benchmarks:
26
+
27
+ #### [HumanEval](https://github.com/openai/human-eval)
28
+ - License: MIT License
29
+
30
+ #### [DSP](https://github.com/microsoft/DataScienceProblems)
31
+ - License: MIT License
32
+
33
+ #### [HumanEvalPack](https://github.com/bigcode-project/bigcode-evaluation-harness)
34
+ - License: Apache License 2.0
35
+
36
+ #### [AgentBench](https://github.com/THUDM/AgentBench)
37
+ - License: Apache License 2.0
38
+
39
+ #### [SWE-Bench](https://github.com/princeton-nlp/SWE-bench)
40
+ - License: MIT License
41
+
42
+ #### [BIRD](https://bird-bench.github.io/)
43
+ - License: MIT License
44
+ - Dataset: CC-BY-SA 4.0
45
+
46
+ #### [Gorilla APIBench](https://github.com/ShishirPatil/gorilla)
47
+ - License: Apache License 2.0
48
+
49
+ #### [GPQA](https://github.com/idavidrein/gpqa)
50
+ - License: MIT License
51
+
52
+ #### [ProntoQA](https://github.com/asaparov/prontoqa)
53
+ - License: Apache License 2.0
54
+
55
+
56
+ ## Open Source licenses
57
+
58
+ ### MIT License
59
+
60
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
61
+
62
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
63
+
64
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
65
+
66
+ ### BSD 3-Clause License
67
+
68
+ Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
69
+
70
+ 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
71
+
72
+ 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
73
+
74
+ 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
75
+
76
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
77
+
78
+ ### Apache License 2.0
79
+
80
+
81
+ Apache License
82
+ Version 2.0, January 2004
83
+ http://www.apache.org/licenses/
84
+
85
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
86
+
87
+ 1. Definitions.
88
+
89
+ "License" shall mean the terms and conditions for use, reproduction,
90
+ and distribution as defined by Sections 1 through 9 of this document.
91
+
92
+ "Licensor" shall mean the copyright owner or entity authorized by
93
+ the copyright owner that is granting the License.
94
+
95
+ "Legal Entity" shall mean the union of the acting entity and all
96
+ other entities that control, are controlled by, or are under common
97
+ control with that entity. For the purposes of this definition,
98
+ "control" means (i) the power, direct or indirect, to cause the
99
+ direction or management of such entity, whether by contract or
100
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
101
+ outstanding shares, or (iii) beneficial ownership of such entity.
102
+
103
+ "You" (or "Your") shall mean an individual or Legal Entity
104
+ exercising permissions granted by this License.
105
+
106
+ "Source" form shall mean the preferred form for making modifications,
107
+ including but not limited to software source code, documentation
108
+ source, and configuration files.
109
+
110
+ "Object" form shall mean any form resulting from mechanical
111
+ transformation or translation of a Source form, including but
112
+ not limited to compiled object code, generated documentation,
113
+ and conversions to other media types.
114
+
115
+ "Work" shall mean the work of authorship, whether in Source or
116
+ Object form, made available under the License, as indicated by a
117
+ copyright notice that is included in or attached to the work
118
+ (an example is provided in the Appendix below).
119
+
120
+ "Derivative Works" shall mean any work, whether in Source or Object
121
+ form, that is based on (or derived from) the Work and for which the
122
+ editorial revisions, annotations, elaborations, or other modifications
123
+ represent, as a whole, an original work of authorship. For the purposes
124
+ of this License, Derivative Works shall not include works that remain
125
+ separable from, or merely link (or bind by name) to the interfaces of,
126
+ the Work and Derivative Works thereof.
127
+
128
+ "Contribution" shall mean any work of authorship, including
129
+ the original version of the Work and any modifications or additions
130
+ to that Work or Derivative Works thereof, that is intentionally
131
+ submitted to Licensor for inclusion in the Work by the copyright owner
132
+ or by an individual or Legal Entity authorized to submit on behalf of
133
+ the copyright owner. For the purposes of this definition, "submitted"
134
+ means any form of electronic, verbal, or written communication sent
135
+ to the Licensor or its representatives, including but not limited to
136
+ communication on electronic mailing lists, source code control systems,
137
+ and issue tracking systems that are managed by, or on behalf of, the
138
+ Licensor for the purpose of discussing and improving the Work, but
139
+ excluding communication that is conspicuously marked or otherwise
140
+ designated in writing by the copyright owner as "Not a Contribution."
141
+
142
+ "Contributor" shall mean Licensor and any individual or Legal Entity
143
+ on behalf of whom a Contribution has been received by Licensor and
144
+ subsequently incorporated within the Work.
145
+
146
+ 2. Grant of Copyright License. Subject to the terms and conditions of
147
+ this License, each Contributor hereby grants to You a perpetual,
148
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
149
+ copyright license to reproduce, prepare Derivative Works of,
150
+ publicly display, publicly perform, sublicense, and distribute the
151
+ Work and such Derivative Works in Source or Object form.
152
+
153
+ 3. Grant of Patent License. Subject to the terms and conditions of
154
+ this License, each Contributor hereby grants to You a perpetual,
155
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
156
+ (except as stated in this section) patent license to make, have made,
157
+ use, offer to sell, sell, import, and otherwise transfer the Work,
158
+ where such license applies only to those patent claims licensable
159
+ by such Contributor that are necessarily infringed by their
160
+ Contribution(s) alone or by combination of their Contribution(s)
161
+ with the Work to which such Contribution(s) was submitted. If You
162
+ institute patent litigation against any entity (including a
163
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
164
+ or a Contribution incorporated within the Work constitutes direct
165
+ or contributory patent infringement, then any patent licenses
166
+ granted to You under this License for that Work shall terminate
167
+ as of the date such litigation is filed.
168
+
169
+ 4. Redistribution. You may reproduce and distribute copies of the
170
+ Work or Derivative Works thereof in any medium, with or without
171
+ modifications, and in Source or Object form, provided that You
172
+ meet the following conditions:
173
+
174
+ (a) You must give any other recipients of the Work or
175
+ Derivative Works a copy of this License; and
176
+
177
+ (b) You must cause any modified files to carry prominent notices
178
+ stating that You changed the files; and
179
+
180
+ (c) You must retain, in the Source form of any Derivative Works
181
+ that You distribute, all copyright, patent, trademark, and
182
+ attribution notices from the Source form of the Work,
183
+ excluding those notices that do not pertain to any part of
184
+ the Derivative Works; and
185
+
186
+ (d) If the Work includes a "NOTICE" text file as part of its
187
+ distribution, then any Derivative Works that You distribute must
188
+ include a readable copy of the attribution notices contained
189
+ within such NOTICE file, excluding those notices that do not
190
+ pertain to any part of the Derivative Works, in at least one
191
+ of the following places: within a NOTICE text file distributed
192
+ as part of the Derivative Works; within the Source form or
193
+ documentation, if provided along with the Derivative Works; or,
194
+ within a display generated by the Derivative Works, if and
195
+ wherever such third-party notices normally appear. The contents
196
+ of the NOTICE file are for informational purposes only and
197
+ do not modify the License. You may add Your own attribution
198
+ notices within Derivative Works that You distribute, alongside
199
+ or as an addendum to the NOTICE text from the Work, provided
200
+ that such additional attribution notices cannot be construed
201
+ as modifying the License.
202
+
203
+ You may add Your own copyright statement to Your modifications and
204
+ may provide additional or different license terms and conditions
205
+ for use, reproduction, or distribution of Your modifications, or
206
+ for any such Derivative Works as a whole, provided Your use,
207
+ reproduction, and distribution of the Work otherwise complies with
208
+ the conditions stated in this License.
209
+
210
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
211
+ any Contribution intentionally submitted for inclusion in the Work
212
+ by You to the Licensor shall be under the terms and conditions of
213
+ this License, without any additional terms or conditions.
214
+ Notwithstanding the above, nothing herein shall supersede or modify
215
+ the terms of any separate license agreement you may have executed
216
+ with Licensor regarding such Contributions.
217
+
218
+ 6. Trademarks. This License does not grant permission to use the trade
219
+ names, trademarks, service marks, or product names of the Licensor,
220
+ except as required for reasonable and customary use in describing the
221
+ origin of the Work and reproducing the content of the NOTICE file.
222
+
223
+ 7. Disclaimer of Warranty. Unless required by applicable law or
224
+ agreed to in writing, Licensor provides the Work (and each
225
+ Contributor provides its Contributions) on an "AS IS" BASIS,
226
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
227
+ implied, including, without limitation, any warranties or conditions
228
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
229
+ PARTICULAR PURPOSE. You are solely responsible for determining the
230
+ appropriateness of using or redistributing the Work and assume any
231
+ risks associated with Your exercise of permissions under this License.
232
+
233
+ 8. Limitation of Liability. In no event and under no legal theory,
234
+ whether in tort (including negligence), contract, or otherwise,
235
+ unless required by applicable law (such as deliberate and grossly
236
+ negligent acts) or agreed to in writing, shall any Contributor be
237
+ liable to You for damages, including any direct, indirect, special,
238
+ incidental, or consequential damages of any character arising as a
239
+ result of this License or out of the use or inability to use the
240
+ Work (including but not limited to damages for loss of goodwill,
241
+ work stoppage, computer failure or malfunction, or any and all
242
+ other commercial damages or losses), even if such Contributor
243
+ has been advised of the possibility of such damages.
244
+
245
+ 9. Accepting Warranty or Additional Liability. While redistributing
246
+ the Work or Derivative Works thereof, You may choose to offer,
247
+ and charge a fee for, acceptance of support, warranty, indemnity,
248
+ or other liability obligations and/or rights consistent with this
249
+ License. However, in accepting such obligations, You may act only
250
+ on Your own behalf and on Your sole responsibility, not on behalf
251
+ of any other Contributor, and only if You agree to indemnify,
252
+ defend, and hold each Contributor harmless for any liability
253
+ incurred by, or claims asserted against, such Contributor by reason
254
+ of your accepting any such warranty or additional liability.
255
+
256
+ END OF TERMS AND CONDITIONS
257
+
258
+ APPENDIX: How to apply the Apache License to your work.
259
+
260
+ To apply the Apache License to your work, attach the following
261
+ boilerplate notice, with the fields enclosed by brackets "[]"
262
+ replaced with your own identifying information. (Don't include
263
+ the brackets!) The text should be enclosed in the appropriate
264
+ comment syntax for the file format. We also recommend that a
265
+ file or class name and description of purpose be included on the
266
+ same "printed page" as the copyright notice for easier
267
+ identification within third-party archives.
268
+
269
+ Copyright [yyyy] [name of copyright owner]
270
+
271
+
272
+
273
+ ### Non-Open Source Reference Implementations:
274
+
275
+ #### [MultiPL-E](https://github.com/nuprl/MultiPL-E)
276
+ - License: BSD 3-Clause License with Machine Learning Restriction
277
+
278
+ BSD 3-Clause License with Machine Learning Restriction
279
+
280
+ Copyright (c) 2022, Northeastern University, Oberlin College, Roblox Inc,
281
+ Stevens Institute of Technology, University of Massachusetts Amherst, and
282
+ Wellesley College.
283
+
284
+ All rights reserved.
285
+
286
+ Redistribution and use in source and binary forms, with or without
287
+ modification, are permitted provided that the following conditions are met:
288
+
289
+ 1. Redistributions of source code must retain the above copyright notice, this
290
+ list of conditions and the following disclaimer.
291
+
292
+ 2. Redistributions in binary form must reproduce the above copyright notice,
293
+ this list of conditions and the following disclaimer in the documentation
294
+ and/or other materials provided with the distribution.
295
+
296
+ 3. Neither the name of the copyright holder nor the names of its
297
+ contributors may be used to endorse or promote products derived from
298
+ this software without specific prior written permission.
299
+
300
+ 4. The contents of this repository may not be used as training data for any
301
+ machine learning model, including but not limited to neural networks.
302
+
303
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
304
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
305
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
306
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
307
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
308
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
309
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
310
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
311
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
312
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Development.md ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Development Guide
2
+ This guide is for people working on OpenHands and editing the source code.
3
+ If you wish to contribute your changes, check out the [CONTRIBUTING.md](https://github.com/All-Hands-AI/OpenHands/blob/main/CONTRIBUTING.md) on how to clone and setup the project initially before moving on.
4
+ Otherwise, you can clone the OpenHands project directly.
5
+
6
+ ## Start the Server for Development
7
+ ### 1. Requirements
8
+ * Linux, Mac OS, or [WSL on Windows](https://learn.microsoft.com/en-us/windows/wsl/install) [Ubuntu >= 22.04]
9
+ * [Docker](https://docs.docker.com/engine/install/) (For those on MacOS, make sure to allow the default Docker socket to be used from advanced settings!)
10
+ * [Python](https://www.python.org/downloads/) = 3.12
11
+ * [NodeJS](https://nodejs.org/en/download/package-manager) >= 20.x
12
+ * [Poetry](https://python-poetry.org/docs/#installing-with-the-official-installer) >= 1.8
13
+ * OS-specific dependencies:
14
+ - Ubuntu: build-essential => `sudo apt-get install build-essential`
15
+ - WSL: netcat => `sudo apt-get install netcat`
16
+
17
+ Make sure you have all these dependencies installed before moving on to `make build`.
18
+
19
+ #### Develop without sudo access
20
+ If you want to develop without system admin/sudo access to upgrade/install `Python` and/or `NodeJs`, you can use `conda` or `mamba` to manage the packages for you:
21
+
22
+ ```bash
23
+ # Download and install Mamba (a faster version of conda)
24
+ curl -L -O "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh"
25
+ bash Miniforge3-$(uname)-$(uname -m).sh
26
+
27
+ # Install Python 3.12, nodejs, and poetry
28
+ mamba install python=3.12
29
+ mamba install conda-forge::nodejs
30
+ mamba install conda-forge::poetry
31
+ ```
32
+
33
+ ### 2. Build and Setup The Environment
34
+ Begin by building the project which includes setting up the environment and installing dependencies. This step ensures that OpenHands is ready to run on your system:
35
+
36
+ ```bash
37
+ make build
38
+ ```
39
+
40
+ ### 3. Configuring the Language Model
41
+ OpenHands supports a diverse array of Language Models (LMs) through the powerful [litellm](https://docs.litellm.ai) library.
42
+ By default, we've chosen Claude Sonnet 3.5 as our go-to model, but the world is your oyster! You can unleash the
43
+ potential of any other LM that piques your interest.
44
+
45
+ To configure the LM of your choice, run:
46
+
47
+ ```bash
48
+ make setup-config
49
+ ```
50
+
51
+ This command will prompt you to enter the LLM API key, model name, and other variables ensuring that OpenHands is tailored to your specific needs. Note that the model name will apply only when you run headless. If you use the UI, please set the model in the UI.
52
+
53
+ Note: If you have previously run OpenHands using the docker command, you may have already set some environmental variables in your terminal. The final configurations are set from highest to lowest priority:
54
+ Environment variables > config.toml variables > default variables
55
+
56
+ **Note on Alternative Models:**
57
+ See [our documentation](https://docs.all-hands.dev/modules/usage/llms) for recommended models.
58
+
59
+ ### 4. Running the application
60
+ #### Option A: Run the Full Application
61
+ Once the setup is complete, this command starts both the backend and frontend servers, allowing you to interact with OpenHands:
62
+ ```bash
63
+ make run
64
+ ```
65
+
66
+ #### Option B: Individual Server Startup
67
+ - **Start the Backend Server:** If you prefer, you can start the backend server independently to focus on backend-related tasks or configurations.
68
+ ```bash
69
+ make start-backend
70
+ ```
71
+
72
+ - **Start the Frontend Server:** Similarly, you can start the frontend server on its own to work on frontend-related components or interface enhancements.
73
+ ```bash
74
+ make start-frontend
75
+ ```
76
+
77
+ ### 6. LLM Debugging
78
+ If you encounter any issues with the Language Model (LM) or you're simply curious, export DEBUG=1 in the environment and restart the backend.
79
+ OpenHands will log the prompts and responses in the logs/llm/CURRENT_DATE directory, allowing you to identify the causes.
80
+
81
+ ### 7. Help
82
+ Need help or info on available targets and commands? Use the help command for all the guidance you need with OpenHands.
83
+ ```bash
84
+ make help
85
+ ```
86
+
87
+ ### 8. Testing
88
+ To run tests, refer to the following:
89
+ #### Unit tests
90
+
91
+ ```bash
92
+ poetry run pytest ./tests/unit/test_*.py
93
+ ```
94
+
95
+ ### 9. Add or update dependency
96
+ 1. Add your dependency in `pyproject.toml` or use `poetry add xxx`.
97
+ 2. Update the poetry.lock file via `poetry lock --no-update`.
98
+
99
+ ### 9. Use existing Docker image
100
+ To reduce build time (e.g., if no changes were made to the client-runtime component), you can use an existing Docker container image by
101
+ setting the SANDBOX_RUNTIME_CONTAINER_IMAGE environment variable to the desired Docker image.
102
+
103
+ Example: `export SANDBOX_RUNTIME_CONTAINER_IMAGE=ghcr.io/all-hands-ai/runtime:0.20-nikolaik`
104
+
105
+ ## Develop inside Docker container
106
+
107
+ TL;DR
108
+
109
+ ```bash
110
+ make docker-dev
111
+ ```
112
+
113
+ See more details [here](./containers/dev/README.md).
114
+
115
+ If you are just interested in running `OpenHands` without installing all the required tools on your host.
116
+
117
+ ```bash
118
+ make docker-run
119
+ ```
120
+
121
+ If you do not have `make` on your host, run:
122
+
123
+ ```bash
124
+ cd ./containers/dev
125
+ ./dev.sh
126
+ ```
127
+
128
+ You do need [Docker](https://docs.docker.com/engine/install/) installed on your host though.
Dockerfile ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ARG OPENHANDS_BUILD_VERSION=dev
2
+ FROM node:21.7.2-bookworm-slim AS frontend-builder
3
+
4
+ WORKDIR /app
5
+
6
+ COPY ./frontend/package.json frontend/package-lock.json ./
7
+ RUN npm install -g [email protected]
8
+ RUN npm ci
9
+
10
+ COPY ./frontend ./
11
+ RUN npm run build
12
+
13
+ FROM python:3.12.3-slim AS backend-builder
14
+
15
+ WORKDIR /app
16
+ ENV PYTHONPATH='/app'
17
+
18
+ ENV POETRY_NO_INTERACTION=1 \
19
+ POETRY_VIRTUALENVS_IN_PROJECT=1 \
20
+ POETRY_VIRTUALENVS_CREATE=1 \
21
+ POETRY_CACHE_DIR=/tmp/poetry_cache
22
+
23
+ RUN apt-get update -y \
24
+ && apt-get install -y curl make git build-essential \
25
+ && python3 -m pip install poetry==1.8.2 --break-system-packages
26
+
27
+ COPY ./pyproject.toml ./poetry.lock ./
28
+ RUN touch README.md
29
+ RUN export POETRY_CACHE_DIR && poetry install --without evaluation,llama-index --no-root && rm -rf $POETRY_CACHE_DIR
30
+
31
+ FROM python:3.12.3-slim AS openhands-app
32
+
33
+ WORKDIR /app
34
+
35
+ ARG OPENHANDS_BUILD_VERSION #re-declare for this section
36
+
37
+ ENV RUN_AS_OPENHANDS=true
38
+ # A random number--we need this to be different from the user's UID on the host machine
39
+ ENV OPENHANDS_USER_ID=42420
40
+ ENV SANDBOX_LOCAL_RUNTIME_URL=http://host.docker.internal
41
+ ENV USE_HOST_NETWORK=false
42
+ ENV WORKSPACE_BASE=/opt/workspace_base
43
+ ENV OPENHANDS_BUILD_VERSION=$OPENHANDS_BUILD_VERSION
44
+ ENV SANDBOX_USER_ID=0
45
+ ENV FILE_STORE=local
46
+ ENV FILE_STORE_PATH=/.openhands-state
47
+ RUN mkdir -p $FILE_STORE_PATH
48
+ RUN mkdir -p $WORKSPACE_BASE
49
+
50
+ RUN apt-get update -y \
51
+ && apt-get install -y curl ssh sudo
52
+
53
+ # Default is 1000, but OSX is often 501
54
+ RUN sed -i 's/^UID_MIN.*/UID_MIN 499/' /etc/login.defs
55
+ # Default is 60000, but we've seen up to 200000
56
+ RUN sed -i 's/^UID_MAX.*/UID_MAX 1000000/' /etc/login.defs
57
+
58
+ RUN groupadd app
59
+ RUN useradd -l -m -u $OPENHANDS_USER_ID -s /bin/bash openhands && \
60
+ usermod -aG app openhands && \
61
+ usermod -aG sudo openhands && \
62
+ echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
63
+ RUN chown -R openhands:app /app && chmod -R 770 /app
64
+ RUN sudo chown -R openhands:app $WORKSPACE_BASE && sudo chmod -R 770 $WORKSPACE_BASE
65
+ USER openhands
66
+
67
+ ENV VIRTUAL_ENV=/app/.venv \
68
+ PATH="/app/.venv/bin:$PATH" \
69
+ PYTHONPATH='/app'
70
+
71
+ COPY --chown=openhands:app --chmod=770 --from=backend-builder ${VIRTUAL_ENV} ${VIRTUAL_ENV}
72
+ RUN playwright install --with-deps chromium
73
+
74
+ COPY --chown=openhands:app --chmod=770 ./microagents ./microagents
75
+ COPY --chown=openhands:app --chmod=770 ./openhands ./openhands
76
+ COPY --chown=openhands:app --chmod=777 ./openhands/runtime/plugins ./openhands/runtime/plugins
77
+ COPY --chown=openhands:app --chmod=770 ./openhands/agenthub ./openhands/agenthub
78
+ COPY --chown=openhands:app ./pyproject.toml ./pyproject.toml
79
+ COPY --chown=openhands:app ./poetry.lock ./poetry.lock
80
+ COPY --chown=openhands:app ./README.md ./README.md
81
+ COPY --chown=openhands:app ./MANIFEST.in ./MANIFEST.in
82
+ COPY --chown=openhands:app ./LICENSE ./LICENSE
83
+
84
+ # This is run as "openhands" user, and will create __pycache__ with openhands:openhands ownership
85
+ RUN python openhands/core/download.py # No-op to download assets
86
+ # Add this line to set group ownership of all files/directories not already in "app" group
87
+ # openhands:openhands -> openhands:app
88
+ RUN find /app \! -group app -exec chgrp app {} +
89
+
90
+ COPY --chown=openhands:app --chmod=770 --from=frontend-builder /app/build ./frontend/build
91
+ COPY --chown=openhands:app --chmod=770 ./containers/app/entrypoint.sh /app/entrypoint.sh
92
+
93
+ USER root
94
+
95
+ WORKDIR /app
96
+
97
+ ENTRYPOINT ["/app/entrypoint.sh"]
98
+ CMD ["uvicorn", "openhands.server.listen:app", "--host", "0.0.0.0", "--port", "3000"]
ISSUE_TRIAGE.md ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Issue Triage
2
+ These are the procedures and guidelines on how issues are triaged in this repo by the maintainers.
3
+
4
+ ## General
5
+ * Most issues must be tagged with **enhancement** or **bug**.
6
+ * Issues may be tagged with what it relates to (**backend**, **frontend**, **agent quality**, etc.).
7
+
8
+ ## Severity
9
+ * **Low**: Minor issues or affecting single user.
10
+ * **Medium**: Affecting multiple users.
11
+ * **Critical**: Affecting all users or potential security issues.
12
+
13
+ ## Effort
14
+ * Issues may be estimated with effort required (**small effort**, **medium effort**, **large effort**).
15
+
16
+ ## Difficulty
17
+ * Issues with low implementation difficulty may be tagged with **good first issue**.
18
+
19
+ ## Not Enough Information
20
+ * User is asked to provide more information (logs, how to reproduce, etc.) when the issue is not clear.
21
+ * If an issue is unclear and the author does not provide more information or respond to a request, the issue may be closed as **not planned** (Usually after a week).
22
+
23
+ ## Multiple Requests/Fixes in One Issue
24
+ * These issues will be narrowed down to one request/fix so the issue is more easily tracked and fixed.
25
+ * Issues may be broken down into multiple issues if required.
LICENSE ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ The MIT License (MIT)
2
+ =====================
3
+
4
+ Copyright © 2023
5
+
6
+ Permission is hereby granted, free of charge, to any person
7
+ obtaining a copy of this software and associated documentation
8
+ files (the “Software”), to deal in the Software without
9
+ restriction, including without limitation the rights to use,
10
+ copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ copies of the Software, and to permit persons to whom the
12
+ Software is furnished to do so, subject to the following
13
+ conditions:
14
+
15
+ The above copyright notice and this permission notice shall be
16
+ included in all copies or substantial portions of the Software.
17
+
18
+ THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND,
19
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
20
+ OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22
+ HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23
+ WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25
+ OTHER DEALINGS IN THE SOFTWARE.
MANIFEST.in ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # Exclude all Python bytecode files
2
+ global-exclude *.pyc
3
+
4
+ # Exclude Python cache directories
5
+ global-exclude __pycache__
Makefile ADDED
@@ -0,0 +1,332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ SHELL=/bin/bash
2
+ # Makefile for OpenHands project
3
+
4
+ # Variables
5
+ BACKEND_HOST ?= "127.0.0.1"
6
+ BACKEND_PORT = 3000
7
+ BACKEND_HOST_PORT = "$(BACKEND_HOST):$(BACKEND_PORT)"
8
+ FRONTEND_PORT = 3001
9
+ DEFAULT_WORKSPACE_DIR = "./workspace"
10
+ DEFAULT_MODEL = "gpt-4o"
11
+ CONFIG_FILE = config.toml
12
+ PRE_COMMIT_CONFIG_PATH = "./dev_config/python/.pre-commit-config.yaml"
13
+ PYTHON_VERSION = 3.12
14
+
15
+ # ANSI color codes
16
+ GREEN=$(shell tput -Txterm setaf 2)
17
+ YELLOW=$(shell tput -Txterm setaf 3)
18
+ RED=$(shell tput -Txterm setaf 1)
19
+ BLUE=$(shell tput -Txterm setaf 6)
20
+ RESET=$(shell tput -Txterm sgr0)
21
+
22
+ # Build
23
+ build:
24
+ @echo "$(GREEN)Building project...$(RESET)"
25
+ @$(MAKE) -s check-dependencies
26
+ @$(MAKE) -s install-python-dependencies
27
+ @$(MAKE) -s install-frontend-dependencies
28
+ @$(MAKE) -s install-pre-commit-hooks
29
+ @$(MAKE) -s build-frontend
30
+ @echo "$(GREEN)Build completed successfully.$(RESET)"
31
+
32
+ check-dependencies:
33
+ @echo "$(YELLOW)Checking dependencies...$(RESET)"
34
+ @$(MAKE) -s check-system
35
+ @$(MAKE) -s check-python
36
+ @$(MAKE) -s check-npm
37
+ @$(MAKE) -s check-nodejs
38
+ ifeq ($(INSTALL_DOCKER),)
39
+ @$(MAKE) -s check-docker
40
+ endif
41
+ @$(MAKE) -s check-poetry
42
+ @echo "$(GREEN)Dependencies checked successfully.$(RESET)"
43
+
44
+ check-system:
45
+ @echo "$(YELLOW)Checking system...$(RESET)"
46
+ @if [ "$(shell uname)" = "Darwin" ]; then \
47
+ echo "$(BLUE)macOS detected.$(RESET)"; \
48
+ elif [ "$(shell uname)" = "Linux" ]; then \
49
+ if [ -f "/etc/manjaro-release" ]; then \
50
+ echo "$(BLUE)Manjaro Linux detected.$(RESET)"; \
51
+ else \
52
+ echo "$(BLUE)Linux detected.$(RESET)"; \
53
+ fi; \
54
+ elif [ "$$(uname -r | grep -i microsoft)" ]; then \
55
+ echo "$(BLUE)Windows Subsystem for Linux detected.$(RESET)"; \
56
+ else \
57
+ echo "$(RED)Unsupported system detected. Please use macOS, Linux, or Windows Subsystem for Linux (WSL).$(RESET)"; \
58
+ exit 1; \
59
+ fi
60
+
61
+ check-python:
62
+ @echo "$(YELLOW)Checking Python installation...$(RESET)"
63
+ @if command -v python$(PYTHON_VERSION) > /dev/null; then \
64
+ echo "$(BLUE)$(shell python$(PYTHON_VERSION) --version) is already installed.$(RESET)"; \
65
+ else \
66
+ echo "$(RED)Python $(PYTHON_VERSION) is not installed. Please install Python $(PYTHON_VERSION) to continue.$(RESET)"; \
67
+ exit 1; \
68
+ fi
69
+
70
+ check-npm:
71
+ @echo "$(YELLOW)Checking npm installation...$(RESET)"
72
+ @if command -v npm > /dev/null; then \
73
+ echo "$(BLUE)npm $(shell npm --version) is already installed.$(RESET)"; \
74
+ else \
75
+ echo "$(RED)npm is not installed. Please install Node.js to continue.$(RESET)"; \
76
+ exit 1; \
77
+ fi
78
+
79
+ check-nodejs:
80
+ @echo "$(YELLOW)Checking Node.js installation...$(RESET)"
81
+ @if command -v node > /dev/null; then \
82
+ NODE_VERSION=$(shell node --version | sed -E 's/v//g'); \
83
+ IFS='.' read -r -a NODE_VERSION_ARRAY <<< "$$NODE_VERSION"; \
84
+ if [ "$${NODE_VERSION_ARRAY[0]}" -ge 20 ]; then \
85
+ echo "$(BLUE)Node.js $$NODE_VERSION is already installed.$(RESET)"; \
86
+ else \
87
+ echo "$(RED)Node.js 20.x or later is required. Please install Node.js 20.x or later to continue.$(RESET)"; \
88
+ exit 1; \
89
+ fi; \
90
+ else \
91
+ echo "$(RED)Node.js is not installed. Please install Node.js to continue.$(RESET)"; \
92
+ exit 1; \
93
+ fi
94
+
95
+ check-docker:
96
+ @echo "$(YELLOW)Checking Docker installation...$(RESET)"
97
+ @if command -v docker > /dev/null; then \
98
+ echo "$(BLUE)$(shell docker --version) is already installed.$(RESET)"; \
99
+ else \
100
+ echo "$(RED)Docker is not installed. Please install Docker to continue.$(RESET)"; \
101
+ exit 1; \
102
+ fi
103
+
104
+ check-poetry:
105
+ @echo "$(YELLOW)Checking Poetry installation...$(RESET)"
106
+ @if command -v poetry > /dev/null; then \
107
+ POETRY_VERSION=$(shell poetry --version 2>&1 | sed -E 's/Poetry \(version ([0-9]+\.[0-9]+\.[0-9]+)\)/\1/'); \
108
+ IFS='.' read -r -a POETRY_VERSION_ARRAY <<< "$$POETRY_VERSION"; \
109
+ if [ $${POETRY_VERSION_ARRAY[0]} -gt 1 ] || ([ $${POETRY_VERSION_ARRAY[0]} -eq 1 ] && [ $${POETRY_VERSION_ARRAY[1]} -ge 8 ]); then \
110
+ echo "$(BLUE)$(shell poetry --version) is already installed.$(RESET)"; \
111
+ else \
112
+ echo "$(RED)Poetry 1.8 or later is required. You can install poetry by running the following command, then adding Poetry to your PATH:"; \
113
+ echo "$(RED) curl -sSL https://install.python-poetry.org | python$(PYTHON_VERSION) -$(RESET)"; \
114
+ echo "$(RED)More detail here: https://python-poetry.org/docs/#installing-with-the-official-installer$(RESET)"; \
115
+ exit 1; \
116
+ fi; \
117
+ else \
118
+ echo "$(RED)Poetry is not installed. You can install poetry by running the following command, then adding Poetry to your PATH:"; \
119
+ echo "$(RED) curl -sSL https://install.python-poetry.org | python$(PYTHON_VERSION) -$(RESET)"; \
120
+ echo "$(RED)More detail here: https://python-poetry.org/docs/#installing-with-the-official-installer$(RESET)"; \
121
+ exit 1; \
122
+ fi
123
+
124
+ install-python-dependencies:
125
+ @echo "$(GREEN)Installing Python dependencies...$(RESET)"
126
+ @if [ -z "${TZ}" ]; then \
127
+ echo "Defaulting TZ (timezone) to UTC"; \
128
+ export TZ="UTC"; \
129
+ fi
130
+ poetry env use python$(PYTHON_VERSION)
131
+ @if [ "$(shell uname)" = "Darwin" ]; then \
132
+ echo "$(BLUE)Installing chroma-hnswlib...$(RESET)"; \
133
+ export HNSWLIB_NO_NATIVE=1; \
134
+ poetry run pip install chroma-hnswlib; \
135
+ fi
136
+ @poetry install --without llama-index
137
+ @if [ -f "/etc/manjaro-release" ]; then \
138
+ echo "$(BLUE)Detected Manjaro Linux. Installing Playwright dependencies...$(RESET)"; \
139
+ poetry run pip install playwright; \
140
+ poetry run playwright install chromium; \
141
+ else \
142
+ if [ ! -f cache/playwright_chromium_is_installed.txt ]; then \
143
+ echo "Running playwright install --with-deps chromium..."; \
144
+ poetry run playwright install --with-deps chromium; \
145
+ mkdir -p cache; \
146
+ touch cache/playwright_chromium_is_installed.txt; \
147
+ else \
148
+ echo "Setup already done. Skipping playwright installation."; \
149
+ fi \
150
+ fi
151
+ @echo "$(GREEN)Python dependencies installed successfully.$(RESET)"
152
+
153
+ install-frontend-dependencies:
154
+ @echo "$(YELLOW)Setting up frontend environment...$(RESET)"
155
+ @echo "$(YELLOW)Detect Node.js version...$(RESET)"
156
+ @cd frontend && node ./scripts/detect-node-version.js
157
+ echo "$(BLUE)Installing frontend dependencies with npm...$(RESET)"
158
+ @cd frontend && npm install
159
+ @echo "$(GREEN)Frontend dependencies installed successfully.$(RESET)"
160
+
161
+ install-pre-commit-hooks:
162
+ @echo "$(YELLOW)Installing pre-commit hooks...$(RESET)"
163
+ @git config --unset-all core.hooksPath || true
164
+ @poetry run pre-commit install --config $(PRE_COMMIT_CONFIG_PATH)
165
+ @echo "$(GREEN)Pre-commit hooks installed successfully.$(RESET)"
166
+
167
+ lint-backend:
168
+ @echo "$(YELLOW)Running linters...$(RESET)"
169
+ @poetry run pre-commit run --files openhands/**/* agenthub/**/* evaluation/**/* --show-diff-on-failure --config $(PRE_COMMIT_CONFIG_PATH)
170
+
171
+ lint-frontend:
172
+ @echo "$(YELLOW)Running linters for frontend...$(RESET)"
173
+ @cd frontend && npm run lint
174
+
175
+ lint:
176
+ @$(MAKE) -s lint-frontend
177
+ @$(MAKE) -s lint-backend
178
+
179
+ test-frontend:
180
+ @echo "$(YELLOW)Running tests for frontend...$(RESET)"
181
+ @cd frontend && npm run test
182
+
183
+ test:
184
+ @$(MAKE) -s test-frontend
185
+
186
+ build-frontend:
187
+ @echo "$(YELLOW)Building frontend...$(RESET)"
188
+ @cd frontend && npm run build
189
+
190
+ # Start backend
191
+ start-backend:
192
+ @echo "$(YELLOW)Starting backend...$(RESET)"
193
+ @poetry run uvicorn openhands.server.listen:app --host $(BACKEND_HOST) --port $(BACKEND_PORT) --reload --reload-exclude "./workspace"
194
+
195
+ # Start frontend
196
+ start-frontend:
197
+ @echo "$(YELLOW)Starting frontend...$(RESET)"
198
+ @cd frontend && VITE_BACKEND_HOST=$(BACKEND_HOST_PORT) VITE_FRONTEND_PORT=$(FRONTEND_PORT) npm run dev -- --port $(FRONTEND_PORT) --host $(BACKEND_HOST)
199
+
200
+ # Common setup for running the app (non-callable)
201
+ _run_setup:
202
+ @if [ "$(OS)" = "Windows_NT" ]; then \
203
+ echo "$(RED) Windows is not supported, use WSL instead!$(RESET)"; \
204
+ exit 1; \
205
+ fi
206
+ @mkdir -p logs
207
+ @echo "$(YELLOW)Starting backend server...$(RESET)"
208
+ @poetry run uvicorn openhands.server.listen:app --host $(BACKEND_HOST) --port $(BACKEND_PORT) &
209
+ @echo "$(YELLOW)Waiting for the backend to start...$(RESET)"
210
+ @until nc -z localhost $(BACKEND_PORT); do sleep 0.1; done
211
+ @echo "$(GREEN)Backend started successfully.$(RESET)"
212
+
213
+ # Run the app (standard mode)
214
+ run:
215
+ @echo "$(YELLOW)Running the app...$(RESET)"
216
+ @$(MAKE) -s _run_setup
217
+ @$(MAKE) -s start-frontend
218
+ @echo "$(GREEN)Application started successfully.$(RESET)"
219
+
220
+ # Run the app (in docker)
221
+ docker-run: WORKSPACE_BASE ?= $(PWD)/workspace
222
+ docker-run:
223
+ @if [ -f /.dockerenv ]; then \
224
+ echo "Running inside a Docker container. Exiting..."; \
225
+ exit 0; \
226
+ else \
227
+ echo "$(YELLOW)Running the app in Docker $(OPTIONS)...$(RESET)"; \
228
+ export WORKSPACE_BASE=${WORKSPACE_BASE}; \
229
+ export SANDBOX_USER_ID=$(shell id -u); \
230
+ export DATE=$(shell date +%Y%m%d%H%M%S); \
231
+ docker compose up $(OPTIONS); \
232
+ fi
233
+
234
+ # Run the app (WSL mode)
235
+ run-wsl:
236
+ @echo "$(YELLOW)Running the app in WSL mode...$(RESET)"
237
+ @$(MAKE) -s _run_setup
238
+ @cd frontend && echo "$(BLUE)Starting frontend with npm (WSL mode)...$(RESET)" && npm run dev_wsl -- --port $(FRONTEND_PORT)
239
+ @echo "$(GREEN)Application started successfully in WSL mode.$(RESET)"
240
+
241
+ # Setup config.toml
242
+ setup-config:
243
+ @echo "$(YELLOW)Setting up config.toml...$(RESET)"
244
+ @$(MAKE) setup-config-prompts
245
+ @mv $(CONFIG_FILE).tmp $(CONFIG_FILE)
246
+ @echo "$(GREEN)Config.toml setup completed.$(RESET)"
247
+
248
+ setup-config-prompts:
249
+ @echo "[core]" > $(CONFIG_FILE).tmp
250
+
251
+ @read -p "Enter your workspace directory (as absolute path) [default: $(DEFAULT_WORKSPACE_DIR)]: " workspace_dir; \
252
+ workspace_dir=$${workspace_dir:-$(DEFAULT_WORKSPACE_DIR)}; \
253
+ echo "workspace_base=\"$$workspace_dir\"" >> $(CONFIG_FILE).tmp
254
+
255
+ @echo "" >> $(CONFIG_FILE).tmp
256
+
257
+ @echo "[llm]" >> $(CONFIG_FILE).tmp
258
+ @read -p "Enter your LLM model name, used for running without UI. Set the model in the UI after you start the app. (see https://docs.litellm.ai/docs/providers for full list) [default: $(DEFAULT_MODEL)]: " llm_model; \
259
+ llm_model=$${llm_model:-$(DEFAULT_MODEL)}; \
260
+ echo "model=\"$$llm_model\"" >> $(CONFIG_FILE).tmp
261
+
262
+ @read -p "Enter your LLM api key: " llm_api_key; \
263
+ echo "api_key=\"$$llm_api_key\"" >> $(CONFIG_FILE).tmp
264
+
265
+ @read -p "Enter your LLM base URL [mostly used for local LLMs, leave blank if not needed - example: http://localhost:5001/v1/]: " llm_base_url; \
266
+ if [[ ! -z "$$llm_base_url" ]]; then echo "base_url=\"$$llm_base_url\"" >> $(CONFIG_FILE).tmp; fi
267
+
268
+ @echo "Enter your LLM Embedding Model"; \
269
+ echo "Choices are:"; \
270
+ echo " - openai"; \
271
+ echo " - azureopenai"; \
272
+ echo " - Embeddings available only with OllamaEmbedding:"; \
273
+ echo " - llama2"; \
274
+ echo " - mxbai-embed-large"; \
275
+ echo " - nomic-embed-text"; \
276
+ echo " - all-minilm"; \
277
+ echo " - stable-code"; \
278
+ echo " - bge-m3"; \
279
+ echo " - bge-large"; \
280
+ echo " - paraphrase-multilingual"; \
281
+ echo " - snowflake-arctic-embed"; \
282
+ echo " - Leave blank to default to 'BAAI/bge-small-en-v1.5' via huggingface"; \
283
+ read -p "> " llm_embedding_model; \
284
+ echo "embedding_model=\"$$llm_embedding_model\"" >> $(CONFIG_FILE).tmp; \
285
+ if [ "$$llm_embedding_model" = "llama2" ] || [ "$$llm_embedding_model" = "mxbai-embed-large" ] || [ "$$llm_embedding_model" = "nomic-embed-text" ] || [ "$$llm_embedding_model" = "all-minilm" ] || [ "$$llm_embedding_model" = "stable-code" ]; then \
286
+ read -p "Enter the local model URL for the embedding model (will set llm.embedding_base_url): " llm_embedding_base_url; \
287
+ echo "embedding_base_url=\"$$llm_embedding_base_url\"" >> $(CONFIG_FILE).tmp; \
288
+ elif [ "$$llm_embedding_model" = "azureopenai" ]; then \
289
+ read -p "Enter the Azure endpoint URL (will overwrite llm.base_url): " llm_base_url; \
290
+ echo "base_url=\"$$llm_base_url\"" >> $(CONFIG_FILE).tmp; \
291
+ read -p "Enter the Azure LLM Embedding Deployment Name: " llm_embedding_deployment_name; \
292
+ echo "embedding_deployment_name=\"$$llm_embedding_deployment_name\"" >> $(CONFIG_FILE).tmp; \
293
+ read -p "Enter the Azure API Version: " llm_api_version; \
294
+ echo "api_version=\"$$llm_api_version\"" >> $(CONFIG_FILE).tmp; \
295
+ fi
296
+
297
+
298
+ # Develop in container
299
+ docker-dev:
300
+ @if [ -f /.dockerenv ]; then \
301
+ echo "Running inside a Docker container. Exiting..."; \
302
+ exit 0; \
303
+ else \
304
+ echo "$(YELLOW)Build and run in Docker $(OPTIONS)...$(RESET)"; \
305
+ ./containers/dev/dev.sh $(OPTIONS); \
306
+ fi
307
+
308
+ # Clean up all caches
309
+ clean:
310
+ @echo "$(YELLOW)Cleaning up caches...$(RESET)"
311
+ @rm -rf openhands/.cache
312
+ @echo "$(GREEN)Caches cleaned up successfully.$(RESET)"
313
+
314
+ # Help
315
+ help:
316
+ @echo "$(BLUE)Usage: make [target]$(RESET)"
317
+ @echo "Targets:"
318
+ @echo " $(GREEN)build$(RESET) - Build project, including environment setup and dependencies."
319
+ @echo " $(GREEN)lint$(RESET) - Run linters on the project."
320
+ @echo " $(GREEN)setup-config$(RESET) - Setup the configuration for OpenHands by providing LLM API key,"
321
+ @echo " LLM Model name, and workspace directory."
322
+ @echo " $(GREEN)start-backend$(RESET) - Start the backend server for the OpenHands project."
323
+ @echo " $(GREEN)start-frontend$(RESET) - Start the frontend server for the OpenHands project."
324
+ @echo " $(GREEN)run$(RESET) - Run the OpenHands application, starting both backend and frontend servers."
325
+ @echo " Backend Log file will be stored in the 'logs' directory."
326
+ @echo " $(GREEN)docker-dev$(RESET) - Build and run the OpenHands application in Docker."
327
+ @echo " $(GREEN)docker-run$(RESET) - Run the OpenHands application, starting both backend and frontend servers in Docker."
328
+ @echo " $(GREEN)help$(RESET) - Display this help message, providing information on available targets."
329
+
330
+ # Phony targets
331
+ .PHONY: build check-dependencies check-python check-npm check-docker check-poetry install-python-dependencies install-frontend-dependencies install-pre-commit-hooks lint start-backend start-frontend run run-wsl setup-config setup-config-prompts help
332
+ .PHONY: docker-dev docker-run
build.sh ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ set -e
3
+
4
+ cp pyproject.toml poetry.lock openhands
5
+ poetry build -v
config.sh ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ DOCKER_REGISTRY=ghcr.io
2
+ DOCKER_ORG=all-hands-ai
3
+ DOCKER_IMAGE=openhands
4
+ DOCKER_BASE_DIR="."
config.template.toml ADDED
@@ -0,0 +1,290 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ###################### OpenHands Configuration Example ######################
2
+ #
3
+ # All settings have default values, so you only need to uncomment and
4
+ # modify what you want to change
5
+ # The fields within each section are sorted in alphabetical order.
6
+ #
7
+ ##############################################################################
8
+
9
+ #################################### Core ####################################
10
+ # General core configurations
11
+ ##############################################################################
12
+ [core]
13
+ # API key for E2B
14
+ #e2b_api_key = ""
15
+
16
+ # API key for Modal
17
+ #modal_api_token_id = ""
18
+ #modal_api_token_secret = ""
19
+
20
+ # Base path for the workspace
21
+ workspace_base = "./workspace"
22
+
23
+ # Cache directory path
24
+ #cache_dir = "/tmp/cache"
25
+
26
+ # Reasoning effort for o1 models (low, medium, high, or not set)
27
+ #reasoning_effort = "medium"
28
+
29
+ # Debugging enabled
30
+ #debug = false
31
+
32
+ # Disable color in terminal output
33
+ #disable_color = false
34
+
35
+ # Enable saving and restoring the session when run from CLI
36
+ #enable_cli_session = false
37
+
38
+ # Path to store trajectories, can be a folder or a file
39
+ # If it's a folder, the session id will be used as the file name
40
+ #save_trajectory_path="./trajectories"
41
+
42
+ # Path to replay a trajectory, must be a file path
43
+ # If provided, trajectory will be loaded and replayed before the
44
+ # agent responds to any user instruction
45
+ #replay_trajectory_path = ""
46
+
47
+ # File store path
48
+ #file_store_path = "/tmp/file_store"
49
+
50
+ # File store type
51
+ #file_store = "memory"
52
+
53
+ # List of allowed file extensions for uploads
54
+ #file_uploads_allowed_extensions = [".*"]
55
+
56
+ # Maximum file size for uploads, in megabytes
57
+ #file_uploads_max_file_size_mb = 0
58
+
59
+ # Maximum budget per task, 0.0 means no limit
60
+ #max_budget_per_task = 0.0
61
+
62
+ # Maximum number of iterations
63
+ #max_iterations = 100
64
+
65
+ # Path to mount the workspace in the sandbox
66
+ #workspace_mount_path_in_sandbox = "/workspace"
67
+
68
+ # Path to mount the workspace
69
+ #workspace_mount_path = ""
70
+
71
+ # Path to rewrite the workspace mount path to
72
+ #workspace_mount_rewrite = ""
73
+
74
+ # Run as openhands
75
+ #run_as_openhands = true
76
+
77
+ # Runtime environment
78
+ #runtime = "eventstream"
79
+
80
+ # Name of the default agent
81
+ #default_agent = "CodeActAgent"
82
+
83
+ # JWT secret for authentication
84
+ #jwt_secret = ""
85
+
86
+ # Restrict file types for file uploads
87
+ #file_uploads_restrict_file_types = false
88
+
89
+ # List of allowed file extensions for uploads
90
+ #file_uploads_allowed_extensions = [".*"]
91
+
92
+ #################################### LLM #####################################
93
+ # Configuration for LLM models (group name starts with 'llm')
94
+ # use 'llm' for the default LLM config
95
+ ##############################################################################
96
+ [llm]
97
+ # AWS access key ID
98
+ #aws_access_key_id = ""
99
+
100
+ # AWS region name
101
+ #aws_region_name = ""
102
+
103
+ # AWS secret access key
104
+ #aws_secret_access_key = ""
105
+
106
+ # API key to use (For Headless / CLI only - In Web this is overridden by Session Init)
107
+ api_key = "your-api-key"
108
+
109
+ # API base URL (For Headless / CLI only - In Web this is overridden by Session Init)
110
+ #base_url = ""
111
+
112
+ # API version
113
+ #api_version = ""
114
+
115
+ # Cost per input token
116
+ #input_cost_per_token = 0.0
117
+
118
+ # Cost per output token
119
+ #output_cost_per_token = 0.0
120
+
121
+ # Custom LLM provider
122
+ #custom_llm_provider = ""
123
+
124
+ # Embedding API base URL
125
+ #embedding_base_url = ""
126
+
127
+ # Embedding deployment name
128
+ #embedding_deployment_name = ""
129
+
130
+ # Embedding model to use
131
+ embedding_model = "local"
132
+
133
+ # Maximum number of characters in an observation's content
134
+ #max_message_chars = 10000
135
+
136
+ # Maximum number of input tokens
137
+ #max_input_tokens = 0
138
+
139
+ # Maximum number of output tokens
140
+ #max_output_tokens = 0
141
+
142
+ # Model to use. (For Headless / CLI only - In Web this is overridden by Session Init)
143
+ model = "gpt-4o"
144
+
145
+ # Number of retries to attempt when an operation fails with the LLM.
146
+ # Increase this value to allow more attempts before giving up
147
+ #num_retries = 8
148
+
149
+ # Maximum wait time (in seconds) between retry attempts
150
+ # This caps the exponential backoff to prevent excessively long
151
+ #retry_max_wait = 120
152
+
153
+ # Minimum wait time (in seconds) between retry attempts
154
+ # This sets the initial delay before the first retry
155
+ #retry_min_wait = 15
156
+
157
+ # Multiplier for exponential backoff calculation
158
+ # The wait time increases by this factor after each failed attempt
159
+ # A value of 2.0 means each retry waits twice as long as the previous one
160
+ #retry_multiplier = 2.0
161
+
162
+ # Drop any unmapped (unsupported) params without causing an exception
163
+ #drop_params = false
164
+
165
+ # Modify params for litellm to do transformations like adding a default message, when a message is empty.
166
+ # Note: this setting is global, unlike drop_params, it cannot be overridden in each call to litellm.
167
+ #modify_params = true
168
+
169
+ # Using the prompt caching feature if provided by the LLM and supported
170
+ #caching_prompt = true
171
+
172
+ # Base URL for the OLLAMA API
173
+ #ollama_base_url = ""
174
+
175
+ # Temperature for the API
176
+ #temperature = 0.0
177
+
178
+ # Timeout for the API
179
+ #timeout = 0
180
+
181
+ # Top p for the API
182
+ #top_p = 1.0
183
+
184
+ # If model is vision capable, this option allows to disable image processing (useful for cost reduction).
185
+ #disable_vision = true
186
+
187
+ # Custom tokenizer to use for token counting
188
+ # https://docs.litellm.ai/docs/completion/token_usage
189
+ #custom_tokenizer = ""
190
+
191
+ # Whether to use native tool calling if supported by the model. Can be true, false, or None by default, which chooses the model's default behavior based on the evaluation.
192
+ # ATTENTION: Based on evaluation, enabling native function calling may lead to worse results
193
+ # in some scenarios. Use with caution and consider testing with your specific use case.
194
+ # https://github.com/All-Hands-AI/OpenHands/pull/4711
195
+ #native_tool_calling = None
196
+
197
+ [llm.gpt4o-mini]
198
+ api_key = "your-api-key"
199
+ model = "gpt-4o"
200
+
201
+
202
+ #################################### Agent ###################################
203
+ # Configuration for agents (group name starts with 'agent')
204
+ # Use 'agent' for the default agent config
205
+ # otherwise, group name must be `agent.<agent_name>` (case-sensitive), e.g.
206
+ # agent.CodeActAgent
207
+ ##############################################################################
208
+ [agent]
209
+
210
+ # whether the browsing tool is enabled
211
+ codeact_enable_browsing = true
212
+
213
+ # whether the LLM draft editor is enabled
214
+ codeact_enable_llm_editor = false
215
+
216
+ # whether the IPython tool is enabled
217
+ codeact_enable_jupyter = true
218
+
219
+ # Name of the micro agent to use for this agent
220
+ #micro_agent_name = ""
221
+
222
+ # Memory enabled
223
+ #memory_enabled = false
224
+
225
+ # Memory maximum threads
226
+ #memory_max_threads = 3
227
+
228
+ # LLM config group to use
229
+ #llm_config = 'your-llm-config-group'
230
+
231
+ # Whether to use prompt extension (e.g., microagent, repo/runtime info) at all
232
+ #enable_prompt_extensions = true
233
+
234
+ # List of microagents to disable
235
+ #disabled_microagents = []
236
+
237
+ [agent.RepoExplorerAgent]
238
+ # Example: use a cheaper model for RepoExplorerAgent to reduce cost, especially
239
+ # useful when an agent doesn't demand high quality but uses a lot of tokens
240
+ llm_config = 'gpt3'
241
+
242
+ #################################### Sandbox ###################################
243
+ # Configuration for the sandbox
244
+ ##############################################################################
245
+ [sandbox]
246
+ # Sandbox timeout in seconds
247
+ #timeout = 120
248
+
249
+ # Sandbox user ID
250
+ #user_id = 1000
251
+
252
+ # Container image to use for the sandbox
253
+ #base_container_image = "nikolaik/python-nodejs:python3.12-nodejs22"
254
+
255
+ # Use host network
256
+ #use_host_network = false
257
+
258
+ # runtime extra build args
259
+ #runtime_extra_build_args = ["--network=host", "--add-host=host.docker.internal:host-gateway"]
260
+
261
+ # Enable auto linting after editing
262
+ #enable_auto_lint = false
263
+
264
+ # Whether to initialize plugins
265
+ #initialize_plugins = true
266
+
267
+ # Extra dependencies to install in the runtime image
268
+ #runtime_extra_deps = ""
269
+
270
+ # Environment variables to set at the launch of the runtime
271
+ #runtime_startup_env_vars = {}
272
+
273
+ # BrowserGym environment to use for evaluation
274
+ #browsergym_eval_env = ""
275
+
276
+ #################################### Security ###################################
277
+ # Configuration for security features
278
+ ##############################################################################
279
+ [security]
280
+
281
+ # Enable confirmation mode (For Headless / CLI only - In Web this is overridden by Session Init)
282
+ #confirmation_mode = false
283
+
284
+ # The security analyzer to use (For Headless / CLI only - In Web this is overridden by Session Init)
285
+ #security_analyzer = ""
286
+
287
+ #################################### Eval ####################################
288
+ # Configuration for the evaluation, please refer to the specific evaluation
289
+ # plugin for the available options
290
+ ##############################################################################
containers/README.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Docker Containers
2
+
3
+ Each folder here contains a Dockerfile, and a config.sh describing how to build
4
+ the images and where to push them. These images are built and pushed in GitHub Actions
5
+ by the `ghcr.yml` workflow.
6
+
7
+ ## Building Manually
8
+
9
+ ```bash
10
+ docker build -f containers/app/Dockerfile -t openhands .
11
+ docker build -f containers/sandbox/Dockerfile -t sandbox .
12
+ ```
containers/app/config.sh ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ DOCKER_REGISTRY=ghcr.io
2
+ DOCKER_ORG=all-hands-ai
3
+ DOCKER_IMAGE=openhands
4
+ DOCKER_BASE_DIR="."
containers/app/entrypoint.sh ADDED
File without changes
containers/build.sh ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ set -eo pipefail
3
+
4
+ # Initialize variables with default values
5
+ image_name=""
6
+ org_name=""
7
+ push=0
8
+ load=0
9
+ tag_suffix=""
10
+
11
+ # Function to display usage information
12
+ usage() {
13
+ echo "Usage: $0 -i <image_name> [-o <org_name>] [--push] [--load] [-t <tag_suffix>]"
14
+ echo " -i: Image name (required)"
15
+ echo " -o: Organization name"
16
+ echo " --push: Push the image"
17
+ echo " --load: Load the image"
18
+ echo " -t: Tag suffix"
19
+ exit 1
20
+ }
21
+
22
+ # Parse command-line options
23
+ while [[ $# -gt 0 ]]; do
24
+ case $1 in
25
+ -i) image_name="$2"; shift 2 ;;
26
+ -o) org_name="$2"; shift 2 ;;
27
+ --push) push=1; shift ;;
28
+ --load) load=1; shift ;;
29
+ -t) tag_suffix="$2"; shift 2 ;;
30
+ *) usage ;;
31
+ esac
32
+ done
33
+ # Check if required arguments are provided
34
+ if [[ -z "$image_name" ]]; then
35
+ echo "Error: Image name is required."
36
+ usage
37
+ fi
38
+
39
+ echo "Building: $image_name"
40
+ tags=()
41
+
42
+ OPENHANDS_BUILD_VERSION="dev"
43
+
44
+ cache_tag_base="buildcache"
45
+ cache_tag="$cache_tag_base"
46
+
47
+ if [[ -n $RELEVANT_SHA ]]; then
48
+ git_hash=$(git rev-parse --short "$RELEVANT_SHA")
49
+ tags+=("$git_hash")
50
+ tags+=("$RELEVANT_SHA")
51
+ fi
52
+
53
+ if [[ -n $GITHUB_REF_NAME ]]; then
54
+ # check if ref name is a version number
55
+ if [[ $GITHUB_REF_NAME =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
56
+ major_version=$(echo "$GITHUB_REF_NAME" | cut -d. -f1)
57
+ minor_version=$(echo "$GITHUB_REF_NAME" | cut -d. -f1,2)
58
+ tags+=("$major_version" "$minor_version")
59
+ tags+=("latest")
60
+ fi
61
+ sanitized_ref_name=$(echo "$GITHUB_REF_NAME" | sed 's/[^a-zA-Z0-9.-]\+/-/g')
62
+ OPENHANDS_BUILD_VERSION=$sanitized_ref_name
63
+ sanitized_ref_name=$(echo "$sanitized_ref_name" | tr '[:upper:]' '[:lower:]') # lower case is required in tagging
64
+ tags+=("$sanitized_ref_name")
65
+ cache_tag+="-${sanitized_ref_name}"
66
+ fi
67
+
68
+ if [[ -n $tag_suffix ]]; then
69
+ cache_tag+="-${tag_suffix}"
70
+ for i in "${!tags[@]}"; do
71
+ tags[$i]="${tags[$i]}-$tag_suffix"
72
+ done
73
+ fi
74
+
75
+ echo "Tags: ${tags[@]}"
76
+
77
+ if [[ "$image_name" == "openhands" ]]; then
78
+ dir="./containers/app"
79
+ elif [[ "$image_name" == "runtime" ]]; then
80
+ dir="./containers/runtime"
81
+ else
82
+ dir="./containers/$image_name"
83
+ fi
84
+
85
+ if [[ (! -f "$dir/Dockerfile") && "$image_name" != "runtime" ]]; then
86
+ # Allow runtime to be built without a Dockerfile
87
+ echo "No Dockerfile found"
88
+ exit 1
89
+ fi
90
+ if [[ ! -f "$dir/config.sh" ]]; then
91
+ echo "No config.sh found for Dockerfile"
92
+ exit 1
93
+ fi
94
+
95
+ source "$dir/config.sh"
96
+
97
+ if [[ -n "$org_name" ]]; then
98
+ DOCKER_ORG="$org_name"
99
+ fi
100
+
101
+ # If $DOCKER_IMAGE_SOURCE_TAG is set, add it to the tags
102
+ if [[ -n "$DOCKER_IMAGE_SOURCE_TAG" ]]; then
103
+ tags+=("$DOCKER_IMAGE_SOURCE_TAG")
104
+ fi
105
+ # If $DOCKER_IMAGE_TAG is set, add it to the tags
106
+ if [[ -n "$DOCKER_IMAGE_TAG" ]]; then
107
+ tags+=("$DOCKER_IMAGE_TAG")
108
+ fi
109
+
110
+ DOCKER_REPOSITORY="$DOCKER_REGISTRY/$DOCKER_ORG/$DOCKER_IMAGE"
111
+ DOCKER_REPOSITORY=${DOCKER_REPOSITORY,,} # lowercase
112
+ echo "Repo: $DOCKER_REPOSITORY"
113
+ echo "Base dir: $DOCKER_BASE_DIR"
114
+
115
+ args=""
116
+ for tag in "${tags[@]}"; do
117
+ args+=" -t $DOCKER_REPOSITORY:$tag"
118
+ done
119
+
120
+ if [[ $push -eq 1 ]]; then
121
+ args+=" --push"
122
+ args+=" --cache-to=type=registry,ref=$DOCKER_REPOSITORY:$cache_tag,mode=max"
123
+ fi
124
+
125
+ if [[ $load -eq 1 ]]; then
126
+ args+=" --load"
127
+ fi
128
+
129
+ echo "Args: $args"
130
+
131
+ # Modify the platform selection based on --load flag
132
+ if [[ $load -eq 1 ]]; then
133
+ # When loading, build only for the current platform
134
+ platform=$(docker version -f '{{.Server.Os}}/{{.Server.Arch}}')
135
+ else
136
+ # For push or without load, build for multiple platforms
137
+ platform="linux/amd64,linux/arm64"
138
+ fi
139
+
140
+ echo "Building for platform(s): $platform"
141
+
142
+ docker buildx build \
143
+ $args \
144
+ --build-arg OPENHANDS_BUILD_VERSION="$OPENHANDS_BUILD_VERSION" \
145
+ --cache-from=type=registry,ref=$DOCKER_REPOSITORY:$cache_tag \
146
+ --cache-from=type=registry,ref=$DOCKER_REPOSITORY:$cache_tag_base-main \
147
+ --platform $platform \
148
+ --provenance=false \
149
+ -f "$dir/Dockerfile" \
150
+ "$DOCKER_BASE_DIR"
151
+
152
+ # If load was requested, print the loaded images
153
+ if [[ $load -eq 1 ]]; then
154
+ echo "Local images built:"
155
+ docker images "$DOCKER_REPOSITORY" --format "{{.Repository}}:{{.Tag}}"
156
+ fi
containers/dev/Dockerfile ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # syntax=docker/dockerfile:1
2
+
3
+ ###
4
+ FROM ubuntu:22.04 AS dind
5
+
6
+ # https://docs.docker.com/engine/install/ubuntu/
7
+ RUN apt-get update && apt-get install -y \
8
+ ca-certificates \
9
+ curl \
10
+ && install -m 0755 -d /etc/apt/keyrings \
11
+ && curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc \
12
+ && chmod a+r /etc/apt/keyrings/docker.asc \
13
+ && echo \
14
+ "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
15
+ $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null
16
+
17
+ RUN apt-get update && apt-get install -y \
18
+ docker-ce \
19
+ docker-ce-cli \
20
+ containerd.io \
21
+ docker-buildx-plugin \
22
+ docker-compose-plugin \
23
+ && rm -rf /var/lib/apt/lists/* \
24
+ && apt-get clean \
25
+ && apt-get autoremove -y
26
+
27
+ ###
28
+ FROM dind AS openhands
29
+
30
+ ENV DEBIAN_FRONTEND=noninteractive
31
+
32
+ #
33
+ RUN apt-get update && apt-get install -y \
34
+ bash \
35
+ build-essential \
36
+ curl \
37
+ git \
38
+ git-lfs \
39
+ software-properties-common \
40
+ make \
41
+ netcat \
42
+ sudo \
43
+ wget \
44
+ && rm -rf /var/lib/apt/lists/* \
45
+ && apt-get clean \
46
+ && apt-get autoremove -y
47
+
48
+ # https://github.com/cli/cli/blob/trunk/docs/install_linux.md
49
+ RUN curl -fsSL https://cli.github.com/packages/githubcli-archive-keyring.gpg | dd of=/usr/share/keyrings/githubcli-archive-keyring.gpg \
50
+ && chmod go+r /usr/share/keyrings/githubcli-archive-keyring.gpg \
51
+ && echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/githubcli-archive-keyring.gpg] https://cli.github.com/packages stable main" | tee /etc/apt/sources.list.d/github-cli.list > /dev/null \
52
+ && apt-get update && apt-get -y install \
53
+ gh \
54
+ && rm -rf /var/lib/apt/lists/* \
55
+ && apt-get clean \
56
+ && apt-get autoremove -y
57
+
58
+ # Python 3.12
59
+ RUN add-apt-repository ppa:deadsnakes/ppa \
60
+ && apt-get update \
61
+ && apt-get install -y python3.12 python3.12-venv python3.12-dev python3-pip \
62
+ && ln -s /usr/bin/python3.12 /usr/bin/python
63
+
64
+ # NodeJS >= 18.17.1
65
+ RUN curl -fsSL https://deb.nodesource.com/setup_18.x | bash - \
66
+ && apt-get install -y nodejs
67
+
68
+ # Poetry >= 1.8
69
+ RUN curl -fsSL https://install.python-poetry.org | python3.12 - \
70
+ && ln -s ~/.local/bin/poetry /usr/local/bin/poetry
71
+
72
+ #
73
+ RUN <<EOF
74
+ #!/bin/bash
75
+ printf "#!/bin/bash
76
+ set +x
77
+ uname -a
78
+ docker --version
79
+ gh --version | head -n 1
80
+ git --version
81
+ #
82
+ python --version
83
+ echo node `node --version`
84
+ echo npm `npm --version`
85
+ poetry --version
86
+ netcat -h 2>&1 | head -n 1
87
+ " > /version.sh
88
+ chmod a+x /version.sh
89
+ EOF
90
+
91
+ ###
92
+ FROM openhands AS dev
93
+
94
+ RUN apt-get update && apt-get install -y \
95
+ dnsutils \
96
+ file \
97
+ iproute2 \
98
+ jq \
99
+ lsof \
100
+ ripgrep \
101
+ silversearcher-ag \
102
+ vim \
103
+ && rm -rf /var/lib/apt/lists/* \
104
+ && apt-get clean \
105
+ && apt-get autoremove -y
106
+
107
+ WORKDIR /app
108
+
109
+ # cache build dependencies
110
+ RUN \
111
+ --mount=type=bind,source=./,target=/app/ \
112
+ <<EOF
113
+ #!/bin/bash
114
+ make -s clean
115
+ make -s check-dependencies
116
+ make -s install-python-dependencies
117
+
118
+ # NOTE
119
+ # node_modules are .dockerignore-d therefore not mountable
120
+ # make -s install-frontend-dependencies
121
+ EOF
122
+
123
+ #
124
+ CMD ["bash"]
containers/dev/README.md ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Develop in Docker
2
+
3
+ > [!WARNING]
4
+ > This is not officially supported and may not work.
5
+
6
+ Install [Docker](https://docs.docker.com/engine/install/) on your host machine and run:
7
+
8
+ ```bash
9
+ make docker-dev
10
+ # same as:
11
+ cd ./containers/dev
12
+ ./dev.sh
13
+ ```
14
+
15
+ It could take some time if you are running for the first time as Docker will pull all the tools required for building OpenHands. The next time you run again, it should be instant.
16
+
17
+ ## Build and run
18
+
19
+ If everything goes well, you should be inside a container after Docker finishes building the `openhands:dev` image similar to the following:
20
+
21
+ ```bash
22
+ Build and run in Docker ...
23
+ root@93fc0005fcd2:/app#
24
+ ```
25
+
26
+ You may now proceed with the normal [build and run](../../Development.md) workflow as if you were on the host.
27
+
28
+ ## Make changes
29
+
30
+ The source code on the host is mounted as `/app` inside docker. You may edit the files as usual either inside the Docker container or on your host with your favorite IDE/editors.
31
+
32
+ The following are also mapped as readonly from your host:
33
+
34
+ ```yaml
35
+ # host credentials
36
+ - $HOME/.git-credentials:/root/.git-credentials:ro
37
+ - $HOME/.gitconfig:/root/.gitconfig:ro
38
+ - $HOME/.npmrc:/root/.npmrc:ro
39
+ ```
40
+
41
+ ## VSCode
42
+
43
+ Alternatively, if you use VSCode, you could also [attach to the running container](https://code.visualstudio.com/docs/devcontainers/attach-container).
44
+
45
+ See details for [developing in docker](https://code.visualstudio.com/docs/devcontainers/containers) or simply ask `OpenHands` ;-)
46
+
47
+ ## Rebuild dev image
48
+
49
+ You could optionally pass additional options to the build script.
50
+
51
+ ```bash
52
+ make docker-dev OPTIONS="--build"
53
+ # or
54
+ ./containers/dev/dev.sh --build
55
+ ```
56
+
57
+ See [docker compose run](https://docs.docker.com/reference/cli/docker/compose/run/) for more options.
containers/dev/compose.yml ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ services:
3
+ dev:
4
+ privileged: true
5
+ build:
6
+ context: ${OPENHANDS_WORKSPACE:-../../}
7
+ dockerfile: ./containers/dev/Dockerfile
8
+ image: openhands:dev
9
+ container_name: openhands-dev
10
+ environment:
11
+ - BACKEND_HOST=${BACKEND_HOST:-"0.0.0.0"}
12
+ - SANDBOX_API_HOSTNAME=host.docker.internal
13
+ #
14
+ - SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-ghcr.io/all-hands-ai/runtime:0.20-nikolaik}
15
+ - SANDBOX_USER_ID=${SANDBOX_USER_ID:-1234}
16
+ - WORKSPACE_MOUNT_PATH=${WORKSPACE_BASE:-$PWD/workspace}
17
+ ports:
18
+ - "3000:3000"
19
+ extra_hosts:
20
+ - "host.docker.internal:host-gateway"
21
+ volumes:
22
+ - /var/run/docker.sock:/var/run/docker.sock
23
+ - ${WORKSPACE_BASE:-$PWD/workspace}:/opt/workspace_base
24
+ # source code
25
+ - ${OPENHANDS_WORKSPACE:-../../}:/app
26
+ # host credentials
27
+ - $HOME/.git-credentials:/root/.git-credentials:ro
28
+ - $HOME/.gitconfig:/root/.gitconfig:ro
29
+ - $HOME/.npmrc:/root/.npmrc:ro
30
+ # cache
31
+ - cache-data:/root/.cache
32
+ pull_policy: never
33
+ stdin_open: true
34
+ tty: true
35
+
36
+ ##
37
+ volumes:
38
+ cache-data:
containers/dev/dev.sh ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ set -o pipefail
3
+
4
+ function get_docker() {
5
+ echo "Docker is required to build and run OpenHands."
6
+ echo "https://docs.docker.com/get-started/get-docker/"
7
+ exit 1
8
+ }
9
+
10
+ function check_tools() {
11
+ command -v docker &>/dev/null || get_docker
12
+ }
13
+
14
+ function exit_if_indocker() {
15
+ if [ -f /.dockerenv ]; then
16
+ echo "Running inside a Docker container. Exiting..."
17
+ exit 1
18
+ fi
19
+ }
20
+
21
+ #
22
+ exit_if_indocker
23
+
24
+ check_tools
25
+
26
+ ##
27
+ OPENHANDS_WORKSPACE=$(git rev-parse --show-toplevel)
28
+
29
+ cd "$OPENHANDS_WORKSPACE/containers/dev/" || exit 1
30
+
31
+ ##
32
+ export BACKEND_HOST="0.0.0.0"
33
+ #
34
+ export SANDBOX_USER_ID=$(id -u)
35
+ export WORKSPACE_BASE=${WORKSPACE_BASE:-$OPENHANDS_WORKSPACE/workspace}
36
+
37
+ docker compose run --rm --service-ports "$@" dev
38
+
39
+ ##
containers/e2b-sandbox/Dockerfile ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM ubuntu:22.04
2
+
3
+ # install basic packages
4
+ RUN apt-get update && apt-get install -y \
5
+ curl \
6
+ wget \
7
+ git \
8
+ vim \
9
+ nano \
10
+ unzip \
11
+ zip \
12
+ python3 \
13
+ python3-pip \
14
+ python3-venv \
15
+ python3-dev \
16
+ build-essential \
17
+ openssh-server \
18
+ sudo \
19
+ && rm -rf /var/lib/apt/lists/*
containers/e2b-sandbox/README.md ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # How to build custom E2B sandbox for OpenHands
2
+
3
+ [E2B](https://e2b.dev) is an [open-source](https://github.com/e2b-dev/e2b) secure cloud environment (sandbox) made for running AI-generated code and agents. E2B offers [Python](https://pypi.org/project/e2b/) and [JS/TS](https://www.npmjs.com/package/e2b) SDK to spawn and control these sandboxes.
4
+
5
+
6
+ 1. Install the CLI with NPM.
7
+ ```sh
8
+ npm install -g @e2b/cli@latest
9
+ ```
10
+ Full CLI API is [here](https://e2b.dev/docs/cli/installation).
11
+
12
+ 1. Build the sandbox
13
+ ```sh
14
+ e2b template build --dockerfile ./Dockerfile --name "openhands"
15
+ ```
containers/e2b-sandbox/e2b.toml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This is a config for E2B sandbox template.
2
+ # You can use 'template_id' (785n69crgahmz0lkdw9h) or 'template_name (openhands) from this config to spawn a sandbox:
3
+
4
+ # Python SDK
5
+ # from e2b import Sandbox
6
+ # sandbox = Sandbox(template='openhands')
7
+
8
+ # JS SDK
9
+ # import { Sandbox } from 'e2b'
10
+ # const sandbox = await Sandbox.create({ template: 'openhands' })
11
+
12
+ dockerfile = "Dockerfile"
13
+ template_name = "openhands"
14
+ template_id = "785n69crgahmz0lkdw9h"
containers/runtime/README.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dynamically constructed Dockerfile
2
+
3
+ This folder builds a runtime image (sandbox), which will use a dynamically generated `Dockerfile`
4
+ that depends on the `base_image` **AND** a [Python source distribution](https://docs.python.org/3.10/distutils/sourcedist.html) that is based on the current commit of `openhands`.
5
+
6
+ The following command will generate a `Dockerfile` file for `nikolaik/python-nodejs:python3.12-nodejs22` (the default base image), an updated `config.sh` and the runtime source distribution files/folders into `containers/runtime`:
7
+
8
+ ```bash
9
+ poetry run python3 openhands/runtime/utils/runtime_build.py \
10
+ --base_image nikolaik/python-nodejs:python3.12-nodejs22 \
11
+ --build_folder containers/runtime
12
+ ```
containers/runtime/config.sh ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ DOCKER_REGISTRY=ghcr.io
2
+ DOCKER_ORG=all-hands-ai
3
+ DOCKER_BASE_DIR="./containers/runtime"
4
+ DOCKER_IMAGE=runtime
5
+ # These variables will be appended by the runtime_build.py script
6
+ # DOCKER_IMAGE_TAG=
7
+ # DOCKER_IMAGE_SOURCE_TAG=
dev_config/python/.pre-commit-config.yaml ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ repos:
2
+ - repo: https://github.com/pre-commit/pre-commit-hooks
3
+ rev: v4.5.0
4
+ hooks:
5
+ - id: trailing-whitespace
6
+ exclude: docs/modules/python
7
+ - id: end-of-file-fixer
8
+ exclude: docs/modules/python
9
+ - id: check-yaml
10
+ - id: debug-statements
11
+
12
+ - repo: https://github.com/tox-dev/pyproject-fmt
13
+ rev: 1.7.0
14
+ hooks:
15
+ - id: pyproject-fmt
16
+ - repo: https://github.com/abravalheri/validate-pyproject
17
+ rev: v0.16
18
+ hooks:
19
+ - id: validate-pyproject
20
+
21
+ - repo: https://github.com/astral-sh/ruff-pre-commit
22
+ # Ruff version.
23
+ rev: v0.4.1
24
+ hooks:
25
+ # Run the linter.
26
+ - id: ruff
27
+ entry: ruff check --config dev_config/python/ruff.toml
28
+ types_or: [python, pyi, jupyter]
29
+ args: [--fix]
30
+ # Run the formatter.
31
+ - id: ruff-format
32
+ entry: ruff format --config dev_config/python/ruff.toml
33
+ types_or: [python, pyi, jupyter]
34
+
35
+ - repo: https://github.com/pre-commit/mirrors-mypy
36
+ rev: v1.9.0
37
+ hooks:
38
+ - id: mypy
39
+ additional_dependencies:
40
+ [types-requests, types-setuptools, types-pyyaml, types-toml]
41
+ entry: mypy --config-file dev_config/python/mypy.ini openhands/
42
+ always_run: true
43
+ pass_filenames: false
dev_config/python/mypy.ini ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ [mypy]
2
+ warn_unused_configs = True
3
+ ignore_missing_imports = True
4
+ check_untyped_defs = True
5
+ explicit_package_bases = True
6
+ warn_unreachable = True
7
+ warn_redundant_casts = True
8
+ no_implicit_optional = True
9
+ strict_optional = True
dev_config/python/ruff.toml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [lint]
2
+ select = [
3
+ "E",
4
+ "W",
5
+ "F",
6
+ "I",
7
+ "Q",
8
+ "B",
9
+ ]
10
+
11
+ ignore = [
12
+ "E501",
13
+ "B003",
14
+ "B007",
15
+ "B009",
16
+ "B010",
17
+ "B904",
18
+ "B018",
19
+ ]
20
+
21
+ [lint.flake8-quotes]
22
+ docstring-quotes = "double"
23
+ inline-quotes = "single"
24
+
25
+ [format]
26
+ quote-style = "single"
docker-compose.yml ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ services:
3
+ openhands:
4
+ build:
5
+ context: ./
6
+ dockerfile: ./containers/app/Dockerfile
7
+ image: openhands:latest
8
+ container_name: openhands-app-${DATE:-}
9
+ environment:
10
+ - SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-docker.all-hands.dev/all-hands-ai/runtime:0.20-nikolaik}
11
+ #- SANDBOX_USER_ID=${SANDBOX_USER_ID:-1234} # enable this only if you want a specific non-root sandbox user but you will have to manually adjust permissions of openhands-state for this user
12
+ - WORKSPACE_MOUNT_PATH=${WORKSPACE_BASE:-$PWD/workspace}
13
+ ports:
14
+ - "3000:3000"
15
+ extra_hosts:
16
+ - "host.docker.internal:host-gateway"
17
+ volumes:
18
+ - /var/run/docker.sock:/var/run/docker.sock
19
+ - ~/.openhands-state:/.openhands-state
20
+ - ${WORKSPACE_BASE:-$PWD/workspace}:/opt/workspace_base
21
+ pull_policy: build
22
+ stdin_open: true
23
+ tty: true
entrypoint.sh ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ set -eo pipefail
3
+
4
+ echo "Starting OpenHands..."
5
+ if [[ $NO_SETUP == "true" ]]; then
6
+ echo "Skipping setup, running as $(whoami)"
7
+ "$@"
8
+ exit 0
9
+ fi
10
+
11
+ if [ "$(id -u)" -ne 0 ]; then
12
+ echo "The OpenHands entrypoint.sh must run as root"
13
+ exit 1
14
+ fi
15
+
16
+ if [ -z "$SANDBOX_USER_ID" ]; then
17
+ echo "SANDBOX_USER_ID is not set"
18
+ exit 1
19
+ fi
20
+
21
+ if [ -z "$WORKSPACE_MOUNT_PATH" ]; then
22
+ # This is set to /opt/workspace in the Dockerfile. But if the user isn't mounting, we want to unset it so that OpenHands doesn't mount at all
23
+ unset WORKSPACE_BASE
24
+ fi
25
+
26
+ if [[ "$SANDBOX_USER_ID" -eq 0 ]]; then
27
+ echo "Running OpenHands as root"
28
+ export RUN_AS_OPENHANDS=false
29
+ mkdir -p /root/.cache/ms-playwright/
30
+ if [ -d "/home/openhands/.cache/ms-playwright/" ]; then
31
+ mv /home/openhands/.cache/ms-playwright/ /root/.cache/
32
+ fi
33
+ "$@"
34
+ else
35
+ echo "Setting up enduser with id $SANDBOX_USER_ID"
36
+ if id "enduser" &>/dev/null; then
37
+ echo "User enduser already exists. Skipping creation."
38
+ else
39
+ if ! useradd -l -m -u $SANDBOX_USER_ID -s /bin/bash enduser; then
40
+ echo "Failed to create user enduser with id $SANDBOX_USER_ID. Moving openhands user."
41
+ incremented_id=$(($SANDBOX_USER_ID + 1))
42
+ usermod -u $incremented_id openhands
43
+ if ! useradd -l -m -u $SANDBOX_USER_ID -s /bin/bash enduser; then
44
+ echo "Failed to create user enduser with id $SANDBOX_USER_ID for a second time. Exiting."
45
+ exit 1
46
+ fi
47
+ fi
48
+ fi
49
+ usermod -aG app enduser
50
+ # get the user group of /var/run/docker.sock and set openhands to that group
51
+ DOCKER_SOCKET_GID=$(stat -c '%g' /var/run/docker.sock)
52
+ echo "Docker socket group id: $DOCKER_SOCKET_GID"
53
+ if getent group $DOCKER_SOCKET_GID; then
54
+ echo "Group with id $DOCKER_SOCKET_GID already exists"
55
+ else
56
+ echo "Creating group with id $DOCKER_SOCKET_GID"
57
+ groupadd -g $DOCKER_SOCKET_GID docker
58
+ fi
59
+
60
+ mkdir -p /home/enduser/.cache/huggingface/hub/
61
+ mkdir -p /home/enduser/.cache/ms-playwright/
62
+ if [ -d "/home/openhands/.cache/ms-playwright/" ]; then
63
+ mv /home/openhands/.cache/ms-playwright/ /home/enduser/.cache/
64
+ fi
65
+
66
+ usermod -aG $DOCKER_SOCKET_GID enduser
67
+ echo "Running as enduser"
68
+ su enduser /bin/bash -c "${*@Q}" # This magically runs any arguments passed to the script as a command
69
+ fi
evaluation/README.md ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Evaluation
2
+
3
+ This folder contains code and resources to run experiments and evaluations.
4
+
5
+ ## For Benchmark Users
6
+
7
+ ### Setup
8
+
9
+ Before starting evaluation, follow the instructions [here](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md) to setup your local development environment and LLM.
10
+
11
+ Once you are done with setup, you can follow the benchmark-specific instructions in each subdirectory of the [evaluation directory](#supported-benchmarks).
12
+ Generally these will involve running `run_infer.py` to perform inference with the agents.
13
+
14
+ ### Implementing and Evaluating an Agent
15
+
16
+ To add an agent to OpenHands, you will need to implement it in the [agenthub directory](https://github.com/All-Hands-AI/OpenHands/tree/main/openhands/agenthub). There is a README there with more information.
17
+
18
+ To evaluate an agent, you can provide the agent's name to the `run_infer.py` program.
19
+
20
+ ### Evaluating Different LLMs
21
+
22
+ OpenHands in development mode uses `config.toml` to keep track of most configuration.
23
+ Here's an example configuration file you can use to define and use multiple LLMs:
24
+
25
+ ```toml
26
+ [llm]
27
+ # IMPORTANT: add your API key here, and set the model to the one you want to evaluate
28
+ model = "gpt-4o-2024-05-13"
29
+ api_key = "sk-XXX"
30
+
31
+ [llm.eval_gpt4_1106_preview_llm]
32
+ model = "gpt-4-1106-preview"
33
+ api_key = "XXX"
34
+ temperature = 0.0
35
+
36
+ [llm.eval_some_openai_compatible_model_llm]
37
+ model = "openai/MODEL_NAME"
38
+ base_url = "https://OPENAI_COMPATIBLE_URL/v1"
39
+ api_key = "XXX"
40
+ temperature = 0.0
41
+ ```
42
+
43
+ ## Supported Benchmarks
44
+
45
+ The OpenHands evaluation harness supports a wide variety of benchmarks across [software engineering](#software-engineering), [web browsing](#web-browsing), [miscellaneous assistance](#misc-assistance), and [real-world](#real-world) tasks.
46
+
47
+ ### Software Engineering
48
+
49
+ - SWE-Bench: [`evaluation/benchmarks/swe_bench`](./benchmarks/swe_bench)
50
+ - HumanEvalFix: [`evaluation/benchmarks/humanevalfix`](./benchmarks/humanevalfix)
51
+ - BIRD: [`evaluation/benchmarks/bird`](./benchmarks/bird)
52
+ - BioCoder: [`evaluation/benchmarks/ml_bench`](./benchmarks/ml_bench)
53
+ - ML-Bench: [`evaluation/benchmarks/ml_bench`](./benchmarks/ml_bench)
54
+ - APIBench: [`evaluation/benchmarks/gorilla`](./benchmarks/gorilla/)
55
+ - ToolQA: [`evaluation/benchmarks/toolqa`](./benchmarks/toolqa/)
56
+ - AiderBench: [`evaluation/benchmarks/aider_bench`](./benchmarks/aider_bench/)
57
+ - Commit0: [`evaluation/benchmarks/commit0_bench`](./benchmarks/commit0_bench/)
58
+ - DiscoveryBench: [`evaluation/benchmarks/discoverybench`](./benchmarks/discoverybench/)
59
+
60
+ ### Web Browsing
61
+
62
+ - WebArena: [`evaluation/benchmarks/webarena`](./benchmarks/webarena/)
63
+ - MiniWob++: [`evaluation/benchmarks/miniwob`](./benchmarks/miniwob/)
64
+ - Browsing Delegation: [`evaluation/benchmarks/browsing_delegation`](./benchmarks/browsing_delegation/)
65
+
66
+ ### Misc. Assistance
67
+
68
+ - GAIA: [`evaluation/benchmarks/gaia`](./benchmarks/gaia)
69
+ - GPQA: [`evaluation/benchmarks/gpqa`](./benchmarks/gpqa)
70
+ - AgentBench: [`evaluation/benchmarks/agent_bench`](./benchmarks/agent_bench)
71
+ - MINT: [`evaluation/benchmarks/mint`](./benchmarks/mint)
72
+ - Entity deduction Arena (EDA): [`evaluation/benchmarks/EDA`](./benchmarks/EDA)
73
+ - ProofWriter: [`evaluation/benchmarks/logic_reasoning`](./benchmarks/logic_reasoning)
74
+ - ScienceAgentBench: [`evaluation/benchmarks/scienceagentbench`](./benchmarks/scienceagentbench)
75
+
76
+ ### Real World
77
+
78
+ - TheAgentCompany: [`evaluation/benchmarks/the_agent_company`](./benchmarks/the_agent_company)
79
+
80
+ ## Result Visualization
81
+
82
+ Check [this huggingface space](https://huggingface.co/spaces/OpenHands/evaluation) for visualization of existing experimental results.
83
+
84
+ You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenHands/evaluation) and submit a PR of your evaluation results to our hosted huggingface repo via PR following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).
85
+
86
+ ## For Benchmark Developers
87
+
88
+ To learn more about how to integrate your benchmark into OpenHands, check out [tutorial here](https://docs.all-hands.dev/modules/usage/how-to/evaluation-harness). Briefly,
89
+
90
+ - Each subfolder contains a specific benchmark or experiment. For example, [`evaluation/benchmarks/swe_bench`](./benchmarks/swe_bench) should contain
91
+ all the preprocessing/evaluation/analysis scripts.
92
+ - Raw data and experimental records should not be stored within this repo.
93
+ - For model outputs, they should be stored at [this huggingface space](https://huggingface.co/spaces/OpenHands/evaluation) for visualization.
94
+ - Important data files of manageable size and analysis scripts (e.g., jupyter notebooks) can be directly uploaded to this repo.
evaluation/__init__.py ADDED
File without changes
evaluation/benchmarks/EDA/README.md ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # EDA Evaluation
2
+
3
+ This folder contains evaluation harness for evaluating agents on the Entity-deduction-Arena Benchmark, from the paper [Probing the Multi-turn Planning Capabilities of LLMs via 20 Question Games](https://arxiv.org/abs/2310.01468), presented in ACL 2024 main conference.
4
+
5
+ ## Setup Environment and LLM Configuration
6
+
7
+ Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
8
+
9
+ ## Start the evaluation
10
+
11
+ ```bash
12
+ export OPENAI_API_KEY="sk-XXX"; # This is required for evaluation (to simulate another party of conversation)
13
+ ./evaluation/benchmarks/EDA/scripts/run_infer.sh [model_config] [git-version] [agent] [dataset] [eval_limit]
14
+ ```
15
+
16
+ where `model_config` is mandatory, while `git-version`, `agent`, `dataset` and `eval_limit` are optional.
17
+
18
+ - `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your
19
+ LLM settings, as defined in your `config.toml`.
20
+
21
+ - `git-version`, e.g. `HEAD`, is the git commit hash of the OpenHands version you would
22
+ like to evaluate. It could also be a release tag like `0.6.2`.
23
+
24
+ - `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting
25
+ to `CodeActAgent`.
26
+
27
+ - `dataset`: There are two tasks in this evaluation. Specify `dataset` to test on either `things` or `celebs` task.
28
+
29
+ - `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances. By default it infers all instances.
30
+
31
+ For example,
32
+
33
+ ```bash
34
+ ./evaluation/benchmarks/EDA/scripts/run_infer.sh eval_gpt4o_2024_05_13 0.6.2 CodeActAgent things
35
+ ```
36
+
37
+ ## Reference
38
+
39
+ ```bibtex
40
+ @inproceedings{zhang2023entity,
41
+ title={Probing the Multi-turn Planning Capabilities of LLMs via 20 Question Games},
42
+ author={Zhang, Yizhe and Lu, Jiarui and Jaitly, Navdeep},
43
+ journal={ACL},
44
+ year={2024}
45
+ }
46
+ ```
evaluation/benchmarks/EDA/game.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import re
3
+
4
+ import openai
5
+ import requests.exceptions
6
+ from openai import OpenAI
7
+ from retry import retry
8
+
9
+ LOGGER = logging.getLogger(__name__)
10
+
11
+
12
+ class Q20Game:
13
+ def __init__(
14
+ self,
15
+ item: str,
16
+ answerer_model: str = 'gpt-3.5-turbo-0613',
17
+ guesser_model: str = 'gpt-3.5-turbo-0613',
18
+ num_turns: int = 20,
19
+ temperature: float = 0.8,
20
+ openai_api: bool = True,
21
+ openai_api_key: str | None = None,
22
+ guesser_kargs=None,
23
+ ) -> None:
24
+ if guesser_kargs is None:
25
+ guesser_kargs = {}
26
+ self.item = item
27
+ self.answerer_model = answerer_model
28
+ self.guesser_model = guesser_model
29
+ self.num_turns = num_turns
30
+ self.temperature = temperature
31
+ self.openai_api = openai_api
32
+ self.guesser_kargs = guesser_kargs
33
+ self.vicuna_prompt = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions."
34
+ self.first_user_utterance = (
35
+ 'Your task is to ask a series of questions to deduce the entity '
36
+ "that I'm thinking of with as few queries as possible. "
37
+ "Only ask questions that can be answered by 'yes', 'no' or 'maybe'. "
38
+ 'Do not ask for hint. Make your question brief with no linebreaker. '
39
+ 'Now start asking a question.'
40
+ )
41
+ self.guesser_win = False
42
+ self.curr_turn = 0
43
+ if openai_api_key is not None:
44
+ openai.api_key = openai_api_key
45
+
46
+ if isinstance(answerer_model, str) and not answerer_model.startswith('gpt'):
47
+ self.user_api_base = 'http://0.0.0.0:8000/v1'
48
+ else:
49
+ self.user_api_base = 'https://api.openai.com/v1'
50
+
51
+ if isinstance(guesser_model, str) and not guesser_model.startswith('gpt'):
52
+ self.guesser_api_base = 'http://0.0.0.0:8000/v1'
53
+ else:
54
+ self.guesser_api_base = 'https://api.openai.com/v1'
55
+
56
+ self.guesser_messages = []
57
+
58
+ def preprocess_response(self, response):
59
+ response = re.sub(r'the entity you are thinking of', 'it', response)
60
+ response = re.sub(r"the entity you're thinking of", 'it', response)
61
+ response = re.sub(r" you're thinking of", '', response)
62
+ response = re.sub(r' you are thinking of', '', response)
63
+ return response
64
+
65
+ def judge_winner(self, response):
66
+ guesser_question = response.strip()
67
+
68
+ if self.curr_turn == self.num_turns - 1:
69
+ guesser_question += ' Is it right?'
70
+
71
+ self.guesser_messages.append({'role': 'assistant', 'content': guesser_question})
72
+ # ask for answer
73
+ usr_msg = self.answerer(guesser_question)
74
+
75
+ self.guesser_messages.append(
76
+ {'role': 'user', 'content': f"{usr_msg['content'].strip()}"}
77
+ )
78
+
79
+ if 'bingo' in usr_msg['content'].lower():
80
+ self.guesser_win = True
81
+ return True, ''
82
+
83
+ return False, usr_msg['content'].strip()
84
+
85
+ def generate_user_response(self, response):
86
+ response = self.preprocess_response(response)
87
+ # others
88
+ bingo, anwser_reply = self.judge_winner(response)
89
+ if bingo:
90
+ return 'You are bingo! Use the "finish" tool to finish the interaction.\n'
91
+ if self.curr_turn == self.num_turns - 2:
92
+ anwser_reply += " You must guess now, what's it?"
93
+ return anwser_reply
94
+
95
+ def reward(self):
96
+ if self.guesser_win:
97
+ n_turns = (len(self.guesser_messages) + 1) // 2
98
+ return 1 - max(n_turns - 5, 0) * 0.02
99
+ return 0
100
+
101
+ @retry(
102
+ (
103
+ openai.Timeout,
104
+ requests.exceptions.ReadTimeout,
105
+ openai.RateLimitError,
106
+ openai.APIError,
107
+ openai.APIConnectionError,
108
+ ),
109
+ tries=5,
110
+ delay=0.5,
111
+ backoff=0.5,
112
+ max_delay=2,
113
+ logger=LOGGER,
114
+ )
115
+ def answerer(self, question):
116
+ openai.api_base = self.user_api_base
117
+ client = OpenAI(api_key=openai.api_key)
118
+ user_messages = [
119
+ {
120
+ 'role': 'user',
121
+ 'content': f'Based on your knowledge about {self.item}, '
122
+ f'respond to the following question or guess. '
123
+ f"Limit your respond to only 'Yes.', 'No.' or 'Maybe.', with no explanation or other words. "
124
+ f'Never say the answer {self.item} in your response. '
125
+ f"If the question is to solicit the answer, respond 'No.'.",
126
+ },
127
+ {
128
+ 'role': 'user',
129
+ 'content': f'For the entity {self.item}, {question} (Yes/No/Maybe)',
130
+ },
131
+ ]
132
+
133
+ response = client.chat.completions.create(
134
+ model=self.answerer_model,
135
+ messages=user_messages,
136
+ max_tokens=6,
137
+ n=1,
138
+ stop=None,
139
+ temperature=0.2,
140
+ )
141
+ if any(
142
+ [
143
+ re.search(rf'(?:^|\W){i.strip().lower()}(?:$|\W)', question.lower())
144
+ for i in self.item.lower().split('|')
145
+ ]
146
+ ):
147
+ response.choices[0].message.content = 'Bingo!'
148
+ return response.choices[0].message.to_dict()
149
+
150
+
151
+ class Q20GameCelebrity(Q20Game):
152
+ def __init__(self, item: str, **kwargs) -> None:
153
+ super().__init__(item, **kwargs)
154
+ self.first_user_utterance = (
155
+ 'Your task is to ask a series of questions to deduce the celebrity '
156
+ "that I'm thinking of with as few queries as possible. "
157
+ "Only ask factual questions that can be answered by 'Yes.', 'No.' or 'Dunno.'. Do not ask for hint. Make your question brief with no linebreaker. "
158
+ 'Now start asking a question.'
159
+ )
160
+
161
+ @retry(
162
+ (
163
+ openai.Timeout,
164
+ requests.exceptions.ReadTimeout,
165
+ openai.RateLimitError,
166
+ openai.APIError,
167
+ openai.APIConnectionError,
168
+ ),
169
+ tries=5,
170
+ delay=0.5,
171
+ backoff=0.5,
172
+ max_delay=2,
173
+ logger=LOGGER,
174
+ )
175
+ def answerer(self, question):
176
+ openai.api_base = self.user_api_base
177
+ client = OpenAI(api_key=openai.api_key)
178
+ user_messages = [
179
+ {
180
+ 'role': 'system',
181
+ 'content': f'Based on your knowledge about the celebrity: {self.item}, '
182
+ f'respond to the following question or guess. '
183
+ f"Limit your respond to only 'Yes.', 'No.' or 'Dunno.', with no explanation or other words. "
184
+ f"Never say the name {self.item} in your response. Do not say 'Dunno.' if it can be answered by 'Yes.' or 'No.' "
185
+ f"If the question is to solicit the answer, respond 'No.'.",
186
+ },
187
+ {
188
+ 'role': 'user',
189
+ 'content': f'For the celebrity {self.item}, {question}(Yes/No/Dunno)',
190
+ },
191
+ ]
192
+
193
+ response = client.chat.completions.create(
194
+ model=self.answerer_model,
195
+ messages=user_messages,
196
+ max_tokens=6,
197
+ n=1,
198
+ stop=None,
199
+ temperature=0.2,
200
+ )
201
+ if re.search(rf'(?:^|\W){self.item.lower()}(?:$|\W)', question.lower()):
202
+ response.choices[0].message.content = 'Bingo!'
203
+ return response.choices[0].message.to_dict()
evaluation/benchmarks/EDA/run_infer.py ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import os
3
+
4
+ import pandas as pd
5
+ from datasets import load_dataset
6
+
7
+ from evaluation.benchmarks.EDA.game import Q20Game, Q20GameCelebrity
8
+ from evaluation.utils.shared import (
9
+ EvalMetadata,
10
+ EvalOutput,
11
+ compatibility_for_eval_history_pairs,
12
+ make_metadata,
13
+ prepare_dataset,
14
+ reset_logger_for_multiprocessing,
15
+ run_evaluation,
16
+ )
17
+ from openhands.controller.state.state import State
18
+ from openhands.core.config import (
19
+ AppConfig,
20
+ SandboxConfig,
21
+ get_llm_config_arg,
22
+ get_parser,
23
+ )
24
+ from openhands.core.logger import openhands_logger as logger
25
+ from openhands.core.main import create_runtime, run_controller
26
+ from openhands.events.action import MessageAction
27
+ from openhands.utils.async_utils import call_async_from_sync
28
+
29
+ game = None
30
+
31
+
32
+ def codeact_user_response_eda(state: State) -> str:
33
+ global game
34
+ model_guess = ''
35
+
36
+ # retrieve the latest model message from history
37
+ if state.history:
38
+ last_agent_message = state.get_last_agent_message()
39
+ model_guess = last_agent_message.content if last_agent_message else ''
40
+
41
+ assert game is not None, 'Game is not initialized.'
42
+ msg = game.generate_user_response(model_guess)
43
+ game.curr_turn += 1
44
+ logger.info(f'Model guess: {model_guess}')
45
+ logger.info(f'Answer response: {msg}')
46
+ if 'bingo!' in msg.lower():
47
+ return '/exit'
48
+ return msg
49
+
50
+
51
+ AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
52
+ 'CodeActAgent': codeact_user_response_eda,
53
+ }
54
+
55
+ AGENT_CLS_TO_INST_SUFFIX = {
56
+ 'CodeActAgent': 'When you think you have solved the question, please first send your answer to user through message and then exit.\n'
57
+ }
58
+
59
+
60
+ def get_config(
61
+ metadata: EvalMetadata,
62
+ ) -> AppConfig:
63
+ config = AppConfig(
64
+ default_agent=metadata.agent_class,
65
+ run_as_openhands=False,
66
+ runtime='docker',
67
+ max_iterations=metadata.max_iterations,
68
+ sandbox=SandboxConfig(
69
+ base_container_image='python:3.12-bookworm',
70
+ enable_auto_lint=False,
71
+ use_host_network=False,
72
+ ),
73
+ # do not mount workspace
74
+ workspace_base=None,
75
+ workspace_mount_path=None,
76
+ )
77
+ config.set_llm_config(metadata.llm_config)
78
+ agent_config = config.get_agent_config(metadata.agent_class)
79
+ agent_config.enable_prompt_extensions = False
80
+ return config
81
+
82
+
83
+ def process_instance(
84
+ instance: pd.Series,
85
+ metadata: EvalMetadata,
86
+ reset_logger: bool = True,
87
+ ) -> EvalOutput:
88
+ config = get_config(metadata)
89
+ instance_id = instance['text'].strip()
90
+
91
+ # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
92
+ if reset_logger:
93
+ log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
94
+ reset_logger_for_multiprocessing(logger, instance_id, log_dir)
95
+ else:
96
+ logger.info(f'Starting evaluation for instance {instance_id}.')
97
+
98
+ # Prepare instruction
99
+ _game_class = {'eda-things': Q20Game, 'eda-celebs': Q20GameCelebrity}
100
+
101
+ guesser_kargs = {
102
+ 'max_new_tokens': 64,
103
+ 'temperature': 0.8,
104
+ 'repetition_penalty': 1.0,
105
+ 'do_sample': True,
106
+ } # no penalty
107
+
108
+ # Use codeactagent as guesser_model
109
+ global game
110
+ assert metadata.dataset is not None
111
+ assert metadata.details is not None
112
+ game = _game_class[metadata.dataset](
113
+ item=instance['text'].strip(),
114
+ answerer_model=metadata.details['answerer_model'],
115
+ guesser_model=None,
116
+ num_turns=metadata.max_iterations,
117
+ openai_api_key=metadata.details['openai_api_key'],
118
+ guesser_kargs=guesser_kargs,
119
+ )
120
+
121
+ instruction = f'{game.first_user_utterance}'
122
+ logger.info(f'Instruction: {instruction}')
123
+ instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]
124
+
125
+ # Here's how you can run the agent (similar to the `main` function) and get the final task state
126
+ runtime = create_runtime(config)
127
+ call_async_from_sync(runtime.connect)
128
+
129
+ state: State | None = asyncio.run(
130
+ run_controller(
131
+ config=config,
132
+ initial_user_action=MessageAction(content=instruction),
133
+ runtime=runtime,
134
+ fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
135
+ metadata.agent_class
136
+ ],
137
+ )
138
+ )
139
+ # ======= Attempt to evaluate the agent's edits =======
140
+ # If you are working on simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
141
+ # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
142
+
143
+ if state is None:
144
+ raise ValueError('State should not be None.')
145
+
146
+ last_agent_message = state.get_last_agent_message()
147
+ final_message = last_agent_message.content if last_agent_message else ''
148
+
149
+ logger.info(f'Final message: {final_message} | Ground truth: {instance["text"]}')
150
+ test_result = game.reward()
151
+ metrics = state.metrics.get() if state.metrics else None
152
+
153
+ # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
154
+ # for compatibility with the existing output format, we can remake the pairs here
155
+ # remove when it becomes unnecessary
156
+ histories = compatibility_for_eval_history_pairs(state.history)
157
+
158
+ # Save the output
159
+ output = EvalOutput(
160
+ instance_id=instance_id,
161
+ instance=instance.to_dict(),
162
+ instruction=instruction,
163
+ metadata=metadata,
164
+ history=histories,
165
+ metrics=metrics,
166
+ error=state.last_error if state and state.last_error else None,
167
+ test_result={
168
+ 'success': test_result,
169
+ 'final_message': final_message,
170
+ 'ground_truth': instance['text'],
171
+ },
172
+ )
173
+ return output
174
+
175
+
176
+ if __name__ == '__main__':
177
+ parser = get_parser()
178
+ parser.add_argument(
179
+ '--answerer_model', '-a', default='gpt-3.5-turbo', help='answerer model'
180
+ )
181
+ parser.add_argument(
182
+ '--dataset',
183
+ default='things',
184
+ choices=['things', 'celebs'],
185
+ type=str,
186
+ help='dataset to be used',
187
+ )
188
+ parser.add_argument(
189
+ '--OPENAI_API_KEY', type=str, required=True, help='Your OpenAI API key'
190
+ )
191
+ parser.add_argument(
192
+ '--data-split',
193
+ default='test',
194
+ type=str,
195
+ help='data split, eg, test',
196
+ )
197
+ args, _ = parser.parse_known_args()
198
+
199
+ eda_dataset = load_dataset(
200
+ 'yizheapple/entity-deduction-arena', name=args.dataset, split=args.data_split
201
+ )
202
+ eda_dataset.rename(columns={'text': 'instance_id'}, inplace=True)
203
+
204
+ llm_config = None
205
+ if args.llm_config:
206
+ llm_config = get_llm_config_arg(args.llm_config)
207
+ # modify_params must be False for evaluation purpose, for reproducibility and accurancy of results
208
+ llm_config.modify_params = False
209
+
210
+ if llm_config is None:
211
+ raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
212
+
213
+ metadata = make_metadata(
214
+ llm_config,
215
+ f'eda-{args.dataset}',
216
+ args.agent_cls,
217
+ args.max_iterations,
218
+ args.eval_note,
219
+ args.eval_output_dir,
220
+ data_split=args.data_split,
221
+ details={
222
+ 'answerer_model': str(args.answerer_model),
223
+ 'openai_api_key': str(args.OPENAI_API_KEY),
224
+ },
225
+ )
226
+
227
+ output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
228
+ prepared_dataset = prepare_dataset(
229
+ eda_dataset.to_pandas(), output_file, args.eval_n_limit
230
+ )
231
+
232
+ run_evaluation(
233
+ prepared_dataset,
234
+ metadata,
235
+ output_file,
236
+ args.eval_num_workers,
237
+ process_instance,
238
+ )
evaluation/benchmarks/EDA/scripts/run_infer.sh ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ set -eo pipefail
3
+
4
+ source "evaluation/utils/version_control.sh"
5
+
6
+ MODEL_CONFIG=$1
7
+ COMMIT_HASH=$2
8
+ AGENT=$3
9
+ DATASET=$4
10
+ EVAL_LIMIT=$5
11
+ NUM_WORKERS=$6
12
+
13
+ if [ -z "$NUM_WORKERS" ]; then
14
+ NUM_WORKERS=1
15
+ echo "Number of workers not specified, use default $NUM_WORKERS"
16
+ fi
17
+ checkout_eval_branch
18
+
19
+ if [ -z "$AGENT" ]; then
20
+ echo "Agent not specified, use default CodeActAgent"
21
+ AGENT="CodeActAgent"
22
+ fi
23
+
24
+ get_openhands_version
25
+
26
+ if [ -z "$DATASET" ]; then
27
+ echo "Dataset not specified, use default 'things'"
28
+ DATASET="things"
29
+ fi
30
+
31
+ # check if OPENAI_API_KEY is set
32
+ if [ -z "$OPENAI_API_KEY" ]; then
33
+ echo "OPENAI_API_KEY is not set, please set it to run the script"
34
+ exit 1
35
+ fi
36
+
37
+
38
+ echo "AGENT: $AGENT"
39
+ echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
40
+ echo "MODEL_CONFIG: $MODEL_CONFIG"
41
+ echo "DATASET: $DATASET"
42
+
43
+ COMMAND="poetry run python evaluation/benchmarks/EDA/run_infer.py \
44
+ --agent-cls $AGENT \
45
+ --llm-config $MODEL_CONFIG \
46
+ --dataset $DATASET \
47
+ --data-split test \
48
+ --max-iterations 20 \
49
+ --OPENAI_API_KEY $OPENAI_API_KEY \
50
+ --eval-num-workers $NUM_WORKERS \
51
+ --eval-note ${OPENHANDS_VERSION}_${DATASET}"
52
+
53
+ if [ -n "$EVAL_LIMIT" ]; then
54
+ echo "EVAL_LIMIT: $EVAL_LIMIT"
55
+ COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
56
+ fi
57
+
58
+ # Run the command
59
+ echo $COMMAND
60
+ eval $COMMAND
evaluation/benchmarks/agent_bench/README.md ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # AgentBench Evaluation
2
+
3
+ This folder contains evaluation harness for evaluating agents on the [AgentBench: Evaluating LLMs as Agents](https://arxiv.org/abs/2308.03688). We currently only support running on the `osbench` subset.
4
+
5
+ ## Setup Environment and LLM Configuration
6
+
7
+ Please follow instruction [here](../../README.md#setup) to setup your local development environment and LLM.
8
+
9
+ ## Start the evaluation
10
+
11
+ ```bash
12
+ ./evaluation/benchmarks/agent_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit]
13
+ ```
14
+
15
+ - `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your
16
+ LLM settings, as defined in your `config.toml`.
17
+ - `git-version`, e.g. `HEAD`, is the git commit hash of the OpenHands version you would
18
+ like to evaluate. It could also be a release tag like `0.6.2`.
19
+ - `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting
20
+ to `CodeActAgent`.
21
+ - `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances. By
22
+ default, the script evaluates the entire SWE-bench_Lite test set (300 issues). Note:
23
+ in order to use `eval_limit`, you must also set `agent`.
24
+
25
+
26
+ Following is the basic command to start the evaluation.
27
+
28
+ You can update the arguments in the script `evaluation/benchmarks/agent_bench/scripts/run_infer.sh`, such as `--max-iterations`, `--eval-num-workers` and so on.
29
+
30
+ - `--agent-cls`, the agent to use. For example, `CodeActAgent`.
31
+ - `--llm-config`: the LLM configuration to use. For example, `eval_gpt4_1106_preview`.
32
+ - `--max-iterations`: the number of iterations to run the evaluation. For example, `30`.
33
+ - `--eval-num-workers`: the number of workers to use for evaluation. For example, `5`.
34
+ - `--eval-n-limit`: the number of examples to evaluate. For example, `100`.
35
+
36
+ ```bash
37
+ ./evaluation/benchmarks/agent_bench/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 1
38
+ ```
39
+
40
+ ## Run with Remote Runtime (experimental)
41
+
42
+ You can run the evaluation using a remote runtime instead of a local Docker container. This is useful when you want to run the evaluation in a cloud environment or when you don't have Docker installed locally.
43
+
44
+ To use the remote runtime, set the following environment variables:
45
+
46
+ ```bash
47
+ # Required environment variables
48
+ export ALLHANDS_API_KEY="your-api-key" # Contact the team to get an API key
49
+ export RUNTIME=remote
50
+ export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev"
51
+
52
+ # Run the evaluation
53
+ ./evaluation/benchmarks/agent_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 1
54
+ ```
55
+
56
+ The remote runtime will build a container image and run the evaluation in a cloud environment. The results will be saved locally in the same way as when running with a local runtime.
evaluation/benchmarks/agent_bench/__init__.py ADDED
File without changes
evaluation/benchmarks/agent_bench/helper.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ from functools import partial
4
+
5
+ from evaluation.utils.shared import codeact_user_response
6
+ from openhands.events.action import CmdRunAction, MessageAction
7
+
8
+
9
+ def try_parse_answer(act) -> str | None:
10
+ raw_ans = ''
11
+ if isinstance(act, MessageAction) and act.source == 'agent':
12
+ raw_ans = act.content
13
+ elif isinstance(act, CmdRunAction) and act.source == 'agent':
14
+ raw_ans = act.thought
15
+ else:
16
+ return None
17
+ agent_answer = re.findall(r'<solution>(.*?)</solution>', raw_ans, re.DOTALL)
18
+ if not agent_answer:
19
+ return None
20
+ return agent_answer[0].strip()
21
+
22
+
23
+ FAKE_RESPONSES = {
24
+ 'CodeActAgent': partial(
25
+ codeact_user_response, encapsulate_solution=True, try_parse=try_parse_answer
26
+ ),
27
+ }
28
+
29
+ INST_SUFFIXES: dict[str, str] = {
30
+ 'CodeActAgent': (
31
+ 'When you think you have solved the question, '
32
+ 'please first send your answer to user through message and then exit.\n'
33
+ )
34
+ }
35
+
36
+
37
+ def analysis_size(size_str):
38
+ size_str = size_str.strip()
39
+ avails = {
40
+ 'B': 1,
41
+ 'Byte': 1,
42
+ 'K': 1024,
43
+ 'KB': 1024,
44
+ 'M': 1024 * 1024,
45
+ 'MB': 1024 * 1024,
46
+ 'G': 1024 * 1024 * 1024,
47
+ 'GB': 1024 * 1024 * 1024,
48
+ 'T': 1024 * 1024 * 1024 * 1024,
49
+ 'TB': 1024 * 1024 * 1024 * 1024,
50
+ 'P': 1024 * 1024 * 1024 * 1024 * 1024,
51
+ 'PB': 1024 * 1024 * 1024 * 1024 * 1024,
52
+ }
53
+ for size_unit in avails:
54
+ if size_str.endswith(size_unit):
55
+ return int(size_str[: -len(size_unit)]) * avails[size_unit]
56
+ return int(size_str)
57
+
58
+
59
+ def compare_results(check_method: str, model_answer: str, final_ans: str) -> bool:
60
+ try:
61
+ match check_method:
62
+ case 'check/integer-match.py':
63
+ return int(model_answer) == int(final_ans)
64
+ case 'check/size-match.py':
65
+ return analysis_size(model_answer) == analysis_size(final_ans)
66
+ return (
67
+ model_answer.replace('\r\n', '\n').replace('\r', '\n').strip()
68
+ == final_ans.replace('\r\n', '\n').replace('\r', '\n').strip()
69
+ )
70
+ except Exception:
71
+ return False
72
+
73
+
74
+ def create_sh_file(filename: str, cmds: str) -> None:
75
+ with open(filename, 'w', encoding='utf-8') as file:
76
+ file.write(cmds.replace('\r\n', '\n'))
77
+ os.chmod(filename, 0o755)
evaluation/benchmarks/agent_bench/run_infer.py ADDED
@@ -0,0 +1,329 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import os
3
+ import re
4
+ import tempfile
5
+ from typing import Any
6
+
7
+ import pandas as pd
8
+ from datasets import load_dataset
9
+
10
+ from evaluation.benchmarks.agent_bench.helper import (
11
+ FAKE_RESPONSES,
12
+ INST_SUFFIXES,
13
+ compare_results,
14
+ create_sh_file,
15
+ )
16
+ from evaluation.utils.shared import (
17
+ EvalMetadata,
18
+ EvalOutput,
19
+ compatibility_for_eval_history_pairs,
20
+ make_metadata,
21
+ prepare_dataset,
22
+ reset_logger_for_multiprocessing,
23
+ run_evaluation,
24
+ )
25
+ from openhands.controller.state.state import State
26
+ from openhands.core.config import (
27
+ AppConfig,
28
+ SandboxConfig,
29
+ get_llm_config_arg,
30
+ parse_arguments,
31
+ )
32
+ from openhands.core.logger import openhands_logger as logger
33
+ from openhands.core.main import create_runtime, run_controller
34
+ from openhands.events.action import AgentFinishAction, CmdRunAction, MessageAction
35
+ from openhands.events.observation import CmdOutputObservation
36
+ from openhands.runtime.base import Runtime
37
+ from openhands.utils.async_utils import call_async_from_sync
38
+
39
+
40
+ def get_config(
41
+ metadata: EvalMetadata,
42
+ ) -> AppConfig:
43
+ config = AppConfig(
44
+ default_agent=metadata.agent_class,
45
+ run_as_openhands=False,
46
+ runtime=os.environ.get('RUNTIME', 'docker'),
47
+ max_iterations=metadata.max_iterations,
48
+ sandbox=SandboxConfig(
49
+ base_container_image='python:3.12-slim',
50
+ enable_auto_lint=True,
51
+ use_host_network=False,
52
+ api_key=os.environ.get('ALLHANDS_API_KEY', None),
53
+ remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
54
+ keep_runtime_alive=False,
55
+ remote_runtime_init_timeout=3600,
56
+ ),
57
+ # do not mount workspace
58
+ workspace_base=None,
59
+ workspace_mount_path=None,
60
+ )
61
+ config.set_llm_config(metadata.llm_config)
62
+ agent_config = config.get_agent_config(metadata.agent_class)
63
+ agent_config.enable_prompt_extensions = False
64
+ return config
65
+
66
+
67
+ def initialize_runtime(
68
+ runtime: Runtime,
69
+ instance: pd.Series, # this argument is not required
70
+ ):
71
+ """Initialize the runtime for the agent.
72
+
73
+ This function is called before the runtime is used to run the agent.
74
+ """
75
+ logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
76
+ obs: CmdOutputObservation
77
+
78
+ # Set instance id
79
+ action = CmdRunAction(command='mkdir -p /workspace')
80
+ logger.info(action, extra={'msg_type': 'ACTION'})
81
+ obs = runtime.run_action(action)
82
+ assert obs.exit_code == 0
83
+
84
+ action = CmdRunAction(command='cd /workspace')
85
+ logger.info(action, extra={'msg_type': 'ACTION'})
86
+ obs = runtime.run_action(action)
87
+ assert obs.exit_code == 0
88
+
89
+ init_cmd = instance.init
90
+ if init_cmd is not None:
91
+ script_name = f'{instance.instance_id}_init.sh'
92
+
93
+ with tempfile.TemporaryDirectory() as tmpdir:
94
+ host_script_path = os.path.join(tmpdir, script_name)
95
+ create_sh_file(host_script_path, init_cmd)
96
+ runtime.copy_to(
97
+ host_script_path,
98
+ '/workspace',
99
+ )
100
+
101
+ logger.info(f'Running init script: {script_name}')
102
+ action = CmdRunAction(command=f'chmod +x ./{script_name} && ./{script_name}')
103
+ logger.info(action, extra={'msg_type': 'ACTION'})
104
+ obs = runtime.run_action(action)
105
+ logger.info(obs, extra={'msg_type': 'OBSERVATION'})
106
+ assert obs.exit_code == 0
107
+
108
+ logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
109
+
110
+
111
+ def complete_runtime(
112
+ runtime: Runtime,
113
+ instance: pd.Series, # this argument is not required, but it is used to get the workspace_dir_name
114
+ ) -> dict[str, Any]:
115
+ """Complete the runtime for the agent.
116
+
117
+ This function is called before the runtime is used to run the agent.
118
+ If you need to do something in the sandbox to get the correctness metric after
119
+ the agent has run, modify this function.
120
+ """
121
+ logger.info(f"{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}")
122
+ obs: CmdOutputObservation
123
+
124
+ agent_answer = None
125
+ get_agent_result_cmd = instance.get_agent_result
126
+ if get_agent_result_cmd is not None:
127
+ script_name = 'get_agent_result.sh'
128
+
129
+ with tempfile.TemporaryDirectory() as tmpdir:
130
+ host_script_path = os.path.join(tmpdir, script_name)
131
+ create_sh_file(host_script_path, get_agent_result_cmd)
132
+ runtime.copy_to(
133
+ host_script_path,
134
+ '/workspace',
135
+ )
136
+ logger.info(f'Running get agent result cmd: {script_name}')
137
+
138
+ action = CmdRunAction(
139
+ command=f'chmod +x ./{script_name} && ./{script_name}',
140
+ )
141
+ logger.info(action, extra={'msg_type': 'ACTION'})
142
+ obs = runtime.run_action(action)
143
+ logger.info(obs, extra={'msg_type': 'OBSERVATION'})
144
+ assert obs.exit_code == 0
145
+ agent_answer = obs.content
146
+ # IF the agent answer is not found, retrieve it from the history
147
+ # We wait until the controller finishes
148
+
149
+ final_ans = None
150
+ if instance.ground_truth is not None:
151
+ final_ans = instance.ground_truth
152
+ else:
153
+ get_ground_truth_cmd = instance.get_ground_truth
154
+ if get_ground_truth_cmd is not None:
155
+ script_name = 'get_ground_truth.sh'
156
+ with tempfile.TemporaryDirectory() as tmpdir:
157
+ host_script_path = os.path.join(tmpdir, script_name)
158
+ create_sh_file(host_script_path, get_ground_truth_cmd)
159
+ runtime.copy_to(
160
+ host_script_path,
161
+ '/workspace',
162
+ )
163
+ logger.info(f'Running get ground truth cmd: {script_name}')
164
+
165
+ action = CmdRunAction(
166
+ command=f'chmod +x ./{script_name} && ./{script_name}'
167
+ )
168
+ logger.info(action, extra={'msg_type': 'ACTION'})
169
+ obs = runtime.run_action(action)
170
+ logger.info(obs, extra={'msg_type': 'OBSERVATION'})
171
+ final_ans = obs.content
172
+
173
+ logger.info(f"{'-' * 50} END Runtime Completion Fn {'-' * 50}")
174
+ return {
175
+ 'final_ans': final_ans,
176
+ 'agent_answer': agent_answer,
177
+ }
178
+
179
+
180
+ def process_instance(
181
+ instance: pd.Series,
182
+ metadata: EvalMetadata,
183
+ reset_logger: bool = True,
184
+ ) -> EvalOutput:
185
+ config = get_config(metadata)
186
+
187
+ # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
188
+ if reset_logger:
189
+ log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
190
+ reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir)
191
+ else:
192
+ logger.info(f'Starting evaluation for instance {instance.instance_id}.')
193
+
194
+ # =============================================
195
+ # build instruction
196
+ # =============================================
197
+
198
+ # Prepare instruction
199
+ instruction = (
200
+ f'Please fix the following issue.\n'
201
+ 'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
202
+ 'Please encapsulate your final answer (answer ONLY) within <solution> and </solution>.\n'
203
+ 'For example: The answer to the question is <solution> 42 </solution>.\n'
204
+ '# Problem \n'
205
+ f'{instance.description}\n\n'
206
+ )
207
+ instruction += (
208
+ 'IMPORTANT: You should ONLY interact with the environment provided '
209
+ 'to you AND NEVER ASK FOR HUMAN HELP.\n'
210
+ )
211
+ # NOTE: You can actually set slightly different instruction for different agents
212
+ instruction += INST_SUFFIXES[metadata.agent_class]
213
+
214
+ # =============================================
215
+ # create sandbox and run the agent
216
+ # =============================================
217
+
218
+ runtime: Runtime = create_runtime(config)
219
+ call_async_from_sync(runtime.connect)
220
+
221
+ initialize_runtime(runtime, instance=instance)
222
+
223
+ # Here's how you can run the agent (similar to the `main` function) and get the final task state
224
+ state: State | None = asyncio.run(
225
+ run_controller(
226
+ config=config,
227
+ initial_user_action=MessageAction(content=instruction),
228
+ runtime=runtime,
229
+ fake_user_response_fn=FAKE_RESPONSES[metadata.agent_class],
230
+ )
231
+ )
232
+ if state is None:
233
+ raise ValueError('State should not be None.')
234
+
235
+ # =============================================
236
+ # result evaluation
237
+ # =============================================
238
+
239
+ return_val = complete_runtime(runtime, instance)
240
+ agent_answer = return_val['agent_answer']
241
+ final_ans = return_val['final_ans']
242
+
243
+ # If the agent answer is not found, retrieve it from the history
244
+ if agent_answer is None:
245
+ agent_answer = ''
246
+ logger.info('Retrieving agent answer from history.')
247
+ raw_ans = ''
248
+
249
+ # retrieve the last agent message or thought
250
+ for event in reversed(state.history):
251
+ if event.source == 'agent':
252
+ if isinstance(event, AgentFinishAction):
253
+ raw_ans = event.thought
254
+ break
255
+ elif isinstance(event, MessageAction):
256
+ raw_ans = event.content
257
+ break
258
+ elif isinstance(event, CmdRunAction):
259
+ raw_ans = event.thought
260
+ break
261
+
262
+ # parse the answer for a solution tag
263
+ agent_answer = re.findall(r'<solution>(.*?)</solution>', raw_ans, re.DOTALL)
264
+ if len(agent_answer) == 0:
265
+ logger.warning(f'Failed to parse model answer: {raw_ans}')
266
+ agent_answer = raw_ans
267
+ else:
268
+ agent_answer = agent_answer[0]
269
+
270
+ comparison_method = instance.comparison_method
271
+ logger.info(
272
+ f'Final message: {agent_answer} | Ground truth: {final_ans} | Comparison method: {comparison_method}'
273
+ )
274
+ test_result = compare_results(comparison_method, agent_answer, final_ans)
275
+
276
+ # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
277
+ # for compatibility with the existing output format, we can remake the pairs here
278
+ # remove when it becomes unnecessary
279
+ histories = compatibility_for_eval_history_pairs(state.history)
280
+
281
+ metrics = state.metrics.get() if state.metrics else None
282
+
283
+ # Save the output
284
+ output = EvalOutput(
285
+ instance_id=instance.instance_id,
286
+ instance=instance.to_dict(),
287
+ instruction=instruction,
288
+ metadata=metadata,
289
+ history=histories,
290
+ metrics=metrics,
291
+ error=state.last_error if state and state.last_error else None,
292
+ test_result={
293
+ 'agent_answer': agent_answer,
294
+ 'final_answer': final_ans,
295
+ 'check_method': comparison_method,
296
+ 'result': test_result,
297
+ },
298
+ )
299
+ return output
300
+
301
+
302
+ if __name__ == '__main__':
303
+ args = parse_arguments()
304
+ dataset = load_dataset('iFurySt/AgentBench')
305
+ agent_bench_tests = dataset['osbench'].to_pandas()
306
+
307
+ llm_config = None
308
+ if args.llm_config:
309
+ llm_config = get_llm_config_arg(args.llm_config)
310
+ # modify_params must be False for evaluation purpose, for reproducibility and accurancy of results
311
+ llm_config.modify_params = False
312
+
313
+ if llm_config is None:
314
+ raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
315
+
316
+ metadata = make_metadata(
317
+ llm_config,
318
+ 'AgentBench-OS',
319
+ args.agent_cls,
320
+ args.max_iterations,
321
+ args.eval_note,
322
+ args.eval_output_dir,
323
+ )
324
+ output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
325
+ instances = prepare_dataset(agent_bench_tests, output_file, args.eval_n_limit)
326
+
327
+ run_evaluation(
328
+ instances, metadata, output_file, args.eval_num_workers, process_instance
329
+ )
evaluation/benchmarks/agent_bench/scripts/run_infer.sh ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ set -eo pipefail
3
+
4
+ source "evaluation/utils/version_control.sh"
5
+
6
+ MODEL_CONFIG=$1
7
+ COMMIT_HASH=$2
8
+ AGENT=$3
9
+ EVAL_LIMIT=$4
10
+ NUM_WORKERS=$5
11
+
12
+ if [ -z "$NUM_WORKERS" ]; then
13
+ NUM_WORKERS=1
14
+ echo "Number of workers not specified, use default $NUM_WORKERS"
15
+ fi
16
+ checkout_eval_branch
17
+
18
+ if [ -z "$AGENT" ]; then
19
+ echo "Agent not specified, use default CodeActAgent"
20
+ AGENT="CodeActAgent"
21
+ fi
22
+
23
+ get_openhands_version
24
+
25
+ echo "AGENT: $AGENT"
26
+ echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
27
+ echo "MODEL_CONFIG: $MODEL_CONFIG"
28
+
29
+ COMMAND="export PYTHONPATH=evaluation/benchmarks/agent_bench:\$PYTHONPATH && poetry run python evaluation/benchmarks/agent_bench/run_infer.py \
30
+ --agent-cls $AGENT \
31
+ --llm-config $MODEL_CONFIG \
32
+ --max-iterations 30 \
33
+ --eval-num-workers $NUM_WORKERS \
34
+ --eval-note $OPENHANDS_VERSION"
35
+
36
+ if [ -n "$EVAL_LIMIT" ]; then
37
+ echo "EVAL_LIMIT: $EVAL_LIMIT"
38
+ COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
39
+ fi
40
+
41
+ # Run the command
42
+ eval $COMMAND
evaluation/benchmarks/agent_bench/scripts/summarise_results.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import sys
3
+
4
+
5
+ def extract_test_results(res_file_path: str) -> tuple[list[str], list[str]]:
6
+ passed = []
7
+ failed = []
8
+ with open(res_file_path, 'r') as file:
9
+ for line in file:
10
+ data = json.loads(line.strip())
11
+ instance_id = data['instance_id']
12
+ resolved = False
13
+ if 'test_result' in data and 'result' in data['test_result']:
14
+ resolved = data['test_result']['result']
15
+ if resolved:
16
+ passed.append(instance_id)
17
+ else:
18
+ failed.append(instance_id)
19
+ return passed, failed
20
+
21
+
22
+ if __name__ == '__main__':
23
+ if len(sys.argv) != 2:
24
+ print(
25
+ 'Usage: poetry run python summarise_results.py <path_to_output_jsonl_file>'
26
+ )
27
+ sys.exit(1)
28
+ json_file_path = sys.argv[1]
29
+ passed_tests, failed_tests = extract_test_results(json_file_path)
30
+ succ_rate = len(passed_tests) / (len(passed_tests) + len(failed_tests))
31
+ print(
32
+ f'\nPassed {len(passed_tests)} tests, failed {len(failed_tests)} tests, resolve rate = {succ_rate}'
33
+ )
34
+ print('PASSED TESTS:')
35
+ print(passed_tests)
36
+ print('FAILED TESTS:')
37
+ print(failed_tests)
evaluation/benchmarks/aider_bench/README.md ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # AiderBench Evaluation
2
+
3
+ This folder contains evaluation harness for evaluating agents on the
4
+ [Aider Editing Benchmark](https://github.com/paul-gauthier/aider/blob/main/benchmark/README.md).
5
+ This will allow us to develop better editing approach without running the full
6
+ SWE-bench. The benchmark uses the
7
+ [RajMaheshwari/Exercism-Python](https://huggingface.co/datasets/RajMaheshwari/Exercism-Python)
8
+ Hugging Face dataset based on the
9
+ [Exercism python coding exercises](https://github.com/exercism/python).
10
+
11
+ ## Setup Environment and LLM Configuration
12
+
13
+ Please follow instruction [here](../../README.md#setup) to setup your local
14
+ development environment and LLM.
15
+
16
+ ## Start the evaluation
17
+
18
+ ```bash
19
+ ./evaluation/benchmarks/aider_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [eval-num-workers] [eval_ids]
20
+ ```
21
+
22
+ - `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for
23
+ your LLM settings, as defined in your `config.toml`.
24
+ - `git-version`, e.g. `HEAD`, is the git commit hash of the OpenHands version
25
+ you would like to evaluate. It could also be a release tag like `0.9.0`.
26
+ - `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks,
27
+ defaulting to `CodeActAgent`.
28
+ - `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit`
29
+ instances. By default, the script evaluates the entire Exercism test set
30
+ (133 issues). Note: in order to use `eval_limit`, you must also set `agent`.
31
+ - `eval-num-workers`: the number of workers to use for evaluation. Default: `1`.
32
+ - `eval_ids`, e.g. `"1,3,10"`, limits the evaluation to instances with the
33
+ given IDs (comma separated).
34
+
35
+ There are also following optional environment variables you can set:
36
+
37
+ ```bash
38
+ export USE_UNIT_TESTS=true # if you want to allow the Agent to verify correctness using unittests. Default to false.
39
+ export SKIP_NUM=12 # skip the first 12 instances from the dataset
40
+ ```
41
+
42
+ Following is the basic command to start the evaluation.
43
+
44
+ You can update the arguments in the script
45
+ `evaluation/benchmarks/aider_bench/scripts/run_infer.sh`, such as `--max-iterations`,
46
+ `--eval-num-workers` and so on:
47
+
48
+ - `--agent-cls`, the agent to use. For example, `CodeActAgent`.
49
+ - `--llm-config`: the LLM configuration to use. For example, `eval_gpt4_1106_preview`.
50
+ - `--max-iterations`: the max allowed number of iterations to run the evaluation. Default: `30`.
51
+ - `--eval-num-workers`: the number of workers to use for evaluation. Default: `1`.
52
+ - `--eval-n-limit`: the number of examples to evaluate. For example, `100`.
53
+ - `--eval-ids`: the IDs of the examples to evaluate (comma separated). For example, `"1,3,10"`.
54
+
55
+ ```bash
56
+ ./evaluation/benchmarks/aider_bench/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 100 1 "1,3,10"
57
+ ```
58
+
59
+ ### Run Inference on `RemoteRuntime` (experimental)
60
+
61
+ This is in limited beta. Contact Xingyao over slack if you want to try this out!
62
+
63
+ ```bash
64
+ ./evaluation/benchmarks/aider_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [eval-num-workers] [eval_ids]
65
+
66
+ # Example - This runs evaluation on CodeActAgent for 133 instances on aider_bench test set, with 2 workers running in parallel
67
+ export ALLHANDS_API_KEY="YOUR-API-KEY"
68
+ export RUNTIME=remote
69
+ export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev"
70
+ ./evaluation/benchmarks/aider_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 133 2
71
+ ```
72
+
73
+ ## Summarize Results
74
+
75
+ ```bash
76
+ poetry run python ./evaluation/benchmarks/aider_bench/scripts/summarize_results.py [path_to_output_jsonl_file]
77
+ ```
78
+
79
+ Full example:
80
+
81
+ ```bash
82
+ poetry run python ./evaluation/benchmarks/aider_bench/scripts/summarize_results.py evaluation/evaluation_outputs/outputs/AiderBench/CodeActAgent/claude-3-5-sonnet@20240620_maxiter_30_N_v1.9/output.jsonl
83
+ ```
84
+
85
+ This will list the instances that passed and the instances that failed. For each
86
+ instance, the corresponding set of test cases (which can vary for each instance)
87
+ are run on the file edited by the agent. We consider an instance to be passed
88
+ only if ALL test cases are passed. Sometimes even a single failed test case will
89
+ cause the entire instance to be marked as failed.
90
+
91
+ You can inspect the `test_results` field in the `output.jsonl` file to find the exact
92
+ outcome of the tests. If there are no syntax or indentation errors, you can
93
+ expect to see something like "`..F...EF..`", where "`.`" means the test case
94
+ passed, "`E`" means there was an error while executing the test case and "`F`"
95
+ means some assertion failed and some returned output was not as expected.
evaluation/benchmarks/aider_bench/create_dataset.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file was used to create the hugging face dataset from the exercism/python
2
+ # github repo.
3
+ # Refer to: https://github.com/exercism/python/tree/main/exercises/practice
4
+
5
+ import os
6
+ from pathlib import Path
7
+
8
+ from datasets import Dataset
9
+
10
+ tests = sorted(os.listdir('practice/'))
11
+ dataset = {
12
+ 'instance_id': [],
13
+ 'instance_name': [],
14
+ 'instruction': [],
15
+ 'signature': [],
16
+ 'test': [],
17
+ }
18
+
19
+ for i, test in enumerate(tests):
20
+ testdir = Path(f'practice/{test}/')
21
+
22
+ dataset['instance_id'].append(i)
23
+ dataset['instance_name'].append(testdir.name.replace('-', '_'))
24
+
25
+ # if len(glob.glob(f'practice/{testdir.name}/*.py')) != 2:
26
+ # print(testdir.name)
27
+
28
+ instructions = ''
29
+ introduction = testdir / '.docs/introduction.md'
30
+ if introduction.exists():
31
+ instructions += introduction.read_text()
32
+ instructions += (testdir / '.docs/instructions.md').read_text()
33
+ instructions_append = testdir / '.docs/instructions.append.md'
34
+ if instructions_append.exists():
35
+ instructions += instructions_append.read_text()
36
+
37
+ dataset['instruction'].append(instructions)
38
+
39
+ signature_file = testdir / (testdir.name + '.py').replace('-', '_')
40
+ dataset['signature'].append(signature_file.read_text())
41
+
42
+ test_file = testdir / (testdir.name + '_test.py').replace('-', '_')
43
+ dataset['test'].append(test_file.read_text())
44
+
45
+ ds = Dataset.from_dict(dataset)
46
+
47
+ ds.push_to_hub('RajMaheshwari/Exercism-Python')