terminal-bench
v2.0Version 2.0 of Terminal-Bench, a benchmark for testing agents in terminal environments. More tasks, harder, and higher quality than 1.0.
harbor run -d terminal-bench@2.0Tasks (89)
adaptive-rejection-sampler
harbor run -d terminal-bench@2.0 -t adaptive-rejection-sampler69671fb
bn-fit-modify
harbor run -d terminal-bench@2.0 -t bn-fit-modify69671fb
break-filter-js-from-html
harbor run -d terminal-bench@2.0 -t break-filter-js-from-html69671fb
build-cython-ext
harbor run -d terminal-bench@2.0 -t build-cython-ext69671fb
build-pmars
harbor run -d terminal-bench@2.0 -t build-pmars69671fb
build-pov-ray
harbor run -d terminal-bench@2.0 -t build-pov-ray69671fb
caffe-cifar-10
harbor run -d terminal-bench@2.0 -t caffe-cifar-1069671fb
cancel-async-tasks
harbor run -d terminal-bench@2.0 -t cancel-async-tasks69671fb
chess-best-move
harbor run -d terminal-bench@2.0 -t chess-best-move69671fb
circuit-fibsqrt
harbor run -d terminal-bench@2.0 -t circuit-fibsqrt69671fb
cobol-modernization
harbor run -d terminal-bench@2.0 -t cobol-modernization69671fb
code-from-image
harbor run -d terminal-bench@2.0 -t code-from-image69671fb
compile-compcert
harbor run -d terminal-bench@2.0 -t compile-compcert69671fb
configure-git-webserver
harbor run -d terminal-bench@2.0 -t configure-git-webserver69671fb
constraints-scheduling
harbor run -d terminal-bench@2.0 -t constraints-scheduling69671fb
count-dataset-tokens
harbor run -d terminal-bench@2.0 -t count-dataset-tokens69671fb
crack-7z-hash
harbor run -d terminal-bench@2.0 -t crack-7z-hash69671fb
custom-memory-heap-crash
harbor run -d terminal-bench@2.0 -t custom-memory-heap-crash69671fb
db-wal-recovery
harbor run -d terminal-bench@2.0 -t db-wal-recovery69671fb
distribution-search
harbor run -d terminal-bench@2.0 -t distribution-search69671fb
dna-assembly
harbor run -d terminal-bench@2.0 -t dna-assembly69671fb
dna-insert
harbor run -d terminal-bench@2.0 -t dna-insert69671fb
extract-elf
harbor run -d terminal-bench@2.0 -t extract-elf69671fb
extract-moves-from-video
harbor run -d terminal-bench@2.0 -t extract-moves-from-video69671fb
feal-differential-cryptanalysis
harbor run -d terminal-bench@2.0 -t feal-differential-cryptanalysis69671fb
feal-linear-cryptanalysis
harbor run -d terminal-bench@2.0 -t feal-linear-cryptanalysis69671fb
filter-js-from-html
harbor run -d terminal-bench@2.0 -t filter-js-from-html69671fb
financial-document-processor
harbor run -d terminal-bench@2.0 -t financial-document-processor69671fb
fix-code-vulnerability
harbor run -d terminal-bench@2.0 -t fix-code-vulnerability69671fb
fix-git
harbor run -d terminal-bench@2.0 -t fix-git69671fb
fix-ocaml-gc
harbor run -d terminal-bench@2.0 -t fix-ocaml-gc69671fb
gcode-to-text
harbor run -d terminal-bench@2.0 -t gcode-to-text69671fb
git-leak-recovery
harbor run -d terminal-bench@2.0 -t git-leak-recovery69671fb
git-multibranch
harbor run -d terminal-bench@2.0 -t git-multibranch69671fb
gpt2-codegolf
harbor run -d terminal-bench@2.0 -t gpt2-codegolf69671fb
headless-terminal
harbor run -d terminal-bench@2.0 -t headless-terminal69671fb
hf-model-inference
harbor run -d terminal-bench@2.0 -t hf-model-inference69671fb
install-windows-3.11
harbor run -d terminal-bench@2.0 -t install-windows-3.1169671fb
kv-store-grpc
harbor run -d terminal-bench@2.0 -t kv-store-grpc69671fb
large-scale-text-editing
harbor run -d terminal-bench@2.0 -t large-scale-text-editing69671fb
largest-eigenval
harbor run -d terminal-bench@2.0 -t largest-eigenval69671fb
llm-inference-batching-scheduler
harbor run -d terminal-bench@2.0 -t llm-inference-batching-scheduler69671fb
log-summary-date-ranges
harbor run -d terminal-bench@2.0 -t log-summary-date-ranges69671fb
mailman
harbor run -d terminal-bench@2.0 -t mailman69671fb
make-doom-for-mips
harbor run -d terminal-bench@2.0 -t make-doom-for-mips69671fb
make-mips-interpreter
harbor run -d terminal-bench@2.0 -t make-mips-interpreter69671fb
mcmc-sampling-stan
harbor run -d terminal-bench@2.0 -t mcmc-sampling-stan69671fb
merge-diff-arc-agi-task
harbor run -d terminal-bench@2.0 -t merge-diff-arc-agi-task69671fb
model-extraction-relu-logits
harbor run -d terminal-bench@2.0 -t model-extraction-relu-logits69671fb
modernize-scientific-stack
harbor run -d terminal-bench@2.0 -t modernize-scientific-stack69671fb
mteb-leaderboard
harbor run -d terminal-bench@2.0 -t mteb-leaderboard69671fb
mteb-retrieve
harbor run -d terminal-bench@2.0 -t mteb-retrieve69671fb
multi-source-data-merger
harbor run -d terminal-bench@2.0 -t multi-source-data-merger69671fb
nginx-request-logging
harbor run -d terminal-bench@2.0 -t nginx-request-logging69671fb
openssl-selfsigned-cert
harbor run -d terminal-bench@2.0 -t openssl-selfsigned-cert69671fb
overfull-hbox
harbor run -d terminal-bench@2.0 -t overfull-hbox69671fb
password-recovery
harbor run -d terminal-bench@2.0 -t password-recovery69671fb
path-tracing
harbor run -d terminal-bench@2.0 -t path-tracing69671fb
path-tracing-reverse
harbor run -d terminal-bench@2.0 -t path-tracing-reverse69671fb
polyglot-c-py
harbor run -d terminal-bench@2.0 -t polyglot-c-py69671fb
polyglot-rust-c
harbor run -d terminal-bench@2.0 -t polyglot-rust-c69671fb
portfolio-optimization
harbor run -d terminal-bench@2.0 -t portfolio-optimization69671fb
protein-assembly
harbor run -d terminal-bench@2.0 -t protein-assembly69671fb
prove-plus-comm
harbor run -d terminal-bench@2.0 -t prove-plus-comm69671fb
pypi-server
harbor run -d terminal-bench@2.0 -t pypi-server69671fb
pytorch-model-cli
harbor run -d terminal-bench@2.0 -t pytorch-model-cli69671fb
pytorch-model-recovery
harbor run -d terminal-bench@2.0 -t pytorch-model-recovery69671fb
qemu-alpine-ssh
harbor run -d terminal-bench@2.0 -t qemu-alpine-ssh69671fb
qemu-startup
harbor run -d terminal-bench@2.0 -t qemu-startup69671fb
query-optimize
harbor run -d terminal-bench@2.0 -t query-optimize69671fb
raman-fitting
harbor run -d terminal-bench@2.0 -t raman-fitting69671fb
regex-chess
harbor run -d terminal-bench@2.0 -t regex-chess69671fb
regex-log
harbor run -d terminal-bench@2.0 -t regex-log69671fb
reshard-c4-data
harbor run -d terminal-bench@2.0 -t reshard-c4-data69671fb
rstan-to-pystan
harbor run -d terminal-bench@2.0 -t rstan-to-pystan69671fb
sam-cell-seg
harbor run -d terminal-bench@2.0 -t sam-cell-seg69671fb
sanitize-git-repo
harbor run -d terminal-bench@2.0 -t sanitize-git-repo69671fb
schemelike-metacircular-eval
harbor run -d terminal-bench@2.0 -t schemelike-metacircular-eval69671fb
sparql-university
harbor run -d terminal-bench@2.0 -t sparql-university69671fb
sqlite-db-truncate
harbor run -d terminal-bench@2.0 -t sqlite-db-truncate69671fb
sqlite-with-gcov
harbor run -d terminal-bench@2.0 -t sqlite-with-gcov69671fb
torch-pipeline-parallelism
harbor run -d terminal-bench@2.0 -t torch-pipeline-parallelism69671fb
torch-tensor-parallelism
harbor run -d terminal-bench@2.0 -t torch-tensor-parallelism69671fb
train-fasttext
harbor run -d terminal-bench@2.0 -t train-fasttext69671fb
tune-mjcf
harbor run -d terminal-bench@2.0 -t tune-mjcf69671fb
video-processing
harbor run -d terminal-bench@2.0 -t video-processing69671fb
vulnerable-secret
harbor run -d terminal-bench@2.0 -t vulnerable-secret69671fb
winning-avg-corewars
harbor run -d terminal-bench@2.0 -t winning-avg-corewars69671fb
write-compressor
harbor run -d terminal-bench@2.0 -t write-compressor69671fb