terminal-bench

v2.0

Version 2.0 of Terminal-Bench, a benchmark for testing agents in terminal environments. More tasks, harder, and higher quality than 1.0.

harbor run -d terminal-bench@2.0

Tasks (89)

adaptive-rejection-sampler
harbor run -d terminal-bench@2.0 -t adaptive-rejection-sampler
69671fb
bn-fit-modify
harbor run -d terminal-bench@2.0 -t bn-fit-modify
69671fb
break-filter-js-from-html
harbor run -d terminal-bench@2.0 -t break-filter-js-from-html
69671fb
build-cython-ext
harbor run -d terminal-bench@2.0 -t build-cython-ext
69671fb
build-pmars
harbor run -d terminal-bench@2.0 -t build-pmars
69671fb
build-pov-ray
harbor run -d terminal-bench@2.0 -t build-pov-ray
69671fb
caffe-cifar-10
harbor run -d terminal-bench@2.0 -t caffe-cifar-10
69671fb
cancel-async-tasks
harbor run -d terminal-bench@2.0 -t cancel-async-tasks
69671fb
chess-best-move
harbor run -d terminal-bench@2.0 -t chess-best-move
69671fb
circuit-fibsqrt
harbor run -d terminal-bench@2.0 -t circuit-fibsqrt
69671fb
cobol-modernization
harbor run -d terminal-bench@2.0 -t cobol-modernization
69671fb
code-from-image
harbor run -d terminal-bench@2.0 -t code-from-image
69671fb
compile-compcert
harbor run -d terminal-bench@2.0 -t compile-compcert
69671fb
configure-git-webserver
harbor run -d terminal-bench@2.0 -t configure-git-webserver
69671fb
constraints-scheduling
harbor run -d terminal-bench@2.0 -t constraints-scheduling
69671fb
count-dataset-tokens
harbor run -d terminal-bench@2.0 -t count-dataset-tokens
69671fb
crack-7z-hash
harbor run -d terminal-bench@2.0 -t crack-7z-hash
69671fb
custom-memory-heap-crash
harbor run -d terminal-bench@2.0 -t custom-memory-heap-crash
69671fb
db-wal-recovery
harbor run -d terminal-bench@2.0 -t db-wal-recovery
69671fb
distribution-search
harbor run -d terminal-bench@2.0 -t distribution-search
69671fb
dna-assembly
harbor run -d terminal-bench@2.0 -t dna-assembly
69671fb
dna-insert
harbor run -d terminal-bench@2.0 -t dna-insert
69671fb
extract-elf
harbor run -d terminal-bench@2.0 -t extract-elf
69671fb
extract-moves-from-video
harbor run -d terminal-bench@2.0 -t extract-moves-from-video
69671fb
feal-differential-cryptanalysis
harbor run -d terminal-bench@2.0 -t feal-differential-cryptanalysis
69671fb
feal-linear-cryptanalysis
harbor run -d terminal-bench@2.0 -t feal-linear-cryptanalysis
69671fb
filter-js-from-html
harbor run -d terminal-bench@2.0 -t filter-js-from-html
69671fb
financial-document-processor
harbor run -d terminal-bench@2.0 -t financial-document-processor
69671fb
fix-code-vulnerability
harbor run -d terminal-bench@2.0 -t fix-code-vulnerability
69671fb
fix-git
harbor run -d terminal-bench@2.0 -t fix-git
69671fb
fix-ocaml-gc
harbor run -d terminal-bench@2.0 -t fix-ocaml-gc
69671fb
gcode-to-text
harbor run -d terminal-bench@2.0 -t gcode-to-text
69671fb
git-leak-recovery
harbor run -d terminal-bench@2.0 -t git-leak-recovery
69671fb
git-multibranch
harbor run -d terminal-bench@2.0 -t git-multibranch
69671fb
gpt2-codegolf
harbor run -d terminal-bench@2.0 -t gpt2-codegolf
69671fb
headless-terminal
harbor run -d terminal-bench@2.0 -t headless-terminal
69671fb
hf-model-inference
harbor run -d terminal-bench@2.0 -t hf-model-inference
69671fb
install-windows-3.11
harbor run -d terminal-bench@2.0 -t install-windows-3.11
69671fb
kv-store-grpc
harbor run -d terminal-bench@2.0 -t kv-store-grpc
69671fb
large-scale-text-editing
harbor run -d terminal-bench@2.0 -t large-scale-text-editing
69671fb
largest-eigenval
harbor run -d terminal-bench@2.0 -t largest-eigenval
69671fb
llm-inference-batching-scheduler
harbor run -d terminal-bench@2.0 -t llm-inference-batching-scheduler
69671fb
log-summary-date-ranges
harbor run -d terminal-bench@2.0 -t log-summary-date-ranges
69671fb
mailman
harbor run -d terminal-bench@2.0 -t mailman
69671fb
make-doom-for-mips
harbor run -d terminal-bench@2.0 -t make-doom-for-mips
69671fb
make-mips-interpreter
harbor run -d terminal-bench@2.0 -t make-mips-interpreter
69671fb
mcmc-sampling-stan
harbor run -d terminal-bench@2.0 -t mcmc-sampling-stan
69671fb
merge-diff-arc-agi-task
harbor run -d terminal-bench@2.0 -t merge-diff-arc-agi-task
69671fb
model-extraction-relu-logits
harbor run -d terminal-bench@2.0 -t model-extraction-relu-logits
69671fb
modernize-scientific-stack
harbor run -d terminal-bench@2.0 -t modernize-scientific-stack
69671fb
mteb-leaderboard
harbor run -d terminal-bench@2.0 -t mteb-leaderboard
69671fb
mteb-retrieve
harbor run -d terminal-bench@2.0 -t mteb-retrieve
69671fb
multi-source-data-merger
harbor run -d terminal-bench@2.0 -t multi-source-data-merger
69671fb
nginx-request-logging
harbor run -d terminal-bench@2.0 -t nginx-request-logging
69671fb
openssl-selfsigned-cert
harbor run -d terminal-bench@2.0 -t openssl-selfsigned-cert
69671fb
overfull-hbox
harbor run -d terminal-bench@2.0 -t overfull-hbox
69671fb
password-recovery
harbor run -d terminal-bench@2.0 -t password-recovery
69671fb
path-tracing
harbor run -d terminal-bench@2.0 -t path-tracing
69671fb
path-tracing-reverse
harbor run -d terminal-bench@2.0 -t path-tracing-reverse
69671fb
polyglot-c-py
harbor run -d terminal-bench@2.0 -t polyglot-c-py
69671fb
polyglot-rust-c
harbor run -d terminal-bench@2.0 -t polyglot-rust-c
69671fb
portfolio-optimization
harbor run -d terminal-bench@2.0 -t portfolio-optimization
69671fb
protein-assembly
harbor run -d terminal-bench@2.0 -t protein-assembly
69671fb
prove-plus-comm
harbor run -d terminal-bench@2.0 -t prove-plus-comm
69671fb
pypi-server
harbor run -d terminal-bench@2.0 -t pypi-server
69671fb
pytorch-model-cli
harbor run -d terminal-bench@2.0 -t pytorch-model-cli
69671fb
pytorch-model-recovery
harbor run -d terminal-bench@2.0 -t pytorch-model-recovery
69671fb
qemu-alpine-ssh
harbor run -d terminal-bench@2.0 -t qemu-alpine-ssh
69671fb
qemu-startup
harbor run -d terminal-bench@2.0 -t qemu-startup
69671fb
query-optimize
harbor run -d terminal-bench@2.0 -t query-optimize
69671fb
raman-fitting
harbor run -d terminal-bench@2.0 -t raman-fitting
69671fb
regex-chess
harbor run -d terminal-bench@2.0 -t regex-chess
69671fb
regex-log
harbor run -d terminal-bench@2.0 -t regex-log
69671fb
reshard-c4-data
harbor run -d terminal-bench@2.0 -t reshard-c4-data
69671fb
rstan-to-pystan
harbor run -d terminal-bench@2.0 -t rstan-to-pystan
69671fb
sam-cell-seg
harbor run -d terminal-bench@2.0 -t sam-cell-seg
69671fb
sanitize-git-repo
harbor run -d terminal-bench@2.0 -t sanitize-git-repo
69671fb
schemelike-metacircular-eval
harbor run -d terminal-bench@2.0 -t schemelike-metacircular-eval
69671fb
sparql-university
harbor run -d terminal-bench@2.0 -t sparql-university
69671fb
sqlite-db-truncate
harbor run -d terminal-bench@2.0 -t sqlite-db-truncate
69671fb
sqlite-with-gcov
harbor run -d terminal-bench@2.0 -t sqlite-with-gcov
69671fb
torch-pipeline-parallelism
harbor run -d terminal-bench@2.0 -t torch-pipeline-parallelism
69671fb
torch-tensor-parallelism
harbor run -d terminal-bench@2.0 -t torch-tensor-parallelism
69671fb
train-fasttext
harbor run -d terminal-bench@2.0 -t train-fasttext
69671fb
tune-mjcf
harbor run -d terminal-bench@2.0 -t tune-mjcf
69671fb
video-processing
harbor run -d terminal-bench@2.0 -t video-processing
69671fb
vulnerable-secret
harbor run -d terminal-bench@2.0 -t vulnerable-secret
69671fb
winning-avg-corewars
harbor run -d terminal-bench@2.0 -t winning-avg-corewars
69671fb
write-compressor
harbor run -d terminal-bench@2.0 -t write-compressor
69671fb