terminal-bench-pro
v1.0Terminal-Bench Pro (Public Set) is an extended benchmark dataset for testing AI agents in real terminal environments. From compiling code to training models and setting up servers, Terminal-Bench Pro evaluates how well agents can handle real-world, end-to-end tasks autonomously.
harbor run -d terminal-bench-pro@1.0Tasks (200)
implement-sales-processor-engine
harbor run -d terminal-bench-pro@1.0 -t implement-sales-processor-engine82bc46d
implement-sprt-ab-test-analyzer
harbor run -d terminal-bench-pro@1.0 -t implement-sprt-ab-test-analyzer82bc46d
implement-tensor-parallel-matmul
harbor run -d terminal-bench-pro@1.0 -t implement-tensor-parallel-matmul82bc46d
improve-code-similarity-feature-extraction
harbor run -d terminal-bench-pro@1.0 -t improve-code-similarity-feature-extraction82bc46d
improve-predictive-maintenance-model
harbor run -d terminal-bench-pro@1.0 -t improve-predictive-maintenance-model82bc46d
jq-github-contributor-report
harbor run -d terminal-bench-pro@1.0 -t jq-github-contributor-report82bc46d
learn-gaussian-mrf-and-infer
harbor run -d terminal-bench-pro@1.0 -t learn-gaussian-mrf-and-infer82bc46d
linear-sem-causal-discovery-intervention
harbor run -d terminal-bench-pro@1.0 -t linear-sem-causal-discovery-intervention82bc46d
lorentzian-fit-fluorescence-spectra
harbor run -d terminal-bench-pro@1.0 -t lorentzian-fit-fluorescence-spectra82bc46d
make-ascii-fits-keywords-case-insensitive
harbor run -d terminal-bench-pro@1.0 -t make-ascii-fits-keywords-case-insensitive82bc46d
maze-shortest-path-solver
harbor run -d terminal-bench-pro@1.0 -t maze-shortest-path-solver82bc46d
mcts-solver-for-15-puzzle
harbor run -d terminal-bench-pro@1.0 -t mcts-solver-for-15-puzzle82bc46d
mcts-tictactoe-ai-implementation
harbor run -d terminal-bench-pro@1.0 -t mcts-tictactoe-ai-implementation82bc46d
merge-git-bundles-and-implement-transform
harbor run -d terminal-bench-pro@1.0 -t merge-git-bundles-and-implement-transform82bc46d
merge-org-csv-to-schema-json
harbor run -d terminal-bench-pro@1.0 -t merge-org-csv-to-schema-json82bc46d
merge-parser-branches
harbor run -d terminal-bench-pro@1.0 -t merge-parser-branches82bc46d
migrate-fortran-mcsim-to-gfortran
harbor run -d terminal-bench-pro@1.0 -t migrate-fortran-mcsim-to-gfortran82bc46d
migrate-make-to-cmake-build
harbor run -d terminal-bench-pro@1.0 -t migrate-make-to-cmake-build82bc46d
minesweeper-safe-move-selector
harbor run -d terminal-bench-pro@1.0 -t minesweeper-safe-move-selector82bc46d
mongodb-sales-aggregation-engine
harbor run -d terminal-bench-pro@1.0 -t mongodb-sales-aggregation-engine82bc46d
mountaincar-rl-agent-implementation
harbor run -d terminal-bench-pro@1.0 -t mountaincar-rl-agent-implementation82bc46d
ner-eval-conll2003
harbor run -d terminal-bench-pro@1.0 -t ner-eval-conll200382bc46d
normalize-invoice-pdfs-to-csv
harbor run -d terminal-bench-pro@1.0 -t normalize-invoice-pdfs-to-csv82bc46d
optimal-2x2x2-cube-solver
harbor run -d terminal-bench-pro@1.0 -t optimal-2x2x2-cube-solver82bc46d
optimize-portfolio-allocation
harbor run -d terminal-bench-pro@1.0 -t optimize-portfolio-allocation82bc46d
optimize-postgresql-analytics-query
harbor run -d terminal-bench-pro@1.0 -t optimize-postgresql-analytics-query82bc46d
optimize-product-search-query
harbor run -d terminal-bench-pro@1.0 -t optimize-product-search-query82bc46d
optimize-triton-rope-kernel
harbor run -d terminal-bench-pro@1.0 -t optimize-triton-rope-kernel82bc46d
optimize-urdf-robot-for-pybullet
harbor run -d terminal-bench-pro@1.0 -t optimize-urdf-robot-for-pybullet82bc46d
parse-bitcoin-tx-to-json
harbor run -d terminal-bench-pro@1.0 -t parse-bitcoin-tx-to-json82bc46d
polyglot-bash-python-config-parser
harbor run -d terminal-bench-pro@1.0 -t polyglot-bash-python-config-parser82bc46d
polyglot-bash-python-config-validator
harbor run -d terminal-bench-pro@1.0 -t polyglot-bash-python-config-validator82bc46d
polyglot-prime-generator
harbor run -d terminal-bench-pro@1.0 -t polyglot-prime-generator82bc46d
polyglot-text-stats-script
harbor run -d terminal-bench-pro@1.0 -t polyglot-text-stats-script82bc46d
polyglot-yaml-config-validator
harbor run -d terminal-bench-pro@1.0 -t polyglot-yaml-config-validator82bc46d
port-fortran-climate-model-to-python
harbor run -d terminal-bench-pro@1.0 -t port-fortran-climate-model-to-python82bc46d
port-fortran-rk4-lorenz-to-python
harbor run -d terminal-bench-pro@1.0 -t port-fortran-rk4-lorenz-to-python82bc46d
portfolio-risk-analyzer-script
harbor run -d terminal-bench-pro@1.0 -t portfolio-risk-analyzer-script82bc46d
present-known-plaintext-key-recovery
harbor run -d terminal-bench-pro@1.0 -t present-known-plaintext-key-recovery82bc46d
prove-nat-mult-commutativity-in-coq
harbor run -d terminal-bench-pro@1.0 -t prove-nat-mult-commutativity-in-coq82bc46d
python-jacobi-eigen-solver-conversion
harbor run -d terminal-bench-pro@1.0 -t python-jacobi-eigen-solver-conversion82bc46d
python-pcap-anomaly-detector
harbor run -d terminal-bench-pro@1.0 -t python-pcap-anomaly-detector82bc46d
python-prime-http-server
harbor run -d terminal-bench-pro@1.0 -t python-prime-http-server82bc46d
python-sokoban-bfs-solver
harbor run -d terminal-bench-pro@1.0 -t python-sokoban-bfs-solver82bc46d
python-sql-injection-sanitizer
harbor run -d terminal-bench-pro@1.0 -t python-sql-injection-sanitizer82bc46d
python-sudoku-solver-backtracking
harbor run -d terminal-bench-pro@1.0 -t python-sudoku-solver-backtracking82bc46d
rebuild-fastproc-for-python-3-13
harbor run -d terminal-bench-pro@1.0 -t rebuild-fastproc-for-python-3-1382bc46d
recover-and-sanitize-git-db-credential
harbor run -d terminal-bench-pro@1.0 -t recover-and-sanitize-git-db-credential82bc46d
recover-and-sanitize-postgres-wal-secret
harbor run -d terminal-bench-pro@1.0 -t recover-and-sanitize-postgres-wal-secret82bc46d
recover-corrupted-sqlite-data
harbor run -d terminal-bench-pro@1.0 -t recover-corrupted-sqlite-data82bc46d
recover-encrypted-db-credentials
harbor run -d terminal-bench-pro@1.0 -t recover-encrypted-db-credentials82bc46d
recover-git-history-secrets
harbor run -d terminal-bench-pro@1.0 -t recover-git-history-secrets82bc46d
recover-git-reflog-commits
harbor run -d terminal-bench-pro@1.0 -t recover-git-reflog-commits82bc46d
recover-multilayer-openssl-backup
harbor run -d terminal-bench-pro@1.0 -t recover-multilayer-openssl-backup82bc46d
recover-prod-db-password-from-git-history
harbor run -d terminal-bench-pro@1.0 -t recover-prod-db-password-from-git-history82bc46d
recover-stream-cipher-key
harbor run -d terminal-bench-pro@1.0 -t recover-stream-cipher-key82bc46d
regex-bitcoin-p2pkh-extraction
harbor run -d terminal-bench-pro@1.0 -t regex-bitcoin-p2pkh-extraction82bc46d
repair-broken-shell-data-pipeline
harbor run -d terminal-bench-pro@1.0 -t repair-broken-shell-data-pipeline82bc46d
reproducible-latex-pdf-build-script
harbor run -d terminal-bench-pro@1.0 -t reproducible-latex-pdf-build-script82bc46d
resolve-react-peer-deps
harbor run -d terminal-bench-pro@1.0 -t resolve-react-peer-deps82bc46d
restore-broken-pip-installation
harbor run -d terminal-bench-pro@1.0 -t restore-broken-pip-installation82bc46d
retrieve-vault-root-token
harbor run -d terminal-bench-pro@1.0 -t retrieve-vault-root-token82bc46d
reverse-engineer-kvstore-binary-protocol
harbor run -d terminal-bench-pro@1.0 -t reverse-engineer-kvstore-binary-protocol82bc46d
reverse-engineer-physics-simulator
harbor run -d terminal-bench-pro@1.0 -t reverse-engineer-physics-simulator82bc46d
reverse-go-stream-cipher-encryption
harbor run -d terminal-bench-pro@1.0 -t reverse-go-stream-cipher-encryption82bc46d
rush-hour-bfs-solver
harbor run -d terminal-bench-pro@1.0 -t rush-hour-bfs-solver82bc46d
sanitize-dclm-repo-secrets
harbor run -d terminal-bench-pro@1.0 -t sanitize-dclm-repo-secrets82bc46d
sanitize-jinja2-ssti-templates
harbor run -d terminal-bench-pro@1.0 -t sanitize-jinja2-ssti-templates82bc46d
schedule-multi-team-kickoff-meeting
harbor run -d terminal-bench-pro@1.0 -t schedule-multi-team-kickoff-meeting82bc46d
secure-db-migration-and-wipe
harbor run -d terminal-bench-pro@1.0 -t secure-db-migration-and-wipe82bc46d
secure-django-runserver-input-validation
harbor run -d terminal-bench-pro@1.0 -t secure-django-runserver-input-validation82bc46d
secure-model-pipeline-deployment
harbor run -d terminal-bench-pro@1.0 -t secure-model-pipeline-deployment82bc46d
secure-postgres-schema-migration
harbor run -d terminal-bench-pro@1.0 -t secure-postgres-schema-migration82bc46d
select-best-english-embedding-model
harbor run -d terminal-bench-pro@1.0 -t select-best-english-embedding-model82bc46d
setup-mlflow-sqlite-server
harbor run -d terminal-bench-pro@1.0 -t setup-mlflow-sqlite-server82bc46d
setup-ubuntu-vm-ssh-key-auth
harbor run -d terminal-bench-pro@1.0 -t setup-ubuntu-vm-ssh-key-auth82bc46d
simulate-2d-sampling-with-acceptance-stats
harbor run -d terminal-bench-pro@1.0 -t simulate-2d-sampling-with-acceptance-stats82bc46d
sokoban-bfs-move-with-fallback
harbor run -d terminal-bench-pro@1.0 -t sokoban-bfs-move-with-fallback82bc46d
solve-chemical-equilibrium-system
harbor run -d terminal-bench-pro@1.0 -t solve-chemical-equilibrium-system82bc46d
solve-chess-mate-in-two
harbor run -d terminal-bench-pro@1.0 -t solve-chess-mate-in-two82bc46d
solve-colossal-cave-350-score
harbor run -d terminal-bench-pro@1.0 -t solve-colossal-cave-350-score82bc46d
solve-escape-room-puzzle-server
harbor run -d terminal-bench-pro@1.0 -t solve-escape-room-puzzle-server82bc46d
solve-ode-with-sympy
harbor run -d terminal-bench-pro@1.0 -t solve-ode-with-sympy82bc46d
solve-train-shunting-puzzle
harbor run -d terminal-bench-pro@1.0 -t solve-train-shunting-puzzle82bc46d
sparql-asian-senior-researchers
harbor run -d terminal-bench-pro@1.0 -t sparql-asian-senior-researchers82bc46d
stabilize-neural-network-training
harbor run -d terminal-bench-pro@1.0 -t stabilize-neural-network-training82bc46d
summarize-api-log-status-metrics
harbor run -d terminal-bench-pro@1.0 -t summarize-api-log-status-metrics82bc46d
symbolic-ode-solution-chemical-reaction
harbor run -d terminal-bench-pro@1.0 -t symbolic-ode-solution-chemical-reaction82bc46d
synthesize-harmonic-wav-in-c
harbor run -d terminal-bench-pro@1.0 -t synthesize-harmonic-wav-in-c82bc46d
tabular-q-learning-mountaincar-agent
harbor run -d terminal-bench-pro@1.0 -t tabular-q-learning-mountaincar-agent82bc46d
text-image-ocr-pipeline
harbor run -d terminal-bench-pro@1.0 -t text-image-ocr-pipeline82bc46d
train-disruption-model-with-hash-chain
harbor run -d terminal-bench-pro@1.0 -t train-disruption-model-with-hash-chain82bc46d
train-fasttext-style-subword-embeddings
harbor run -d terminal-bench-pro@1.0 -t train-fasttext-style-subword-embeddings82bc46d
train-fraud-detection-model
harbor run -d terminal-bench-pro@1.0 -t train-fraud-detection-model82bc46d
train-loan-default-logreg-model
harbor run -d terminal-bench-pro@1.0 -t train-loan-default-logreg-model82bc46d
train-matrix-factorization-embeddings
harbor run -d terminal-bench-pro@1.0 -t train-matrix-factorization-embeddings82bc46d
train-python-code-skipgram-embeddings
harbor run -d terminal-bench-pro@1.0 -t train-python-code-skipgram-embeddings82bc46d
train-sarsa-taxi-agent
harbor run -d terminal-bench-pro@1.0 -t train-sarsa-taxi-agent82bc46d
validate-and-solve-sudoku
harbor run -d terminal-bench-pro@1.0 -t validate-and-solve-sudoku82bc46d
xrd-two-peak-fitting
harbor run -d terminal-bench-pro@1.0 -t xrd-two-peak-fitting82bc46d