terminal-bench-pro

v1.0

Terminal-Bench Pro (Public Set) is an extended benchmark dataset for testing AI agents in real terminal environments. From compiling code to training models and setting up servers, Terminal-Bench Pro evaluates how well agents can handle real-world, end-to-end tasks autonomously.

harbor run -d terminal-bench-pro@1.0

Tasks (200)

implement-sales-processor-engine
harbor run -d terminal-bench-pro@1.0 -t implement-sales-processor-engine
82bc46d
implement-sprt-ab-test-analyzer
harbor run -d terminal-bench-pro@1.0 -t implement-sprt-ab-test-analyzer
82bc46d
implement-tensor-parallel-matmul
harbor run -d terminal-bench-pro@1.0 -t implement-tensor-parallel-matmul
82bc46d
improve-code-similarity-feature-extraction
harbor run -d terminal-bench-pro@1.0 -t improve-code-similarity-feature-extraction
82bc46d
improve-predictive-maintenance-model
harbor run -d terminal-bench-pro@1.0 -t improve-predictive-maintenance-model
82bc46d
jq-github-contributor-report
harbor run -d terminal-bench-pro@1.0 -t jq-github-contributor-report
82bc46d
learn-gaussian-mrf-and-infer
harbor run -d terminal-bench-pro@1.0 -t learn-gaussian-mrf-and-infer
82bc46d
linear-sem-causal-discovery-intervention
harbor run -d terminal-bench-pro@1.0 -t linear-sem-causal-discovery-intervention
82bc46d
lorentzian-fit-fluorescence-spectra
harbor run -d terminal-bench-pro@1.0 -t lorentzian-fit-fluorescence-spectra
82bc46d
make-ascii-fits-keywords-case-insensitive
harbor run -d terminal-bench-pro@1.0 -t make-ascii-fits-keywords-case-insensitive
82bc46d
maze-shortest-path-solver
harbor run -d terminal-bench-pro@1.0 -t maze-shortest-path-solver
82bc46d
mcts-solver-for-15-puzzle
harbor run -d terminal-bench-pro@1.0 -t mcts-solver-for-15-puzzle
82bc46d
mcts-tictactoe-ai-implementation
harbor run -d terminal-bench-pro@1.0 -t mcts-tictactoe-ai-implementation
82bc46d
merge-git-bundles-and-implement-transform
harbor run -d terminal-bench-pro@1.0 -t merge-git-bundles-and-implement-transform
82bc46d
merge-org-csv-to-schema-json
harbor run -d terminal-bench-pro@1.0 -t merge-org-csv-to-schema-json
82bc46d
merge-parser-branches
harbor run -d terminal-bench-pro@1.0 -t merge-parser-branches
82bc46d
migrate-fortran-mcsim-to-gfortran
harbor run -d terminal-bench-pro@1.0 -t migrate-fortran-mcsim-to-gfortran
82bc46d
migrate-make-to-cmake-build
harbor run -d terminal-bench-pro@1.0 -t migrate-make-to-cmake-build
82bc46d
minesweeper-safe-move-selector
harbor run -d terminal-bench-pro@1.0 -t minesweeper-safe-move-selector
82bc46d
mongodb-sales-aggregation-engine
harbor run -d terminal-bench-pro@1.0 -t mongodb-sales-aggregation-engine
82bc46d
mountaincar-rl-agent-implementation
harbor run -d terminal-bench-pro@1.0 -t mountaincar-rl-agent-implementation
82bc46d
ner-eval-conll2003
harbor run -d terminal-bench-pro@1.0 -t ner-eval-conll2003
82bc46d
normalize-invoice-pdfs-to-csv
harbor run -d terminal-bench-pro@1.0 -t normalize-invoice-pdfs-to-csv
82bc46d
optimal-2x2x2-cube-solver
harbor run -d terminal-bench-pro@1.0 -t optimal-2x2x2-cube-solver
82bc46d
optimize-portfolio-allocation
harbor run -d terminal-bench-pro@1.0 -t optimize-portfolio-allocation
82bc46d
optimize-postgresql-analytics-query
harbor run -d terminal-bench-pro@1.0 -t optimize-postgresql-analytics-query
82bc46d
optimize-product-search-query
harbor run -d terminal-bench-pro@1.0 -t optimize-product-search-query
82bc46d
optimize-triton-rope-kernel
harbor run -d terminal-bench-pro@1.0 -t optimize-triton-rope-kernel
82bc46d
optimize-urdf-robot-for-pybullet
harbor run -d terminal-bench-pro@1.0 -t optimize-urdf-robot-for-pybullet
82bc46d
parse-bitcoin-tx-to-json
harbor run -d terminal-bench-pro@1.0 -t parse-bitcoin-tx-to-json
82bc46d
polyglot-bash-python-config-parser
harbor run -d terminal-bench-pro@1.0 -t polyglot-bash-python-config-parser
82bc46d
polyglot-bash-python-config-validator
harbor run -d terminal-bench-pro@1.0 -t polyglot-bash-python-config-validator
82bc46d
polyglot-prime-generator
harbor run -d terminal-bench-pro@1.0 -t polyglot-prime-generator
82bc46d
polyglot-text-stats-script
harbor run -d terminal-bench-pro@1.0 -t polyglot-text-stats-script
82bc46d
polyglot-yaml-config-validator
harbor run -d terminal-bench-pro@1.0 -t polyglot-yaml-config-validator
82bc46d
port-fortran-climate-model-to-python
harbor run -d terminal-bench-pro@1.0 -t port-fortran-climate-model-to-python
82bc46d
port-fortran-rk4-lorenz-to-python
harbor run -d terminal-bench-pro@1.0 -t port-fortran-rk4-lorenz-to-python
82bc46d
portfolio-risk-analyzer-script
harbor run -d terminal-bench-pro@1.0 -t portfolio-risk-analyzer-script
82bc46d
present-known-plaintext-key-recovery
harbor run -d terminal-bench-pro@1.0 -t present-known-plaintext-key-recovery
82bc46d
prove-nat-mult-commutativity-in-coq
harbor run -d terminal-bench-pro@1.0 -t prove-nat-mult-commutativity-in-coq
82bc46d
python-jacobi-eigen-solver-conversion
harbor run -d terminal-bench-pro@1.0 -t python-jacobi-eigen-solver-conversion
82bc46d
python-pcap-anomaly-detector
harbor run -d terminal-bench-pro@1.0 -t python-pcap-anomaly-detector
82bc46d
python-prime-http-server
harbor run -d terminal-bench-pro@1.0 -t python-prime-http-server
82bc46d
python-sokoban-bfs-solver
harbor run -d terminal-bench-pro@1.0 -t python-sokoban-bfs-solver
82bc46d
python-sql-injection-sanitizer
harbor run -d terminal-bench-pro@1.0 -t python-sql-injection-sanitizer
82bc46d
python-sudoku-solver-backtracking
harbor run -d terminal-bench-pro@1.0 -t python-sudoku-solver-backtracking
82bc46d
rebuild-fastproc-for-python-3-13
harbor run -d terminal-bench-pro@1.0 -t rebuild-fastproc-for-python-3-13
82bc46d
recover-and-sanitize-git-db-credential
harbor run -d terminal-bench-pro@1.0 -t recover-and-sanitize-git-db-credential
82bc46d
recover-and-sanitize-postgres-wal-secret
harbor run -d terminal-bench-pro@1.0 -t recover-and-sanitize-postgres-wal-secret
82bc46d
recover-corrupted-sqlite-data
harbor run -d terminal-bench-pro@1.0 -t recover-corrupted-sqlite-data
82bc46d
recover-encrypted-db-credentials
harbor run -d terminal-bench-pro@1.0 -t recover-encrypted-db-credentials
82bc46d
recover-git-history-secrets
harbor run -d terminal-bench-pro@1.0 -t recover-git-history-secrets
82bc46d
recover-git-reflog-commits
harbor run -d terminal-bench-pro@1.0 -t recover-git-reflog-commits
82bc46d
recover-multilayer-openssl-backup
harbor run -d terminal-bench-pro@1.0 -t recover-multilayer-openssl-backup
82bc46d
recover-prod-db-password-from-git-history
harbor run -d terminal-bench-pro@1.0 -t recover-prod-db-password-from-git-history
82bc46d
recover-stream-cipher-key
harbor run -d terminal-bench-pro@1.0 -t recover-stream-cipher-key
82bc46d
regex-bitcoin-p2pkh-extraction
harbor run -d terminal-bench-pro@1.0 -t regex-bitcoin-p2pkh-extraction
82bc46d
repair-broken-shell-data-pipeline
harbor run -d terminal-bench-pro@1.0 -t repair-broken-shell-data-pipeline
82bc46d
reproducible-latex-pdf-build-script
harbor run -d terminal-bench-pro@1.0 -t reproducible-latex-pdf-build-script
82bc46d
resolve-react-peer-deps
harbor run -d terminal-bench-pro@1.0 -t resolve-react-peer-deps
82bc46d
restore-broken-pip-installation
harbor run -d terminal-bench-pro@1.0 -t restore-broken-pip-installation
82bc46d
retrieve-vault-root-token
harbor run -d terminal-bench-pro@1.0 -t retrieve-vault-root-token
82bc46d
reverse-engineer-kvstore-binary-protocol
harbor run -d terminal-bench-pro@1.0 -t reverse-engineer-kvstore-binary-protocol
82bc46d
reverse-engineer-physics-simulator
harbor run -d terminal-bench-pro@1.0 -t reverse-engineer-physics-simulator
82bc46d
reverse-go-stream-cipher-encryption
harbor run -d terminal-bench-pro@1.0 -t reverse-go-stream-cipher-encryption
82bc46d
rush-hour-bfs-solver
harbor run -d terminal-bench-pro@1.0 -t rush-hour-bfs-solver
82bc46d
sanitize-dclm-repo-secrets
harbor run -d terminal-bench-pro@1.0 -t sanitize-dclm-repo-secrets
82bc46d
sanitize-jinja2-ssti-templates
harbor run -d terminal-bench-pro@1.0 -t sanitize-jinja2-ssti-templates
82bc46d
schedule-multi-team-kickoff-meeting
harbor run -d terminal-bench-pro@1.0 -t schedule-multi-team-kickoff-meeting
82bc46d
secure-db-migration-and-wipe
harbor run -d terminal-bench-pro@1.0 -t secure-db-migration-and-wipe
82bc46d
secure-django-runserver-input-validation
harbor run -d terminal-bench-pro@1.0 -t secure-django-runserver-input-validation
82bc46d
secure-model-pipeline-deployment
harbor run -d terminal-bench-pro@1.0 -t secure-model-pipeline-deployment
82bc46d
secure-postgres-schema-migration
harbor run -d terminal-bench-pro@1.0 -t secure-postgres-schema-migration
82bc46d
select-best-english-embedding-model
harbor run -d terminal-bench-pro@1.0 -t select-best-english-embedding-model
82bc46d
setup-mlflow-sqlite-server
harbor run -d terminal-bench-pro@1.0 -t setup-mlflow-sqlite-server
82bc46d
setup-ubuntu-vm-ssh-key-auth
harbor run -d terminal-bench-pro@1.0 -t setup-ubuntu-vm-ssh-key-auth
82bc46d
simulate-2d-sampling-with-acceptance-stats
harbor run -d terminal-bench-pro@1.0 -t simulate-2d-sampling-with-acceptance-stats
82bc46d
sokoban-bfs-move-with-fallback
harbor run -d terminal-bench-pro@1.0 -t sokoban-bfs-move-with-fallback
82bc46d
solve-chemical-equilibrium-system
harbor run -d terminal-bench-pro@1.0 -t solve-chemical-equilibrium-system
82bc46d
solve-chess-mate-in-two
harbor run -d terminal-bench-pro@1.0 -t solve-chess-mate-in-two
82bc46d
solve-colossal-cave-350-score
harbor run -d terminal-bench-pro@1.0 -t solve-colossal-cave-350-score
82bc46d
solve-escape-room-puzzle-server
harbor run -d terminal-bench-pro@1.0 -t solve-escape-room-puzzle-server
82bc46d
solve-ode-with-sympy
harbor run -d terminal-bench-pro@1.0 -t solve-ode-with-sympy
82bc46d
solve-train-shunting-puzzle
harbor run -d terminal-bench-pro@1.0 -t solve-train-shunting-puzzle
82bc46d
sparql-asian-senior-researchers
harbor run -d terminal-bench-pro@1.0 -t sparql-asian-senior-researchers
82bc46d
stabilize-neural-network-training
harbor run -d terminal-bench-pro@1.0 -t stabilize-neural-network-training
82bc46d
summarize-api-log-status-metrics
harbor run -d terminal-bench-pro@1.0 -t summarize-api-log-status-metrics
82bc46d
symbolic-ode-solution-chemical-reaction
harbor run -d terminal-bench-pro@1.0 -t symbolic-ode-solution-chemical-reaction
82bc46d
synthesize-harmonic-wav-in-c
harbor run -d terminal-bench-pro@1.0 -t synthesize-harmonic-wav-in-c
82bc46d
tabular-q-learning-mountaincar-agent
harbor run -d terminal-bench-pro@1.0 -t tabular-q-learning-mountaincar-agent
82bc46d
text-image-ocr-pipeline
harbor run -d terminal-bench-pro@1.0 -t text-image-ocr-pipeline
82bc46d
train-disruption-model-with-hash-chain
harbor run -d terminal-bench-pro@1.0 -t train-disruption-model-with-hash-chain
82bc46d
train-fasttext-style-subword-embeddings
harbor run -d terminal-bench-pro@1.0 -t train-fasttext-style-subword-embeddings
82bc46d
train-fraud-detection-model
harbor run -d terminal-bench-pro@1.0 -t train-fraud-detection-model
82bc46d
train-loan-default-logreg-model
harbor run -d terminal-bench-pro@1.0 -t train-loan-default-logreg-model
82bc46d
train-matrix-factorization-embeddings
harbor run -d terminal-bench-pro@1.0 -t train-matrix-factorization-embeddings
82bc46d
train-python-code-skipgram-embeddings
harbor run -d terminal-bench-pro@1.0 -t train-python-code-skipgram-embeddings
82bc46d
train-sarsa-taxi-agent
harbor run -d terminal-bench-pro@1.0 -t train-sarsa-taxi-agent
82bc46d
validate-and-solve-sudoku
harbor run -d terminal-bench-pro@1.0 -t validate-and-solve-sudoku
82bc46d
xrd-two-peak-fitting
harbor run -d terminal-bench-pro@1.0 -t xrd-two-peak-fitting
82bc46d