terminal-bench-pro
v1.0Terminal-Bench Pro (Public Set) is an extended benchmark dataset for testing AI agents in real terminal environments. From compiling code to training models and setting up servers, Terminal-Bench Pro evaluates how well agents can handle real-world, end-to-end tasks autonomously.
harbor run -d terminal-bench-pro@1.0Tasks (200)
advanced-json-to-rfc4180-csv-converter
harbor run -d terminal-bench-pro@1.0 -t advanced-json-to-rfc4180-csv-converter82bc46d
advanced-poker-hand-classifier
harbor run -d terminal-bench-pro@1.0 -t advanced-poker-hand-classifier82bc46d
analyze-and-run-encoded-payload
harbor run -d terminal-bench-pro@1.0 -t analyze-and-run-encoded-payload82bc46d
analyze-arm-shellcode-network-connections
harbor run -d terminal-bench-pro@1.0 -t analyze-arm-shellcode-network-connections82bc46d
analyze-fen-with-stockfish
harbor run -d terminal-bench-pro@1.0 -t analyze-fen-with-stockfish82bc46d
analyze-pgn-best-chess-move
harbor run -d terminal-bench-pro@1.0 -t analyze-pgn-best-chess-move82bc46d
apache-log-security-analyzer
harbor run -d terminal-bench-pro@1.0 -t apache-log-security-analyzer82bc46d
automate-blind-graph-mapping
harbor run -d terminal-bench-pro@1.0 -t automate-blind-graph-mapping82bc46d
bash-ddos-traffic-analyzer
harbor run -d terminal-bench-pro@1.0 -t bash-ddos-traffic-analyzer82bc46d
bash-tree-diff-sync
harbor run -d terminal-bench-pro@1.0 -t bash-tree-diff-sync82bc46d
benchmark-gcc-opt-levels
harbor run -d terminal-bench-pro@1.0 -t benchmark-gcc-opt-levels82bc46d
boot-debian-qemu-with-ssh-check
harbor run -d terminal-bench-pro@1.0 -t boot-debian-qemu-with-ssh-check82bc46d
build-arm64-qemu-linux-with-custom-message
harbor run -d terminal-bench-pro@1.0 -t build-arm64-qemu-linux-with-custom-message82bc46d
build-coq-from-source
harbor run -d terminal-bench-pro@1.0 -t build-coq-from-source82bc46d
build-graphicsmagick-1-3-45
harbor run -d terminal-bench-pro@1.0 -t build-graphicsmagick-1-3-4582bc46d
build-grpc-user-profile-service
harbor run -d terminal-bench-pro@1.0 -t build-grpc-user-profile-service82bc46d
build-nginx-1-24-production-server
harbor run -d terminal-bench-pro@1.0 -t build-nginx-1-24-production-server82bc46d
build-prime-factorization-http-api
harbor run -d terminal-bench-pro@1.0 -t build-prime-factorization-http-api82bc46d
build-python-sokoban-solver
harbor run -d terminal-bench-pro@1.0 -t build-python-sokoban-solver82bc46d
build-qemu-arm-user-emulator
harbor run -d terminal-bench-pro@1.0 -t build-qemu-arm-user-emulator82bc46d
build-uci-chess-engine-to-beat-multiple-bots
harbor run -d terminal-bench-pro@1.0 -t build-uci-chess-engine-to-beat-multiple-bots82bc46d
c-audio-waveform-reconstruction
harbor run -d terminal-bench-pro@1.0 -t c-audio-waveform-reconstruction82bc46d
cache-bert-model-offline
harbor run -d terminal-bench-pro@1.0 -t cache-bert-model-offline82bc46d
capture-iot-firmware-boot-message
harbor run -d terminal-bench-pro@1.0 -t capture-iot-firmware-boot-message82bc46d
cmake-build-for-cpp-console-app
harbor run -d terminal-bench-pro@1.0 -t cmake-build-for-cpp-console-app82bc46d
compare-cmake-build-types-performance
harbor run -d terminal-bench-pro@1.0 -t compare-cmake-build-types-performance82bc46d
compare-lasso-ridge-elasticnet
harbor run -d terminal-bench-pro@1.0 -t compare-lasso-ridge-elasticnet82bc46d
compare-lasso-ridge-gene-expression
harbor run -d terminal-bench-pro@1.0 -t compare-lasso-ridge-gene-expression82bc46d
compile-postgresql-with-sanitizers
harbor run -d terminal-bench-pro@1.0 -t compile-postgresql-with-sanitizers82bc46d
complete-sql-tutorial-game
harbor run -d terminal-bench-pro@1.0 -t complete-sql-tutorial-game82bc46d
compute-best-chess-move-san
harbor run -d terminal-bench-pro@1.0 -t compute-best-chess-move-san82bc46d
compute-image-hash-chain
harbor run -d terminal-bench-pro@1.0 -t compute-image-hash-chain82bc46d
compute-symbolic-eigenpairs-3x3-matrix
harbor run -d terminal-bench-pro@1.0 -t compute-symbolic-eigenpairs-3x3-matrix82bc46d
configure-apache-analytics-virtualhost
harbor run -d terminal-bench-pro@1.0 -t configure-apache-analytics-virtualhost82bc46d
configure-apache-logging-and-rate-limit
harbor run -d terminal-bench-pro@1.0 -t configure-apache-logging-and-rate-limit82bc46d
configure-localhost-ssh-key-login
harbor run -d terminal-bench-pro@1.0 -t configure-localhost-ssh-key-login82bc46d
consolidate-valid-prod-credentials
harbor run -d terminal-bench-pro@1.0 -t consolidate-valid-prod-credentials82bc46d
constrained-portfolio-variance-minimization
harbor run -d terminal-bench-pro@1.0 -t constrained-portfolio-variance-minimization82bc46d
convert-jags-hierarchical-model-to-pymc
harbor run -d terminal-bench-pro@1.0 -t convert-jags-hierarchical-model-to-pymc82bc46d
convert-matlab-tiff-loader
harbor run -d terminal-bench-pro@1.0 -t convert-matlab-tiff-loader82bc46d
convert-r-model-script-to-python
harbor run -d terminal-bench-pro@1.0 -t convert-r-model-script-to-python82bc46d
count-claude-tokens-medical-papers
harbor run -d terminal-bench-pro@1.0 -t count-claude-tokens-medical-papers82bc46d
count-unique-person-names-conll2003
harbor run -d terminal-bench-pro@1.0 -t count-unique-person-names-conll200382bc46d
crack-pdf-password-and-extract-author
harbor run -d terminal-bench-pro@1.0 -t crack-pdf-password-and-extract-author82bc46d
craft-binary-message-file
harbor run -d terminal-bench-pro@1.0 -t craft-binary-message-file82bc46d
create-valid-message-enc-file
harbor run -d terminal-bench-pro@1.0 -t create-valid-message-enc-file82bc46d
cube-grid-cross-expansion
harbor run -d terminal-bench-pro@1.0 -t cube-grid-cross-expansion82bc46d
debug-bst-segfault-with-gdb
harbor run -d terminal-bench-pro@1.0 -t debug-bst-segfault-with-gdb82bc46d
decode-go-ctf-credentials
harbor run -d terminal-bench-pro@1.0 -t decode-go-ctf-credentials82bc46d
decode-multilayer-xor-payload
harbor run -d terminal-bench-pro@1.0 -t decode-multilayer-xor-payload82bc46d
decode-stl-heightmap-message
harbor run -d terminal-bench-pro@1.0 -t decode-stl-heightmap-message82bc46d
decrypt-and-restore-backup-fragments
harbor run -d terminal-bench-pro@1.0 -t decrypt-and-restore-backup-fragments82bc46d
decrypt-mia-agents-rendezvous-point
harbor run -d terminal-bench-pro@1.0 -t decrypt-mia-agents-rendezvous-point82bc46d
detect-and-mitigate-ssti
harbor run -d terminal-bench-pro@1.0 -t detect-and-mitigate-ssti82bc46d
detect-c-feature-flags
harbor run -d terminal-bench-pro@1.0 -t detect-c-feature-flags82bc46d
detect-corrupted-blockchain-transaction
harbor run -d terminal-bench-pro@1.0 -t detect-corrupted-blockchain-transaction82bc46d
diagnose-and-repair-broken-pip-installation
harbor run -d terminal-bench-pro@1.0 -t diagnose-and-repair-broken-pip-installation82bc46d
dual-bash-python-config-parser
harbor run -d terminal-bench-pro@1.0 -t dual-bash-python-config-parser82bc46d
email-and-timestamp-regex
harbor run -d terminal-bench-pro@1.0 -t email-and-timestamp-regex82bc46d
enumerate-polytope-extreme-points
harbor run -d terminal-bench-pro@1.0 -t enumerate-polytope-extreme-points82bc46d
extract-binary-symbol-table
harbor run -d terminal-bench-pro@1.0 -t extract-binary-symbol-table82bc46d
extract-paper-metadata-to-json
harbor run -d terminal-bench-pro@1.0 -t extract-paper-metadata-to-json82bc46d
find-invalid-blockchain-transactions
harbor run -d terminal-bench-pro@1.0 -t find-invalid-blockchain-transactions82bc46d
fix-color-lib-hash-and-pickle
harbor run -d terminal-bench-pro@1.0 -t fix-color-lib-hash-and-pickle82bc46d
fix-django-command-injection
harbor run -d terminal-bench-pro@1.0 -t fix-django-command-injection82bc46d
fix-docker-python-dependency-conflicts
harbor run -d terminal-bench-pro@1.0 -t fix-docker-python-dependency-conflicts82bc46d
fix-fasttext-import-on-python3-13
harbor run -d terminal-bench-pro@1.0 -t fix-fasttext-import-on-python3-1382bc46d
fix-game-server-turn-race-condition
harbor run -d terminal-bench-pro@1.0 -t fix-game-server-turn-race-condition82bc46d
fix-gomoku-win-detection
harbor run -d terminal-bench-pro@1.0 -t fix-gomoku-win-detection82bc46d
fix-jni-unsatisfiedlinkerror-path
harbor run -d terminal-bench-pro@1.0 -t fix-jni-unsatisfiedlinkerror-path82bc46d
fix-multihead-attention-weights-shape
harbor run -d terminal-bench-pro@1.0 -t fix-multihead-attention-weights-shape82bc46d
fix-nameerrors-using-aliases-mapping
harbor run -d terminal-bench-pro@1.0 -t fix-nameerrors-using-aliases-mapping82bc46d
fix-nested-transform-independence-matrix
harbor run -d terminal-bench-pro@1.0 -t fix-nested-transform-independence-matrix82bc46d
fix-neural-net-weight-init
harbor run -d terminal-bench-pro@1.0 -t fix-neural-net-weight-init82bc46d
fix-nn-weight-initialization
harbor run -d terminal-bench-pro@1.0 -t fix-nn-weight-initialization82bc46d
fix-numpy-einsum-optimize-compatibility
harbor run -d terminal-bench-pro@1.0 -t fix-numpy-einsum-optimize-compatibility82bc46d
fix-release-build-memory-corruption
harbor run -d terminal-bench-pro@1.0 -t fix-release-build-memory-corruption82bc46d
fix-sentiment-cli-text-processing
harbor run -d terminal-bench-pro@1.0 -t fix-sentiment-cli-text-processing82bc46d
fix-web-app-security-vulnerability
harbor run -d terminal-bench-pro@1.0 -t fix-web-app-security-vulnerability82bc46d
fix-xor-neural-network-instability
harbor run -d terminal-bench-pro@1.0 -t fix-xor-neural-network-instability82bc46d
flood-escape-path-planner
harbor run -d terminal-bench-pro@1.0 -t flood-escape-path-planner82bc46d
fluorescence-peak-fitting-pipeline
harbor run -d terminal-bench-pro@1.0 -t fluorescence-peak-fitting-pipeline82bc46d
generate-llvm-ir-with-dwarf-debug
harbor run -d terminal-bench-pro@1.0 -t generate-llvm-ir-with-dwarf-debug82bc46d
go-board-legal-move-capture-analysis
harbor run -d terminal-bench-pro@1.0 -t go-board-legal-move-capture-analysis82bc46d
go-board-move-recommendation
harbor run -d terminal-bench-pro@1.0 -t go-board-move-recommendation82bc46d
implement-2d-convolution
harbor run -d terminal-bench-pro@1.0 -t implement-2d-convolution82bc46d
implement-bivariate-gibbs-sampler
harbor run -d terminal-bench-pro@1.0 -t implement-bivariate-gibbs-sampler82bc46d
implement-chemical-equilibrium-solver
harbor run -d terminal-bench-pro@1.0 -t implement-chemical-equilibrium-solver82bc46d
implement-connect-four-mcts
harbor run -d terminal-bench-pro@1.0 -t implement-connect-four-mcts82bc46d
implement-crc32-with-logic-gates
harbor run -d terminal-bench-pro@1.0 -t implement-crc32-with-logic-gates82bc46d
implement-depgraph-dependency-resolver
harbor run -d terminal-bench-pro@1.0 -t implement-depgraph-dependency-resolver82bc46d
implement-gmm-em-cli
harbor run -d terminal-bench-pro@1.0 -t implement-gmm-em-cli82bc46d
implement-go-board-analyzer
harbor run -d terminal-bench-pro@1.0 -t implement-go-board-analyzer82bc46d
implement-hmc-sampler-with-jax
harbor run -d terminal-bench-pro@1.0 -t implement-hmc-sampler-with-jax82bc46d
implement-idcard-flask-api
harbor run -d terminal-bench-pro@1.0 -t implement-idcard-flask-api82bc46d
implement-lz77-file-compressor
harbor run -d terminal-bench-pro@1.0 -t implement-lz77-file-compressor82bc46d
implement-mitm-attack-for-24bit-double-cipher
harbor run -d terminal-bench-pro@1.0 -t implement-mitm-attack-for-24bit-double-cipher82bc46d
implement-nonogram-puzzle-solver
harbor run -d terminal-bench-pro@1.0 -t implement-nonogram-puzzle-solver82bc46d
implement-nonogram-solver
harbor run -d terminal-bench-pro@1.0 -t implement-nonogram-solver82bc46d
implement-portfolio-optimization-engine
harbor run -d terminal-bench-pro@1.0 -t implement-portfolio-optimization-engine82bc46d