terminal-bench-pro

v1.0

Terminal-Bench Pro (Public Set) is an extended benchmark dataset for testing AI agents in real terminal environments. From compiling code to training models and setting up servers, Terminal-Bench Pro evaluates how well agents can handle real-world, end-to-end tasks autonomously.

harbor run -d terminal-bench-pro@1.0

Tasks (200)

advanced-json-to-rfc4180-csv-converter
harbor run -d terminal-bench-pro@1.0 -t advanced-json-to-rfc4180-csv-converter
82bc46d
advanced-poker-hand-classifier
harbor run -d terminal-bench-pro@1.0 -t advanced-poker-hand-classifier
82bc46d
analyze-and-run-encoded-payload
harbor run -d terminal-bench-pro@1.0 -t analyze-and-run-encoded-payload
82bc46d
analyze-arm-shellcode-network-connections
harbor run -d terminal-bench-pro@1.0 -t analyze-arm-shellcode-network-connections
82bc46d
analyze-fen-with-stockfish
harbor run -d terminal-bench-pro@1.0 -t analyze-fen-with-stockfish
82bc46d
analyze-pgn-best-chess-move
harbor run -d terminal-bench-pro@1.0 -t analyze-pgn-best-chess-move
82bc46d
apache-log-security-analyzer
harbor run -d terminal-bench-pro@1.0 -t apache-log-security-analyzer
82bc46d
automate-blind-graph-mapping
harbor run -d terminal-bench-pro@1.0 -t automate-blind-graph-mapping
82bc46d
bash-ddos-traffic-analyzer
harbor run -d terminal-bench-pro@1.0 -t bash-ddos-traffic-analyzer
82bc46d
bash-tree-diff-sync
harbor run -d terminal-bench-pro@1.0 -t bash-tree-diff-sync
82bc46d
benchmark-gcc-opt-levels
harbor run -d terminal-bench-pro@1.0 -t benchmark-gcc-opt-levels
82bc46d
boot-debian-qemu-with-ssh-check
harbor run -d terminal-bench-pro@1.0 -t boot-debian-qemu-with-ssh-check
82bc46d
build-arm64-qemu-linux-with-custom-message
harbor run -d terminal-bench-pro@1.0 -t build-arm64-qemu-linux-with-custom-message
82bc46d
build-coq-from-source
harbor run -d terminal-bench-pro@1.0 -t build-coq-from-source
82bc46d
build-graphicsmagick-1-3-45
harbor run -d terminal-bench-pro@1.0 -t build-graphicsmagick-1-3-45
82bc46d
build-grpc-user-profile-service
harbor run -d terminal-bench-pro@1.0 -t build-grpc-user-profile-service
82bc46d
build-nginx-1-24-production-server
harbor run -d terminal-bench-pro@1.0 -t build-nginx-1-24-production-server
82bc46d
build-prime-factorization-http-api
harbor run -d terminal-bench-pro@1.0 -t build-prime-factorization-http-api
82bc46d
build-python-sokoban-solver
harbor run -d terminal-bench-pro@1.0 -t build-python-sokoban-solver
82bc46d
build-qemu-arm-user-emulator
harbor run -d terminal-bench-pro@1.0 -t build-qemu-arm-user-emulator
82bc46d
build-uci-chess-engine-to-beat-multiple-bots
harbor run -d terminal-bench-pro@1.0 -t build-uci-chess-engine-to-beat-multiple-bots
82bc46d
c-audio-waveform-reconstruction
harbor run -d terminal-bench-pro@1.0 -t c-audio-waveform-reconstruction
82bc46d
cache-bert-model-offline
harbor run -d terminal-bench-pro@1.0 -t cache-bert-model-offline
82bc46d
capture-iot-firmware-boot-message
harbor run -d terminal-bench-pro@1.0 -t capture-iot-firmware-boot-message
82bc46d
cmake-build-for-cpp-console-app
harbor run -d terminal-bench-pro@1.0 -t cmake-build-for-cpp-console-app
82bc46d
compare-cmake-build-types-performance
harbor run -d terminal-bench-pro@1.0 -t compare-cmake-build-types-performance
82bc46d
compare-lasso-ridge-elasticnet
harbor run -d terminal-bench-pro@1.0 -t compare-lasso-ridge-elasticnet
82bc46d
compare-lasso-ridge-gene-expression
harbor run -d terminal-bench-pro@1.0 -t compare-lasso-ridge-gene-expression
82bc46d
compile-postgresql-with-sanitizers
harbor run -d terminal-bench-pro@1.0 -t compile-postgresql-with-sanitizers
82bc46d
complete-sql-tutorial-game
harbor run -d terminal-bench-pro@1.0 -t complete-sql-tutorial-game
82bc46d
compute-best-chess-move-san
harbor run -d terminal-bench-pro@1.0 -t compute-best-chess-move-san
82bc46d
compute-image-hash-chain
harbor run -d terminal-bench-pro@1.0 -t compute-image-hash-chain
82bc46d
compute-symbolic-eigenpairs-3x3-matrix
harbor run -d terminal-bench-pro@1.0 -t compute-symbolic-eigenpairs-3x3-matrix
82bc46d
configure-apache-analytics-virtualhost
harbor run -d terminal-bench-pro@1.0 -t configure-apache-analytics-virtualhost
82bc46d
configure-apache-logging-and-rate-limit
harbor run -d terminal-bench-pro@1.0 -t configure-apache-logging-and-rate-limit
82bc46d
configure-localhost-ssh-key-login
harbor run -d terminal-bench-pro@1.0 -t configure-localhost-ssh-key-login
82bc46d
consolidate-valid-prod-credentials
harbor run -d terminal-bench-pro@1.0 -t consolidate-valid-prod-credentials
82bc46d
constrained-portfolio-variance-minimization
harbor run -d terminal-bench-pro@1.0 -t constrained-portfolio-variance-minimization
82bc46d
convert-jags-hierarchical-model-to-pymc
harbor run -d terminal-bench-pro@1.0 -t convert-jags-hierarchical-model-to-pymc
82bc46d
convert-matlab-tiff-loader
harbor run -d terminal-bench-pro@1.0 -t convert-matlab-tiff-loader
82bc46d
convert-r-model-script-to-python
harbor run -d terminal-bench-pro@1.0 -t convert-r-model-script-to-python
82bc46d
count-claude-tokens-medical-papers
harbor run -d terminal-bench-pro@1.0 -t count-claude-tokens-medical-papers
82bc46d
count-unique-person-names-conll2003
harbor run -d terminal-bench-pro@1.0 -t count-unique-person-names-conll2003
82bc46d
crack-pdf-password-and-extract-author
harbor run -d terminal-bench-pro@1.0 -t crack-pdf-password-and-extract-author
82bc46d
craft-binary-message-file
harbor run -d terminal-bench-pro@1.0 -t craft-binary-message-file
82bc46d
create-valid-message-enc-file
harbor run -d terminal-bench-pro@1.0 -t create-valid-message-enc-file
82bc46d
cube-grid-cross-expansion
harbor run -d terminal-bench-pro@1.0 -t cube-grid-cross-expansion
82bc46d
debug-bst-segfault-with-gdb
harbor run -d terminal-bench-pro@1.0 -t debug-bst-segfault-with-gdb
82bc46d
decode-go-ctf-credentials
harbor run -d terminal-bench-pro@1.0 -t decode-go-ctf-credentials
82bc46d
decode-multilayer-xor-payload
harbor run -d terminal-bench-pro@1.0 -t decode-multilayer-xor-payload
82bc46d
decode-stl-heightmap-message
harbor run -d terminal-bench-pro@1.0 -t decode-stl-heightmap-message
82bc46d
decrypt-and-restore-backup-fragments
harbor run -d terminal-bench-pro@1.0 -t decrypt-and-restore-backup-fragments
82bc46d
decrypt-mia-agents-rendezvous-point
harbor run -d terminal-bench-pro@1.0 -t decrypt-mia-agents-rendezvous-point
82bc46d
detect-and-mitigate-ssti
harbor run -d terminal-bench-pro@1.0 -t detect-and-mitigate-ssti
82bc46d
detect-c-feature-flags
harbor run -d terminal-bench-pro@1.0 -t detect-c-feature-flags
82bc46d
detect-corrupted-blockchain-transaction
harbor run -d terminal-bench-pro@1.0 -t detect-corrupted-blockchain-transaction
82bc46d
diagnose-and-repair-broken-pip-installation
harbor run -d terminal-bench-pro@1.0 -t diagnose-and-repair-broken-pip-installation
82bc46d
dual-bash-python-config-parser
harbor run -d terminal-bench-pro@1.0 -t dual-bash-python-config-parser
82bc46d
email-and-timestamp-regex
harbor run -d terminal-bench-pro@1.0 -t email-and-timestamp-regex
82bc46d
enumerate-polytope-extreme-points
harbor run -d terminal-bench-pro@1.0 -t enumerate-polytope-extreme-points
82bc46d
extract-binary-symbol-table
harbor run -d terminal-bench-pro@1.0 -t extract-binary-symbol-table
82bc46d
extract-paper-metadata-to-json
harbor run -d terminal-bench-pro@1.0 -t extract-paper-metadata-to-json
82bc46d
find-invalid-blockchain-transactions
harbor run -d terminal-bench-pro@1.0 -t find-invalid-blockchain-transactions
82bc46d
fix-color-lib-hash-and-pickle
harbor run -d terminal-bench-pro@1.0 -t fix-color-lib-hash-and-pickle
82bc46d
fix-django-command-injection
harbor run -d terminal-bench-pro@1.0 -t fix-django-command-injection
82bc46d
fix-docker-python-dependency-conflicts
harbor run -d terminal-bench-pro@1.0 -t fix-docker-python-dependency-conflicts
82bc46d
fix-fasttext-import-on-python3-13
harbor run -d terminal-bench-pro@1.0 -t fix-fasttext-import-on-python3-13
82bc46d
fix-game-server-turn-race-condition
harbor run -d terminal-bench-pro@1.0 -t fix-game-server-turn-race-condition
82bc46d
fix-gomoku-win-detection
harbor run -d terminal-bench-pro@1.0 -t fix-gomoku-win-detection
82bc46d
fix-jni-unsatisfiedlinkerror-path
harbor run -d terminal-bench-pro@1.0 -t fix-jni-unsatisfiedlinkerror-path
82bc46d
fix-multihead-attention-weights-shape
harbor run -d terminal-bench-pro@1.0 -t fix-multihead-attention-weights-shape
82bc46d
fix-nameerrors-using-aliases-mapping
harbor run -d terminal-bench-pro@1.0 -t fix-nameerrors-using-aliases-mapping
82bc46d
fix-nested-transform-independence-matrix
harbor run -d terminal-bench-pro@1.0 -t fix-nested-transform-independence-matrix
82bc46d
fix-neural-net-weight-init
harbor run -d terminal-bench-pro@1.0 -t fix-neural-net-weight-init
82bc46d
fix-nn-weight-initialization
harbor run -d terminal-bench-pro@1.0 -t fix-nn-weight-initialization
82bc46d
fix-numpy-einsum-optimize-compatibility
harbor run -d terminal-bench-pro@1.0 -t fix-numpy-einsum-optimize-compatibility
82bc46d
fix-release-build-memory-corruption
harbor run -d terminal-bench-pro@1.0 -t fix-release-build-memory-corruption
82bc46d
fix-sentiment-cli-text-processing
harbor run -d terminal-bench-pro@1.0 -t fix-sentiment-cli-text-processing
82bc46d
fix-web-app-security-vulnerability
harbor run -d terminal-bench-pro@1.0 -t fix-web-app-security-vulnerability
82bc46d
fix-xor-neural-network-instability
harbor run -d terminal-bench-pro@1.0 -t fix-xor-neural-network-instability
82bc46d
flood-escape-path-planner
harbor run -d terminal-bench-pro@1.0 -t flood-escape-path-planner
82bc46d
fluorescence-peak-fitting-pipeline
harbor run -d terminal-bench-pro@1.0 -t fluorescence-peak-fitting-pipeline
82bc46d
generate-llvm-ir-with-dwarf-debug
harbor run -d terminal-bench-pro@1.0 -t generate-llvm-ir-with-dwarf-debug
82bc46d
go-board-legal-move-capture-analysis
harbor run -d terminal-bench-pro@1.0 -t go-board-legal-move-capture-analysis
82bc46d
go-board-move-recommendation
harbor run -d terminal-bench-pro@1.0 -t go-board-move-recommendation
82bc46d
implement-2d-convolution
harbor run -d terminal-bench-pro@1.0 -t implement-2d-convolution
82bc46d
implement-bivariate-gibbs-sampler
harbor run -d terminal-bench-pro@1.0 -t implement-bivariate-gibbs-sampler
82bc46d
implement-chemical-equilibrium-solver
harbor run -d terminal-bench-pro@1.0 -t implement-chemical-equilibrium-solver
82bc46d
implement-connect-four-mcts
harbor run -d terminal-bench-pro@1.0 -t implement-connect-four-mcts
82bc46d
implement-crc32-with-logic-gates
harbor run -d terminal-bench-pro@1.0 -t implement-crc32-with-logic-gates
82bc46d
implement-depgraph-dependency-resolver
harbor run -d terminal-bench-pro@1.0 -t implement-depgraph-dependency-resolver
82bc46d
implement-gmm-em-cli
harbor run -d terminal-bench-pro@1.0 -t implement-gmm-em-cli
82bc46d
implement-go-board-analyzer
harbor run -d terminal-bench-pro@1.0 -t implement-go-board-analyzer
82bc46d
implement-hmc-sampler-with-jax
harbor run -d terminal-bench-pro@1.0 -t implement-hmc-sampler-with-jax
82bc46d
implement-idcard-flask-api
harbor run -d terminal-bench-pro@1.0 -t implement-idcard-flask-api
82bc46d
implement-lz77-file-compressor
harbor run -d terminal-bench-pro@1.0 -t implement-lz77-file-compressor
82bc46d
implement-mitm-attack-for-24bit-double-cipher
harbor run -d terminal-bench-pro@1.0 -t implement-mitm-attack-for-24bit-double-cipher
82bc46d
implement-nonogram-puzzle-solver
harbor run -d terminal-bench-pro@1.0 -t implement-nonogram-puzzle-solver
82bc46d
implement-nonogram-solver
harbor run -d terminal-bench-pro@1.0 -t implement-nonogram-solver
82bc46d
implement-portfolio-optimization-engine
harbor run -d terminal-bench-pro@1.0 -t implement-portfolio-optimization-engine
82bc46d