models/.github/workflows/update-benchmarks.yml at main · arimxyer/models · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
name: Update Benchmark Data

on:
  schedule:
    - cron: '*/30 * * * *' # Every 30 minutes
  workflow_dispatch: {}

permissions:
  contents: write

env:
  FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true

jobs:
  update-benchmarks:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4

      - name: Fetch benchmark data from Artificial Analysis API
        env:
          AA_API_KEY: ${{ secrets.AA_API_KEY }}
        run: |
          curl -sf -H "X-API-Key: $AA_API_KEY" \
            https://artificialanalysis.ai/api/v2/data/llms/models \
            | jq '[.data[] | {
              id: .id,
              name: .name,
              slug: .slug,
              creator: (.model_creator?.slug // null),
              creator_id: (.model_creator?.id // null),
              creator_name: (.model_creator?.name // null),
              release_date: .release_date,
              intelligence_index: (.evaluations?.artificial_analysis_intelligence_index // null),
              coding_index: (.evaluations?.artificial_analysis_coding_index // null),
              math_index: (.evaluations?.artificial_analysis_math_index // null),
              mmlu_pro: (.evaluations?.mmlu_pro // null),
              gpqa: (.evaluations?.gpqa // null),
              hle: (.evaluations?.hle // null),
              livecodebench: (.evaluations?.livecodebench // null),
              scicode: (.evaluations?.scicode // null),
              ifbench: (.evaluations?.ifbench // null),
              lcr: (.evaluations?.lcr // null),
              terminalbench_hard: (.evaluations?.terminalbench_hard // null),
              tau2: (.evaluations?.tau2 // null),
              math_500: (.evaluations?.math_500 // null),
              aime: (.evaluations?.aime // null),
              aime_25: (.evaluations?.aime_25 // null),
              output_tps: (.median_output_tokens_per_second | if . == 0 then null else . end),
              ttft: (.median_time_to_first_token_seconds | if . == 0 then null else . end),
              ttfat: (.median_time_to_first_answer_token | if . == 0 then null else . end),
              price_input: (.pricing?.price_1m_input_tokens // null),
              price_output: (.pricing?.price_1m_output_tokens // null),
              price_blended: (.pricing?.price_1m_blended_3_to_1 // null)
            }]' > data/benchmarks.json

          echo "Fetched $(jq length data/benchmarks.json) entries"

      - name: Commit and push if changed
        id: commit
        run: |
          git diff --quiet data/benchmarks.json && echo "changed=false" >> "$GITHUB_OUTPUT" && exit 0
          git config user.name "github-actions[bot]"
          git config user.email "github-actions[bot]@users.noreply.github.com"
          git add data/benchmarks.json
          git commit -m "chore: update benchmark data from Artificial Analysis"
          git push
          echo "changed=true" >> "$GITHUB_OUTPUT"

      - name: Purge jsDelivr cache
        if: steps.commit.outputs.changed == 'true'
        run: |
          curl -sf "https://purge.jsdelivr.net/gh/arimxyer/models@main/data/benchmarks.json" || true