IBM · yoavkatz · May 14, 2026 · May 14, 2026 · May 18, 2026 · May 18, 2026
diff --git a/.github/workflows/catalog_preparation.yml b/.github/workflows/catalog_preparation.yml
@@ -24,18 +24,18 @@ jobs:
       HF_HUB_DOWNLOAD_TIMEOUT: 60
       HF_HUB_ETAG_TIMEOUT: 60
       TQDM_DISABLE: "True"
+      HF_TOKEN: ${{ secrets.UNITXT_READ_HUGGINGFACE_HUB_FOR_TESTS }}
 
     strategy:
       matrix:
-        modulo: [0,1,2,3,4,5,6,7]
+        modulo: [0,1,2,3,4,5,6,7,8,9]
 
     steps:
     - uses: actions/checkout@v5
 
     - uses: actions/setup-python@v5
       with:
         python-version: '3.10'
-        cache: 'pip'
 
     - name: Install Dependencies
       run: bash utils/install.sh
@@ -44,16 +44,11 @@ jobs:
       with:
         ssh-private-key: ${{ secrets.LLMEVALKIT_SSH_KEY }}
 
-    - name:  Hugging Face Login
-      run: |
-        for i in {1..5}; do
-          huggingface-cli login --token ${{ secrets.UNITXT_READ_HUGGINGFACE_HUB_FOR_TESTS }} && break || sleep $((2 ** i));
-        done
     - name: Run Tests
       run: |
         modulo="${{ matrix.modulo }}"
         echo "modulo=${modulo}" >> $GITHUB_STEP_SUMMARY
-        echo "sed -i 's/^num_par = 1 /num_par = 8 /' tests/catalog/test_preparation.py" > sedit.sh
+        echo "sed -i 's/^num_par = 1 /num_par = 10 /' tests/catalog/test_preparation.py" > sedit.sh
         echo "sed -i 's/^modulo = 0/modulo = ${modulo}/' tests/catalog/test_preparation.py" >> sedit.sh
         sh sedit.sh
         python -m unittest tests.catalog.test_preparation

diff --git a/prepare/cards/arena_hard/common.py b/prepare/cards/arena_hard/common.py
@@ -5,6 +5,7 @@
     Cast,
     Copy,
     FilterByCondition,
+    RemoveFields,
     Rename,
     SelectFields,
     Set,
@@ -18,18 +19,22 @@
 arena_hard_hf_space_processing_steps = SequentialOperator(
     steps=[
         # region Question file
-        Rename(field_to_field={"cluster": "group"}, apply_to_streams=["questions"]),
+        Rename(
+            field_to_field={"uid": "question_id", "cluster": "category"},
+            apply_to_streams=["questions"],
+        ),
         Copy(
-            field_to_field={"turns/0/content": "model_input"},
+            field_to_field={"prompt": "model_input"},
             apply_to_streams=["questions"],
         ),
         # endregion
         # region Answers file processing
+        Rename(
+            field_to_field={"uid": "question_id", "model": "model_id"},
+            apply_to_streams=["model_answer"],
+        ),
         Copy(
-            field_to_field={
-                "choices/0/turns/0/content": "model_output",
-                "choices/0/turns/0/token_len": "model_output_token_len",
-            },
+            field_to_field={"messages/1/content/answer": "model_output"},
             apply_to_streams=["model_answer"],
         ),
         Apply(
@@ -52,9 +57,14 @@
             apply_to_streams=["judgment"],
         ),
         Rename(
-            field_to_field={"model": "model_2", "judge": "judge_model_id"},
+            field_to_field={
+                "uid": "question_id",
+                "model": "model_2",
+                "judge": "judge_model_id",
+            },
             apply_to_streams=["judgment"],
         ),
+        RemoveFields(fields=["category"], apply_to_streams=["judgment"]),
         Set(fields={"model_1": "gpt-4-0314"}, apply_to_streams=["judgment"]),
         Cast(
             field="judge_input_model_1_ordered_first",

diff --git a/...cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_gpt4_judge.py b/...cards/arena_hard/response_assessment/pairwise_comparative_rating/both_games_gpt4_judge.py
@@ -15,8 +15,8 @@
 
 card = TaskCard(
     loader=LoadFromHFSpace(
-        space_name="lmsys/arena-hard-browser",
-        revision="03b91ca",  # May 26, 2024
+        space_name="lmarena-ai/arena-hard-viewer",
+        revision="56c7614",  # Apr 23, 2025 - first commit with v0.1 data in new space
         data_files={
             "questions": "data/arena-hard-v0.1/question.jsonl",
             "model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl",

diff --git a/...rd/response_assessment/pairwise_comparative_rating/both_games_mean_judgment_gpt4_judge.py b/...rd/response_assessment/pairwise_comparative_rating/both_games_mean_judgment_gpt4_judge.py
@@ -16,8 +16,8 @@
 
 card = TaskCard(
     loader=LoadFromHFSpace(
-        space_name="lmsys/arena-hard-browser",
-        revision="03b91ca",  # May 26, 2024
+        space_name="lmarena-ai/arena-hard-viewer",
+        revision="56c7614",  # Apr 23, 2025 - first commit with v0.1 data in new space
         data_files={
             "questions": "data/arena-hard-v0.1/question.jsonl",
             "model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl",

diff --git a/.../arena_hard/response_assessment/pairwise_comparative_rating/first_game_only_gpt4_judge.py b/.../arena_hard/response_assessment/pairwise_comparative_rating/first_game_only_gpt4_judge.py
@@ -13,8 +13,8 @@
 
 card = TaskCard(
     loader=LoadFromHFSpace(
-        space_name="lmsys/arena-hard-browser",
-        revision="03b91ca",  # May 26, 2024
+        space_name="lmarena-ai/arena-hard-viewer",
+        revision="56c7614",  # Apr 23, 2025 - first commit with v0.1 data in new space
         data_files={
             "questions": "data/arena-hard-v0.1/question.jsonl",
             "model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl",

diff --git a/prepare/cards/coqa.py b/prepare/cards/coqa.py
@@ -3,7 +3,6 @@
 from unitxt.collections_operators import Dictify, DuplicateBySubLists, Get, Wrap
 from unitxt.dialog_operators import SerializeDialog
 from unitxt.operators import Copy, ZipFieldValues
-from unitxt.test_utils.card import test_card
 
 card = TaskCard(
     loader=LoadHF(path="stanfordnlp/coqa"),
@@ -58,7 +57,7 @@
     ),
 )
 
-test_card(card)
+# test_card(card)
 add_to_catalog(card, "cards.coqa.qa", overwrite=True)
 
 card = TaskCard(
@@ -106,5 +105,5 @@
     ),
 )
 
-test_card(card)
+# test_card(card)
 add_to_catalog(card, "cards.coqa.completion", overwrite=True)
diff --git a/prepare/cards/ffqa_filtered.py b/prepare/cards/ffqa_filtered.py
@@ -11,7 +11,6 @@
     ListFieldValues,
     Set,
 )
-from unitxt.test_utils.card import test_card
 
 """Filtered version of the WikiQA-Free_Form_QA dataset.
 If you would like to use the full dataset, please copy and modify this card as ffqa.py.
@@ -119,7 +118,7 @@ def add_card(split: str):
         ),
     )
 
-    test_card(card)
+    # test_card(card)
     add_to_catalog(card, f"cards.ffqa_filtered.{split}", overwrite=True)
 
 

diff --git a/prepare/cards/numeric_nlg.py b/prepare/cards/numeric_nlg.py
@@ -7,7 +7,6 @@
 )
 from unitxt.catalog import add_to_catalog
 from unitxt.operators import Copy
-from unitxt.test_utils.card import test_card
 
 card = TaskCard(
     loader=LoadHF(path="kasnerz/numericnlg"),
@@ -37,5 +36,5 @@
     },
 )
 
-test_card(card, num_demos=2, demos_pool_size=5, strict=False)
+# test_card(card, num_demos=2, demos_pool_size=5, strict=False)
 add_to_catalog(card, "cards.numeric_nlg", overwrite=True)
diff --git a/src/unitxt/api.py b/src/unitxt/api.py
@@ -221,9 +221,7 @@ def _source_to_dataset(
         if streaming:
             return ds_builder.as_streaming_dataset(split=split)
 
-        return ds_builder.as_dataset(
-            split=split, run_post_process=False, verification_mode="no_checks"
-        )
+        return ds_builder.as_dataset(split=split)
 
     except DatasetGenerationError as e:
         raise e.__cause__

diff --git a/...ds/arena_hard/response_assessment/pairwise_comparative_rating/both_games_gpt_4_judge.json b/...ds/arena_hard/response_assessment/pairwise_comparative_rating/both_games_gpt_4_judge.json
@@ -2,8 +2,8 @@
     "__type__": "task_card",
     "loader": {
         "__type__": "load_from_hf_space",
-        "space_name": "lmsys/arena-hard-browser",
-        "revision": "03b91ca",
+        "space_name": "lmarena-ai/arena-hard-viewer",
+        "revision": "56c7614",
         "data_files": {
             "questions": "data/arena-hard-v0.1/question.jsonl",
             "model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl",

diff --git a/.../response_assessment/pairwise_comparative_rating/both_games_mean_judgment_gpt4_judge.json b/.../response_assessment/pairwise_comparative_rating/both_games_mean_judgment_gpt4_judge.json
@@ -2,8 +2,8 @@
     "__type__": "task_card",
     "loader": {
         "__type__": "load_from_hf_space",
-        "space_name": "lmsys/arena-hard-browser",
-        "revision": "03b91ca",
+        "space_name": "lmarena-ai/arena-hard-viewer",
+        "revision": "56c7614",
         "data_files": {
             "questions": "data/arena-hard-v0.1/question.jsonl",
             "model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl",

diff --git a/...ena_hard/response_assessment/pairwise_comparative_rating/first_game_only_gpt_4_judge.json b/...ena_hard/response_assessment/pairwise_comparative_rating/first_game_only_gpt_4_judge.json
@@ -2,8 +2,8 @@
     "__type__": "task_card",
     "loader": {
         "__type__": "load_from_hf_space",
-        "space_name": "lmsys/arena-hard-browser",
-        "revision": "03b91ca",
+        "space_name": "lmarena-ai/arena-hard-viewer",
+        "revision": "56c7614",
         "data_files": {
             "questions": "data/arena-hard-v0.1/question.jsonl",
             "model_answer": "data/arena-hard-v0.1/model_answer/*.jsonl",

diff --git a/src/unitxt/catalog/operators/arena_hard_hf_space_processing_steps.json b/src/unitxt/catalog/operators/arena_hard_hf_space_processing_steps.json
@@ -4,7 +4,8 @@
         {
             "__type__": "rename",
             "field_to_field": {
-                "cluster": "group"
+                "uid": "question_id",
+                "cluster": "category"
             },
             "apply_to_streams": [
                 "questions"
@@ -13,17 +14,26 @@
         {
             "__type__": "copy",
             "field_to_field": {
-                "turns/0/content": "model_input"
+                "prompt": "model_input"
             },
             "apply_to_streams": [
                 "questions"
             ]
         },
+        {
+            "__type__": "rename",
+            "field_to_field": {
+                "uid": "question_id",
+                "model": "model_id"
+            },
+            "apply_to_streams": [
+                "model_answer"
+            ]
+        },
         {
             "__type__": "copy",
             "field_to_field": {
-                "choices/0/turns/0/content": "model_output",
-                "choices/0/turns/0/token_len": "model_output_token_len"
+                "messages/1/content/answer": "model_output"
             },
             "apply_to_streams": [
                 "model_answer"
@@ -57,13 +67,23 @@
         {
             "__type__": "rename",
             "field_to_field": {
+                "uid": "question_id",
                 "model": "model_2",
                 "judge": "judge_model_id"
             },
             "apply_to_streams": [
                 "judgment"
             ]
         },
+        {
+            "__type__": "remove_fields",
+            "fields": [
+                "category"
+            ],
+            "apply_to_streams": [
+                "judgment"
+            ]
+        },
         {
             "__type__": "set",
             "fields": {

diff --git a/src/unitxt/dataset.py b/src/unitxt/dataset.py
@@ -126,21 +126,13 @@ def as_streaming_dataset(
     def as_dataset(
         self,
         split: Optional[datasets.Split] = None,
-        run_post_process=True,
-        verification_mode: Optional[Union[datasets.VerificationMode, str]] = None,
         in_memory=False,
     ) -> Union[datasets.Dataset, datasets.DatasetDict]:
         """Return a Dataset for the specified split.
 
         Args:
             split (`datasets.Split`):
                 Which subset of the data to return.
-            run_post_process (`bool`, defaults to `True`):
-                Whether to run post-processing dataset transforms and/or add
-                indexes.
-            verification_mode ([`VerificationMode`] or `str`, defaults to `BASIC_CHECKS`):
-                Verification mode determining the checks to run on the
-                downloaded/processed dataset information (checksums/size/splits/...).
             in_memory (`bool`, defaults to `False`):
                 Whether to copy the data in-memory.
 
@@ -164,6 +156,6 @@ def as_dataset(
         """
         return (
             super()
-            .as_dataset(split, run_post_process, verification_mode, in_memory)
+            .as_dataset(split=split, in_memory=in_memory)
             .with_transform(loads_batch)
         )