simula · stevenah · May 3, 2026 · Feb 1, 2026 · Feb 1, 2026 · May 2, 2026
diff --git a/.eslintrc.json b/.eslintrc.json
diff --git a/.github/workflows/deploy.yaml b/.github/workflows/deploy.yaml
@@ -16,15 +16,17 @@ jobs:
                   known_hosts: 'just-a-placeholder-so-we-dont-get-errors'
             - name: Adding Known Hosts
               run: ssh-keyscan -H ${{ secrets.SSH_HOST }} >> ~/.ssh/known_hosts
-            - uses: actions/checkout@v2
-            - uses: actions/setup-node@v1
+            - uses: actions/checkout@v4
+            - uses: actions/setup-node@v4
               with:
-                  node-version: '12.x'
+                  node-version: '20.x'
             - name: Install NPM dependencies
               run: npm install
+            - name: Lint
+              run: npm run lint
+            - name: Run unit and integration tests
+              run: npm test
             - name: Build application
               run: npm run build
-            - name: Export application
-              run: npm run export
             - name: Deploy with rsync
               run: rsync -avz --no-perms --omit-dir-times --delete ./out/ ${{ secrets.SSH_USER }}@${{ secrets.SSH_HOST }}:${{ secrets.HOST_DEPLOY_PATH }} --exclude downloads --exclude keep --exclude ExE1Dlex4PyE78q9BXFv --exclude ExE1Dlex4PyE78q9BXkl --exclude sparcity --exclude toadstool2.0
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -0,0 +1,44 @@
+name: test
+
+on:
+    pull_request:
+    push:
+        branches-ignore: [main]
+
+jobs:
+    unit:
+        runs-on: ubuntu-latest
+        steps:
+            - uses: actions/checkout@v4
+            - uses: actions/setup-node@v4
+              with:
+                  node-version: '20.x'
+                  cache: npm
+            - name: Install dependencies
+              run: npm ci
+            - name: Lint
+              run: npm run lint
+            - name: Run unit and integration tests
+              run: npm test
+
+    e2e:
+        runs-on: ubuntu-latest
+        steps:
+            - uses: actions/checkout@v4
+            - uses: actions/setup-node@v4
+              with:
+                  node-version: '20.x'
+                  cache: npm
+            - name: Install dependencies
+              run: npm ci
+            - name: Install Playwright browsers
+              run: npx playwright install --with-deps chromium firefox webkit
+            - name: Run E2E tests
+              run: npm run test:e2e
+            - name: Upload Playwright report
+              if: failure()
+              uses: actions/upload-artifact@v4
+              with:
+                  name: playwright-report
+                  path: playwright-report/
+                  retention-days: 7
diff --git a/.gitignore b/.gitignore
@@ -13,6 +13,9 @@
 /.build/
 /out/
 
+# build-time generated thumbnails (from scripts/optimize-thumbnails.mjs)
+/public/thumbnails/optimized/
+
 # production
 /build
 
@@ -33,3 +36,4 @@ yarn-error.log*
 
 # vercel
 .vercel
+test-results/
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -0,0 +1,25 @@
+# Contributing a dataset
+
+Datasets are added by pull request.
+
+1. **Fork** the repo and create a branch.
+2. **Create `datasets/<slug>.md`** with this frontmatter:
+
+    ```markdown
+    ---
+    title: '<dataset name>'
+    desc: '<one-sentence description>'
+    thumbnail: /thumbnails/<slug>.png
+    publication: <https://...>     # optional
+    github: <https://...>          # optional
+    tags:
+      - <tag from src/data/tags.js>
+    ---
+
+    <full markdown description here>
+    ```
+
+3. **Add the thumbnail** to `public/thumbnails/<slug>.png` (16:9, PNG or JPG — e.g. 640×360).
+4. **Pick tags from the curated list** in [`src/data/tags.js`](src/data/tags.js). The schema rejects anything not in that list. If your dataset truly needs a new tag, add an entry to `src/data/tags.js` in the same PR.
+5. **Run `npm test`** locally — the schema validator runs as part of the test suite, so you'll see immediately if anything is wrong.
+6. **Open a PR**. CI will re-run validation; once it's green, a maintainer will review and merge.
diff --git a/Makefile b/Makefile
@@ -0,0 +1,25 @@
+.PHONY: install dev build start test test-watch test-coverage test-e2e lint format clean
+
+install:
+	npm install
+
+dev:
+	npm run dev
+
+build:
+	npm run build
+
+start:
+	npm run start
+
+test:
+	npm test
+
+lint:
+	npm run lint
+
+format:
+	npm run prettier
+
+clean:
+	rm -rf .next out node_modules/.cache test-results
diff --git a/README.md b/README.md
@@ -56,26 +56,8 @@ Currently, we have published the following datasets:
 
 
 ## How to contribute
-To add a new **dataset**, follow these steps:
 
-1. **Fork the Repository:** Fork this repository to your GitHub account.
-2. **Create a Markdown File:** In your forked repository, navigate to the `datasets` folder and create a new Markdown file (`.md`) for your dataset. The file name should be descriptive of the dataset.
-3. **Add Dataset Information:** Copy and paste the following template into your Markdown file:
-   ```markdown
-   ---
-   title: <dataset name>
-   desc: <dataset description>
-   thumbnail: <dataset thumbnail>
-   publication: <link to publication>
-   github: <link to github>
-   tags:
-     - <list of tags>
-   ---
-   ```
-   Fill in the template with the appropriate information about your dataset.
-4. **Add a Dataset Thumbnail:** Add a thumbnail to the dataset that will be displayed on the main page. The thumbnail should use a 16:9 aspect ratio, like `320 x 180` or `640 x 360` pixels, and be placed under `public/thumbnails`.
-5. **Update the README:** Update this README with the new dataset added under one of the categories above. Add links to the publication, code, or other things that may be useful.
-6. **Create a Pull Request:** Once you have added the Markdown file and filled in the dataset information, commit your changes. Push the changes to your forked repository. Create a pull request to merge your changes into the main repository.
+Datasets are added via pull request. See [CONTRIBUTING.md](CONTRIBUTING.md) for the full walkthrough — in short: fork the repo, add a `datasets/<slug>.md` file with the required frontmatter, drop a 16:9 thumbnail into `public/thumbnails/`, run `npm test`, and open a PR.
 
 ## Contact
 If you have any questions or need assistance, please open an issue in the repository or contact steven@simula.no.
diff --git a/datasets/alfheim.md b/datasets/alfheim.md
@@ -1,12 +1,13 @@
 ---
-title: 'Alfheim'
-desc: 'Soccer video and player position dataset.'
+title: Alfheim
+desc: Soccer video and player position dataset.
 thumbnail: /thumbnails/alfheim.png
-publication: https://dl.acm.org/doi/10.1145/2557642.2563677
+publication: 'https://dl.acm.org/doi/10.1145/2557642.2563677'
 github: ''
-tags:
-  - soccer
-  - video analysis
+domain:
+  - sports
+modality:
+  - video
 ---
 
 In this dataset paper, we present and make available a dataset of elite soccer player movements and corresponding videos. The dataset is captured at Alfheim Stadium -- the home arena for Tromsø IL (Norway). The player postions are measured at 20~Hz using the ZXY Sport Tracking system, and the video is captured from the middle of the field using two camera arrays. The player tracking system provides the player coordinates on the field, their speed, acceleration and force together with an ID and timestamp. The camera array covers the entire field, and each camera can be used individually or as a stitched panorama video. In addition to the obvious sport analytics scenario, the dataset can be used several ways. In the multimedia scenario, the combination of sensor data and video gives a researcher a video dataset with a ground truth object position from the sensor data, i.e., it can be used to test algorithms used for feature extraction, object tracking, background substraction, etc. The sensor data itself can also be used alone in simulations or experiments where for example the users with devices move on a limited area (the field) with a varying speed and direction.
@@ -116,4 +117,4 @@ Proceedings of the International Conference on Distributed Smart Cameras (ICDSC)
 [\[pdf\]](http://home.ifi.uio.no/paalh/publications/files/icdsc2012.pdf)
 
 ## Contact
-Email paalh (_at_) simula (_dot_) no if you have any questions about the dataset and our research activities. We always welcome collaboration and joint research!
+Email paalh (_at_) simula (_dot_) no if you have any questions about the dataset and our research activities. We always welcome collaboration and joint research!
diff --git a/datasets/ao.md b/datasets/ao.md
@@ -1,11 +1,14 @@
 ---
-title: 'Anarchy Online'
-desc: 'Server-side Network Traffic from Anarchy Online: Analysis, Statistics and Applications.'
+title: Anarchy Online
+desc: >-
+  Server-side Network Traffic from Anarchy Online: Analysis, Statistics and
+  Applications.
 thumbnail: /thumbnails/anarchy-online.png
-publication: https://datasets.simula.no/ao/mmsys2012-dataset.pdf
+publication: 'https://datasets.simula.no/ao/mmsys2012-dataset.pdf'
 github: ''
-tags:
-  - climate change
+domain:
+  - networks
+modality:
   - sensor
 ---
 

diff --git a/datasets/arx.md b/datasets/arx.md
@@ -1,12 +1,17 @@
 ---
-title: 'Arx'
-desc: 'A Text-Classification Dataset Consisting of Norwegian Soccer Articles from VG and TV2.'
+title: Arx
+desc: >-
+  A Text-Classification Dataset Consisting of Norwegian Soccer Articles from VG
+  and TV2.
 thumbnail: /thumbnails/arx.jpg
-publication: https://ieeexplore.ieee.org/abstract/document/8877417/
+publication: 'https://ieeexplore.ieee.org/abstract/document/8877417/'
 github: ''
-tags:
-  - soccer
+domain:
+  - sports
+modality:
   - text
+tasks:
+  - classification
 ---
 
 We are today overwhelmed with information, of which an important part is news. Sports news, in particular, has become very popular, where soccer makes up a big part of this coverage. For sports fans, it can be a time consuming and tedious to keep up with the news that they really
@@ -44,4 +49,4 @@ The dataset contains 5,526 labeled data samples and is freely available online.
 The use of the Arx dataset is restricted for research and educational purposes only. The use of the Arx dataset for other purposes including commercial purposes is forbidden without prior written permission. In all documents and papers that use or refer to the Arx dataset or report experimental results based on the Arx dataset, a reference to the dataset paper have to be included.
 
 ## Contact
-Email steven (_at_) simula (_dot_) no if you have any questions about the dataset and our research activities. We always welcome collaboration and joint research!
+Email steven (_at_) simula (_dot_) no if you have any questions about the dataset and our research activities. We always welcome collaboration and joint research!
diff --git a/datasets/cellular.md b/datasets/cellular.md
@@ -1,13 +1,13 @@
 ---
-title: 'Cellular'
-desc: 'A cell autophagy dataset.'
+title: Cellular
+desc: A cell autophagy dataset.
 thumbnail: /thumbnails/cellular.png
 publication: ''
-github: https://github.com/simula/cellular
+github: 'https://github.com/simula/cellular'
 hidden: false
-tags:
-  - cells
+tasks:
   - segmentation
+  - classification
 ---
 
 ## Dataset Details

diff --git a/datasets/depresjon.md b/datasets/depresjon.md
@@ -1,12 +1,16 @@
 ---
-title: 'Depresjon'
-desc: 'The Depresjon Dataset.'
+title: Depresjon
+desc: The Depresjon Dataset.
 thumbnail: /thumbnails/depresjon.png
-publication: https://dl.acm.org/doi/10.1145/3204949.3208125
+publication: 'https://dl.acm.org/doi/10.1145/3204949.3208125'
 github: ''
-tags:
-  - mental health
+domain:
+  - health
+modality:
   - sensor
+tasks:
+  - classification
+  - forecasting
 ---
 
 Wearable sensors measuring different parts of people's activity are a common technology nowadays. Data created using these devices holds a lot of potential besides measuring the quantity of daily steps or calories burned, since continuous recordings of heart rate and activity levels usually are collected. Furthermore, there is an increasing awareness in the field of psychiatry on how these activity data relates to various mental health issues such as changes in mood, personality, inability to cope with daily problems or stress and withdrawal from friends and activities. In this paper we present the analysis of a unique dataset containing sensor data collected from patients suffering from depression. The dataset contains motor activity recordings of 23 unipolar and bipolar depressed patients and 32 healthy controls. We apply machine learning to classify patients into depressed and nondepressed. For evaluation of the algorithms, leave one patient out validation is performed. The best results achieved are an F1 score of 0.73 and a MCC of 0.44. The overall findings show that sensor data contains information that can be used to determine the depression status of a person.
@@ -64,4 +68,4 @@ Looking at the list of related work in this area, there are a lot of different m
 In all documents and papers that report experimental results based on the Depresjon Dataset, a reference to this study should be included.
 
 ## Contact
-Email enriqug (_at_) ifi (_dot_) uio (_dot_) no or michael (_at_) simula (_dot_) no if you have any questions about to the dataset and our research activities. We always welcome collaboration and joint research!
+Email enriqug (_at_) ifi (_dot_) uio (_dot_) no or michael (_at_) simula (_dot_) no if you have any questions about to the dataset and our research activities. We always welcome collaboration and joint research!
diff --git a/datasets/ecc-dataset.md b/datasets/ecc-dataset.md
@@ -1,12 +1,15 @@
 ---
-title: 'European Cloud Cover'
-desc: 'A dataset containing reanalysis data from ERA5 and satellite retrievals from METeosat Second Generation.'
+title: European Cloud Cover
+desc: >-
+  A dataset containing reanalysis data from ERA5 and satellite retrievals from
+  METeosat Second Generation.
 thumbnail: /thumbnails/european-cloud-cover.jpg
-publication: https://www.mdpi.com/2504-2289/5/4/62/pdf
-github: https://osf.io/kqdgx/
-tags:
-  - climate change
+publication: 'https://www.mdpi.com/2504-2289/5/4/62/pdf'
+github: 'https://osf.io/kqdgx/'
+modality:
   - sensor
+tasks:
+  - forecasting
 ---
 
 Climate change is stated as one of the biggest challenges of our time resulting in many unwanted effects. The response of cloud fractional cover (CFC), i.e. the portion of the sky covered by clouds, to future climate is associated with high uncertainties. CFC will affect the rate of global warming and different parts of the society such as agriculture and solar energy production.

diff --git a/datasets/exposure-engine.md b/datasets/exposure-engine.md
@@ -1,14 +1,13 @@
 ---
-title: 'ExposureEngine'
-desc: 'Oriented Logo Detection and Sponsor Visibility Analytics in Sports Broadcasts'
+title: ExposureEngine
+desc: Oriented Logo Detection and Sponsor Visibility Analytics in Sports Broadcasts
 thumbnail: /thumbnails/exposure-engine.jpg
 publication: ''
-github: https://huggingface.co/datasets/SimulaMet-HOST/ExposureEngine
+github: 'https://huggingface.co/datasets/SimulaMet-HOST/ExposureEngine'
 hidden: false
-tags:
-  - Logo Detection
-  - Oriented Object Detection
-  - Analytics
+tasks:
+  - detection
+  - tracking
 ---
 
 # ExposureEngine: Oriented Logo Detection & Sponsor Visibility Analytics (Dataset)

diff --git a/datasets/eye-tracker.md b/datasets/eye-tracker.md
@@ -1,11 +1,10 @@
 ---
-title: 'Eye Tracker'
-desc: 'A Serious Game Based Dataset.'
+title: Eye Tracker
+desc: A Serious Game Based Dataset.
 thumbnail: /thumbnails/eye-tracker.png
-publication: http://ceur-ws.org/Vol-1345/gamifir15_5.pdf
+publication: 'http://ceur-ws.org/Vol-1345/gamifir15_5.pdf'
 github: ''
-tags:
-  - climate change
+modality:
   - sensor
 ---
 

diff --git a/datasets/gastrovision.md b/datasets/gastrovision.md
@@ -1,13 +1,16 @@
 ---
-title: 'GastroVision'
-desc: 'A multicenter dataset.'
+title: GastroVision
+desc: A multicenter dataset.
 thumbnail: /thumbnails/gastrovision.jpg
-publication: https://arxiv.org/abs/2307.08140
-github: https://github.com/DebeshJha/GastroVision
+publication: 'https://arxiv.org/abs/2307.08140'
+github: 'https://github.com/DebeshJha/GastroVision'
 hidden: false
-tags:
-  - gastrointestinal
+domain:
+  - health
+modality:
   - images
+tasks:
+  - classification
 ---
 
 This repository provides related links and codes for the GastroVision dataset, a multi-class endoscopy image dataset comprising the largest number of anatomical landmarks, pathological abnormalities, and normal findings in the gastrointestinal (GI) tract. A total of 36 such classes, with 6,169 images, are acquired from the upper and lower GI tracts.
@@ -34,4 +37,4 @@ If you use this dataset in your research work, please cite the following paper:
 ```
 
 ## Contact
-Please contact debesh.jha@northwestern.edu  & Vanshalisharma@iit.ac.in if you have questions about the dataset and our research activities. We always welcome collaboration and joint research!
+Please contact debesh.jha@northwestern.edu  & Vanshalisharma@iit.ac.in if you have questions about the dataset and our research activities. We always welcome collaboration and joint research!