mpds-io · MrEx3cut0r · May 20, 2026 · May 20, 2026 · May 20, 2026 · May 20, 2026
diff --git a/datadump/data_loader.py b/datadump/data_loader.py
@@ -1,9 +1,13 @@
-import os.path
+import os
+import sys
 import time
 
 import ujson as json
 from mpds_client import MPDSDataRetrieval, APIError, MPDSDataTypes
 
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'kickoff'))
+from query_utils import normalize_query
+
 
 class DataExportMPDS:
     """
@@ -47,7 +51,7 @@ def get_structures(self):
         for year in range(1890, 2025):
             time.sleep(1.0)
             try:
-                for entry in self.client.get_data({"props": "atomic structure", "years": str(year)},
+                for entry in self.client.get_data(normalize_query({"props": "atomic structure", "years": str(year)}),
                 fields={}):
                     fp.write(json.dumps(entry, escape_forward_slashes=False) + "\n")
             except APIError as error:
@@ -67,7 +71,7 @@ def get_phase_diagrams(self):
         for year in range(1890, 2025):
             time.sleep(1.0)
             try:
-                for entry in self.client.get_data({"props": "phase diagram", "years": str(year)},
+                for entry in self.client.get_data(normalize_query({"props": "phase diagram", "years": str(year)}),
                 fields={}):
                     fp.write(json.dumps(entry, escape_forward_slashes=False) + "\n")
             except APIError as error:
@@ -87,7 +91,7 @@ def get_phys_properties(self):
         for year in range(1890, 2025):
             time.sleep(1.0)
             try:
-                for entry in self.client.get_data({"props": "physical properties", "years": str(year)},
+                for entry in self.client.get_data(normalize_query({"props": "physical properties", "years": str(year)}),
                 fields={}):
                     fp.write(json.dumps(entry, escape_forward_slashes=False) + "\n")
             except APIError as error:
@@ -101,7 +105,7 @@ def get_phys_properties(self):
         self.client.dtype = MPDSDataTypes.MACHINE_LEARNING
         fp = open(os.path.join(DataExportMPDS.export_dir, "physical_properties_machine_learning.jsonl"), "w")
         for props in DataExportMPDS.ml_properties_supported:
-            for entry in self.client.get_data({"props": props}, fields={}):
+            for entry in self.client.get_data(normalize_query({"props": props}), fields={}):
                 fp.write(json.dumps(entry, escape_forward_slashes=False) + "\n")
         fp.close()
 
@@ -111,7 +115,7 @@ def get_phys_properties(self):
         self.client.dtype = MPDSDataTypes.AB_INITIO
         fp = open(os.path.join(DataExportMPDS.export_dir, "physical_properties_ab_initio.jsonl"), "w")
         # TODO more data will require splitting
-        for entry in self.client.get_data({"props": "physical properties"}, fields={}):
+        for entry in self.client.get_data(normalize_query({"props": "physical properties"}), fields={}):
             fp.write(json.dumps(entry, escape_forward_slashes=False) + "\n")
         fp.close()
 

diff --git a/kickoff/miner_ab_etransport.py b/kickoff/miner_ab_etransport.py
@@ -7,6 +7,7 @@
 from mpds_client import MPDSDataRetrieval, MPDSDataTypes
 
 from etransport_raw import analyze_raw # this is given in the supplied file "etransport_raw.py"
+from query_utils import normalize_query
 
 # the raw simulation data on the MPDS are in 7z format
 # so we need e.g. the latest dev version of the pylzma package
@@ -18,7 +19,7 @@
 
 mpds_api = MPDSDataRetrieval(dtype=MPDSDataTypes.AB_INITIO)
 
-for entry in mpds_api.get_data({'props': 'electrical conductivity'}, fields={}):
+for entry in mpds_api.get_data(normalize_query({'props': 'electrical conductivity'}), fields={}):
 
     archive_url = entry['sample']['measurement'][0]['raw_data'] # this is the raw data archive field in the MPDS JSON P-entries
 

diff --git a/kickoff/miner_bgkmeans.py b/kickoff/miner_bgkmeans.py
@@ -12,12 +12,13 @@
 
 from kmeans import Point, kmeans, k_from_n
 from element_groups import get_element_group
+from query_utils import normalize_query
 
 
 client = MPDSDataRetrieval()
 
 dfrm = client.get_dataframe(
-    {"classes": "binary", "props": "band gap"},
+    normalize_query({"classes": "binary", "props": "band gap"}),
     fields={'P': [
         'sample.material.chemical_formula',
         'sample.material.chemical_elements',
@@ -27,7 +28,7 @@
     ]},
     columns=['Formula', 'Elements', 'SG', 'Units', 'Bandgap']
 )
-dfrm = dfrm.filter((dfrm['Units'] == 'eV') & (dfrm['Bandgap'] > 0) & (dfrm['Bandgap'] < 20))
+dfrm = dfrm.filter((dfrm['Units'] == 'eV') & (dfrm['Bandgap'] > 0) & (dfrm['Bandgap'] < 20) & dfrm['Elements'].is_not_null())
 
 # group by 'Formula' and calculate mean Bandgap
 avgbgfrm = dfrm.group_by('Formula').agg(

diff --git a/kickoff/miner_bondlength.py b/kickoff/miner_bondlength.py
@@ -9,24 +9,27 @@
 import polars as pl
 
 from mpds_client import MPDSDataRetrieval, MPDSExport
-
+from ase.neighborlist import NeighborList
+from query_utils import normalize_query
 
 def calculate_lengths(ase_obj, elA, elB, limit=4):
     assert elA != elB
+    nlist = NeighborList([limit / 2] * len(ase_obj), self_interaction=False, bothways=True)
+    nlist.update(ase_obj)
     lengths = []
     for n, atom in enumerate(ase_obj):
         if atom.symbol == elA:
-            for m, neighbor in enumerate(ase_obj):
-                if neighbor.symbol == elB:
-                    dist = round(ase_obj.get_distance(n, m), 2) # NB occurrence <-> rounding
-                    if dist < limit:
-                        lengths.append(dist)
+            indices, _ = nlist.get_neighbors(n)
+            for m in indices:
+                if ase_obj[m].symbol == elB:
+                    dist = round(ase_obj.get_distance(n,m), 2)
+                    lengths.append(dist)
     return lengths
 
 client = MPDSDataRetrieval()
 
 answer = client.get_data(
-    {"elements": "U-O", "props": "atomic structure"},
+    normalize_query({"elements": "U-O", "props": "atomic structure"}),
     fields={'S':['phase_id', 'entry', 'chemical_formula', 'cell_abc', 'sg_n', 'basis_noneq', 'els_noneq']}
 )
 

diff --git a/kickoff/miner_cmp_ab_pr_data.py b/kickoff/miner_cmp_ab_pr_data.py
@@ -9,6 +9,7 @@
 import math
 
 from mpds_client import MPDSDataRetrieval, MPDSDataTypes
+from query_utils import normalize_query
 
 
 result_cache = 'mpds_cmp_ab_pr.pkl'
@@ -250,12 +251,14 @@ def get_ab_pr_values(
     print('#' * 50, 'downloading', ab_prop_name)
 
     mpds_api = MPDSDataRetrieval(dtype=MPDSDataTypes.AB_INITIO)
-    for deck in mpds_api.get_data({'props': ab_prop_name}, fields={'P': ab_prop_conds or [
+    for deck in mpds_api.get_data(normalize_query({'props': ab_prop_name}), fields={'P': ab_prop_conds or [
         'sample.material.chemical_formula',
         'sample.material.condition[0].scalar[0].value',
         'sample.material.phase_id',
         'sample.measurement[0].property.scalar'
     ]}):
+        if not deck:
+            continue
         if ab_prop_massage:
             deck = ab_prop_massage(deck)
             if not deck:
@@ -271,7 +274,7 @@ def get_ab_pr_values(
     print('#' * 50, 'downloading', pr_prop_name)
 
     mpds_api = MPDSDataRetrieval(dtype=MPDSDataTypes.PEER_REVIEWED)
-    for deck in mpds_api.get_data({'props': pr_prop_name}, fields={'P': pr_prop_conds or [
+    for deck in mpds_api.get_data(normalize_query({'props': pr_prop_name}), fields={'P': pr_prop_conds or [
         'sample.material.chemical_formula',
         'sample.material.condition[0].scalar[0].value',
         'sample.material.phase_id',
@@ -281,6 +284,8 @@ def get_ab_pr_values(
         'sample.measurement[0].condition[0].name',
         'sample.measurement[0].condition[0].scalar'
     ]}):
+        if not deck:
+            continue
         if pr_prop_massage:
             deck = pr_prop_massage(deck)
             if not deck:

diff --git a/kickoff/miner_liquidus.py b/kickoff/miner_liquidus.py
@@ -17,7 +17,8 @@
 plt.switch_backend('agg')
 from mpds_client import MPDSDataRetrieval
 
-from miner_nonformers import pd_svg_to_points
+from miner_nonformers import pl_svg_to_points
+from query_utils import normalize_query
 
 
 MARGIN_EDGES_COMP = 0.1
@@ -40,7 +41,7 @@
     plt.annotate(elb, xy=(1.05, -0.1), xycoords='axes fraction')
     ymin, ymax = 500, 700
 
-    for pd in api_client.get_data({"props": "phase diagram", "classes": "binary", "elements": "-".join(elements)}, fields={}): # fields={} means all fields
+    for pd in api_client.get_data(normalize_query({"props": "phase diagram", "classes": "binary", "elements": elements}), fields={}): # fields={} means all fields
         # Consider only full-composition diagrams
         if pd['comp_range'] != [0, 100]:
             continue
@@ -68,7 +69,7 @@
                 done_liquidus = True
                 liquidus_line = []
 
-                for point in pd_svg_to_points(area['svgpath']):
+                for point in pl_svg_to_points(area['svgpath']):
 
                     # NB the line out of polygon extraction algorithm must be improved;
                     # this is just a quick and dirty example based on

diff --git a/kickoff/miner_nonformers.py b/kickoff/miner_nonformers.py
@@ -16,6 +16,7 @@
 import time
 import json
 from mpds_client import MPDSDataRetrieval
+from query_utils import normalize_query
 
 
 # Within this composition tolerance (%), a phase near a pure element
@@ -50,7 +51,7 @@ def get_nonformers(api_client):
 
     true_nonformers, maybe_nonformers, formers = set(), set(), set()
 
-    for pl in api_client.get_data({"props": "phase diagram", "classes": "binary"}, fields={}):
+    for pl in api_client.get_data(normalize_query({"props": "phase diagram", "classes": "binary"}), fields={}):
 
         # Only full-composition diagrams
         if pl['comp_range'] != [0, 100]:

diff --git a/kickoff/miner_pb_ratio.py b/kickoff/miner_pb_ratio.py
@@ -6,6 +6,7 @@
 import numpy
 from numpy.linalg import det
 from mpds_client import MPDSDataRetrieval
+from query_utils import normalize_query
 
 
 supported_arities = {1: 'unary', 2: 'binary', 3: 'ternary', 4: 'quaternary', 5: 'quinary'}
@@ -23,7 +24,7 @@ def get_cell_v_for_t(elements, t0=250, t1=350):
     """
     phases_volumes = {}
 
-    for item in mpds_api.get_data(dict(elements='-'.join(elements), classes=supported_arities[len(elements)]), fields={
+    for item in mpds_api.get_data(normalize_query(dict(elements=elements, classes=supported_arities[len(elements)])), fields={
     'P': [
         lambda: 'P',
         'sample.material.phase_id',

diff --git a/kickoff/miner_propstruct.py b/kickoff/miner_propstruct.py
@@ -13,6 +13,7 @@
 import polars as pl
 from ase.data import covalent_radii, chemical_symbols
 from mpds_client import MPDSDataRetrieval
+from query_utils import normalize_query
 
 def get_APF(ase_obj):
     """
@@ -34,14 +35,14 @@ def get_Wiener(ase_obj):
 
 client = MPDSDataRetrieval()
 
-data = client.get_dataframe({"classes": "transitional, oxide", "props": "isothermal bulk modulus"})
+data = client.get_dataframe(normalize_query({"classes": "transitional, oxide", "props": "isothermal bulk modulus"}))
 data = data.filter(pl.col("Phase").is_not_null())
 data = data.filter(pl.col("Units") == "GPa")
 data = data.filter(pl.col("Value") > 0)
 
 phases = set(data.select("Phase").to_series())
 answer = client.get_data(
-    {"props": "atomic structure"},
+    normalize_query({"props": "atomic structure"}),
     phases=phases,
     fields={
         'S': ['phase_id', 'entry', 'chemical_formula', 'cell_abc', 'sg_n', 'basis_noneq', 'els_noneq']
@@ -51,6 +52,8 @@ def get_Wiener(ase_obj):
 descriptors = []
 
 for item in answer:
+    if not item:
+        continue
     crystal = MPDSDataRetrieval.compile_crystal(item, 'ase')
     if not crystal:
         continue

diff --git a/kickoff/miner_twofold_props.py b/kickoff/miner_twofold_props.py
@@ -10,26 +10,31 @@
 
 import numpy as np
 from mpds_client import MPDSDataRetrieval, MPDSDataTypes
+from query_utils import normalize_query
 
 mpds_api = MPDSDataRetrieval(dtype=MPDSDataTypes.MACHINE_LEARNING) # NB MPDSDataTypes.ALL
 
 phase_for_formula = {}
 phase_for_val_a, phase_for_val_b = {}, {}
 
-for deck in mpds_api.get_data({'props': 'temperature for congruent melting', 'classes': 'oxide'}, fields={'P': [
+for deck in mpds_api.get_data(normalize_query({'props': 'temperature for congruent melting', 'classes': 'oxide'}), fields={'P': [
     'sample.material.phase_id',
     'sample.material.chemical_formula',
     'sample.measurement[0].property.scalar'
 ]}):
+    if not deck:
+        continue
     if deck[2] > (1800 + 273):
         phase_for_formula[deck[0]] = deck[1]
         phase_for_val_a.setdefault(deck[0], []).append(deck[2]) # why list? each phase might have > 1 value
 
-for deck in mpds_api.get_data({'props': 'linear thermal expansion coefficient'}, phases=phase_for_val_a.keys(), fields={'P': [
+for deck in mpds_api.get_data(normalize_query({'props': 'linear thermal expansion coefficient'}), phases=phase_for_val_a.keys(), fields={'P': [
     'sample.material.phase_id',
     # we don't need *chemical_formula* now, since we have phase_id's
     'sample.measurement[0].property.scalar'
 ]}):
+    if not deck:
+        continue
     phase_for_val_b.setdefault(deck[0], []).append(deck[1] * 1E5) # why list? each phase might have > 1 value
 
 # now we just re-group and show the results (but we can do much more!)

diff --git a/kickoff/query_utils.py b/kickoff/query_utils.py
@@ -0,0 +1,15 @@
+def normalize_query(query):
+    """Ensure 'elements' and 'classes' are joined strings, not lists.
+
+    The MPDS API expects:
+      - elements: dash-separated string  e.g. "Sr-Ti-O"
+      - classes:  comma-separated string  e.g. "perovskite, conductor"
+
+    Passing raw lists causes unsupported-symbol errors.
+    """
+    out = dict(query)
+    if 'elements' in out and isinstance(out['elements'], (list, tuple)):
+        out['elements'] = '-'.join(out['elements'])
+    if 'classes' in out and isinstance(out['classes'], (list, tuple)):
+        out['classes'] = ','.join(out['classes'])
+    return out
diff --git a/notebooks/2_mpds_basic.ipynb b/notebooks/2_mpds_basic.ipynb
@@ -79,7 +79,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from mpds_client import MPDSDataRetrieval, MPDSDataTypes, APIError"
+    "from mpds_client import MPDSDataRetrieval, MPDSDataTypes, APIError",
+    "\nfrom query_utils import normalize_query"
    ]
   },
   {
@@ -181,7 +182,7 @@
     "    print(\"Considering %s\" % prop)\n",
     "\n",
     "    try:\n",
-    "        for card in client.get_data({\n",
+    "        for card in client.get_data(normalize_query({\n",
     "            \"props\": prop,\n",
     "            # we defined our props above\n",
     "\n",
@@ -197,7 +198,7 @@
     "\n",
     "            \"years\": \"2010-2019\"\n",
     "            # only recent results (void for MACHINE_LEARNING, as all are 2018)\n",
-    "        }, fields=desired_fields):\n",
+    "        }), fields=desired_fields):\n",
     "\n",
     "            print(\"%s %s %s\" % (card[0], \"-\".join(card[2]), card[3]))\n",
     "\n",
@@ -218,7 +219,7 @@
    "source": [
     "client.dtype = MPDSDataTypes.PEER_REVIEWED\n",
     "\n",
-    "print(client.get_data({\"elements\": \"O\", \"classes\": \"binary\", \"sgs\": \"I4/mmm\"}))"
+    "print(client.get_data(normalize_query({\"elements\": \"O\", \"classes\": \"binary\", \"sgs\": \"I4/mmm\"})))\n"
    ]
   },
   {
@@ -230,7 +231,7 @@
     "import random\n",
     "prop = random.choice(example_props)\n",
     "\n",
-    "print(client.get_data({\"props\": prop, \"elements\": \"O\", \"classes\": \"binary, lanthanoid, non-disordered\"}))"
+    "print(client.get_data(normalize_query({\"props\": prop, \"elements\": \"O\", \"classes\": \"binary, lanthanoid, non-disordered\"})))\n"
    ]
   },
   {
@@ -253,4 +254,4 @@
  "metadata": {},
  "nbformat": 4,
  "nbformat_minor": 2
-}
+}