Skip to content
Open
16 changes: 10 additions & 6 deletions datadump/data_loader.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
import os.path
import os
import sys
import time

import ujson as json
from mpds_client import MPDSDataRetrieval, APIError, MPDSDataTypes

sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'kickoff'))
from query_utils import normalize_query


class DataExportMPDS:
"""
Expand Down Expand Up @@ -47,7 +51,7 @@ def get_structures(self):
for year in range(1890, 2025):
time.sleep(1.0)
try:
for entry in self.client.get_data({"props": "atomic structure", "years": str(year)},
for entry in self.client.get_data(normalize_query({"props": "atomic structure", "years": str(year)}),
fields={}):
fp.write(json.dumps(entry, escape_forward_slashes=False) + "\n")
except APIError as error:
Expand All @@ -67,7 +71,7 @@ def get_phase_diagrams(self):
for year in range(1890, 2025):
time.sleep(1.0)
try:
for entry in self.client.get_data({"props": "phase diagram", "years": str(year)},
for entry in self.client.get_data(normalize_query({"props": "phase diagram", "years": str(year)}),
fields={}):
fp.write(json.dumps(entry, escape_forward_slashes=False) + "\n")
except APIError as error:
Expand All @@ -87,7 +91,7 @@ def get_phys_properties(self):
for year in range(1890, 2025):
time.sleep(1.0)
try:
for entry in self.client.get_data({"props": "physical properties", "years": str(year)},
for entry in self.client.get_data(normalize_query({"props": "physical properties", "years": str(year)}),
fields={}):
fp.write(json.dumps(entry, escape_forward_slashes=False) + "\n")
except APIError as error:
Expand All @@ -101,7 +105,7 @@ def get_phys_properties(self):
self.client.dtype = MPDSDataTypes.MACHINE_LEARNING
fp = open(os.path.join(DataExportMPDS.export_dir, "physical_properties_machine_learning.jsonl"), "w")
for props in DataExportMPDS.ml_properties_supported:
for entry in self.client.get_data({"props": props}, fields={}):
for entry in self.client.get_data(normalize_query({"props": props}), fields={}):
fp.write(json.dumps(entry, escape_forward_slashes=False) + "\n")
fp.close()

Expand All @@ -111,7 +115,7 @@ def get_phys_properties(self):
self.client.dtype = MPDSDataTypes.AB_INITIO
fp = open(os.path.join(DataExportMPDS.export_dir, "physical_properties_ab_initio.jsonl"), "w")
# TODO more data will require splitting
for entry in self.client.get_data({"props": "physical properties"}, fields={}):
for entry in self.client.get_data(normalize_query({"props": "physical properties"}), fields={}):
fp.write(json.dumps(entry, escape_forward_slashes=False) + "\n")
fp.close()

Expand Down
3 changes: 2 additions & 1 deletion kickoff/miner_ab_etransport.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from mpds_client import MPDSDataRetrieval, MPDSDataTypes

from etransport_raw import analyze_raw # this is given in the supplied file "etransport_raw.py"
from query_utils import normalize_query

# the raw simulation data on the MPDS are in 7z format
# so we need e.g. the latest dev version of the pylzma package
Expand All @@ -18,7 +19,7 @@

mpds_api = MPDSDataRetrieval(dtype=MPDSDataTypes.AB_INITIO)

for entry in mpds_api.get_data({'props': 'electrical conductivity'}, fields={}):
for entry in mpds_api.get_data(normalize_query({'props': 'electrical conductivity'}), fields={}):

archive_url = entry['sample']['measurement'][0]['raw_data'] # this is the raw data archive field in the MPDS JSON P-entries

Expand Down
5 changes: 3 additions & 2 deletions kickoff/miner_bgkmeans.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,13 @@

from kmeans import Point, kmeans, k_from_n
from element_groups import get_element_group
from query_utils import normalize_query


client = MPDSDataRetrieval()

dfrm = client.get_dataframe(
{"classes": "binary", "props": "band gap"},
normalize_query({"classes": "binary", "props": "band gap"}),
fields={'P': [
'sample.material.chemical_formula',
'sample.material.chemical_elements',
Expand All @@ -27,7 +28,7 @@
]},
columns=['Formula', 'Elements', 'SG', 'Units', 'Bandgap']
)
dfrm = dfrm.filter((dfrm['Units'] == 'eV') & (dfrm['Bandgap'] > 0) & (dfrm['Bandgap'] < 20))
dfrm = dfrm.filter((dfrm['Units'] == 'eV') & (dfrm['Bandgap'] > 0) & (dfrm['Bandgap'] < 20) & dfrm['Elements'].is_not_null())

# group by 'Formula' and calculate mean Bandgap
avgbgfrm = dfrm.group_by('Formula').agg(
Expand Down
17 changes: 10 additions & 7 deletions kickoff/miner_bondlength.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,24 +9,27 @@
import polars as pl

from mpds_client import MPDSDataRetrieval, MPDSExport

from ase.neighborlist import NeighborList
from query_utils import normalize_query

def calculate_lengths(ase_obj, elA, elB, limit=4):
assert elA != elB
nlist = NeighborList([limit / 2] * len(ase_obj), self_interaction=False, bothways=True)
nlist.update(ase_obj)
lengths = []
for n, atom in enumerate(ase_obj):
if atom.symbol == elA:
for m, neighbor in enumerate(ase_obj):
if neighbor.symbol == elB:
dist = round(ase_obj.get_distance(n, m), 2) # NB occurrence <-> rounding
if dist < limit:
lengths.append(dist)
indices, _ = nlist.get_neighbors(n)
for m in indices:
if ase_obj[m].symbol == elB:
dist = round(ase_obj.get_distance(n,m), 2)
lengths.append(dist)
return lengths

client = MPDSDataRetrieval()

answer = client.get_data(
{"elements": "U-O", "props": "atomic structure"},
normalize_query({"elements": "U-O", "props": "atomic structure"}),
fields={'S':['phase_id', 'entry', 'chemical_formula', 'cell_abc', 'sg_n', 'basis_noneq', 'els_noneq']}
)

Expand Down
9 changes: 7 additions & 2 deletions kickoff/miner_cmp_ab_pr_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import math

from mpds_client import MPDSDataRetrieval, MPDSDataTypes
from query_utils import normalize_query


result_cache = 'mpds_cmp_ab_pr.pkl'
Expand Down Expand Up @@ -250,12 +251,14 @@ def get_ab_pr_values(
print('#' * 50, 'downloading', ab_prop_name)

mpds_api = MPDSDataRetrieval(dtype=MPDSDataTypes.AB_INITIO)
for deck in mpds_api.get_data({'props': ab_prop_name}, fields={'P': ab_prop_conds or [
for deck in mpds_api.get_data(normalize_query({'props': ab_prop_name}), fields={'P': ab_prop_conds or [
'sample.material.chemical_formula',
'sample.material.condition[0].scalar[0].value',
'sample.material.phase_id',
'sample.measurement[0].property.scalar'
]}):
if not deck:
continue
if ab_prop_massage:
deck = ab_prop_massage(deck)
if not deck:
Expand All @@ -271,7 +274,7 @@ def get_ab_pr_values(
print('#' * 50, 'downloading', pr_prop_name)

mpds_api = MPDSDataRetrieval(dtype=MPDSDataTypes.PEER_REVIEWED)
for deck in mpds_api.get_data({'props': pr_prop_name}, fields={'P': pr_prop_conds or [
for deck in mpds_api.get_data(normalize_query({'props': pr_prop_name}), fields={'P': pr_prop_conds or [
'sample.material.chemical_formula',
'sample.material.condition[0].scalar[0].value',
'sample.material.phase_id',
Expand All @@ -281,6 +284,8 @@ def get_ab_pr_values(
'sample.measurement[0].condition[0].name',
'sample.measurement[0].condition[0].scalar'
]}):
if not deck:
continue
if pr_prop_massage:
deck = pr_prop_massage(deck)
if not deck:
Expand Down
7 changes: 4 additions & 3 deletions kickoff/miner_liquidus.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@
plt.switch_backend('agg')
from mpds_client import MPDSDataRetrieval

from miner_nonformers import pd_svg_to_points
from miner_nonformers import pl_svg_to_points
from query_utils import normalize_query


MARGIN_EDGES_COMP = 0.1
Expand All @@ -40,7 +41,7 @@
plt.annotate(elb, xy=(1.05, -0.1), xycoords='axes fraction')
ymin, ymax = 500, 700

for pd in api_client.get_data({"props": "phase diagram", "classes": "binary", "elements": "-".join(elements)}, fields={}): # fields={} means all fields
for pd in api_client.get_data(normalize_query({"props": "phase diagram", "classes": "binary", "elements": elements}), fields={}): # fields={} means all fields
# Consider only full-composition diagrams
if pd['comp_range'] != [0, 100]:
continue
Expand Down Expand Up @@ -68,7 +69,7 @@
done_liquidus = True
liquidus_line = []

for point in pd_svg_to_points(area['svgpath']):
for point in pl_svg_to_points(area['svgpath']):

# NB the line out of polygon extraction algorithm must be improved;
# this is just a quick and dirty example based on
Expand Down
3 changes: 2 additions & 1 deletion kickoff/miner_nonformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import time
import json
from mpds_client import MPDSDataRetrieval
from query_utils import normalize_query


# Within this composition tolerance (%), a phase near a pure element
Expand Down Expand Up @@ -50,7 +51,7 @@ def get_nonformers(api_client):

true_nonformers, maybe_nonformers, formers = set(), set(), set()

for pl in api_client.get_data({"props": "phase diagram", "classes": "binary"}, fields={}):
for pl in api_client.get_data(normalize_query({"props": "phase diagram", "classes": "binary"}), fields={}):

# Only full-composition diagrams
if pl['comp_range'] != [0, 100]:
Expand Down
3 changes: 2 additions & 1 deletion kickoff/miner_pb_ratio.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import numpy
from numpy.linalg import det
from mpds_client import MPDSDataRetrieval
from query_utils import normalize_query


supported_arities = {1: 'unary', 2: 'binary', 3: 'ternary', 4: 'quaternary', 5: 'quinary'}
Expand All @@ -23,7 +24,7 @@ def get_cell_v_for_t(elements, t0=250, t1=350):
"""
phases_volumes = {}

for item in mpds_api.get_data(dict(elements='-'.join(elements), classes=supported_arities[len(elements)]), fields={
for item in mpds_api.get_data(normalize_query(dict(elements=elements, classes=supported_arities[len(elements)])), fields={
'P': [
lambda: 'P',
'sample.material.phase_id',
Expand Down
7 changes: 5 additions & 2 deletions kickoff/miner_propstruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import polars as pl
from ase.data import covalent_radii, chemical_symbols
from mpds_client import MPDSDataRetrieval
from query_utils import normalize_query

def get_APF(ase_obj):
"""
Expand All @@ -34,14 +35,14 @@ def get_Wiener(ase_obj):

client = MPDSDataRetrieval()

data = client.get_dataframe({"classes": "transitional, oxide", "props": "isothermal bulk modulus"})
data = client.get_dataframe(normalize_query({"classes": "transitional, oxide", "props": "isothermal bulk modulus"}))
data = data.filter(pl.col("Phase").is_not_null())
data = data.filter(pl.col("Units") == "GPa")
data = data.filter(pl.col("Value") > 0)

phases = set(data.select("Phase").to_series())
answer = client.get_data(
{"props": "atomic structure"},
normalize_query({"props": "atomic structure"}),
phases=phases,
fields={
'S': ['phase_id', 'entry', 'chemical_formula', 'cell_abc', 'sg_n', 'basis_noneq', 'els_noneq']
Expand All @@ -51,6 +52,8 @@ def get_Wiener(ase_obj):
descriptors = []

for item in answer:
if not item:
continue
crystal = MPDSDataRetrieval.compile_crystal(item, 'ase')
if not crystal:
continue
Expand Down
9 changes: 7 additions & 2 deletions kickoff/miner_twofold_props.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,26 +10,31 @@

import numpy as np
from mpds_client import MPDSDataRetrieval, MPDSDataTypes
from query_utils import normalize_query

mpds_api = MPDSDataRetrieval(dtype=MPDSDataTypes.MACHINE_LEARNING) # NB MPDSDataTypes.ALL

phase_for_formula = {}
phase_for_val_a, phase_for_val_b = {}, {}

for deck in mpds_api.get_data({'props': 'temperature for congruent melting', 'classes': 'oxide'}, fields={'P': [
for deck in mpds_api.get_data(normalize_query({'props': 'temperature for congruent melting', 'classes': 'oxide'}), fields={'P': [
'sample.material.phase_id',
'sample.material.chemical_formula',
'sample.measurement[0].property.scalar'
]}):
if not deck:
continue
if deck[2] > (1800 + 273):
phase_for_formula[deck[0]] = deck[1]
phase_for_val_a.setdefault(deck[0], []).append(deck[2]) # why list? each phase might have > 1 value

for deck in mpds_api.get_data({'props': 'linear thermal expansion coefficient'}, phases=phase_for_val_a.keys(), fields={'P': [
for deck in mpds_api.get_data(normalize_query({'props': 'linear thermal expansion coefficient'}), phases=phase_for_val_a.keys(), fields={'P': [
'sample.material.phase_id',
# we don't need *chemical_formula* now, since we have phase_id's
'sample.measurement[0].property.scalar'
]}):
if not deck:
continue
phase_for_val_b.setdefault(deck[0], []).append(deck[1] * 1E5) # why list? each phase might have > 1 value

# now we just re-group and show the results (but we can do much more!)
Expand Down
15 changes: 15 additions & 0 deletions kickoff/query_utils.py
Comment thread
MrEx3cut0r marked this conversation as resolved.
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
def normalize_query(query):
"""Ensure 'elements' and 'classes' are joined strings, not lists.

The MPDS API expects:
- elements: dash-separated string e.g. "Sr-Ti-O"
- classes: comma-separated string e.g. "perovskite, conductor"

Passing raw lists causes unsupported-symbol errors.
"""
out = dict(query)
if 'elements' in out and isinstance(out['elements'], (list, tuple)):
out['elements'] = '-'.join(out['elements'])
if 'classes' in out and isinstance(out['classes'], (list, tuple)):
out['classes'] = ','.join(out['classes'])
return out
13 changes: 7 additions & 6 deletions notebooks/2_mpds_basic.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,8 @@
"metadata": {},
"outputs": [],
"source": [
"from mpds_client import MPDSDataRetrieval, MPDSDataTypes, APIError"
"from mpds_client import MPDSDataRetrieval, MPDSDataTypes, APIError",
"\nfrom query_utils import normalize_query"
]
},
{
Expand Down Expand Up @@ -181,7 +182,7 @@
" print(\"Considering %s\" % prop)\n",
"\n",
" try:\n",
" for card in client.get_data({\n",
" for card in client.get_data(normalize_query({\n",
" \"props\": prop,\n",
" # we defined our props above\n",
"\n",
Expand All @@ -197,7 +198,7 @@
"\n",
" \"years\": \"2010-2019\"\n",
" # only recent results (void for MACHINE_LEARNING, as all are 2018)\n",
" }, fields=desired_fields):\n",
" }), fields=desired_fields):\n",
"\n",
" print(\"%s %s %s\" % (card[0], \"-\".join(card[2]), card[3]))\n",
"\n",
Expand All @@ -218,7 +219,7 @@
"source": [
"client.dtype = MPDSDataTypes.PEER_REVIEWED\n",
"\n",
"print(client.get_data({\"elements\": \"O\", \"classes\": \"binary\", \"sgs\": \"I4/mmm\"}))"
"print(client.get_data(normalize_query({\"elements\": \"O\", \"classes\": \"binary\", \"sgs\": \"I4/mmm\"})))\n"
]
},
{
Expand All @@ -230,7 +231,7 @@
"import random\n",
"prop = random.choice(example_props)\n",
"\n",
"print(client.get_data({\"props\": prop, \"elements\": \"O\", \"classes\": \"binary, lanthanoid, non-disordered\"}))"
"print(client.get_data(normalize_query({\"props\": prop, \"elements\": \"O\", \"classes\": \"binary, lanthanoid, non-disordered\"})))\n"
]
},
{
Expand All @@ -253,4 +254,4 @@
"metadata": {},
"nbformat": 4,
"nbformat_minor": 2
}
}
Loading