-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata-stack-config.yaml
More file actions
94 lines (87 loc) · 2.05 KB
/
data-stack-config.yaml
File metadata and controls
94 lines (87 loc) · 2.05 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# Data Stack Configuration
version: "1.0"
# Stack-wide configurations
stack:
name: "covid-analysis"
description: "COVID-19 data analysis pipeline"
environment: "production"
ingestion:
sources:
- name: "covid-raw-data"
type: "duckdb"
config:
query: "SELECT * FROM 's3://coviddata/covid_*.parquet'"
incremental: false
- name: "vaccination-data"
type: "duckdb"
config:
query: "SELECT * FROM public.vaccinations"
incremental: true
timestamp_column: "updated_at"
transformations:
- name: "covid-aggregation"
depends_on: ["covid-raw-data"]
type: "sql"
config:
groupby:
- "country"
aggregate:
cases: "sum"
deaths: "sum"
- name: "vaccination-metrics"
depends_on: ["vaccination-data"]
type: "sql"
config:
groupby:
- "country"
- "date"
aggregate:
total_vaccinations: "sum"
people_vaccinated: "sum"
people_fully_vaccinated: "sum"
# Serving configurations
serving:
- name: "covid-dashboard"
type: "markdown"
template: "github://covid/covid_dashboard.md"
data_sources:
- "covid-aggregation"
- "vaccination-metrics"
config:
refresh_interval: "24h"
access_control:
public: true
roles:
- admin
- analyst
# Orchestration settings
orchestration:
schedule: "0 0 * * *" # Daily at midnight
retries: 3
timeout: "1h"
notifications:
on_failure:
- type: "email"
recipients: ["data-team@example.com"]
on_success:
- type: "slack"
channel: "#data-pipeline"
# Monitoring and observability (example, not used yet)
monitoring:
metrics:
- type: "data_freshness"
- type: "pipeline_duration"
- type: "error_rate"
alerts:
- name: "pipeline_failure"
condition: "error_rate > 0.1"
notify:
- "data-team@example.com"
# Resource management (example, not used yet)
resources:
compute:
max_memory: "8Gi"
max_cpu: "4"
storage:
type: "s3"
bucket: "coviddata"