llnl · grantjllnl · Sep 3, 2025 · Nov 12, 2024 · Nov 12, 2024 · Nov 27, 2024
diff --git a/.gitignore b/.gitignore
@@ -80,3 +80,5 @@ tramp
 
 # Local config file. A common practice is to have many, so ignore those also.
 wintappy_settings.toml*
+dbt/logs/dbt.log
+dbt/xdr/xdr-dev.duckdb
diff --git a/DependencyGraph.md b/DependencyGraph.md
@@ -0,0 +1,80 @@
+Below is a **dependency diagram** of the tables and views, organized hierarchically by batch names. The diagram shows how tables and views depend on each other and their relationships. The structure is rendered using **Mermaid syntax** for clarity.
+
+```mermaid
+graph TD
+    %% Batch: Raw to Standard View
+    subgraph "Batch: Raw to Standard View"
+        raw_host --> host
+        raw_macip --> host_ip
+        raw_process --> process
+        raw_process_conn_incr --> process_conn_incr
+        process_conn_incr --> process_net_conn
+        raw_process_file --> process_file
+        raw_process_registry --> process_registry
+        raw_imageload --> process_image_load
+        process_exe_file_summary --> files_tmp_v1
+        process_image_load --> files_tmp_v1
+        process_file --> files_tmp_v1
+        files_tmp_v1 --> files
+        files --> all_files
+    end
+
+    %% Batch: Process Summary
+    subgraph "Batch: Process Summary"
+        process_registry --> process_registry_summary
+        process_file --> process_file_summary
+        process_net_conn --> process_net_summary
+        process_image_load --> process_image_load_summary
+        process --> process_summary
+        process_registry_summary --> process_summary
+        process_file_summary --> process_summary
+        process_net_summary --> process_summary
+        process_image_load_summary --> process_summary
+        host --> process_summary
+    end
+
+    %% Batch: Label Summaries
+    subgraph "Batch: Label Summaries"
+        labels_networkx --> labels_graph_nodes
+        labels_graph_nodes --> labels_graph_process_summary
+        labels_graph_nodes --> labels_graph_net_conn
+    end
+
+    %% Batch: LOLBAS, MITRE, and SIGMA Summaries
+    subgraph "Batch: LOLBAS, MITRE, and SIGMA Summaries"
+        process --> process_lolbas_summary
+        mitre_car --> process_mitre_summary
+        mitre_labels --> process_mitre_summary
+        sigma_labels --> sigma_labels_summary
+        sigma --> sigma_labels_summary
+    end
+
+    %% Batch: Process Uber Summary
+    subgraph "Batch: Process Uber Summary"
+        process_summary --> process_uber_summary
+        sigma_labels_summary --> process_uber_summary
+        labels_graph_process_summary --> process_uber_summary
+        process_lolbas_summary --> process_uber_summary
+        process_mitre_summary --> process_uber_summary
+    end
+```
+
+### **Explanation of the Diagram**
+1. **Batch: Raw to Standard View**:
+   - Raw data tables (e.g., `raw_host`, `raw_macip`) are transformed into standardized tables (`host`, `host_ip`, `process`, etc.).
+   - Intermediate tables like `process_conn_incr` and `files_tmp_v1` are used to aggregate and summarize data before creating final tables like `process_net_conn` and `files`.
+
+2. **Batch: Process Summary**:
+   - Summarizes activity for processes, files, networks, and registry data into views like `process_summary`.
+   - Combines data from multiple tables (e.g., `process_registry_summary`, `process_file_summary`) to create a unified view.
+
+3. **Batch: Label Summaries**:
+   - Extracts and summarizes labels from external sources (`labels_networkx`) into views like `labels_graph_nodes` and `labels_graph_process_summary`.
+
+4. **Batch: LOLBAS, MITRE, and SIGMA Summaries**:
+   - Integrates threat intelligence data (LOLBAS, MITRE, SIGMA) into process-related summaries.
+
+5. **Batch: Process Uber Summary**:
+   - Combines all process-related summaries, external labels, and threat intelligence into a comprehensive view (`process_uber_summary`).
+
+This hierarchical structure shows the relationships and dependencies among the tables and views, grouped by their respective batches. Let me know if you need further clarification or adjustments!
diff --git a/Labels.md b/Labels.md
@@ -0,0 +1,37 @@
+# Labels
+Datasets are labeled in 2 ways: manually by humans and programmatically.
+
+For manual labels, we work with the red team attackers and develop a graph of activity composed of processes, files, network, etc. These graphs are persisted as NetworkX JSON format and are in the "Sources" directory. From these graphs, we extract the unqique identifiers and ultimately create a summarized table that can be joined to PROCESS using the PID_HASH.
+
+Programmatic labels are created several different ways:
+
+* Sigma/Mitre labels are created by running publicly avaliable rules. The hits are quite large for both, producing many false positives.
+* LolBAS (Living of the Land) are created by simply joining the LolBAS list of programs, by name, to the PROCESS data. Again, this produces many false positives.
+* Caldera labels are created by extracting unique IDs from the Caldera report 
+
+## PROCESS_UBER_SUMMARY: Global process summary with labels
+This table is summarized to PID_HASH for every process instance in the time range. It has summarized information for process, file, network, registry, DLLs, sigma, mitre, lolBAS and manual labels. Generally, this table is the best place to start as it has so much varied information. Inevitably, researchers often later dive deeper into the various supporting tables.
+
+## Labels summarized to PID_HASH
+* labels_graph_process_summary.parquet
+* process_lolbas_summary.parquet
+* process_mitre_summary.parquet
+* sigma_labels_summary.parquet
+
+## The summaries are derived from these more detailed tables
+
+### Networkx Graphs
+These are essentially the flattened data extracted from the graphs.
+
+labels_graph_net_conn.parquet
+labels_graph_nodes.parquet
+labels_networkx.parquet
+
+### LolBAS
+lolbas.parquet
+
+### Mitre Detail
+mitre_labels.parquet
+
+### Sigma Detail
+sigma_labels.parquet
diff --git a/README-Labels.md b/README-Labels.md
@@ -0,0 +1,38 @@
+## START HERE: process_uber_summary.parquet
+ * This table is summarized to PID_HASH for every process instance in the time range.
+ * It has summarized information for process, file, network, registry, and DLLs activity.
+ * It also has sigma, mitre, lolBAS and manual labels.
+
+ Generally, this table is the best place to start as it has so much varied information. Researchers often later dive deeper into the various supporting tables.
+
+## LABELS
+Datasets are labeled in 2 ways: manually by humans (labels) and programmatically by scripts (lolbas, mitre, and sigma).
+
+Manual labels:
+For manual labels, we work with the red team attackers and develop a graph of activity composed of processes, files, network, etc. These graphs are persisted as NetworkX JSON format and are in the "Sources" directory. From these graphs, we extract the unique identifiers and ultimately create a summarized table that can be joined to PROCESS using the PID_HASH.
+
+With the addition of Caldera attacks, we can now parse reports generated from Caldera that detail the execution activity. That data is converted into a networkx graph and processed along with manual labels described above.
+
+Scripted labels are created several different ways:
+* Sigma/Mitre labels are created by running publicly avaliable rules. The hits are quite large for both, producing many false positives.
+* LolBAS (Living of the Land) are created by simply joining the LolBAS list of programs, by name, to the PROCESS data. Again, this produces many false positives.
+    * (add info about lolc here)
+
+### Labels summarized to PID_HASH can be found in the following files (i.e., one row per PID_HASH):
+* labels_graph_process_summary.parquet (manual)
+* process_lolbas_summary.parquet (scripted)
+* process_mitre_summary.parquet (scripted)
+* sigma_labels_summary.parquet (scripted)
+
+### Summaries are derived from more detailed tables (where the same PID_HASH will be found on multiple rows if it got more than one hit):
+
+* labels_graph_process_summary.parquet (manual) <- These are essentially the flattened data extracted from the graphs:
+    * labels_graph_net_conn.parquet
+    * labels_graph_nodes.parquet
+    * labels_networkx.parquet
+
+* process_lolbas_summary.parquet (scripted) <- lolbas.parquet
+
+* process_mitre_summary.parquet (scripted) <- mitre_labels.parquet
+
+* sigma_labels_summary.parquet (scripted) <- sigma_labels.parquet 
diff --git a/dataflow.md b/dataflow.md
@@ -0,0 +1,67 @@
+Here's the updated dependency diagram with **batch names** included for a hierarchical structure. I’ve grouped the dependencies under the **Detail** and **Summary** batches to clearly show the flow of data and processing.
+
+### **Updated Dependency Diagram with Batch Names**
+
+```mermaid
+graph TD 
+    subgraph WintapOnly ["Wintap Only Data"]
+        subgraph Detail ["Bronze"]
+            raw_host --> host
+            raw_process --> process
+            raw_process_file --> process_file
+            raw_image_load --> process_image_load
+            raw_process_registry --> process_registry
+            raw_process_conn_incr --> process_conn_incr
+            process_conn_incr --> process_net_conn
+        end
+
+        subgraph Summary ["Silver"]
+            process_net_conn --> process_net_summary
+            process_registry --> process_registry_summary
+            process_file --> process_file_summary
+            process_image_load --> process_image_load_summary
+        end
+
+        subgraph Gold ["Gold"]
+            process --> process_summary
+            host --> process_summary
+            process_registry_summary --> process_summary
+            process_file_summary --> process_summary
+            process_net_summary --> process_summary
+            process_image_load_summary --> process_summary
+        end
+    end
+
+    subgraph Labels ["Labels"]
+        caldera --> process_uber_summary
+        manual --> process_uber_summary
+        mitre --> process_uber_summary
+        sigma --> process_uber_summary
+        process_summary --> process_uber_summary
+    end
+```
+
+---
+
+### **Annotations for the Diagram**
+
+#### **Detail Batch**
+- **`raw_process`**: The raw input data for processes.
+- **`process`**: Transformed and structured data from `raw_process`.
+- **`raw_process_conn_incr`**: The raw input data for process connections.
+- **`process_conn_incr`**: Transformed and structured data from `raw_process_conn_incr`.
+- **`process_net_conn`**: Aggregated connection-level data derived from `process_conn_incr`.
+
+#### **Summary Batch**
+- **`process_registry_summary`**: Summarized registry activity for processes (from `process_registry`).
+- **`process_file_summary`**: Summarized file activity for processes (from `process_file`).
+- **`process_net_summary`**: Summarized network activity for processes (from `process_net_conn`).
+- **`process_image_load_summary`**: Summarized image load activity for processes (from `process_image_load`).
+- **`process_summary`**: A comprehensive summary combining data from all summarized views (`process_registry_summary`, `process_file_summary`, `process_net_summary`, `process_image_load_summary`) and enriched with host-level details (`host`).
+
+---
+
+### **Key Highlights**
+- The **Detail Batch** focuses on creating foundational tables from raw data.
+- The **Summary Batch** builds higher-level summaries and combines them into a single, comprehensive `process_summary` view.
+- The hierarchical structure ensures clarity in how data flows from raw inputs to detailed tables and then to summarized views.
diff --git a/setup.py b/setup.py
@@ -21,6 +21,7 @@
     "pyarrow",
     "python-dotenv",
     "pyyaml",
+    "s3fs",
     "toml",
     "tqdm",
 ]

diff --git a/tests/etlutils/test_utils.py b/tests/etlutils/test_utils.py
@@ -18,22 +18,22 @@ def test_get_date_range_default(self, mock_datetime: mock.MagicMock, mock_listdi
         mock_datetime.utcnow.return_value = dt(2021, 12, 5, 11, 24)
         mock_datetime.return_value = dt(2021, 12, 5)
         mock_listdir.return_value = []
-        start, end = get_date_range("", "")
+        start, end = get_date_range("", "", "ACME")
         print(f"{start}  {end}")
         print(f"{dt(2021, 12, 5, 11, 24)}")
         assert start == None
         assert end == None
-        start, end = get_date_range(None, None)
+        start, end = get_date_range(None, None, "ACME")
         assert start == None
         assert end == None
 
     def test_get_date_range_specified(self) -> None:
-        start, end = get_date_range("20231101", "20231108")
+        start, end = get_date_range("20231101", "20231108", "ACME")
         assert start == dt(2023, 11, 1, 0, 0)
         assert end == dt(2023, 11, 8, 0, 0)
 
     def test_get_date_range_specified_datefmt(self) -> None:
-        start, end = get_date_range("20231101 05 05", "20231108 10 56", "%Y%m%d %H %M")
+        start, end = get_date_range("20231101 05 05", "20231108 10 56", "ACME", date_format = "%Y%m%d %H %M")
         assert start == dt(2023, 11, 1, 5, 5)
         assert end == dt(2023, 11, 8, 10, 56)
 

diff --git a/wintappy/database/wintap_duckdb.py b/wintappy/database/wintap_duckdb.py
@@ -114,7 +114,12 @@ def write_table(
                 pathspec = f"{path}"
                 filename = f"{table}.parquet"
             else:
-                pathspec = f"{path}/rolling/{table}/dayPK={partition_key}"
+                # TODO: Find a cleaner way to fix this!
+                # In some cases the "path" already includes rolling, so make sure we don't duplicate it here.
+                if path.endswith("rolling"):
+                    pathspec = f"{path}/{table}/dayPK={partition_key}"
+                else:
+                    pathspec = f"{path}/rolling/{table}/dayPK={partition_key}"
                 filename = f"{table}-{partition_key}.parquet"
             if not os.path.exists(pathspec):
                 os.makedirs(pathspec)

diff --git a/wintappy/datautils/ProcessPath.md b/wintappy/datautils/ProcessPath.md
@@ -0,0 +1,22 @@
+For larger datasets, processing by partitioning is required. 
+
+Pseudo code:
+
+setup
+   create table tmp_process where false (just need object, no data, at this point)
+   create view process_path_v1
+   create table process_path as from process_path_v1 where false 
+
+for host in hosts:
+   drop table tmp_process
+   create table tmp_process where hostname=host
+   insert into process_path select * from process_path_v1
+
+Single host tests (ACME-Redo: ACME-DC1)
+
+No index: Run Time (s): real 516.086 user 996.254755 sys 374.332785
+Pid_Hash: Run Time (s): real 523.609 user 1014.446506 sys 396.068710
+ParentPH: Run Time (s): real 524.081 user 1015.510835 sys 392.339936
+
+Remove p.hostname=pt.hostname qualifier
+ParentPH: Run Time (s): real 515.749 user 974.264660 sys 382.148094
diff --git a/wintappy/datautils/label_summary.sql b/wintappy/datautils/label_summary.sql
@@ -1,4 +1,5 @@
--- Create a summary view for all NetworkX (Everest) labels
+-- View for all nodes in the graph data
+-- id is based on the node_type
 create or replace view labels_graph_nodes
 as
 select
@@ -12,6 +13,7 @@ select
 		when node_type = 'File' then node->>'$.FileKey[0].Filename'
 		when node_type = 'FiveTupleConn' then concat_ws(':',node->>'$.FiveTupleKey[0].protocol',node->>'$.FiveTupleKey[0].RemoteIp', node->>'$.FiveTupleKey[0].RemotePort')
 		when node_type = 'IpConn' then node->>'$.IpConnKey[0].ID'
+		when node_type = 'IpV4Addr' then node->>'$.IpV4Addr[0].IP'
 		else node_type||' missing in view'
 	end as label
 from
@@ -25,10 +27,27 @@ order by
 	all
 ;
 
+-- View for all links
+-- To Do: map attribute features to sql specific SQL columns, which would make them easier to deal with later.
+create or replace view labels_graph_links
+as
+select
+		unnest(links, recursive:=true) as link,
+		filename
+	from
+		labels_networkx
+;
+
+-- Macro for getting a simple name for the file
+create macro tmp_basename(filename)
+as
+replace(string_split(filename,'/')[-1:][1],'.json','')
+;
+
 -- Process node graph labels summarized by PID_HASH
 create or replace view labels_graph_process_summary
 as
-select id pid_hash, 'networkx' as label_source, count(distinct filename) label_num_sources, count(distinct annotation) label_num_uniq_annotations, count(*) label_num_hits
+select id pid_hash, list_sort(list(distinct tmp_basename(filename))) as label_sources, count(distinct tmp_basename(filename)) label_num_sources, list_sort(list(distinct annotation)) label_annonations, count(distinct annotation) label_num_uniq_annotations, count(*) label_num_hits
 from labels_graph_nodes 
 where node_type ='Process'
 group by ALL 
@@ -38,7 +57,7 @@ group by ALL
 -- Note: This view is just created for convenience for users later and must be joined to base network data.
 create or replace view labels_graph_net_conn
 as
-select id conn_id, 'networkx' as label_source, count(distinct filename) label_num_sources, count(distinct annotation) label_num_uniq_annotations, count(*) label_num_hits
+select id conn_id, list_sort(list(distinct tmp_basename(filename))) as label_sources, count(distinct tmp_basename(filename)) label_num_sources, list_sort(list(distinct annotation)) label_annonations, count(distinct annotation) label_num_uniq_annotations, count(*) label_num_hits
 from labels_graph_nodes 
 where node_type ='FiveTupleConn'
 group by ALL 

diff --git a/wintappy/datautils/lolbas_summary.sql b/wintappy/datautils/lolbas_summary.sql
@@ -5,14 +5,18 @@
 -- Summarize LOLBAS hits to PID_HASH
 create table process_lolbas_summary
 as
-select pid_hash, lolbas_privs, lolbas_cats, lolbas_mitre, count(*) lolbas_num_rows
+select pls.pid_hash, pls.lolbas_privs, pls.lolbas_cats, pls.lolbas_mitre, lr.lolc_class, count(*) lolbas_num_rows,
 from (
 	select pid_hash,
+      p.process_name,
+      p.args,
 	  list_sort(list(distinct Command_Privileges)) lolbas_privs,
 	  list_sort(list(distinct Command_Category)) lolbas_cats,
-	  list_sort(list(distinct MITRE_ATTCK_technique)) lolbas_mitre,
+	  list_sort(list(distinct MITRE_ATTCK_technique)) lolbas_mitre
 	from main.process p
-	join lolbas l on lower(l.filename)=p.process_name 
+	join lolbas l on lower(l.filename)=p.process_name
 	group by all
-) group by all
+) pls
+left outer join lolc_labels lr on pls.process_name=lr.process_name and pls.args=lr.args
+group by all
 ;
-Original file line number
+Diff line change
@@ Expand Up / @@ -21,6 +21,7 @@ @@
         "pyarrow",
         "python-dotenv",
         "pyyaml",
+        "s3fs",
         "toml",
         "tqdm",
     ]
@@ Expand Down @@