sbms4d · domagoj-gusic · Oct 13, 2025
diff --git a/calculate_largest_expensors.sql b/calculate_largest_expensors.sql
@@ -0,0 +1,33 @@
+USE memory.default;
+
+/*
+First we need to find all expenses and to do that, we are selecting from "EXPENSE" table. After than, we need to join
+it to "EMPLOYEE" table and finally to "EMPLOYEE" table again to get actual employess and their managers. After that, it's just
+a matter of a simple sum and filter.
+
+We use HAVING instead of WHERE because of SQL order of operations as HAVING is being evaluated after the actual aggreation, while
+WHERE is evaluated before aggregation.
+*/
+SELECT 
+    employee.employee_id,
+    CONCAT(employee.first_name, ' ', employee.last_name) AS employee_name,
+	manager.manager_id,
+	CONCAT(manager.first_name, ' ', manager.last_name) AS manager_name,
+	SUM(expense.unit_price * expense.quantity) AS total_expensed_amount
+FROM
+    EXPENSE expense
+LEFT JOIN
+    EMPLOYEE employee
+        ON expense.employee_id = employee.employee_id
+LEFT JOIN
+    EMPLOYEE manager
+        ON manager.employee_id = employee.manager_id
+GROUP BY
+    employee.employee_id,
+    CONCAT(employee.first_name, ' ', employee.last_name),
+	manager.manager_id,
+	CONCAT(manager.first_name, ' ', manager.last_name)
+HAVING
+    SUM(expense.unit_price * expense.quantity) > 1000
+ORDER BY
+    SUM(expense.unit_price * expense.quantity) DESC;
diff --git a/create_employees.sql b/create_employees.sql
@@ -0,0 +1,28 @@
+USE memory.default;
+
+/*
+In real world, creating a table would not be so trivial (perhaps only for really small or manually created use-cases).
+Another option on how to load data (from .csv, .parquet, etc.) would be to connect to an external location such as S3 or GCS or even
+local file system and load from there.
+
+As I have time constraint on this task, I've decided to manually load data.
+*/
+CREATE TABLE IF NOT EXISTS EMPLOYEE (
+    employee_id     TINYINT,
+    first_name      VARCHAR,
+    last_name       VARCHAR,
+    job_title       VARCHAR,
+    manager_id      TINYINT
+);
+
+-- Data as shown in hr/employee_index.csv
+INSERT INTO EMPLOYEE VALUES
+    (1, 'Ian', 'James', 'CEO', 4),
+    (2, 'Umberto', 'Torrielli', 'CSO', 1),
+    (3, 'Alex', 'Jacobson', 'MD EMEA', 2),
+    (4, 'Darren', 'Poynton', 'CFO', 2),
+    (5, 'Tim', 'Beard', 'MD APAC', 2),
+    (6, 'Gemma', 'Dodd', 'COS', 1),
+    (7, 'Lisa', 'Platten', 'CHR', 6),
+    (8, 'Stefano', 'Camisaca', 'GM Activation', 2),
+    (9, 'Andrea', 'Ghibaudi', 'MD NAM', 2);
diff --git a/create_expenses.sql b/create_expenses.sql
@@ -0,0 +1,20 @@
+USE memory.default;
+
+/*
+Same as for create_employees.sql...
+*/
+CREATE TABLE IF NOT EXISTS EXPENSE (
+    employee_id     TINYINT,
+    unit_price      DECIMAL(8, 2),
+    quantity        TINYINT
+);
+
+-- Data as shown in finance/receipts_from_last_night/*.txt. Again, due to data sample size, I've simply manually looked up all of the employees ids and their respective names
+INSERT INTO EXPENSE VALUES
+    (3, 6.50, 14),
+    (3, 11.00, 20),
+    (3, 22.00, 18),
+    (3, 13.00, 75),
+    (9, 300.00, 1),
+    (4, 40.00, 9),
+    (2, 17.50, 4);
diff --git a/create_invoices.sql b/create_invoices.sql
@@ -0,0 +1,35 @@
+USE memory.default;
+
+/*
+Same as for create_employees.sql...
+*/
+CREATE TABLE IF NOT EXISTS SUPPLIER (
+    supplier_id     TINYINT,
+    name            VARCHAR
+);
+
+-- Data as shown in finance/invoices/*.txt
+INSERT INTO SUPPLIER VALUES 
+    (1, 'Catering Plus'),
+    (2, 'Dave''s Discos'),
+    (3, 'Entertainment tonight'),
+    (4, 'Ice Ice Baby'),
+    (5, 'Party Animals');
+
+/*
+Same as for create_employees.sql...
+*/
+CREATE TABLE IF NOT EXISTS INVOICE (
+    supplier_id     TINYINT,
+    invoice_amount  DECIMAL(8, 2),
+    due_date        DATE
+);
+
+-- Data as shown in finance/invoices/*.txt
+INSERT INTO INVOICE VALUES
+    (5, 6000.00, LAST_DAY_OF_MONTH(DATE_ADD('month', 3, CURRENT_DATE))),
+    (1, 2000.00, LAST_DAY_OF_MONTH(DATE_ADD('month', 2, CURRENT_DATE))),
+    (1, 1500.00, LAST_DAY_OF_MONTH(DATE_ADD('month', 3, CURRENT_DATE))),
+    (2, 500.00, LAST_DAY_OF_MONTH(DATE_ADD('month', 1, CURRENT_DATE))),
+    (3, 6000.00, LAST_DAY_OF_MONTH(DATE_ADD('month', 3, CURRENT_DATE))),
+    (4, 4000.00, LAST_DAY_OF_MONTH(DATE_ADD('month', 6, CURRENT_DATE)));
diff --git a/find_manager_cycles.sql b/find_manager_cycles.sql
@@ -0,0 +1,44 @@
+USE memory.default;
+
+-- We are using recursiveness here as this is a perfect use case
+WITH RECURSIVE manager_loop (
+    start_employee, 
+    current_employee, 
+    next_manager, 
+    path
+) AS (
+    -- Anchor part of the query starts for every employee
+    SELECT
+        e.employee_id AS start_employee,
+        e.employee_id AS current_employee,
+        e.manager_id AS next_manager,
+        ARRAY[e.employee_id] AS path
+    FROM
+        EMPLOYEE e
+
+    UNION ALL
+
+    -- Where recursiveness starts by following the manager chain
+    SELECT
+        ml.start_employee,
+        e.employee_id AS current_employee,
+        e.manager_id AS next_manager,
+        ml.path || e.employee_id
+    FROM
+        manager_loop ml
+    JOIN
+        EMPLOYEE e
+            ON ml.next_manager = e.employee_id
+    WHERE
+        1 = 1
+        AND NOT CONTAINS(ml.path, e.employee_id)
+)
+-- In the final select, a cycle is found when the next manager is the same person that we started with
+SELECT
+    ml.start_employee AS employee_id,
+    ARRAY_JOIN(ml.path || ml.next_manager, ', ') AS full_cycle_path
+FROM
+    manager_loop ml
+WHERE
+    1 = 1
+    AND next_manager = start_employee;
diff --git a/generate_supplier_payment_plans.sql b/generate_supplier_payment_plans.sql
@@ -0,0 +1,63 @@
+USE memory.default;
+
+-- Aggregate all invoices for each supplier to get a total amount and final due date.
+WITH supplier_invoice AS (
+    SELECT
+        s.supplier_id,
+        s.name AS supplier_name,
+        CAST(SUM(i.invoice_amount) AS DECIMAL(10, 2)) AS sum_invoice_amt,
+        MAX(i.due_date) AS latest_due_date
+    FROM
+        INVOICE i
+    LEFT JOIN
+        SUPPLIER s
+            ON i.supplier_id = s.supplier_id
+    GROUP BY
+        s.supplier_id, 
+        s.name
+),
+
+-- Calculate the number of payments and the amount for a standard and final payment.
+payment_details AS (
+    SELECT
+        supplier_id,
+        supplier_name,
+        sum_invoice_amt,
+        DATE_DIFF('month', CURRENT_DATE, latest_due_date) + 1 AS payment_num,
+        CAST(FLOOR(sum_invoice_amt / (DATE_DIFF('month', CURRENT_DATE, latest_due_date) + 1)) AS DECIMAL(10, 2)) AS payment_amt_monthly,
+        CAST(MOD(sum_invoice_amt, (DATE_DIFF('month', CURRENT_DATE, latest_due_date) + 1)) AS DECIMAL(10, 2)) AS last_payment_adjustment
+    FROM
+        supplier_invoice
+),
+
+-- Generate the payment schedule rows and calculate values for each month.
+payment_schedule AS (
+    SELECT
+        d.supplier_id,
+        d.supplier_name,
+        d.sum_invoice_amt,
+        -- Use a CASE statement to determine the payment amount for this specific row (seq)
+        CASE
+            WHEN seq = d.payment_num - 1 THEN d.payment_amt_monthly + d.last_payment_adjustment -- Last payment
+            ELSE d.payment_amt_monthly -- Rest of payments
+        END AS payment_amount,
+        LAST_DAY_OF_MONTH(DATE_ADD('month', seq, CURRENT_DATE)) AS payment_date,
+        seq
+    FROM
+        payment_details d
+    CROSS JOIN
+        UNNEST(SEQUENCE(0, d.payment_num - 1)) AS t(seq)
+)
+-- Calculate the running balance and display the final report.
+SELECT
+    supplier_id,
+    supplier_name,
+    payment_amount,
+    -- Use a window function to get the running total of payments and subtract from the total
+    sum_invoice_amt - SUM(payment_amount) OVER (PARTITION BY supplier_id ORDER BY payment_date) AS balance_outstanding,
+    payment_date
+FROM
+    payment_schedule
+ORDER BY
+    supplier_id, 
+    payment_date;
diff --git a/tests/00_referential_integrity.sql b/tests/00_referential_integrity.sql
@@ -0,0 +1,49 @@
+USE memory.default;
+
+-- Test 1: Check for expenses logged by non-existent employees.
+-- This query identifies any employee_id in the EXPENSE table
+-- that does not have a corresponding entry in the EMPLOYEE table.
+-- An ideal result is an empty set, indicating no orphaned expense records.
+SELECT
+e.employee_id
+FROM
+    EXPENSE e
+LEFT JOIN
+    EMPLOYEE emp
+        ON e.employee_id = emp.employee_id
+WHERE
+    1 = 1
+    AND emp.employee_id IS NULL;
+
+-- Test 2: Check for employees with non-existent managers.
+-- This query checks for any manager_id in the EMPLOYEE table that does not
+-- correspond to a valid employee_id in the same table. This is a self-referencing
+-- foreign key check. The CEO's manager_id might be NULL, so we exclude that.
+-- An ideal result is an empty set.
+SELECT
+    emp.employee_id,
+    emp.manager_id
+FROM
+    EMPLOYEE emp
+LEFT JOIN
+    EMPLOYEE mgr
+        ON emp.manager_id = mgr.employee_id
+WHERE
+    1 = 1
+    AND mgr.employee_id IS NULL
+    AND emp.manager_id IS NOT NULL;
+
+-- Test 3: Check for invoices from non-existent suppliers.
+-- This query looks for any supplier_id in the INVOICE table that
+-- does not exist in the SUPPLIER table.
+-- A clean result (empty set) means all invoices are linked to valid suppliers.
+SELECT
+    i.supplier_id
+FROM
+    INVOICE i
+LEFT JOIN
+    SUPPLIER s 
+        ON i.supplier_id = s.supplier_id
+WHERE
+    1 = 1
+    AND s.supplier_id IS NULL;
diff --git a/tests/01_data_validation.sql b/tests/01_data_validation.sql
@@ -0,0 +1,104 @@
+USE memory.default;
+
+/*
+Test 1: Row Count Checks
+These queries return the total number of rows in each table.
+This is useful for tracking table growth and verifying data loads.
+*/
+SELECT
+    'EMPLOYEE' AS table_name, 
+    COUNT(*) AS row_count 
+FROM
+    EMPLOYEE
+UNION ALL
+SELECT
+    'EXPENSE' AS table_name, 
+    COUNT(*) AS row_count 
+FROM
+    EXPENSE
+UNION ALL
+SELECT
+    'SUPPLIER' AS table_name,
+    COUNT(*) AS row_count 
+FROM
+    SUPPLIER
+UNION ALL
+SELECT
+    'INVOICE' AS table_name, 
+    COUNT(*) AS row_count
+FROM
+    INVOICE;
+
+/*
+Test 2: Null Value Checks for Primary Keys
+These queries check for NULL values in primary key columns.
+These columns should never be null. An ideal result is 0.
+*/
+SELECT
+    'EMPLOYEE' AS table_name, 
+    COUNT(*) AS null_employee_ids 
+FROM 
+    EMPLOYEE
+WHERE
+    1 = 1
+    AND employee_id IS NULL
+UNION ALL
+SELECT
+    'SUPPLIER' AS table_name,
+    COUNT(*) AS null_supplier_ids
+FROM
+    SUPPLIER
+WHERE
+    1 = 1
+    AND supplier_id IS NULL;
+
+/*
+Test 3: Uniqueness Checks for Primary Keys
+These queries identify duplicate primary key values.
+An ideal result is an empty set.
+*/
+SELECT
+    employee_id,
+    COUNT(*)
+FROM
+    EMPLOYEE
+GROUP BY
+    employee_id
+HAVING
+    COUNT(*) > 1;
+
+SELECT
+    supplier_id,
+    COUNT(*)
+FROM
+    SUPPLIER
+GROUP BY
+    supplier_id
+HAVING
+    COUNT(*) > 1;
+
+/*
+Test 4: Data Constraint Checks
+This query checks for any records that violate logical data constraints,
+such as negative prices or quantities.
+An ideal result is an empty set.
+*/
+SELECT
+    employee_id,
+    unit_price,
+    quantity
+FROM
+    EXPENSE
+WHERE
+    1 = 1
+    AND unit_price <= 0 OR quantity <= 0;
+
+-- Check for negative invoice amounts
+SELECT
+    supplier_id,
+    invoice_amount
+FROM
+    INVOICE
+WHERE
+    1 = 1
+    AND invoice_amount <= 0;