-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathData Exploration.sql
More file actions
238 lines (184 loc) · 6.11 KB
/
Data Exploration.sql
File metadata and controls
238 lines (184 loc) · 6.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
Use DataWarehouseAnalytics;
-- Step 1: Database Exploration
-- exploring all the objects in the database
select
*
from INFORMATION_SCHEMA.TABLES
-- explore all the columns in the database
select
*
from INFORMATION_SCHEMA.COLUMNS
where TABLE_NAME = 'fact_Sales' and COLUMN_NAME = 'quantity';
-- Step 2: Dimensions Exploration
-- exploring the countries the customers come from.
select distinct
country
from gold.dim_customers;
-- result: We have customers from 6 different countries.
-- explore all product categories "The major divisions"
select distinct
category,
subcategory,
product_name
from gold.dim_products;
-- result: We have 4 different categories.
-- step 3: date exploration
-- identifying the latest and earliest orders
-- how many years of sales are available?
select
max(order_Date) as latest_order,
min(order_Date) as earliest_order,
datediff(year, min(order_Date), max(order_Date)) as orders_years
from gold.fact_sales;
-- result: 2014 was the latest and 2010 was the earliest order date. Also, we have data of 4 years.
-- find the youngest and the oldest customers and their age
select
min(birthdate) as oldest_birthdate,
max(birthdate) as youngest_birthdate,
datediff(year, min(birthdate), getdate()) as oldest_bd_age,
datediff(year, max(birthdate), getdate()) as youngest_bd_age
from gold.dim_customers;
-- result: 109 was the oldest age of the customer and 39 is the youngest one.
-- step 4: Measures Exploration
select
-- finding the total sales
sum(sales_amount) as total_Sales,
-- finding the number of items that were sold
sum(quantity) as total_sold_items,
-- finding the average selling price
avg(price) as average_selling_price,
-- finding the total number of orders
count(distinct order_number) as total_orders,
-- finding total number of products
count(distinct product_key) as total_products,
-- total number of customer
count(distinct customer_key) as total_customers
from gold.fact_sales;
-- generating a report that shows all key metrics of the business
select 'Total Sales' as measure_name, sum(sales_amount) as measure_values from gold.fact_sales
union all
select 'Total Quantity', sum(quantity) from gold.fact_sales
union all
select 'Average Price', avg(price) from gold.fact_sales
union all
select 'Totol Orders' ,count(distinct order_number) from gold.fact_sales
union all
select 'Total Products' ,count(distinct product_name) from gold.dim_products
union all
select 'Total customers' ,count(distinct customer_key) a from gold.dim_customers
/*
result: measure_name measure_values
Total Sales 29356250
Total Quantity 60423
Average Price 486
Totol Orders 27659
Total Products 295
Total customers 18484
*/
-- step 5: Magnitude Analysis
-- finding the total customers by countries
select
country,
count(customer_key) as total_customers
from gold.dim_customers
group by country
order by total_customers desc;
-- result: Most of the customers are from United States while canada has the lowest customers.
-- finding total customers by gender
select
gender,
count(customer_key) as total_Customers
from gold.dim_customers
group by gender
order by total_Customers desc;
-- result: The male population is greater.
-- finding the total products by category
select
category,
count(product_key) as total_products
from gold.dim_products
group by category
order by total_products desc;
-- result: Components has 127 product which is the highest while accessories has the least 29.
-- finding the average costs in each category
select
category,
avg(cost) as average_cost
from gold.dim_products
group by category
order by average_cost desc;
-- result: The bikes category has the highest average cost while the accessories remain at last.
-- finding the total revenue generated for each category
select
p.category,
sum(f.sales_amount) as total_Revenue
from gold.fact_Sales f
left join gold.dim_products p
on f.product_key = p.product_key
group by p.category
order by total_Revenue desc;
-- result: The bikes category generated the highest revenue while the clothing lowest.
-- finding the total revenue generated by each customer
select
c.customer_key,
sum(f.sales_Amount) as total_revenue
from gold.fact_sales f
left join gold.dim_customers c
on f.customer_key = c.customer_key
group by c.customer_key
order by total_revenue desc;
-- result: The customer with the customer_key 1133 and 1302 spended the most amount.
-- finding the distribution of items sold across countries.
select
c.country,
count(f.quantity) as items_sold
from gold.fact_sales f
left join gold.dim_customers c
on f.customer_key = c.customer_key
group by c.country
order by items_sold desc;
-- result: Max items were sold in United States whereas france has the lowest.
-- Step 6: Ranking Analysis
-- finding top 5 products that generated the highest revenue
select top 5
p.product_name,
sum(f.sales_amount) as total_revenue
from gold.fact_sales f
left join gold.dim_products p
on f.product_key = p.product_key
group by p.product_name
order by total_revenue desc;
-- Alternatively
select
*
from
(
select
p.product_name,
sum(f.sales_amount) as total_revenue,
ROW_NUMBER() over(order by sum(f.sales_amount) desc) as ranks
from gold.fact_sales f
left join gold.dim_products p
on f.product_key = p.product_key
group by p.product_name
) as sub_query
where ranks <=5;
-- finding the 5 worst- performing products in terms of sales
select top 5
p.product_name,
sum(f.sales_amount) as total_revenue
from gold.fact_sales f
left join gold.dim_products p
on f.product_key = p.product_key
group by p.product_name
order by total_revenue asc;
-- finding the 3 customers with the fewest orders placed
select top 3
c.customer_key,
count(distinct f.order_number) as no_of_orders
from gold.fact_Sales f
left join gold.dim_customers c
on f.customer_key = c.customer_key
group by c.customer_key
order by no_of_orders asc;
-- result: customer with the key 16, 17 and 21 placed the lowest of orders.