End-to-End Analytics Pipeline in SQL
You will build a complete analytics database for a retail company. The pipeline covers schema design, data loading, exploratory queries, revenue analysis, customer segmentation, and connecting SQL results to Python or R for visualization.
-- Enable foreign keys PRAGMA foreign_keys = ON; CREATE TABLE customers ( customer_id INTEGER PRIMARY KEY, first_name TEXT NOT NULL, last_name TEXT NOT NULL, email TEXT UNIQUE, city TEXT, signup_date TEXT NOT NULL ); CREATE TABLE categories ( category_id INTEGER PRIMARY KEY, category_name TEXT NOT NULL UNIQUE ); CREATE TABLE products ( product_id INTEGER PRIMARY KEY, product_name TEXT NOT NULL, category_id INTEGER, unit_price REAL CHECK(unit_price > 0), FOREIGN KEY (category_id) REFERENCES categories(category_id) ); CREATE TABLE orders ( order_id INTEGER PRIMARY KEY, customer_id INTEGER NOT NULL, order_date TEXT NOT NULL, FOREIGN KEY (customer_id) REFERENCES customers(customer_id) ); CREATE TABLE order_items ( order_id INTEGER, product_id INTEGER, quantity INTEGER NOT NULL CHECK(quantity > 0), unit_price REAL NOT NULL, PRIMARY KEY (order_id, product_id), FOREIGN KEY (order_id) REFERENCES orders(order_id), FOREIGN KEY (product_id) REFERENCES products(product_id) );
-- Categories INSERT INTO categories VALUES (1, 'Electronics'), (2, 'Clothing'), (3, 'Home & Kitchen'), (4, 'Books'); -- Products (sample) INSERT INTO products VALUES (101, 'Wireless Headphones', 1, 79.99), (102, 'USB-C Hub', 1, 34.99), (103, 'Running Shoes', 2, 119.99), (104, 'Cotton T-Shirt', 2, 24.99), (105, 'Coffee Maker', 3, 89.99), (106, 'Desk Lamp', 3, 45.99), (107, 'Data Science Handbook', 4, 39.99), (108, 'SQL Cookbook', 4, 29.99); -- Customers (sample) INSERT INTO customers VALUES (1, 'Alice', 'Chen', 'alice@example.com', 'New York', '2023-01-15'), (2, 'Bob', 'Smith', 'bob@example.com', 'Chicago', '2023-03-22'), (3, 'Carol', 'Jones', 'carol@example.com', 'Austin', '2023-06-10'), (4, 'David', 'Lee', 'david@example.com', 'Seattle', '2023-09-05'), (5, 'Eve', 'Park', 'eve@example.com', 'Boston', '2024-01-12'); -- Orders and order items INSERT INTO orders VALUES (1001, 1, '2024-01-10'), (1002, 2, '2024-01-15'), (1003, 1, '2024-02-20'), (1004, 3, '2024-03-05'), (1005, 4, '2024-03-18'), (1006, 2, '2024-04-02'), (1007, 1, '2024-05-14'), (1008, 5, '2024-06-01'), (1009, 3, '2024-06-20'), (1010, 1, '2024-07-08'); INSERT INTO order_items VALUES (1001, 101, 1, 79.99), (1001, 108, 2, 29.99), (1002, 103, 1, 119.99), (1003, 102, 1, 34.99), (1003, 107, 1, 39.99), (1004, 105, 1, 89.99), (1005, 104, 3, 24.99), (1005, 106, 1, 45.99), (1006, 101, 1, 79.99), (1006, 104, 2, 24.99), (1007, 105, 1, 89.99), (1008, 103, 1, 119.99), (1008, 107, 2, 39.99), (1009, 106, 2, 45.99), (1010, 101, 1, 79.99), (1010, 102, 1, 34.99);
-- Record counts SELECT 'customers' AS tbl, COUNT(*) AS n FROM customers UNION ALL SELECT 'orders', COUNT(*) FROM orders UNION ALL SELECT 'items', COUNT(*) FROM order_items UNION ALL SELECT 'products', COUNT(*) FROM products; -- Date range SELECT MIN(order_date) AS first_order, MAX(order_date) AS last_order FROM orders;
-- Monthly revenue with year-over-year comparison WITH monthly AS ( SELECT STRFTIME('%Y-%m', o.order_date) AS month, SUM(oi.quantity * oi.unit_price) AS revenue FROM orders o INNER JOIN order_items oi ON o.order_id = oi.order_id GROUP BY month ) SELECT month, revenue, LAG(revenue) OVER (ORDER BY month) AS prev_month, ROUND( (revenue - LAG(revenue) OVER (ORDER BY month)) * 100.0 / LAG(revenue) OVER (ORDER BY month), 1) AS mom_growth_pct, SUM(revenue) OVER (ORDER BY month) AS cumulative_revenue FROM monthly; -- Revenue by category SELECT cat.category_name, SUM(oi.quantity * oi.unit_price) AS revenue, ROUND( SUM(oi.quantity * oi.unit_price) * 100.0 / (SELECT SUM(quantity * unit_price) FROM order_items), 1) AS pct_of_total FROM order_items oi INNER JOIN products p ON oi.product_id = p.product_id INNER JOIN categories cat ON p.category_id = cat.category_id GROUP BY cat.category_name ORDER BY revenue DESC;
RFM analysis segments customers by Recency (days since last purchase), Frequency (number of orders), and Monetary value (total spend).
WITH rfm AS ( SELECT c.customer_id, c.first_name || ' ' || c.last_name AS customer_name, JULIANDAY('2024-07-31') - JULIANDAY(MAX(o.order_date)) AS recency_days, COUNT(DISTINCT o.order_id) AS frequency, SUM(oi.quantity * oi.unit_price) AS monetary FROM customers c INNER JOIN orders o ON c.customer_id = o.customer_id INNER JOIN order_items oi ON o.order_id = oi.order_id GROUP BY c.customer_id ) SELECT customer_name, recency_days, frequency, ROUND(monetary, 2) AS monetary, CASE WHEN recency_days <= 30 AND frequency >= 3 THEN 'Champion' WHEN recency_days <= 60 AND frequency >= 2 THEN 'Loyal' WHEN recency_days <= 90 THEN 'Recent' WHEN frequency >= 2 THEN 'At Risk' ELSE 'Lapsed' END AS segment FROM rfm ORDER BY monetary DESC;
-- Top products by revenue and quantity SELECT p.product_name, cat.category_name, SUM(oi.quantity) AS units_sold, ROUND(SUM(oi.quantity * oi.unit_price), 2) AS revenue, DENSE_RANK() OVER (ORDER BY SUM(oi.quantity * oi.unit_price) DESC) AS revenue_rank FROM order_items oi INNER JOIN products p ON oi.product_id = p.product_id INNER JOIN categories cat ON p.category_id = cat.category_id GROUP BY p.product_id ORDER BY revenue DESC; -- Products frequently bought together SELECT p1.product_name AS product_a, p2.product_name AS product_b, COUNT(*) AS times_bought_together FROM order_items oi1 INNER JOIN order_items oi2 ON oi1.order_id = oi2.order_id AND oi1.product_id < oi2.product_id INNER JOIN products p1 ON oi1.product_id = p1.product_id INNER JOIN products p2 ON oi2.product_id = p2.product_id GROUP BY oi1.product_id, oi2.product_id ORDER BY times_bought_together DESC;
-- Customer retention by signup quarter WITH cohort AS ( SELECT c.customer_id, STRFTIME('%Y-Q', c.signup_date) || ((CAST(STRFTIME('%m', c.signup_date) AS INTEGER) - 1) / 3 + 1) AS signup_quarter, MIN(o.order_date) AS first_order FROM customers c LEFT JOIN orders o ON c.customer_id = o.customer_id GROUP BY c.customer_id ) SELECT signup_quarter, COUNT(*) AS total_customers, COUNT(first_order) AS converted, ROUND(COUNT(first_order) * 100.0 / COUNT(*), 1) AS conversion_pct FROM cohort GROUP BY signup_quarter ORDER BY signup_quarter;
-- Create reusable views for dashboards CREATE VIEW v_monthly_revenue AS SELECT STRFTIME('%Y-%m', o.order_date) AS month, COUNT(DISTINCT o.order_id) AS orders, SUM(oi.quantity * oi.unit_price) AS revenue FROM orders o INNER JOIN order_items oi ON o.order_id = oi.order_id GROUP BY month; CREATE VIEW v_customer_lifetime AS SELECT c.customer_id, c.first_name || ' ' || c.last_name AS name, COUNT(DISTINCT o.order_id) AS total_orders, ROUND(SUM(oi.quantity * oi.unit_price), 2) AS lifetime_value FROM customers c LEFT JOIN orders o ON c.customer_id = o.customer_id LEFT JOIN order_items oi ON o.order_id = oi.order_id GROUP BY c.customer_id;
-- Python: sqlite3 (built-in)
import sqlite3 import pandas as pd conn = sqlite3.connect('analytics.db') df = pd.read_sql_query("SELECT * FROM v_monthly_revenue", conn) print(df) conn.close()
-- R: DBI + RSQLite
library(DBI) library(RSQLite) con <- dbConnect(SQLite(), "analytics.db") df <- dbGetQuery(con, "SELECT * FROM v_customer_lifetime") print(df) dbDisconnect(con)
Extend the schema with a returns table. Insert sample return data and write a query that calculates the return rate per product (returns / units sold) and per category.
Build a complete RFM segmentation for the sample data. Assign each customer a score from 1-3 on Recency, Frequency, and Monetary. Combine the scores into a single RFM code (e.g., "3-3-3" for the best customers) and count how many customers fall into each segment.
Write a Python script that connects to the analytics.db database, executes the monthly revenue query, and creates a line chart using matplotlib. Save the chart as a PNG file.
Design and build your own analytics database for a domain you are interested in (e.g., a gym membership system, a library, a restaurant). Create at least 4 tables, load sample data, and write 5 analytical queries that would be useful for a manager in that domain.