Why SQL Matters
SQL is the universal language of data. Every data engineer needs to master SQL for querying databases, building transformations, and analyzing data. Modern data warehouses and tools like dbt use SQL as the primary interface.
This guide covers advanced SQL techniques essential for data engineering work.
Window Functions
-- Running totals
SELECT
order_date,
amount,
SUM(amount) OVER (ORDER BY order_date) as running_total,
SUM(amount) OVER (
PARTITION BY customer_id
ORDER BY order_date
) as customer_running_total
FROM orders;
-- Ranking
SELECT
customer_id,
amount,
ROW_NUMBER() OVER (ORDER BY amount DESC) as row_num,
RANK() OVER (ORDER BY amount DESC) as rank,
DENSE_RANK() OVER (ORDER BY amount DESC) as dense_rank,
NTILE(4) OVER (ORDER BY amount DESC) as quartile
FROM orders;
-- Lead and Lag (previous/next values)
SELECT
customer_id,
order_date,
amount,
LAG(amount, 1) OVER (
PARTITION BY customer_id ORDER BY order_date
) as previous_amount,
LEAD(amount, 1) OVER (
PARTITION BY customer_id ORDER BY order_date
) as next_amount,
amount - LAG(amount, 1) OVER (
PARTITION BY customer_id ORDER BY order_date
) as amount_change
FROM orders;
-- First and Last values
SELECT
customer_id,
order_date,
amount,
FIRST_VALUE(amount) OVER (
PARTITION BY customer_id ORDER BY order_date
) as first_order_amount,
LAST_VALUE(amount) OVER (
PARTITION BY customer_id
ORDER BY order_date
ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
) as last_order_amount
FROM orders;
Common Table Expressions (CTEs)
-- Basic CTE
WITH monthly_sales AS (
SELECT
DATE_TRUNC('month', order_date) as month,
SUM(amount) as total_sales
FROM orders
GROUP BY 1
)
SELECT
month,
total_sales,
LAG(total_sales) OVER (ORDER BY month) as prev_month,
total_sales - LAG(total_sales) OVER (ORDER BY month) as mom_change
FROM monthly_sales;
-- Multiple CTEs
WITH
customers AS (
SELECT customer_id, customer_name, segment
FROM dim_customers
WHERE is_active = true
),
orders AS (
SELECT customer_id, SUM(amount) as total_spend
FROM fact_orders
WHERE order_date >= '2024-01-01'
GROUP BY customer_id
),
enriched AS (
SELECT
c.customer_id,
c.customer_name,
c.segment,
COALESCE(o.total_spend, 0) as total_spend
FROM customers c
LEFT JOIN orders o ON c.customer_id = o.customer_id
)
SELECT * FROM enriched WHERE total_spend > 1000;
-- Recursive CTE (hierarchical data)
WITH RECURSIVE org_hierarchy AS (
-- Base case: top-level managers
SELECT employee_id, name, manager_id, 1 as level
FROM employees
WHERE manager_id IS NULL
UNION ALL
-- Recursive case: employees with managers
SELECT e.employee_id, e.name, e.manager_id, h.level + 1
FROM employees e
JOIN org_hierarchy h ON e.manager_id = h.employee_id
)
SELECT * FROM org_hierarchy ORDER BY level, name;
Advanced Joins
-- Self join (compare rows in same table)
SELECT
a.employee_id,
a.name as employee,
b.name as manager
FROM employees a
LEFT JOIN employees b ON a.manager_id = b.employee_id;
-- Cross join (cartesian product)
SELECT
d.date,
p.product_id,
COALESCE(s.quantity, 0) as quantity
FROM dim_dates d
CROSS JOIN dim_products p
LEFT JOIN fact_sales s
ON d.date = s.sale_date
AND p.product_id = s.product_id
WHERE d.date BETWEEN '2024-01-01' AND '2024-01-31';
-- Lateral join (correlated subquery as join)
SELECT
c.customer_id,
c.customer_name,
recent.order_date,
recent.amount
FROM customers c
CROSS JOIN LATERAL (
SELECT order_date, amount
FROM orders o
WHERE o.customer_id = c.customer_id
ORDER BY order_date DESC
LIMIT 3
) recent;
-- Anti-join (records not in other table)
SELECT c.*
FROM customers c
LEFT JOIN orders o ON c.customer_id = o.customer_id
WHERE o.customer_id IS NULL;
-- Or using NOT EXISTS
SELECT c.*
FROM customers c
WHERE NOT EXISTS (
SELECT 1 FROM orders o
WHERE o.customer_id = c.customer_id
);
Aggregation Techniques
-- GROUPING SETS (multiple groupings in one query)
SELECT
COALESCE(region, 'All Regions') as region,
COALESCE(product_category, 'All Categories') as category,
SUM(amount) as total_sales
FROM sales
GROUP BY GROUPING SETS (
(region, product_category),
(region),
(product_category),
()
);
-- ROLLUP (hierarchical grouping)
SELECT
year,
quarter,
month,
SUM(amount) as total_sales
FROM sales
GROUP BY ROLLUP (year, quarter, month);
-- CUBE (all combinations)
SELECT
region,
product_category,
SUM(amount) as total_sales
FROM sales
GROUP BY CUBE (region, product_category);
-- Conditional aggregation
SELECT
customer_id,
COUNT(*) as total_orders,
COUNT(*) FILTER (WHERE status = 'completed') as completed_orders,
SUM(amount) FILTER (WHERE order_date >= '2024-01-01') as ytd_spend,
AVG(amount) FILTER (WHERE product_category = 'Electronics') as avg_electronics
FROM orders
GROUP BY customer_id;
Date and Time Operations
-- Date truncation
SELECT
DATE_TRUNC('day', timestamp_col) as day,
DATE_TRUNC('week', timestamp_col) as week_start,
DATE_TRUNC('month', timestamp_col) as month_start,
DATE_TRUNC('quarter', timestamp_col) as quarter_start
FROM events;
-- Date arithmetic
SELECT
order_date,
order_date + INTERVAL '30 days' as due_date,
order_date - INTERVAL '1 year' as year_ago,
CURRENT_DATE - order_date as days_since_order,
DATE_PART('dow', order_date) as day_of_week,
EXTRACT(MONTH FROM order_date) as month
FROM orders;
-- Generate date series
SELECT generate_series(
'2024-01-01'::date,
'2024-12-31'::date,
'1 day'::interval
)::date as date;
-- Fill missing dates
WITH date_range AS (
SELECT generate_series(
MIN(order_date),
MAX(order_date),
'1 day'::interval
)::date as date
FROM orders
)
SELECT
d.date,
COALESCE(SUM(o.amount), 0) as daily_sales
FROM date_range d
LEFT JOIN orders o ON d.date = o.order_date
GROUP BY d.date
ORDER BY d.date;
Query Optimization
-- Use EXPLAIN ANALYZE to understand query performance
EXPLAIN ANALYZE
SELECT * FROM orders WHERE customer_id = 123;
-- Create indexes for common queries
CREATE INDEX idx_orders_customer ON orders(customer_id);
CREATE INDEX idx_orders_date ON orders(order_date);
-- Composite index for multi-column filters
CREATE INDEX idx_orders_customer_date
ON orders(customer_id, order_date);
-- Partial index for filtered queries
CREATE INDEX idx_active_orders
ON orders(customer_id) WHERE status = 'active';
-- Avoid SELECT *
-- Bad:
SELECT * FROM orders WHERE customer_id = 123;
-- Good:
SELECT order_id, order_date, amount
FROM orders WHERE customer_id = 123;
-- Use EXISTS instead of IN for large subqueries
-- Slower:
SELECT * FROM customers
WHERE customer_id IN (SELECT customer_id FROM orders);
-- Faster:
SELECT * FROM customers c
WHERE EXISTS (
SELECT 1 FROM orders o WHERE o.customer_id = c.customer_id
);
Data Quality Queries
-- Find duplicates
SELECT
email,
COUNT(*) as count
FROM customers
GROUP BY email
HAVING COUNT(*) > 1;
-- Check for nulls
SELECT
COUNT(*) as total_rows,
COUNT(email) as non_null_email,
COUNT(*) - COUNT(email) as null_email,
ROUND(100.0 * COUNT(email) / COUNT(*), 2) as completeness_pct
FROM customers;
-- Validate referential integrity
SELECT o.*
FROM orders o
LEFT JOIN customers c ON o.customer_id = c.customer_id
WHERE c.customer_id IS NULL;
-- Check value distributions
SELECT
status,
COUNT(*) as count,
ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER (), 2) as percentage
FROM orders
GROUP BY status
ORDER BY count DESC;
-- Find outliers
WITH stats AS (
SELECT
AVG(amount) as avg_amount,
STDDEV(amount) as std_amount
FROM orders
)
SELECT o.*
FROM orders o, stats s
WHERE o.amount > s.avg_amount + 3 * s.std_amount
OR o.amount < s.avg_amount - 3 * s.std_amount;
Best Practices
- Use CTEs: Make queries readable and maintainable
- Index wisely: Create indexes based on query patterns
- Avoid SELECT *: Select only needed columns
- Filter early: Reduce data volume as soon as possible
- Use EXPLAIN: Understand query execution plans
- Test with data: Validate queries with realistic data volumes
Master SQL with Expert Mentorship
Our Data Engineering program covers advanced SQL techniques for building robust data pipelines. Learn from industry experts with hands-on practice.
Explore Data Engineering Program