NumPy & Pandas Guide

Why NumPy & Pandas?

NumPy and Pandas are the foundation of data science in Python. You'll use them in every project, every analysis, and every ML pipeline.

NumPy: Fast numerical computing with arrays. Powers all ML libraries.
Pandas: Data manipulation with DataFrames. Makes data cleaning and analysis easy.

import numpy as np
import pandas as pd

# 90% of your data science code will start with these imports!

NumPy Basics

import numpy as np

# Creating arrays
arr = np.array([1, 2, 3, 4, 5])
zeros = np.zeros((3, 4))        # 3x4 matrix of zeros
ones = np.ones((2, 3))          # 2x3 matrix of ones
range_arr = np.arange(0, 10, 2) # [0, 2, 4, 6, 8]
linspace = np.linspace(0, 1, 5) # 5 evenly spaced points

# Random arrays (for ML initialization)
random_uniform = np.random.rand(3, 4)     # Uniform [0, 1)
random_normal = np.random.randn(3, 4)     # Standard normal
random_int = np.random.randint(0, 10, 5)  # Random integers

# Array properties
print(f"Shape: {arr.shape}")    # (5,)
print(f"Dtype: {arr.dtype}")    # int64
print(f"Size: {arr.size}")      # 5
print(f"Ndim: {arr.ndim}")      # 1

NumPy Operations

# Vectorized operations (100x faster than loops!)
a = np.array([1, 2, 3, 4])
b = np.array([5, 6, 7, 8])

# Element-wise operations
print(a + b)      # [6, 8, 10, 12]
print(a * b)      # [5, 12, 21, 32]
print(a ** 2)     # [1, 4, 9, 16]
print(np.sqrt(a)) # [1, 1.41, 1.73, 2]

# Aggregations
print(np.sum(a))     # 10
print(np.mean(a))    # 2.5
print(np.std(a))     # 1.118
print(np.max(a))     # 4
print(np.argmax(a))  # 3 (index of max)

# Matrix operations
X = np.array([[1, 2], [3, 4]])
Y = np.array([[5, 6], [7, 8]])

print(X @ Y)         # Matrix multiplication
print(X.T)           # Transpose
print(np.dot(X, Y))  # Same as @

# Reshaping
arr = np.arange(12)
reshaped = arr.reshape(3, 4)  # 12 -> 3x4 matrix
flattened = reshaped.flatten() # Back to 1D

NumPy Indexing & Slicing

# Slicing works like Python lists but more powerful
arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])

print(arr[0, 1])      # 2 (row 0, col 1)
print(arr[0])         # [1, 2, 3] (first row)
print(arr[:, 0])      # [1, 4, 7] (first column)
print(arr[0:2, 1:3])  # [[2, 3], [5, 6]] (submatrix)

# Boolean indexing (super useful for filtering!)
data = np.array([1, 5, 3, 8, 2, 9, 4])
mask = data > 4
print(data[mask])     # [5, 8, 9]

# Fancy indexing
indices = np.array([0, 2, 4])
print(data[indices])  # [1, 3, 2]

# Where (conditional selection)
result = np.where(data > 4, data, 0)
print(result)         # [0, 5, 0, 8, 0, 9, 0]

Pandas: DataFrames

import pandas as pd

# Creating DataFrames
df = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Charlie', 'Diana'],
    'age': [25, 30, 35, 28],
    'salary': [50000, 60000, 75000, 55000],
    'department': ['Engineering', 'Sales', 'Engineering', 'Marketing']
})

print(df)
#       name  age  salary   department
# 0    Alice   25   50000  Engineering
# 1      Bob   30   60000        Sales
# 2  Charlie   35   75000  Engineering
# 3    Diana   28   55000    Marketing

# From CSV (most common)
df = pd.read_csv('data.csv')

# DataFrame properties
print(df.shape)      # (4, 4)
print(df.columns)    # Index(['name', 'age', ...])
print(df.dtypes)     # Data types of each column
print(df.info())     # Overview of DataFrame
print(df.describe()) # Statistical summary

Selecting Data

# Column selection
df['name']           # Single column (Series)
df[['name', 'age']]  # Multiple columns (DataFrame)

# Row selection with loc (label-based)
df.loc[0]            # First row by label
df.loc[0:2]          # Rows 0, 1, 2
df.loc[0, 'name']    # Specific cell

# Row selection with iloc (integer-based)
df.iloc[0]           # First row by position
df.iloc[0:2]         # First 2 rows
df.iloc[0, 1]        # Row 0, Column 1

# Filtering (most important!)
df[df['age'] > 28]                    # Age > 28
df[df['department'] == 'Engineering'] # Engineers only
df[(df['age'] > 25) & (df['salary'] > 55000)]  # Multiple conditions

# Query (more readable for complex filters)
df.query('age > 28 and salary > 55000')

Data Cleaning

# Handling missing values
df.isna().sum()           # Count missing per column
df.dropna()               # Drop rows with any missing
df.dropna(subset=['age']) # Drop only if age is missing
df.fillna(0)              # Fill missing with 0
df.fillna(df.mean())      # Fill with column means
df['age'].fillna(df['age'].median(), inplace=True)

# Duplicates
df.duplicated().sum()     # Count duplicates
df.drop_duplicates()      # Remove duplicates
df.drop_duplicates(subset=['name'], keep='first')

# Data types
df['age'] = df['age'].astype(int)
df['date'] = pd.to_datetime(df['date'])

# String operations
df['name'] = df['name'].str.lower()
df['name'] = df['name'].str.strip()
df['name'].str.contains('ali')

# Renaming columns
df.rename(columns={'name': 'full_name'}, inplace=True)
df.columns = ['col1', 'col2', 'col3', 'col4']  # Rename all

Aggregation & Grouping

# Basic aggregations
df['salary'].mean()
df['age'].max()
df['salary'].sum()

# GroupBy (SQL-like aggregation)
# "Split-Apply-Combine" pattern
df.groupby('department')['salary'].mean()
# department
# Engineering    62500
# Marketing      55000
# Sales          60000

# Multiple aggregations
df.groupby('department').agg({
    'salary': ['mean', 'min', 'max'],
    'age': 'mean'
})

# Custom aggregations
df.groupby('department').agg(
    avg_salary=('salary', 'mean'),
    employee_count=('name', 'count'),
    oldest=('age', 'max')
)

# Pivot tables
pd.pivot_table(df,
    values='salary',
    index='department',
    aggfunc=['mean', 'count']
)

Creating New Columns

# Simple calculations
df['salary_monthly'] = df['salary'] / 12
df['age_group'] = df['age'] // 10 * 10  # 20, 30, 40...

# Conditional columns
df['senior'] = df['age'] > 30

# Using apply (for complex logic)
def categorize_salary(salary):
    if salary < 55000:
        return 'Low'
    elif salary < 70000:
        return 'Medium'
    else:
        return 'High'

df['salary_category'] = df['salary'].apply(categorize_salary)

# Using np.where (faster)
df['is_engineer'] = np.where(
    df['department'] == 'Engineering',
    'Yes',
    'No'
)

# Using map (for simple mappings)
dept_codes = {'Engineering': 'ENG', 'Sales': 'SAL', 'Marketing': 'MKT'}
df['dept_code'] = df['department'].map(dept_codes)

Merging & Joining

# Merge (like SQL JOIN)
employees = pd.DataFrame({
    'emp_id': [1, 2, 3, 4],
    'name': ['Alice', 'Bob', 'Charlie', 'Diana'],
    'dept_id': [101, 102, 101, 103]
})

departments = pd.DataFrame({
    'dept_id': [101, 102, 103],
    'dept_name': ['Engineering', 'Sales', 'Marketing']
})

# Inner join (default)
merged = pd.merge(employees, departments, on='dept_id')

# Left join
merged = pd.merge(employees, departments, on='dept_id', how='left')

# Concatenation (stacking DataFrames)
df1 = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
df2 = pd.DataFrame({'A': [5, 6], 'B': [7, 8]})

pd.concat([df1, df2])              # Stack vertically
pd.concat([df1, df2], axis=1)      # Stack horizontally

Time Series with Pandas

# Datetime handling
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)

# Date components
df['year'] = df.index.year
df['month'] = df.index.month
df['day_of_week'] = df.index.dayofweek

# Resampling (aggregation by time period)
daily_sales = df['sales'].resample('D').sum()    # Daily
weekly_sales = df['sales'].resample('W').mean()  # Weekly
monthly_sales = df['sales'].resample('M').sum()  # Monthly

# Rolling windows
df['rolling_mean'] = df['sales'].rolling(window=7).mean()
df['rolling_std'] = df['sales'].rolling(window=7).std()

# Shift (for lag features)
df['previous_day'] = df['sales'].shift(1)
df['next_day'] = df['sales'].shift(-1)

Real-World Example

# Complete data preparation pipeline
import pandas as pd
import numpy as np

# Load data
df = pd.read_csv('customer_data.csv')

# Quick exploration
print(df.shape)
print(df.info())
print(df.describe())
print(df.isna().sum())

# Clean data
df = df.dropna(subset=['customer_id'])  # Drop if ID missing
df['age'] = df['age'].fillna(df['age'].median())
df['income'] = df['income'].fillna(df['income'].mean())

# Remove outliers (IQR method)
Q1 = df['income'].quantile(0.25)
Q3 = df['income'].quantile(0.75)
IQR = Q3 - Q1
df = df[(df['income'] >= Q1 - 1.5*IQR) & (df['income'] <= Q3 + 1.5*IQR)]

# Feature engineering
df['age_group'] = pd.cut(df['age'], bins=[0, 25, 40, 60, 100],
                         labels=['Young', 'Adult', 'Middle', 'Senior'])
df['income_per_age'] = df['income'] / df['age']

# Encode categorical variables
df = pd.get_dummies(df, columns=['gender', 'region'], drop_first=True)

# Prepare for ML
X = df.drop(['target', 'customer_id'], axis=1)
y = df['target']

print(f"Features: {X.shape}")
print(f"Ready for model training!")

Master Data Manipulation

Our Data Science program builds your NumPy and Pandas skills from the ground up.

Explore Data Science Program

NumPy & Pandas