Why Version ML Projects?
ML projects involve more than just code - you need to track data, models, hyperparameters, and metrics. Without proper versioning, reproducing results becomes impossible.
- MLflow: Experiment tracking, model registry, deployment
- DVC: Data and model versioning with Git-like workflow
MLflow: Getting Started
# Install MLflow
pip install mlflow
# Start the tracking server
mlflow ui --port 5000
# In your training script
import mlflow
import mlflow.sklearn
# Set experiment
mlflow.set_experiment("classification-experiment")
# Start a run
with mlflow.start_run(run_name="random-forest-v1"):
# Log parameters
mlflow.log_param("n_estimators", 100)
mlflow.log_param("max_depth", 10)
mlflow.log_param("random_state", 42)
# Train model
model = RandomForestClassifier(n_estimators=100, max_depth=10)
model.fit(X_train, y_train)
# Log metrics
accuracy = model.score(X_test, y_test)
mlflow.log_metric("accuracy", accuracy)
mlflow.log_metric("f1_score", f1_score(y_test, predictions))
# Log model
mlflow.sklearn.log_model(model, "model")
# Log artifacts (plots, data)
mlflow.log_artifact("confusion_matrix.png")
MLflow Autologging
import mlflow
# Enable autologging for various frameworks
mlflow.sklearn.autolog()
mlflow.pytorch.autolog()
mlflow.tensorflow.autolog()
mlflow.xgboost.autolog()
# Now training automatically logs everything
from sklearn.ensemble import RandomForestClassifier
with mlflow.start_run():
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)
# Parameters, metrics, and model are automatically logged!
MLflow Model Registry
import mlflow
from mlflow.tracking import MlflowClient
# Register model from a run
mlflow.register_model(
"runs:/run_id/model",
"CustomerChurnModel"
)
# Or during logging
with mlflow.start_run():
mlflow.sklearn.log_model(
model, "model",
registered_model_name="CustomerChurnModel"
)
# Manage model versions
client = MlflowClient()
# Transition to staging
client.transition_model_version_stage(
name="CustomerChurnModel",
version=1,
stage="Staging"
)
# Transition to production
client.transition_model_version_stage(
name="CustomerChurnModel",
version=1,
stage="Production"
)
# Load production model
model = mlflow.pyfunc.load_model(
"models:/CustomerChurnModel/Production"
)
MLflow Projects
# MLproject file
name: ml-project
conda_env: conda.yaml
entry_points:
main:
parameters:
n_estimators: {type: int, default: 100}
max_depth: {type: int, default: 10}
command: "python train.py --n_estimators {n_estimators} --max_depth {max_depth}"
preprocess:
command: "python preprocess.py"
# conda.yaml
name: ml-env
channels:
- defaults
dependencies:
- python=3.11
- scikit-learn
- pandas
- pip:
- mlflow
# Run the project
mlflow run . -P n_estimators=200 -P max_depth=15
# Run from Git
mlflow run https://github.com/user/ml-project -P n_estimators=200
DVC: Getting Started
# Install DVC
pip install dvc
pip install dvc-s3 # For S3 storage
# Initialize DVC in your Git repo
git init
dvc init
# Track a data file
dvc add data/train.csv
git add data/train.csv.dvc data/.gitignore
# Commit to Git
git commit -m "Add training data"
# Configure remote storage
dvc remote add -d myremote s3://mybucket/dvcstore
# Push data to remote
dvc push
# Pull data from remote
dvc pull
DVC Pipelines
# dvc.yaml - Define pipeline stages
stages:
preprocess:
cmd: python src/preprocess.py
deps:
- src/preprocess.py
- data/raw/
outs:
- data/processed/
train:
cmd: python src/train.py
deps:
- src/train.py
- data/processed/
params:
- train.n_estimators
- train.max_depth
outs:
- models/model.pkl
metrics:
- metrics/scores.json:
cache: false
evaluate:
cmd: python src/evaluate.py
deps:
- src/evaluate.py
- models/model.pkl
- data/processed/test.csv
metrics:
- metrics/eval.json:
cache: false
plots:
- metrics/confusion_matrix.csv
# params.yaml
train:
n_estimators: 100
max_depth: 10
# Run the pipeline
dvc repro
# Show metrics
dvc metrics show
# Compare experiments
dvc metrics diff
DVC Experiments
# Run experiment with different parameters
dvc exp run -S train.n_estimators=200
# Run multiple experiments
dvc exp run --queue -S train.n_estimators=100
dvc exp run --queue -S train.n_estimators=200
dvc exp run --queue -S train.n_estimators=300
dvc exp run --run-all # Run all queued
# Show experiments
dvc exp show
# Compare experiments
dvc exp diff exp-abc123 exp-def456
# Apply an experiment
dvc exp apply exp-abc123
# Branch from experiment
dvc exp branch exp-abc123 new-branch
Combining MLflow and DVC
# train.py - Use both tools together
import mlflow
import yaml
import joblib
# Load DVC params
with open("params.yaml") as f:
params = yaml.safe_load(f)["train"]
# Set up MLflow
mlflow.set_experiment("my-experiment")
with mlflow.start_run():
# Log parameters from DVC params.yaml
mlflow.log_params(params)
# Train model
model = RandomForestClassifier(**params)
model.fit(X_train, y_train)
# Evaluate
accuracy = model.score(X_test, y_test)
mlflow.log_metric("accuracy", accuracy)
# Save model (tracked by DVC)
joblib.dump(model, "models/model.pkl")
# Log model to MLflow
mlflow.sklearn.log_model(model, "model")
# Save metrics for DVC
with open("metrics/scores.json", "w") as f:
json.dump({"accuracy": accuracy}, f)
Best Practices
- Track everything: Parameters, metrics, artifacts, and data versions
- Use meaningful names: Name runs and experiments descriptively
- Automate: Use autologging and pipelines
- Remote storage: Store large files (data, models) remotely
- Code + Data: Version code with Git, data with DVC
- CI/CD: Integrate with your CI/CD pipeline
Master MLOps
Our Data Science program covers MLflow, DVC, and the complete MLOps lifecycle. Build reproducible ML systems.
Explore Data Science Program