Building AI applications in Python requires more than just understanding machine learning algorithms. Let's explore the practices that separate prototype code from production-ready AI systems.
Project Structure
A well-organized project makes collaboration easier and code more maintainable:
my-ai-project/
├── src/
│ ├── __init__.py
│ ├── models/ # ML model definitions
│ ├── data/ # Data processing
│ ├── training/ # Training scripts
│ └── utils/ # Helper functions
├── tests/ # Unit and integration tests
├── notebooks/ # Jupyter notebooks for exploration
├── configs/ # Configuration files
├── requirements.txt # Dependencies
└── README.md
Type Hints
Type hints make your code self-documenting and catch errors early:
from typing import List, Tuple, Optional
import numpy as np
def preprocess_data(
data: np.ndarray,
normalize: bool = True,
fill_missing: Optional[float] = None
) -> Tuple[np.ndarray, dict]:
"""
Preprocess input data for model training.
Args:
data: Input data array
normalize: Whether to normalize features
fill_missing: Value to fill missing data (None = no filling)
Returns:
Processed data and metadata dictionary
"""
metadata = {'shape': data.shape}
if fill_missing is not None:
data = np.nan_to_num(data, nan=fill_missing)
if normalize:
mean, std = data.mean(), data.std()
data = (data - mean) / std
metadata['mean'] = mean
metadata['std'] = std
return data, metadata
Configuration Management
Use configuration files instead of hardcoding values:
# config.py
from dataclasses import dataclass
@dataclass
class ModelConfig:
learning_rate: float = 0.001
batch_size: int = 32
epochs: int = 100
hidden_size: int = 128
dropout: float = 0.2
@dataclass
class DataConfig:
train_path: str = "data/train.csv"
val_path: str = "data/val.csv"
test_size: float = 0.2
random_seed: int = 42
# Usage
config = ModelConfig(learning_rate=0.0001, epochs=50)
Error Handling
AI applications face unique error scenarios:
import logging
logger = logging.getLogger(__name__)
class DataProcessingError(Exception):
"""Raised when data processing fails"""
pass
def load_and_validate_data(filepath: str) -> np.ndarray:
"""
Load data with comprehensive error handling.
"""
try:
data = np.load(filepath)
except FileNotFoundError:
logger.error(f"Data file not found: {filepath}")
raise DataProcessingError(f"Missing data file: {filepath}")
except Exception as e:
logger.error(f"Error loading data: {str(e)}")
raise DataProcessingError(f"Failed to load data: {str(e)}")
# Validate data shape and type
if data.ndim != 2:
raise DataProcessingError(
f"Expected 2D array, got {data.ndim}D"
)
if not np.isfinite(data).all():
logger.warning("Data contains NaN or inf values")
return data
Testing AI Code
Testing ML code requires special considerations:
import pytest
import numpy as np
from numpy.testing import assert_array_almost_equal
def test_preprocessing_normalizes_data():
"""Test that preprocessing normalizes data correctly"""
# Arrange
data = np.array([[1, 2], [3, 4], [5, 6]])
# Act
result, metadata = preprocess_data(data, normalize=True)
# Assert
assert_array_almost_equal(result.mean(axis=0), [0, 0], decimal=10)
assert_array_almost_equal(result.std(axis=0), [1, 1], decimal=10)
assert 'mean' in metadata
assert 'std' in metadata
def test_preprocessing_handles_missing_values():
"""Test missing value handling"""
data = np.array([[1, np.nan], [3, 4]])
result, _ = preprocess_data(data, fill_missing=0.0)
assert not np.isnan(result).any()
assert result[0, 1] == 0.0
Logging for ML
Proper logging helps debug and monitor AI systems:
import logging
from pathlib import Path
def setup_logging(log_dir: str = "logs") -> None:
"""Configure logging for ML pipeline"""
Path(log_dir).mkdir(exist_ok=True)
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(f"{log_dir}/training.log"),
logging.StreamHandler()
]
)
# During training
logger = logging.getLogger(__name__)
for epoch in range(config.epochs):
train_loss = train_epoch(model, train_loader)
val_loss = validate(model, val_loader)
logger.info(
f"Epoch {epoch}: "
f"train_loss={train_loss:.4f}, "
f"val_loss={val_loss:.4f}"
)
Reproducibility
Make your experiments reproducible:
import random
import numpy as np
import torch
def set_seed(seed: int = 42) -> None:
"""Set random seeds for reproducibility"""
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
# For CUDA determinism (may reduce performance)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
# Use at the start of training
set_seed(config.random_seed)
Code Documentation
Document your AI code thoroughly:
def train_model(
model: torch.nn.Module,
train_data: DataLoader,
val_data: DataLoader,
config: ModelConfig
) -> dict:
"""
Train a PyTorch model with given configuration.
This function handles the complete training loop including:
- Forward and backward passes
- Gradient updates
- Validation after each epoch
- Early stopping based on validation loss
Args:
model: PyTorch model to train
train_data: Training data loader
val_data: Validation data loader
config: Training configuration
Returns:
Dictionary containing training history:
- 'train_losses': List of training losses
- 'val_losses': List of validation losses
- 'best_epoch': Epoch with lowest validation loss
Example:
>>> config = ModelConfig(epochs=10, learning_rate=0.001)
>>> history = train_model(model, train_loader, val_loader, config)
>>> print(f"Best validation loss: {min(history['val_losses'])}")
"""
# Implementation here
pass
Performance Optimization
Profile and optimize your code:
import time
from functools import wraps
def timeit(func):
"""Decorator to measure function execution time"""
@wraps(func)
def wrapper(*args, **kwargs):
start = time.perf_counter()
result = func(*args, **kwargs)
end = time.perf_counter()
logger.info(f"{func.__name__} took {end-start:.4f}s")
return result
return wrapper
@timeit
def process_batch(batch: np.ndarray) -> np.ndarray:
# Processing logic
return batch
Learn More
Ready to put these practices into action? Join our Python Learning Lab for interactive lessons and projects that teach both Python and AI development.
Next up: Advanced patterns for building production ML pipelines