Python Best Practices for AI Development

Building AI applications in Python requires more than just understanding machine learning algorithms. Let's explore the practices that separate prototype code from production-ready AI systems.

Project Structure

A well-organized project makes collaboration easier and code more maintainable:

my-ai-project/
├── src/
│   ├── __init__.py
│   ├── models/          # ML model definitions
│   ├── data/            # Data processing
│   ├── training/        # Training scripts
│   └── utils/           # Helper functions
├── tests/               # Unit and integration tests
├── notebooks/           # Jupyter notebooks for exploration
├── configs/             # Configuration files
├── requirements.txt     # Dependencies
└── README.md

Type Hints

Type hints make your code self-documenting and catch errors early:

from typing import List, Tuple, Optional
import numpy as np

def preprocess_data(
    data: np.ndarray,
    normalize: bool = True,
    fill_missing: Optional[float] = None
) -> Tuple[np.ndarray, dict]:
    """
    Preprocess input data for model training.

    Args:
        data: Input data array
        normalize: Whether to normalize features
        fill_missing: Value to fill missing data (None = no filling)

    Returns:
        Processed data and metadata dictionary
    """
    metadata = {'shape': data.shape}

    if fill_missing is not None:
        data = np.nan_to_num(data, nan=fill_missing)

    if normalize:
        mean, std = data.mean(), data.std()
        data = (data - mean) / std
        metadata['mean'] = mean
        metadata['std'] = std

    return data, metadata

Configuration Management

Use configuration files instead of hardcoding values:

# config.py
from dataclasses import dataclass

@dataclass
class ModelConfig:
    learning_rate: float = 0.001
    batch_size: int = 32
    epochs: int = 100
    hidden_size: int = 128
    dropout: float = 0.2

@dataclass
class DataConfig:
    train_path: str = "data/train.csv"
    val_path: str = "data/val.csv"
    test_size: float = 0.2
    random_seed: int = 42

# Usage
config = ModelConfig(learning_rate=0.0001, epochs=50)

Error Handling

AI applications face unique error scenarios:

import logging

logger = logging.getLogger(__name__)

class DataProcessingError(Exception):
    """Raised when data processing fails"""
    pass

def load_and_validate_data(filepath: str) -> np.ndarray:
    """
    Load data with comprehensive error handling.
    """
    try:
        data = np.load(filepath)
    except FileNotFoundError:
        logger.error(f"Data file not found: {filepath}")
        raise DataProcessingError(f"Missing data file: {filepath}")
    except Exception as e:
        logger.error(f"Error loading data: {str(e)}")
        raise DataProcessingError(f"Failed to load data: {str(e)}")

    # Validate data shape and type
    if data.ndim != 2:
        raise DataProcessingError(
            f"Expected 2D array, got {data.ndim}D"
        )

    if not np.isfinite(data).all():
        logger.warning("Data contains NaN or inf values")

    return data

Testing AI Code

Testing ML code requires special considerations:

import pytest
import numpy as np
from numpy.testing import assert_array_almost_equal

def test_preprocessing_normalizes_data():
    """Test that preprocessing normalizes data correctly"""
    # Arrange
    data = np.array([[1, 2], [3, 4], [5, 6]])

    # Act
    result, metadata = preprocess_data(data, normalize=True)

    # Assert
    assert_array_almost_equal(result.mean(axis=0), [0, 0], decimal=10)
    assert_array_almost_equal(result.std(axis=0), [1, 1], decimal=10)
    assert 'mean' in metadata
    assert 'std' in metadata

def test_preprocessing_handles_missing_values():
    """Test missing value handling"""
    data = np.array([[1, np.nan], [3, 4]])
    result, _ = preprocess_data(data, fill_missing=0.0)

    assert not np.isnan(result).any()
    assert result[0, 1] == 0.0

Logging for ML

Proper logging helps debug and monitor AI systems:

import logging
from pathlib import Path

def setup_logging(log_dir: str = "logs") -> None:
    """Configure logging for ML pipeline"""
    Path(log_dir).mkdir(exist_ok=True)

    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        handlers=[
            logging.FileHandler(f"{log_dir}/training.log"),
            logging.StreamHandler()
        ]
    )

# During training
logger = logging.getLogger(__name__)

for epoch in range(config.epochs):
    train_loss = train_epoch(model, train_loader)
    val_loss = validate(model, val_loader)

    logger.info(
        f"Epoch {epoch}: "
        f"train_loss={train_loss:.4f}, "
        f"val_loss={val_loss:.4f}"
    )

Reproducibility

Make your experiments reproducible:

import random
import numpy as np
import torch

def set_seed(seed: int = 42) -> None:
    """Set random seeds for reproducibility"""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # For CUDA determinism (may reduce performance)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Use at the start of training
set_seed(config.random_seed)

Code Documentation

Document your AI code thoroughly:

def train_model(
    model: torch.nn.Module,
    train_data: DataLoader,
    val_data: DataLoader,
    config: ModelConfig
) -> dict:
    """
    Train a PyTorch model with given configuration.

    This function handles the complete training loop including:
    - Forward and backward passes
    - Gradient updates
    - Validation after each epoch
    - Early stopping based on validation loss

    Args:
        model: PyTorch model to train
        train_data: Training data loader
        val_data: Validation data loader
        config: Training configuration

    Returns:
        Dictionary containing training history:
            - 'train_losses': List of training losses
            - 'val_losses': List of validation losses
            - 'best_epoch': Epoch with lowest validation loss

    Example:
        >>> config = ModelConfig(epochs=10, learning_rate=0.001)
        >>> history = train_model(model, train_loader, val_loader, config)
        >>> print(f"Best validation loss: {min(history['val_losses'])}")
    """
    # Implementation here
    pass

Performance Optimization

Profile and optimize your code:

import time
from functools import wraps

def timeit(func):
    """Decorator to measure function execution time"""
    @wraps(func)
    def wrapper(*args, **kwargs):
        start = time.perf_counter()
        result = func(*args, **kwargs)
        end = time.perf_counter()
        logger.info(f"{func.__name__} took {end-start:.4f}s")
        return result
    return wrapper

@timeit
def process_batch(batch: np.ndarray) -> np.ndarray:
    # Processing logic
    return batch

Learn More

Ready to put these practices into action? Join our Python Learning Lab for interactive lessons and projects that teach both Python and AI development.

Next up: Advanced patterns for building production ML pipelines