Machine Learning with Python: From Basics to Production

Python has become the go-to language for machine learning thanks to its rich ecosystem of libraries and tools. This comprehensive guide takes you from fundamentals to production-ready ML models.

Setting Up Your ML Environment

# Create virtual environment
python -m venv ml-env
source ml-env/bin/activate  # Windows: ml-env\\Scripts\\activate

# Install essential libraries
pip install numpy pandas scikit-learn matplotlib seaborn
pip install jupyter notebook
pip install tensorflow torch  # For deep learning

Essential Libraries

import numpy as np              # Numerical computing
import pandas as pd             # Data manipulation
import matplotlib.pyplot as plt # Visualization
import seaborn as sns           # Statistical visualization
from sklearn import *           # Machine learning algorithms

The ML Workflow

1. Data Loading and Exploration

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load data
df = pd.read_csv('data.csv')

# Basic exploration
print(df.shape)
print(df.info())
print(df.describe())
print(df.isnull().sum())

# Visualize distributions
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
for idx, col in enumerate(df.select_dtypes(include='number').columns[:4]):
    ax = axes[idx // 2, idx % 2]
    df[col].hist(ax=ax, bins=30)
    ax.set_title(col)
plt.tight_layout()
plt.show()

# Correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', center=0)
plt.title('Feature Correlations')
plt.show()

2. Data Preprocessing

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

# Handle missing values
imputer = SimpleImputer(strategy='median')
df_numeric = pd.DataFrame(
    imputer.fit_transform(df.select_dtypes(include='number')),
    columns=df.select_dtypes(include='number').columns
)

# Encode categorical variables
le = LabelEncoder()
for col in df.select_dtypes(include='object').columns:
    df[col] = le.fit_transform(df[col].astype(str))

# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

3. Feature Engineering

import pandas as pd
import numpy as np

# Create new features
df['feature_ratio'] = df['feature_a'] / (df['feature_b'] + 1)
df['feature_product'] = df['feature_a'] * df['feature_b']

# Binning
df['age_group'] = pd.cut(df['age'], bins=[0, 18, 35, 50, 65, 100],
                          labels=['child', 'young', 'middle', 'senior', 'elderly'])

# One-hot encoding
df_encoded = pd.get_dummies(df, columns=['category'], drop_first=True)

# Polynomial features
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)

Classification Algorithms

Logistic Regression

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Random Forest

from sklearn.ensemble import RandomForestClassifier

# Train with hyperparameters
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    random_state=42,
    n_jobs=-1
)
rf_model.fit(X_train, y_train)

# Feature importance
importance = pd.DataFrame({
    'feature': feature_names,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print(importance.head(10))

Gradient Boosting

from sklearn.ensemble import GradientBoostingClassifier
# Or use XGBoost/LightGBM for better performance
# pip install xgboost lightgbm

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# XGBoost
xgb_model = XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    random_state=42
)
xgb_model.fit(X_train, y_train)

# LightGBM (faster for large datasets)
lgbm_model = LGBMClassifier(
    n_estimators=100,
    learning_rate=0.1,
    random_state=42
)
lgbm_model.fit(X_train, y_train)

Regression Algorithms

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Ridge Regression (L2 regularization)
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)

# Lasso Regression (L1 regularization)
lasso_model = Lasso(alpha=0.1)
lasso_model.fit(X_train, y_train)

# Evaluate regression
y_pred = lr_model.predict(X_test)
print("RMSE: {:.4f}".format(np.sqrt(mean_squared_error(y_test, y_pred))))
print("MAE: {:.4f}".format(mean_absolute_error(y_test, y_pred)))
print("R2: {:.4f}".format(r2_score(y_test, y_pred)))

Hyperparameter Tuning

Grid Search

from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)
print("Best params:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Randomized Search

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

param_distributions = {
    'n_estimators': randint(50, 300),
    'max_depth': randint(3, 20),
    'min_samples_split': randint(2, 20),
    'learning_rate': uniform(0.01, 0.3)
}

random_search = RandomizedSearchCV(
    XGBClassifier(random_state=42),
    param_distributions,
    n_iter=50,
    cv=5,
    scoring='accuracy',
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

Cross-Validation

from sklearn.model_selection import cross_val_score, StratifiedKFold

# K-Fold Cross Validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')

print("CV Scores: {}".format(scores))
print("Mean: {:.4f} (+/- {:.4f})".format(scores.mean(), scores.std() * 2))

Model Persistence

import joblib
import pickle

# Save model
joblib.dump(model, 'model.joblib')

# Or with pickle
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

# Load model
loaded_model = joblib.load('model.joblib')

# With pickle
with open('model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

Building a Complete Pipeline

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier

# Define preprocessing for different column types
numeric_features = ['age', 'income', 'score']
categorical_features = ['category', 'region']

numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
])

# Combine preprocessors
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# Full pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Train
pipeline.fit(X_train, y_train)

# Predict
predictions = pipeline.predict(X_test)

Conclusion

This guide covered the essential ML workflow with Python:

Data exploration and visualization
Preprocessing and feature engineering
Classification and regression algorithms
Hyperparameter tuning
Model evaluation and cross-validation
Model persistence and pipelines

Key takeaways:

Always explore your data before modeling
Feature engineering often matters more than algorithm choice
Use cross-validation for reliable performance estimates
Build reproducible pipelines for production
Monitor models and retrain as needed

Keep practicing with real datasets on Kaggle and building end-to-end projects!

Machine Learning with Python: From Basics to Production

Machine Learning with Python: From Basics to Production

Setting Up Your ML Environment

Essential Libraries

The ML Workflow

1. Data Loading and Exploration

2. Data Preprocessing

3. Feature Engineering

Classification Algorithms

Logistic Regression

Random Forest

Gradient Boosting

Regression Algorithms

Hyperparameter Tuning

Grid Search

Randomized Search

Cross-Validation

Model Persistence

Building a Complete Pipeline

Conclusion

About Dimuthu Wayaman