Artificial Intelligence
Machine Learning with Python: From Basics to Production

December 28, 2025
20 min read
PythonMachine Learningscikit-learnData ScienceTutorial
Machine Learning with Python: From Basics to Production
Python has become the go-to language for machine learning thanks to its rich ecosystem of libraries and tools. This comprehensive guide takes you from fundamentals to production-ready ML models.
Setting Up Your ML Environment
# Create virtual environment python -m venv ml-env source ml-env/bin/activate # Windows: ml-env\\Scripts\\activate # Install essential libraries pip install numpy pandas scikit-learn matplotlib seaborn pip install jupyter notebook pip install tensorflow torch # For deep learning
Essential Libraries
import numpy as np # Numerical computing import pandas as pd # Data manipulation import matplotlib.pyplot as plt # Visualization import seaborn as sns # Statistical visualization from sklearn import * # Machine learning algorithms
The ML Workflow
1. Data Loading and Exploration
import pandas as pd import seaborn as sns import matplotlib.pyplot as plt # Load data df = pd.read_csv('data.csv') # Basic exploration print(df.shape) print(df.info()) print(df.describe()) print(df.isnull().sum()) # Visualize distributions fig, axes = plt.subplots(2, 2, figsize=(12, 10)) for idx, col in enumerate(df.select_dtypes(include='number').columns[:4]): ax = axes[idx // 2, idx % 2] df[col].hist(ax=ax, bins=30) ax.set_title(col) plt.tight_layout() plt.show() # Correlation heatmap plt.figure(figsize=(10, 8)) sns.heatmap(df.corr(), annot=True, cmap='coolwarm', center=0) plt.title('Feature Correlations') plt.show()
2. Data Preprocessing
from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, LabelEncoder from sklearn.impute import SimpleImputer # Handle missing values imputer = SimpleImputer(strategy='median') df_numeric = pd.DataFrame( imputer.fit_transform(df.select_dtypes(include='number')), columns=df.select_dtypes(include='number').columns ) # Encode categorical variables le = LabelEncoder() for col in df.select_dtypes(include='object').columns: df[col] = le.fit_transform(df[col].astype(str)) # Feature scaling scaler = StandardScaler() X_scaled = scaler.fit_transform(X) # Split data X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y )
3. Feature Engineering
import pandas as pd import numpy as np # Create new features df['feature_ratio'] = df['feature_a'] / (df['feature_b'] + 1) df['feature_product'] = df['feature_a'] * df['feature_b'] # Binning df['age_group'] = pd.cut(df['age'], bins=[0, 18, 35, 50, 65, 100], labels=['child', 'young', 'middle', 'senior', 'elderly']) # One-hot encoding df_encoded = pd.get_dummies(df, columns=['category'], drop_first=True) # Polynomial features from sklearn.preprocessing import PolynomialFeatures poly = PolynomialFeatures(degree=2, include_bias=False) X_poly = poly.fit_transform(X)
Classification Algorithms
Logistic Regression
from sklearn.linear_model import LogisticRegression from sklearn.metrics import classification_report, confusion_matrix # Train model model = LogisticRegression(max_iter=1000) model.fit(X_train, y_train) # Predict y_pred = model.predict(X_test) # Evaluate print(classification_report(y_test, y_pred)) print(confusion_matrix(y_test, y_pred))
Random Forest
from sklearn.ensemble import RandomForestClassifier # Train with hyperparameters rf_model = RandomForestClassifier( n_estimators=100, max_depth=10, min_samples_split=5, random_state=42, n_jobs=-1 ) rf_model.fit(X_train, y_train) # Feature importance importance = pd.DataFrame({ 'feature': feature_names, 'importance': rf_model.feature_importances_ }).sort_values('importance', ascending=False) print(importance.head(10))
Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier # Or use XGBoost/LightGBM for better performance # pip install xgboost lightgbm from xgboost import XGBClassifier from lightgbm import LGBMClassifier # XGBoost xgb_model = XGBClassifier( n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42 ) xgb_model.fit(X_train, y_train) # LightGBM (faster for large datasets) lgbm_model = LGBMClassifier( n_estimators=100, learning_rate=0.1, random_state=42 ) lgbm_model.fit(X_train, y_train)
Regression Algorithms
from sklearn.linear_model import LinearRegression, Ridge, Lasso from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error # Linear Regression lr_model = LinearRegression() lr_model.fit(X_train, y_train) # Ridge Regression (L2 regularization) ridge_model = Ridge(alpha=1.0) ridge_model.fit(X_train, y_train) # Lasso Regression (L1 regularization) lasso_model = Lasso(alpha=0.1) lasso_model.fit(X_train, y_train) # Evaluate regression y_pred = lr_model.predict(X_test) print("RMSE: {:.4f}".format(np.sqrt(mean_squared_error(y_test, y_pred)))) print("MAE: {:.4f}".format(mean_absolute_error(y_test, y_pred))) print("R2: {:.4f}".format(r2_score(y_test, y_pred)))
Hyperparameter Tuning
Grid Search
from sklearn.model_selection import GridSearchCV param_grid = { 'n_estimators': [50, 100, 200], 'max_depth': [5, 10, 15, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4] } grid_search = GridSearchCV( RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1 ) grid_search.fit(X_train, y_train) print("Best params:", grid_search.best_params_) print("Best score:", grid_search.best_score_)
Randomized Search
from sklearn.model_selection import RandomizedSearchCV from scipy.stats import randint, uniform param_distributions = { 'n_estimators': randint(50, 300), 'max_depth': randint(3, 20), 'min_samples_split': randint(2, 20), 'learning_rate': uniform(0.01, 0.3) } random_search = RandomizedSearchCV( XGBClassifier(random_state=42), param_distributions, n_iter=50, cv=5, scoring='accuracy', random_state=42, n_jobs=-1 ) random_search.fit(X_train, y_train)
Cross-Validation
from sklearn.model_selection import cross_val_score, StratifiedKFold # K-Fold Cross Validation cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy') print("CV Scores: {}".format(scores)) print("Mean: {:.4f} (+/- {:.4f})".format(scores.mean(), scores.std() * 2))
Model Persistence
import joblib import pickle # Save model joblib.dump(model, 'model.joblib') # Or with pickle with open('model.pkl', 'wb') as f: pickle.dump(model, f) # Load model loaded_model = joblib.load('model.joblib') # With pickle with open('model.pkl', 'rb') as f: loaded_model = pickle.load(f)
Building a Complete Pipeline
from sklearn.pipeline import Pipeline from sklearn.compose import ColumnTransformer from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.impute import SimpleImputer from sklearn.ensemble import RandomForestClassifier # Define preprocessing for different column types numeric_features = ['age', 'income', 'score'] categorical_features = ['category', 'region'] numeric_transformer = Pipeline([ ('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler()) ]) categorical_transformer = Pipeline([ ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore')) ]) # Combine preprocessors preprocessor = ColumnTransformer([ ('num', numeric_transformer, numeric_features), ('cat', categorical_transformer, categorical_features) ]) # Full pipeline pipeline = Pipeline([ ('preprocessor', preprocessor), ('classifier', RandomForestClassifier(n_estimators=100, random_state=42)) ]) # Train pipeline.fit(X_train, y_train) # Predict predictions = pipeline.predict(X_test)
Conclusion
This guide covered the essential ML workflow with Python:
- Data exploration and visualization
- Preprocessing and feature engineering
- Classification and regression algorithms
- Hyperparameter tuning
- Model evaluation and cross-validation
- Model persistence and pipelines
Key takeaways:
- Always explore your data before modeling
- Feature engineering often matters more than algorithm choice
- Use cross-validation for reliable performance estimates
- Build reproducible pipelines for production
- Monitor models and retrain as needed
Keep practicing with real datasets on Kaggle and building end-to-end projects!

About Dimuthu Wayaman
Mobile Application Developer and UI Designer specializing in Flutter development. Passionate about creating beautiful, functional mobile applications and sharing knowledge with the developer community.