Master interview questions machine learning with comprehensive AI ML interview questions covering algorithms, deep learning, feature engineering, and practical implementations from FAANG companies. Our guide covers essential interview questions on machine learning for all experience levels.
# Supervised Learning - Classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
# Split data with labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model = RandomForestClassifier()
model.fit(X_train, y_train) # Training with labels
predictions = model.predict(X_test)
# Unsupervised Learning - Clustering
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3)
kmeans.fit(X) # No labels needed
clusters = kmeans.predict(X)
Overfitting occurs when a model learns the training data too well, including noise and random fluctuations, leading to poor generalization on new data.
# L2 Regularization (Ridge)
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score
# Ridge regression with regularization
ridge = Ridge(alpha=1.0) # alpha controls regularization strength
scores = cross_val_score(ridge, X, y, cv=5)
print(f"Cross-validation scores: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")
# Early stopping in neural networks
from tensorflow.keras.callbacks import EarlyStopping
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
model.fit(X_train, y_train, validation_data=(X_val, y_val), callbacks=[early_stop])
The bias-variance tradeoff is a fundamental concept that describes the relationship between model complexity and generalization error.
Total Error = BiasΒ² + Variance + Irreducible Error
The goal is to find the optimal model complexity that minimizes the sum of bias and variance.
Feature engineering is the process of selecting, modifying, or creating features from raw data to improve model performance. This process can significantly boost model accuracy and predictive power.
# Feature scaling and encoding
from sklearn.preprocessing import StandardScaler, LabelEncoder, PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_classif
# Scaling numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_numerical)
# Encoding categorical features
le = LabelEncoder()
y_encoded = le.fit_transform(y_categorical)
# Creating polynomial features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X_scaled)
# Feature selection
selector = SelectKBest(score_func=f_classif, k=10)
X_selected = selector.fit_transform(X_poly, y)
# Classification evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix, classification_report
# Make predictions
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]
# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)
print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1-Score: {f1:.3f}")
print(f"AUC: {auc:.3f}")
# Detailed report
print(classification_report(y_test, y_pred))
Neural networks are computing systems inspired by biological neural networks. They consist of interconnected nodes (neurons) that process information through weighted connections.
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
# Build neural network
model = Sequential([
Dense(128, activation='relu', input_shape=(input_dim,)),
Dropout(0.3),
Dense(64, activation='relu'),
Dropout(0.3),
Dense(32, activation='relu'),
Dense(1, activation='sigmoid') # Binary classification
])
# Compile model
model.compile(
optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy']
)
# Train model
history = model.fit(
X_train, y_train,
validation_data=(X_val, y_val),
epochs=100,
batch_size=32,
verbose=1
)
Activation functions introduce non-linearity into neural networks, enabling them to learn complex patterns. Without activation functions, neural networks would be equivalent to linear regression.
import pandas as pd
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
# Check missing data
print(df.isnull().sum())
print(df.isnull().sum() / len(df) * 100) # Percentage
# Simple imputation
imputer = SimpleImputer(strategy='mean') # or 'median', 'most_frequent'
X_imputed = imputer.fit_transform(X)
# KNN imputation
knn_imputer = KNNImputer(n_neighbors=5)
X_knn_imputed = knn_imputer.fit_transform(X)
# Iterative imputation (MICE)
iterative_imputer = IterativeImputer(random_state=42)
X_iterative_imputed = iterative_imputer.fit_transform(X)
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
# Define models to compare
models = {
'Logistic Regression': LogisticRegression(),
'Random Forest': RandomForestClassifier(),
'SVM': SVC(),
'Naive Bayes': GaussianNB()
}
# Compare models using cross-validation
results = {}
for name, model in models.items():
scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
results[name] = {
'mean': scores.mean(),
'std': scores.std()
}
print(f"{name}: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")
Ridge regression is a regularized linear regression technique that adds a penalty term to prevent overfitting. This machine learning algorithm helps build better predictive models when dealing with multicollinearity or high-dimensional data.
# Ridge regression with cross-validation
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
# Prepare data for predictive model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Scale features (important for ridge regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Ridge regression with cross-validation to find best alpha
ridge_cv = RidgeCV(alphas=[0.1, 1.0, 10.0, 100.0], cv=5)
ridge_cv.fit(X_train_scaled, y_train)
print(f"Best alpha: {ridge_cv.alpha_}")
# Make predictions on target variable
y_pred = ridge_cv.predict(X_test_scaled)
# Evaluate predictive model performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Ridge MSE: {mse:.3f}")
print(f"Ridge RΒ²: {r2:.3f}")
# Compare coefficients with regular linear regression
from sklearn.linear_model import LinearRegression
linear_reg = LinearRegression()
linear_reg.fit(X_train_scaled, y_train)
print("\\nCoefficient comparison:")
print(f"Linear coefficients: {linear_reg.coef_[:5]}")
print(f"Ridge coefficients: {ridge_cv.coef_[:5]}")
# Compare multiple regression algorithms
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_squared_error
# Define machine learning algorithms to compare
models = {
'Linear Regression': LinearRegression(),
'Ridge Regression': Ridge(alpha=1.0),
'Lasso Regression': Lasso(alpha=1.0),
'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
'SVR': SVR(kernel='rbf'),
'Gradient Boosting': GradientBoostingRegressor(random_state=42)
}
# Evaluate each predictive model
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
results = {}
for name, model in models.items():
# Cross-validation scores for target variable prediction
cv_scores = cross_val_score(model, X_train_scaled, y_train,
cv=5, scoring=mse_scorer)
results[name] = {
'mean_mse': -cv_scores.mean(),
'std_mse': cv_scores.std()
}
print(f"{name}: MSE = {-cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")
# Select best performing machine learning algorithm
best_model = min(results.items(), key=lambda x: x[1]['mean_mse'])
print(f"\\nBest predictive model: {best_model[0]}")
Cross-validation is a technique to assess how well machine learning algorithms generalize to unseen data. This method helps evaluate predictive model performance more reliably than a single train-test split.
# Different cross-validation strategies
from sklearn.model_selection import (
cross_val_score, StratifiedKFold, TimeSeriesSplit,
LeaveOneOut, GroupKFold
)
from sklearn.model_selection import validation_curve
import matplotlib.pyplot as plt
# Standard k-fold cross-validation
model = Ridge(alpha=1.0)
cv_scores = cross_val_score(model, X, y, cv=5)
print(f"5-Fold CV Score: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")
# Stratified cross-validation for classification
if is_classification_task:
stratified_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
stratified_scores = cross_val_score(model, X, y, cv=stratified_cv)
print(f"Stratified CV: {stratified_scores.mean():.3f}")
# Time series cross-validation
if is_time_series:
tscv = TimeSeriesSplit(n_splits=5)
ts_scores = cross_val_score(model, X, y, cv=tscv)
print(f"Time Series CV: {ts_scores.mean():.3f}")
# Validation curve for hyperparameter tuning
alpha_range = np.logspace(-3, 2, 10)
train_scores, val_scores = validation_curve(
Ridge(), X, y, param_name='alpha', param_range=alpha_range,
cv=5, scoring='r2'
)
# Plot validation curve
plt.figure(figsize=(10, 6))
plt.plot(alpha_range, train_scores.mean(axis=1), 'o-', label='Training Score')
plt.plot(alpha_range, val_scores.mean(axis=1), 'o-', label='Validation Score')
plt.xlabel('Alpha (Regularization Strength)')
plt.ylabel('RΒ² Score')
plt.title('Ridge Regression Validation Curve')
plt.legend()
plt.xscale('log')
plt.grid(True)
plt.show()
Solving a classification problem requires understanding the business context, analyzing input data quality, and selecting appropriate machine learning algorithms. Real world applications involve careful data set preparation and feature engineering for optimal results.
# Complete classification problem workflow
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils import resample
# 1. Load and explore data set
df = pd.read_csv('real_world_data.csv')
print(f"Data set shape: {df.shape}")
print(f"Class distribution: {df['target'].value_counts()}")
# 2. Handle input data quality issues
# Remove duplicates and handle missing values
df = df.drop_duplicates()
df = df.fillna(df.median(numeric_only=True))
# 3. Prepare features and target variable
X = df.drop('target', axis=1)
y = df['target']
# 4. Handle class imbalance (if needed)
if y.value_counts().min() / y.value_counts().max() < 0.5:
# Oversample minority class
df_minority = df[df.target == y.value_counts().idxmin()]
df_majority = df[df.target == y.value_counts().idxmax()]
df_minority_upsampled = resample(df_minority,
replace=True,
n_samples=len(df_majority),
random_state=42)
df_balanced = pd.concat([df_majority, df_minority_upsampled])
X = df_balanced.drop('target', axis=1)
y = df_balanced['target']
# 5. Split data for validation
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, stratify=y, random_state=42
)
# 6. Scale input data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# 7. Train classification model
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_scaled, y_train)
# 8. Evaluate on real world metrics
y_pred = clf.predict(X_test_scaled)
print("Classification Report:")
print(classification_report(y_test, y_pred))
# 9. Cross-validation for robust evaluation
cv_scores = cross_val_score(clf, X_train_scaled, y_train, cv=5)
print(f"CV Accuracy: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")
Recommendation systems and reinforcement learning represent advanced machine learning paradigms widely used in real world applications. These interview questions on machine learning often focus on understanding algorithmic approaches and practical implementations.
# Content-based recommendation system example
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# Sample data set for recommendation system
movies_data = {
'movie_id': [1, 2, 3, 4, 5],
'title': ['Action Movie A', 'Romance B', 'Action Movie C', 'Comedy D', 'Romance E'],
'genre': ['Action Thriller', 'Romance Drama', 'Action Adventure', 'Comedy', 'Romance Comedy'],
'description': [
'Fast-paced action with explosions',
'Romantic love story with drama',
'Adventure action with heroes',
'Funny comedy with jokes',
'Light romantic comedy'
]
}
df_movies = pd.DataFrame(movies_data)
# Create feature vectors from input data
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df_movies['description'] + ' ' + df_movies['genre'])
# Calculate similarity between each data point
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
def get_recommendations(movie_id, cosine_sim=cosine_sim, df=df_movies):
"""
Get movie recommendations based on content similarity
This demonstrates how recommendation systems work with data points
"""
# Get movie index
idx = df[df['movie_id'] == movie_id].index[0]
# Get pairwise similarity scores
sim_scores = list(enumerate(cosine_sim[idx]))
# Sort by similarity (excluding the movie itself)
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:]
# Get top recommendations
movie_indices = [i[0] for i in sim_scores[:3]]
return df.iloc[movie_indices][['title', 'genre']]
# Example: Recommend movies similar to movie_id=1
print("Recommendations for Action Movie A:")
print(get_recommendations(1))
# Simple Q-Learning example for reinforcement learning
class SimpleQLearning:
"""
Basic Q-Learning agent for reinforcement learning demonstrations
Used in AI ML interview questions
"""
def __init__(self, states, actions, learning_rate=0.1, discount_factor=0.9):
self.states = states
self.actions = actions
self.lr = learning_rate
self.gamma = discount_factor
self.q_table = np.zeros((len(states), len(actions)))
def choose_action(self, state, epsilon=0.1):
"""Choose action using epsilon-greedy policy"""
if np.random.random() < epsilon:
return np.random.choice(self.actions)
else:
return np.argmax(self.q_table[state])
def update_q_table(self, state, action, reward, next_state):
"""Update Q-values based on experience"""
current_q = self.q_table[state, action]
max_next_q = np.max(self.q_table[next_state])
new_q = current_q + self.lr * (reward + self.gamma * max_next_q - current_q)
self.q_table[state, action] = new_q
# This demonstrates reinforcement learning concepts for interview questions
agent = SimpleQLearning(states=range(5), actions=range(3))
print(f"Initialized Q-table shape: {agent.q_table.shape}")
Get personalized AI coaching, practice with real interview questions, and receive instant feedback to land your dream ML role.
β¨ Join 10,000+ engineers who've successfully landed ML roles with our AI coach