Tarun: ML LAB

LAB 1

#1st

import pandas as pd

import numpy as np

import seaborn as sns

import matplotlib.pyplot as plt

from sklearn.datasets import fetch_california_housing

data = fetch_california_housing(as_frame=True)

df = data.frame

numeric_cols = df.select_dtypes(include=[np.number]).columns

plt.figure(figsize=(15, 10))

for i, col in enumerate(numeric_cols):

plt.subplot(3, 3, i + 1)

sns.histplot(df[col], bins=30, kde=True, color='skyblue')

plt.title(f'{col} Distribution')

plt.tight_layout()

plt.show()

plt.figure(figsize=(15, 10))

for i, col in enumerate(numeric_cols):

plt.subplot(3, 3, i + 1)

sns.boxplot(x=df[col], color='salmon')

plt.title(f'{col} Box Plot')

plt.tight_layout()

plt.show()

print("Outlier Counts:")

for col in numeric_cols:

Q1 = df[col].quantile(0.25)

Q3 = df[col].quantile(0.75)

IQR = Q3 - Q1

lower = Q1 - 1.5 * IQR

upper = Q3 + 1.5 * IQR

outliers = df[(df[col] < lower) | (df[col] > upper)]

print(f"{col}: {len(outliers)} outliers")

print("\nDataset Summary:")

print(df.describe())

LAB 2

#2nd

import pandas as pd

import seaborn as sns

import matplotlib.pyplot as plt

from sklearn.datasets import fetch_california_housing

california_data = fetch_california_housing(as_frame=True)

data = california_data.frame

correlation_matrix = data.corr()

plt.figure(figsize=(10, 8))

sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)

plt.title('Correlation Matrix of California Housing Features')

plt.show()

sns.pairplot(data, diag_kind='kde', plot_kws={'alpha': 0.5})

plt.suptitle('Pair Plot of California Housing Features', y=1.02)

plt.show()

LAB 3

import matplotlib.pyplot as plt

from sklearn.datasets import load_iris

from sklearn.decomposition import PCA

data =load_iris()

X = PCA(2).fit_transform(data.data)

for i, c in zip(range (3), 'rgb'):

plt.scatter (*X[data.target == i].T, c=c, label=data.target_names[i])

plt.title('Iris PCA'),

plt.xlabel('PC1'), plt.ylabel('PC2')

plt.legend(), plt.grid(True)

plt.show()

LAB 4

#4th

import pandas as pd

def find_s_algorithm(file_path):

data = pd.read_csv(file_path)

print("Training data:")

print(data)

attributes = data.columns[:-1]

class_label = data.columns[-1]

hypothesis = ['?' for _ in attributes]

for index, row in data.iterrows():

if row[class_label] == 'Yes':

for i, value in enumerate(row[attributes]):

if hypothesis[i] == '?' or hypothesis[i] == value:

hypothesis[i] = value

else:

hypothesis[i] = '?'

return hypothesis

file_path = 'training_data.csv'

hypothesis = find_s_algorithm(file_path)

print("\nThe final hypothesis is:", hypothesis)

LAB 5

import numpy as np, matplotlib.pyplot as plt

from collections import Counter

data = np.random.rand(100)

train, labels = data[:50], ["Class1" if x <= 0.5 else "Class2" for x in data[:50]]

test = data[50:]

def knn(p, train, labels, k):

return Counter(lbl for _, lbl in sorted((abs(p - x), l) for x, l in zip(train, labels))[:k]).most_common(1)[0][0]

for k in [1, 3, 5, 10]:

preds = [knn(p, train, labels, k) for p in test]

print(f"\nk = {k}")

for i, (val, pred) in enumerate(zip(test, preds), 51):

print(f"x{i}: {val:.2f} → {pred}")

plt.scatter(train, [0]*50, c=["b" if l=="Class1" else "r" for l in labels], marker="o")

plt.scatter(test, [1]*50, c=["b" if p=="Class1" else "r" for p in preds], marker="x")

plt.yticks([0, 1], ["Train", "Test"])

plt.title(f"k-NN (k={k})")

plt.grid(); plt.show()

LAB 6

import numpy as np, matplotlib.pyplot as plt

x = np.linspace(0, 2*np.pi, 100)

y = np.sin(x) + 0.1*np.random.randn(100)

t = np.linspace(0, 2*np.pi, 200)

def f(tx):

w = np.exp(-(x - tx)**2 / 0.25)

X = np.c_[np.ones_like(x), x]

theta = np.linalg.pinv(X.T @ np.diag(w) @ X) @ X.T @ np.diag(w) @ y

return np.array([1, tx]) @ theta

plt.scatter(x, y, c='r')

plt.plot(t, [f(i) for i in t], c='b')

plt.show()

LAB 7

import pandas as pd, numpy as np, matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression

from sklearn.pipeline import make_pipeline

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import PolynomialFeatures

from sklearn.metrics import mean_squared_error as mse, r2_score as r2

d = pd.read_csv("http://lib.stat.cmu.edu/datasets/boston", sep="\s+", skiprows=22, header=None)

x = np.hstack([d.values[::2], d.values[1::2, :2]])[:, 5:6]

y = d.values[1::2, 2]

x1, x2, y1, y2 = train_test_split(x, y)

m = LinearRegression().fit(x1, y1)

p = m.predict(x2)

plt.scatter(x2, y2); plt.plot(x2, p, c="r"); plt.title("Linear"); plt.show()

print("Linear Regression\nMSE:", mse(y2, p), "\nR²:", r2(y2, p))

u = "https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"

df = pd.read_csv(u, sep="\s+", names=["a","b","c","d","e","f","g","h","i"], na_values="?").dropna()

x = df[["c"]]; y = df["a"]

x1, x2, y1, y2 = train_test_split(x, y)

m = make_pipeline(PolynomialFeatures(2), LinearRegression()).fit(x1, y1)

p = m.predict(x2)

plt.scatter(x2, y2); plt.scatter(x2, p, c="r", s=10); plt.title("Poly"); plt.show()

print("Polynomial Regression\nMSE:", mse(y2, p), "\nR²:", r2(y2, p))

LAB 8

from sklearn.datasets import load_breast_cancer

from sklearn.tree import DecisionTreeClassifier, plot_tree

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt

X, y = load_breast_cancer(return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(X, y)

model = DecisionTreeClassifier().fit(X_train, y_train)

print("Accuracy:", accuracy_score(y_test, model.predict(X_test)) * 100, "%")

result = model.predict([X_test[0]])

print("Predicted Class:", "Benign" if result[0] == 1 else "Malignant")

plot_tree(model, filled=True)

plt.show()

LAB 9

from sklearn.datasets import fetch_olivetti_faces

from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import accuracy_score, classification_report

import matplotlib.pyplot as plt

X, y = fetch_olivetti_faces(return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = GaussianNB()

model.fit(X_train, y_train)

pred = model.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, pred)*100:.2f}%")

print(classification_report(y_test, pred, zero_division=1))

for i in range(15):

plt.subplot(3, 5, i+1)

plt.imshow(X_test[i].reshape(64, 64), cmap='gray')

plt.title(f"T:{y_test[i]} P:{pred[i]}")

plt.tight_layout()

plt.show()\

LAB 10

from sklearn.datasets import load_breast_cancer

from sklearn.preprocessing import StandardScaler

from sklearn.cluster import KMeans

from sklearn.decomposition import PCA

from sklearn.metrics import confusion_matrix, classification_report

import pandas as pd, seaborn as sns, matplotlib.pyplot as plt

X, y = load_breast_cancer(return_X_y=True)

X = StandardScaler().fit_transform(X)

# KMeans and PCA

k = KMeans(n_clusters=2, random_state=0).fit(X)

p = PCA(2).fit(X)

X2 = p.transform(X)

cent = p.transform(k.cluster_centers_)

# Results

print(confusion_matrix(y, k.labels_))

print(classification_report(y, k.labels_))

# Plot

d = pd.DataFrame(X2, columns=['x', 'y'])

d['k'] = k.labels_

d['t'] = y

f, a = plt.subplots(1, 3, figsize=(18, 5))

sns.scatterplot(data=d, x='x', y='y', hue='k', palette='Set1', s=100, edgecolor='k', ax=a[0]).set(title='KMeans')

sns.scatterplot(data=d, x='x', y='y', hue='t', palette='coolwarm', s=100, edgecolor='k', ax=a[1]).set(title='True Labels')

sns.scatterplot(data=d, x='x', y='y', hue='k', palette='Set1', s=100, edgecolor='k', ax=a[2])

a[2].scatter(*cent.T, c='red', s=200, marker='X'); a[2].set_title('With Centroids')

plt.tight_layout(); plt.show()

Tarun

Saturday, May 31, 2025

ML LAB

No comments:

Post a Comment

GEN AI

Report Abuse