import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn import datasets
from sklearn.decomposition import PCA
from numpy import linalg as LA

import numpy as np

# Dataset
iris = datasets.load_iris()

X_raw = iris.data.T    
y = iris.target

X = X_raw - X_raw.mean(axis=1).reshape(4,1)   #data is centered

C = (1/X.shape[1]) * X @ X.T     #covariance matrix

lmbd, U = LA.eigh(C)

k = 2                     #Number of dimension to be reduced
mask = np.argsort(lmbd)   #lambdas are sorted and highest k lambdas are used, others are eliminated
mask = np.flip(mask)      #to make the mask descending order

sorted_U = U[:, mask]         #Columns of U are reordered using mask
sorted_lmbd = lmbd[mask]      #lambdas are reordered
reduced_U = sorted_U[:,:k] 

H = reduced_U.T @ X

ax = plt.gca()

H[0,:] = H[0,:] * -1
ax.scatter(H.T[:, 0], H.T[:, 1],  c=y,
           cmap=plt.cm.Set1, edgecolor='k', s=40)

ax.set_title("First three PCA directions")
ax.set_xlabel("1st eigenvector")
ax.set_ylabel("2nd eigenvector")

plt.show()

ax = plt.gca()

X_reduced = PCA(n_components=2).fit_transform(iris.data)

ax.scatter(X_reduced[:, 0], X_reduced[:, 1],  c=y,
           cmap=plt.cm.Set1, edgecolor='k', s=40)

ax.set_title("First three PCA directions (using sklearn PCA function)")
ax.set_xlabel("1st eigenvector")
ax.set_ylabel("2nd eigenvector")

plt.show()

X_raw = iris.data.T    # Z
y_raw = iris.target

X = X_raw - X_raw.mean(axis=1).reshape(4,1)   #data is centered
X.shape

(4, 150)

# Manual implementation
C = (1/150) * X @ X.T
print("C:\n", C)

# Check using numpy
cov =  np.cov(X, bias=True)
print("\nnp.cov:\n", cov)

C:
 [[ 0.68112222 -0.04215111  1.26582     0.51282889]
 [-0.04215111  0.18871289 -0.32745867 -0.12082844]
 [ 1.26582    -0.32745867  3.09550267  1.286972  ]
 [ 0.51282889 -0.12082844  1.286972    0.57713289]]

np.cov:
 [[ 0.68112222 -0.04215111  1.26582     0.51282889]
 [-0.04215111  0.18871289 -0.32745867 -0.12082844]
 [ 1.26582    -0.32745867  3.09550267  1.286972  ]
 [ 0.51282889 -0.12082844  1.286972    0.57713289]]

# LA.eigh(C) function can be used when the data is symmetric real data.Otherwise, use LA.eig(C). Note: eigh is faster than eig function
lmbd, U = LA.eigh(C)
print("lmbd: ", lmbd)
print("\nU matrix:\n", U)

lmbd:  [0.02367619 0.0776881  0.24105294 4.20005343]

U matrix:
 [[ 0.31548719  0.58202985  0.65658877 -0.36138659]
 [-0.3197231  -0.59791083  0.73016143  0.08452251]
 [-0.47983899 -0.07623608 -0.17337266 -0.85667061]
 [ 0.75365743 -0.54583143 -0.07548102 -0.3582892 ]]

#For j=0
left_eq = C @ U[:,0] 
print("C * u_j:\n", left_eq)
right_eq = lmbd[0] * U[:,0]
print("\nlambda_j * u_j:\n", right_eq)

C * u_j:
 [ 0.00746954 -0.00756983 -0.01136076  0.01784374]

lambda_j * u_j:
 [ 0.00746954 -0.00756983 -0.01136076  0.01784374]

#Check the left equation
print(np.round(U.T @ U, 3))

[[ 1. -0.  0. -0.]
 [-0.  1.  0. -0.]
 [ 0.  0.  1. -0.]
 [-0. -0. -0.  1.]]

print("C:\n", C)
print("\n U @ Lambda @ U.T:\n", U @ (lmbd * np.eye(lmbd.shape[0])) @ U.T)

C:
 [[ 0.68112222 -0.04215111  1.26582     0.51282889]
 [-0.04215111  0.18871289 -0.32745867 -0.12082844]
 [ 1.26582    -0.32745867  3.09550267  1.286972  ]
 [ 0.51282889 -0.12082844  1.286972    0.57713289]]

 U @ Lambda @ U.T:
 [[ 0.68112222 -0.04215111  1.26582     0.51282889]
 [-0.04215111  0.18871289 -0.32745867 -0.12082844]
 [ 1.26582    -0.32745867  3.09550267  1.286972  ]
 [ 0.51282889 -0.12082844  1.286972    0.57713289]]

k = 2   #Number of dimension to be reduced
mask = np.argsort(lmbd)   #lambdas are sorted and highest k lambdas are used, others are eliminated
mask = np.flip(mask)   #to make the mask descending order

sorted_U = U[:, mask]         #Columns of U are reordered using mask
sorted_lmbd = lmbd[mask]      #lambdas are reordered
reduced_U = sorted_U[:,:k] 

print("mask = ", mask, "\n")
print("U:\n", U, "\n")
print("sorted_U:\n", sorted_U, "\n")
print("sorted_lmbd:\n",sorted_lmbd, "\n")
print("reduced_U:\n", reduced_U, "\n")

H = reduced_U.T @ X

print("H.shape: ", H.shape)
print("H[:,0]: ", H[:,0])

mask =  [3 2 1 0] 

U:
 [[ 0.31548719  0.58202985  0.65658877 -0.36138659]
 [-0.3197231  -0.59791083  0.73016143  0.08452251]
 [-0.47983899 -0.07623608 -0.17337266 -0.85667061]
 [ 0.75365743 -0.54583143 -0.07548102 -0.3582892 ]] 

sorted_U:
 [[-0.36138659  0.65658877  0.58202985  0.31548719]
 [ 0.08452251  0.73016143 -0.59791083 -0.3197231 ]
 [-0.85667061 -0.17337266 -0.07623608 -0.47983899]
 [-0.3582892  -0.07548102 -0.54583143  0.75365743]] 

sorted_lmbd:
 [4.20005343 0.24105294 0.0776881  0.02367619] 

reduced_U:
 [[-0.36138659  0.65658877]
 [ 0.08452251  0.73016143]
 [-0.85667061 -0.17337266]
 [-0.3582892  -0.07548102]] 

H.shape:  (2, 150)
H[:,0]:  [2.68412563 0.31939725]

# To getter a better understanding of interaction of the dimensions
# plot the first three PCA dimensions
ax = plt.gca()
H[0,:] = H[0,:] * -1    # Since sklearn multiplies the first dimension with -1, I multiplied with -1 to compare with my result
ax.scatter(H.T[:, 0], H.T[:, 1],  c=y_raw,
           cmap=plt.cm.Set1, edgecolor='k', s=40)
ax.set_title("First three PCA directions")
ax.set_xlabel("1st eigenvector")

ax.set_ylabel("2nd eigenvector")

plt.show()

ax = plt.gca()
X_reduced = PCA(n_components=2).fit_transform(iris.data)
ax.scatter(X_reduced[:, 0], X_reduced[:, 1],  c=y_raw,
           cmap=plt.cm.Set1, edgecolor='k', s=40)
ax.set_title("First three PCA directions (using sklearn PCA function)")
ax.set_xlabel("1st eigenvector")

ax.set_ylabel("2nd eigenvector")

plt.show()

H = reduced_U.T @ X
y = reduced_U @ H

# There are 2 different calculation of truncation error
trunc_err_1 = ((X - y)**2).sum() / X_raw.shape[1]
print("trunc_err_1 = ", trunc_err_1)

trunc_err_2 = sorted_lmbd[k:].sum()
print("trunc_err_2 = ", trunc_err_2)

trunc_err_1 =  0.10136429572959302
trunc_err_2 =  0.10136429572959307

print(sorted_lmbd)
print(sorted_lmbd[k:])

sorted_lmbd[k:].sum()

[4.20005343 0.24105294 0.0776881  0.02367619]
[0.0776881  0.02367619]

0.10136429572959307

Principal Component Analysis

PCA¶

Using Sklearn library¶

PCA with Detailed Explanations¶

Average Truncation Error¶

Elif Cansu YILDIZ