PCA

Some tests with PCA

C = [1,4,2,2,4,2]
df = pd.DataFrame({'A':[1,2,3,4,5,6], 'B':[1,3,5,5,3,1], 'C1':C, 'C2':C})

df

	A	B	C1	C2
0	1	1	1	1
1	2	3	4	4
2	3	5	2	2
3	4	5	2	2
4	5	3	4	4
5	6	1	2	2

zero_mean = df - df.mean()
np.linalg.eig(zero_mean.cov())

(array([2.25783424, 4.13294994, 3.30921583, 0.        ]),
 array([[ 4.38595999e-01, -6.90751652e-01, -5.74887558e-01,
         -4.06759704e-18],
        [ 4.62601343e-01, -3.74906474e-01,  8.03396000e-01,
         -1.14581450e-16],
        [-5.44808933e-01, -4.37211213e-01,  1.09679448e-01,
         -7.07106781e-01],
        [-5.44808933e-01, -4.37211213e-01,  1.09679448e-01,
          7.07106781e-01]]))

#not timed
pca = PCA(n_components=4)

pca.fit(zero_mean)
pca.components_

2.61 ms ± 400 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

print(pca.explained_variance_, pca.singular_values_, sep='\n')

[4.13294994e+00 3.30921583e+00 2.25783424e+00 7.51030182e-33]
[4.54584972e+00 4.06768720e+00 3.35993619e+00 1.93782118e-16]
CPU times: total: 0 ns
Wall time: 0 ns

np.matmul(zero_mean.corr(), pca.components_.T)/pca.singular_values_

	0	1	2	3
A	-0.193928	0.129562	-0.059770	-0.193283
B	-0.117592	-0.207353	-0.078473	-0.607407
C1	-0.240572	-0.059146	0.270675	-0.572923
C2	-0.240572	-0.059146	0.270675	-0.572923

pd.DataFrame(pca.components_.T)

	0	1	2	3
0	-0.690752	0.574888	-0.438596	0.000000e+00
1	-0.374906	-0.803396	-0.462601	-1.110223e-16
2	-0.437211	-0.109679	0.544809	-7.071068e-01
3	-0.437211	-0.109679	0.544809	7.071068e-01

i=0
#np.dot(np.matmul(zero_mean.corr(), np.transpose(pca.components_[i][:,np.newaxis])), pca.components_[i])/pca.explained_variance_[i]

proj = []
for i in range(4):
    proj.append(np.outer(np.dot((df - np.mean(df,axis=0)).to_numpy(), pca.components_[i]),pca.components_[i][:,np.newaxis]))

[np.linalg.norm(p) for p in proj]

[4.5458497197998735,
 4.067687196947729,
 3.3599361887962864,
 6.544668208791957e-16]

trial_df = pd.concat([df,df,df,df,df,df,df,df], ignore_index=True)

trial_df['B'].value_counts()

1    16
3    16
5    16
Name: B, dtype: int64

Trials on LearningCurveDisplay

X_train, X_test, y_train, y_test = train_test_split(trial_df.drop(columns=['B']), trial_df['B'], test_size=0.5, random_state=42, stratify= trial_df['B'])
tree = DecisionTreeClassifier(random_state=42)
LearningCurveDisplay.from_estimator(tree, X_train, y_train, train_sizes=np.linspace(0.1, 1.0, 16))
plt.show()

CPU times: total: 938 ms
Wall time: 1.27 s