= [1,4,2,2,4,2]
C = pd.DataFrame({'A':[1,2,3,4,5,6], 'B':[1,3,5,5,3,1], 'C1':C, 'C2':C}) df
PCA
Some tests with PCA
df
A | B | C1 | C2 | |
---|---|---|---|---|
0 | 1 | 1 | 1 | 1 |
1 | 2 | 3 | 4 | 4 |
2 | 3 | 5 | 2 | 2 |
3 | 4 | 5 | 2 | 2 |
4 | 5 | 3 | 4 | 4 |
5 | 6 | 1 | 2 | 2 |
= df - df.mean()
zero_mean np.linalg.eig(zero_mean.cov())
(array([2.25783424, 4.13294994, 3.30921583, 0. ]),
array([[ 4.38595999e-01, -6.90751652e-01, -5.74887558e-01,
-4.06759704e-18],
[ 4.62601343e-01, -3.74906474e-01, 8.03396000e-01,
-1.14581450e-16],
[-5.44808933e-01, -4.37211213e-01, 1.09679448e-01,
-7.07106781e-01],
[-5.44808933e-01, -4.37211213e-01, 1.09679448e-01,
7.07106781e-01]]))
#not timed
= PCA(n_components=4) pca
pca.fit(zero_mean) pca.components_
2.61 ms ± 400 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
print(pca.explained_variance_, pca.singular_values_, sep='\n')
[4.13294994e+00 3.30921583e+00 2.25783424e+00 7.51030182e-33]
[4.54584972e+00 4.06768720e+00 3.35993619e+00 1.93782118e-16]
CPU times: total: 0 ns
Wall time: 0 ns
/pca.singular_values_ np.matmul(zero_mean.corr(), pca.components_.T)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
A | -0.193928 | 0.129562 | -0.059770 | -0.193283 |
B | -0.117592 | -0.207353 | -0.078473 | -0.607407 |
C1 | -0.240572 | -0.059146 | 0.270675 | -0.572923 |
C2 | -0.240572 | -0.059146 | 0.270675 | -0.572923 |
pd.DataFrame(pca.components_.T)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | -0.690752 | 0.574888 | -0.438596 | 0.000000e+00 |
1 | -0.374906 | -0.803396 | -0.462601 | -1.110223e-16 |
2 | -0.437211 | -0.109679 | 0.544809 | -7.071068e-01 |
3 | -0.437211 | -0.109679 | 0.544809 | 7.071068e-01 |
=0
i#np.dot(np.matmul(zero_mean.corr(), np.transpose(pca.components_[i][:,np.newaxis])), pca.components_[i])/pca.explained_variance_[i]
= []
proj for i in range(4):
- np.mean(df,axis=0)).to_numpy(), pca.components_[i]),pca.components_[i][:,np.newaxis])) proj.append(np.outer(np.dot((df
for p in proj] [np.linalg.norm(p)
[4.5458497197998735,
4.067687196947729,
3.3599361887962864,
6.544668208791957e-16]
= pd.concat([df,df,df,df,df,df,df,df], ignore_index=True) trial_df
'B'].value_counts() trial_df[
1 16
3 16
5 16
Name: B, dtype: int64
Trials on LearningCurveDisplay
= train_test_split(trial_df.drop(columns=['B']), trial_df['B'], test_size=0.5, random_state=42, stratify= trial_df['B'])
X_train, X_test, y_train, y_test = DecisionTreeClassifier(random_state=42)
tree =np.linspace(0.1, 1.0, 16))
LearningCurveDisplay.from_estimator(tree, X_train, y_train, train_sizes plt.show()
CPU times: total: 938 ms
Wall time: 1.27 s