The Santander Customer Competition
Autor: Tim • April 5, 2018 • 1,468 Words (6 Pages) • 577 Views
...
(76020, 308)
In [2]: # clean and split data
from sklearn.cross_validation import train_test_split
from sklearn import linear_model, metrics
from sklearn.feature_selection import RFE
from sklearn.metrics import roc_auc_score
import numpy as np
# remove constant columns (std = 0)
remove = []
for col in train.columns:
if train[col].std() == 0:
remove.append(col)
train.drop(remove, axis=1, inplace=True)
test.drop(remove, axis=1, inplace=True)
4
# remove duplicated columns
remove = []
cols = train.columns
for i in range(len(cols)-1):
v = train[cols[i]].values
for j in range(i+1,len(cols)):
if np.array_equal(v,train[cols[j]].values):
remove.append(cols[j])
train.drop(remove, axis=1, inplace=True)
test.drop(remove, axis=1, inplace=True)
# split data into train and test
test_id = test.ID
test = test.drop(["ID"],axis=1)
X = train.drop(["TARGET","ID"],axis=1)
y = train.TARGET.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape,
Out[2]: ((60816, 306), (15204, 306))
5.2 Principal Component Analysis
In [6]: from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
pca = PCA(n_components=None)
pca.fit(X_train)
pc = pd.DataFrame(pca.explained_variance_ratio_,columns=[’Proportion’])
pc["Cumulative"] = [np.sum(pca.explained_variance_ratio_[0:i]) for i in range(0,len(pc))]
# principal components
pc.iloc[0:9,:]
Out[6]: Proportion Cumulative
0 0.668105 0.000000
1 0.149682 0.668105
2 0.077319 0.817786
3 0.037672 0.895106
4 0.034582 0.932777
5 0.014138 0.967359
6 0.004752 0.981497
7 0.003842 0.986249
8 0.002488 0.990091
In [7]: # https://www.kaggle.com/tuomastik/santander-customer-satisfaction/pca-visualization
%matplotlib inline
import matplotlib.pyplot as plt
5
# normalize each feature to unit norm (vector length)
X_train_normalized = normalize(X_train, axis=0)
# project principal components onto new axis
pca = PCA(n_components=2)
pca.fit(X_train)
X_train_projected = pca.fit_transform(X_train_normalized)
# visualize
fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(1, 1, 1)
colors = [(0.0, 0.63, 0.69), ’black’]
markers = ["o", "D"]
classes = [0,1]
labels = ["Unsatisfied","Satisfied"]
for class_ix, marker, color, label in zip(
classes, markers, colors, labels):
ax.scatter(X_train_projected[np.where(y_train == class_ix), 0],
X_train_projected[np.where(y_train == class_ix), 1],
marker=marker, color=color, edgecolor=’whitesmoke’,
linewidth=’1’, alpha=0.9, label=label)
ax.legend(loc=’best’)
plt.title(
"Scatter plot of the training data examples projected on the "
"2 first principal components")
plt.xlabel("Principal axis 1 - Explains %.1f %% of the variance" % (
pca.explained_variance_ratio_[0] * 100.0))
plt.ylabel("Principal axis 2 - Explains %.1f %% of the variance" % (
pca.explained_variance_ratio_[1] * 100.0))
plt.show()
//anaconda/lib/python2.7/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise compariif self. edgecolors == str(’face’):
6
5.3 Feature selection
In [ ]: from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
clf = ExtraTreesClassifier(random_state=1729)
selector = clf.fit(X_train, y_train)
fs = SelectFromModel(selector, prefit=True)
# Data sets with reduced dimensions (36)
X_train2 = fs.transform(X_train)
X_test2 = fs.transform(X_test)
...