# The Santander Customer Competition

Autor:   •  April 5, 2018  •  1,468 Words (6 Pages)  •  158 Views

Page 1 of 6

...

(76020, 308)

In [2]: # clean and split data

from sklearn.cross_validation import train_test_split

from sklearn import linear_model, metrics

from sklearn.feature_selection import RFE

from sklearn.metrics import roc_auc_score

import numpy as np

# remove constant columns (std = 0)

remove = []

for col in train.columns:

if train[col].std() == 0:

remove.append(col)

train.drop(remove, axis=1, inplace=True)

test.drop(remove, axis=1, inplace=True)

4

# remove duplicated columns

remove = []

cols = train.columns

for i in range(len(cols)-1):

v = train[cols[i]].values

for j in range(i+1,len(cols)):

if np.array_equal(v,train[cols[j]].values):

remove.append(cols[j])

train.drop(remove, axis=1, inplace=True)

test.drop(remove, axis=1, inplace=True)

# split data into train and test

test_id = test.ID

test = test.drop(["ID"],axis=1)

X = train.drop(["TARGET","ID"],axis=1)

y = train.TARGET.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape,

Out[2]: ((60816, 306), (15204, 306))

5.2 Principal Component Analysis

In [6]: from sklearn.decomposition import PCA

from sklearn.preprocessing import normalize

pca = PCA(n_components=None)

pca.fit(X_train)

pc = pd.DataFrame(pca.explained_variance_ratio_,columns=[’Proportion’])

pc["Cumulative"] = [np.sum(pca.explained_variance_ratio_[0:i]) for i in range(0,len(pc))]

# principal components

pc.iloc[0:9,:]

Out[6]: Proportion Cumulative

0 0.668105 0.000000

1 0.149682 0.668105

2 0.077319 0.817786

3 0.037672 0.895106

4 0.034582 0.932777

5 0.014138 0.967359

6 0.004752 0.981497

7 0.003842 0.986249

8 0.002488 0.990091

In [7]: # https://www.kaggle.com/tuomastik/santander-customer-satisfaction/pca-visualization

%matplotlib inline

import matplotlib.pyplot as plt

5

# normalize each feature to unit norm (vector length)

X_train_normalized = normalize(X_train, axis=0)

# project principal components onto new axis

pca = PCA(n_components=2)

pca.fit(X_train)

X_train_projected = pca.fit_transform(X_train_normalized)

# visualize

fig = plt.figure(figsize=(10, 7))

colors = [(0.0, 0.63, 0.69), ’black’]

markers = ["o", "D"]

classes = [0,1]

labels = ["Unsatisfied","Satisfied"]

for class_ix, marker, color, label in zip(

classes, markers, colors, labels):

ax.scatter(X_train_projected[np.where(y_train == class_ix), 0],

X_train_projected[np.where(y_train == class_ix), 1],

marker=marker, color=color, edgecolor=’whitesmoke’,

linewidth=’1’, alpha=0.9, label=label)

ax.legend(loc=’best’)

plt.title(

"Scatter plot of the training data examples projected on the "

"2 first principal components")

plt.xlabel("Principal axis 1 - Explains %.1f %% of the variance" % (

pca.explained_variance_ratio_[0] * 100.0))

plt.ylabel("Principal axis 2 - Explains %.1f %% of the variance" % (

pca.explained_variance_ratio_[1] * 100.0))

plt.show()

//anaconda/lib/python2.7/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise compariif self. edgecolors == str(’face’):

6

5.3 Feature selection

In [ ]: from sklearn.ensemble import ExtraTreesClassifier

from sklearn.feature_selection import SelectFromModel

clf = ExtraTreesClassifier(random_state=1729)

selector = clf.fit(X_train, y_train)

fs = SelectFromModel(selector, prefit=True)

# Data sets with reduced dimensions (36)

X_train2 = fs.transform(X_train)

X_test2 = fs.transform(X_test)

...