Machine Learning with Python

machine learning process

build a data set
explorative analysis (histograms/density plot for distributions, scatter plot (numeric-numeric), bar plot (categorical-numeric or categorical-categorical-proportions), box plot (numeric-categorical) for relationships
preprocess for duplicates, outliers, missing value, standardize,
choose a model and a metric
train-test split, 20% hold out dataset, 5 fold cross-validation
train, parameter tuning
evaluation using test set
prediction, interpretation, and result packaging

Tips:

remember to set random_state, to 42, preferably.

# check NA

dat.isnull().sum()

# correlation matrix

import seaborn as sns
sns.set(rc={'figure.figsize':(11.7,8.27)}) # figure size
dat_p = dat.copy()
for i in dat_p.select_dtypes( include = 'object').columns.values:
	dat_p[ i ] = dat_p[ i ].factorize()[1] 
sns.pairplot( dat_p, vars = [], hue = 'outcome_category', size = 3)
# Plot colored by continent for years 2000-2007
sns.pairplot(df[df['year'] >= 2000], 
             vars = ['life_exp', 'log_pop', 'log_gdp_per_cap'], 
             hue = 'continent', diag_kind = 'kde', 
             plot_kws = {'alpha': 0.6, 's': 80, 'edgecolor': 'k'},
             size = 4);
# Title 
plt.suptitle('Pair Plot of Socioeconomic Data for 2000-2007', 
             size = 28);

ways to visualize text

plot top words
plot length by department

plot top words

def get_top_n_words(corpus, n=None):
    vec = CountVectorizer(stop_words = 'english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_words(df['Review Text'], 20)
for word, freq in common_words:
    print(word, freq)
df2 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
df2.groupby('ReviewText').sum()['count'].sort_values(ascending=False)

# show top words that distinguish resume from background
corpus = st.CorpusFromPandas(df, category_col='Department Name', text_col='Review Text', nlp=nlp).build()
print(list(corpus.get_scaled_f_scores_vs_background().index[:10]))

# show top words associated with a category
term_freq_df = corpus.get_term_freq_df()
term_freq_df['Tops Score'] = corpus.get_scaled_f_scores('Tops')
pprint(list(term_freq_df.sort_values(by='Tops Score', ascending=False).index[:10]))

train test split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 42)

column transformer pipe:

## Standardize features by removing the mean and scaling to unit variance
X = data.loc[:, 'X0':'X41']
num_cols = X.select_dtypes(exclude = 'object').columns.values
cat_cols = X.select_dtypes(include = 'object').columns.values

## Numeric:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
num_pipe = Pipeline([ ('impute', SimpleImputer(strategy='median')), 
 ('std',StandardScaler()) ])
 
## Categorical:
cat_pipe = Pipeline([  ('impute', SimpleImputer(strategy='constant',
                   fill_value='MISSING')),
('ohe', OneHotEncoder(sparse=False,
                    handle_unknown='ignore')) ])
ct = ColumnTransformer( [ ('num', num_pipe, num_cols), 
('cat', cat_pipe, cat_cols) ])

nlp

drop na and empty:
blanks = []  # start with an empty list

for i,lb,rv in df.itertuples():  # iterate over the DataFrame
    if type(rv)==str:            # avoid NaN values
        if rv.isspace():         # test 'review' for whitespace
            blanks.append(i)     # add matching index numbers to the list
df.drop(blanks, inplace=True)
df.dropna(inplace=True)

tfidf vectorize

from sklearn.feature_extraction.text import TfidfVectorizer
add this to the pipe, then use multinomial naive bayes: ('tfidf', TfidfVectorizer(sublinear_tf = True, stop_words = 'english', ngram_range=(1, 2)))
from sklearn.naive_bayes import MultinomialNB
('mnb', MultinomialNB() )

multiclass

add this, or just use random forest: ('clf', OneVsRestClassifier(LinearSVC()))

multilabel

import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer

X_train = np.array(["new york is a hell of a town",
                    "new york was originally dutch"])
y_train_text = [["new york"],["new york"]]

X_test = np.array(['nice day in nyc',
                   'hello welcome to new york. enjoy it here and london too'])
target_names = ['New York', 'London']

mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(y_train_text)

classifier = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', OneVsRestClassifier(LinearSVC()))])

classifier.fit(X_train, Y)
predicted = classifier.predict(X_test)
all_labels = mlb.inverse_transform(predicted)

for item, labels in zip(X_test, all_labels):
    print('{0} => {1}'.format(item, ', '.join(labels)))

machine pipe:

from sklearn.compose import ColumnTransformer
ct = ColumnTransformer(transformers =[('cat', cat_pipe, cat_cols),
                    ('num', num_pipe, num_cols)] )
pl = ct.named_transformers_['cat']
ohe = pl.named_steps['ohe']
ohe.get_feature_names()

ml_pipe = Pipeline([('transform', ct), ('ridge', Ridge())])
from sklearn.model_selection import GridSearchCV
param_grid = {
    'transform__num__si__strategy': ['mean', 'median'],  ## notice the two underscores __
    'ridge__alpha': [.001, 0.1, 1.0, 5, 10, 50, 100, 1000],
    }
gs = GridSearchCV(ml_pipe, param_grid, cv=kf, scoring = 'neg_mean_absolute_error')
gs.fit(train, y)
gs.best_params_
{'ridge__alpha': 10, 'transform__num__si__strategy': 'median'}
gs.best_score_

Elastic Net

from sklearn.linear_model import ElasticNet
ml_pipe = Pipeline([('transform', ct), ('enet', ElasticNet())])
params = {"enet__max_iter": [1, 5, 10],
                      "enet__alpha": [0.0001, 1, 10, 100],
                      "enet__l1_ratio": np.arange(0.0, 1.0, 0.5)}
gs = GridSearchCV( ml_pipe, param_grid = params, scoring='r2', cv=5)
gs.fit(X_train, X_test)
Logistic reg:
from sklearn.linear_model import LogisticRegression
ml_pipe = Pipeline( [('transform', ct), ('logit', LogisticRegression())])
params = { 'penalty = ['l1', 'l2'],
'C' = np.logspace(0, 4, 10)}
GridSearchCV( ml_pipe, param_grid = params, scoring = 'roc_auc', cv = 5)

Random Forest:

max_features: the maximum number of features Random Forest is allowed to try in individual tree n_estimators: the number of trees you want to build min_sample_split: The minimum number of samples required to split an internal node min_samples_leaf : The minimum number of samples required to be at a leaf node.

Regressor:

from sklearn.ensemble import RandomForestRegressor
clf = RandomForestClassifier(n_estimators=100, criterion = 'mae')

param_grid = {"max_depth": [3, 10],
              "max_features": [1, 3, 10],
              "min_sample_leaf": [25, 50] }

# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid, scoring = 'roc_auc', cv=5)
Classifier:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, class_weight = 'balanced', random_state = 42)

param_grid = {"max_depth": [3, 10],
              "max_features": [1, 3, 10],
              "min_sample_leaf": [25, 50] }

# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid, scoring = 'roc_auc', cv=5)
Test:
y_pred = fit.predict(X_test)

from sklearn.metrics import r2_score
r2_score( y_test, y_pred)

clf = gs.best_estimator_named_steps['eln']
clf.coef_

from sklearn import metrics
metrics.classification_report(y_test, y_pred)

draw cv results plot or draw final pred report

import seaborn as sns
sns.boxplot(x='model_name', y='accuracy', data=cv_df)
sns.stripplot(x='model_name', y='accuracy', data=cv_df, 
              size=8, jitter=True, edgecolor="gray", linewidth=2)
plt.show()
sns.heatmap(conf_mat, annot=True, fmt='d',
            xticklabels=category_id_df.Product.values, yticklabels=category_id_df.Product.values)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

importance:

rf = fit.best_estimator_names_steps['rf']
rf.feature_importance_

stem and tfidf

import nltk
import string
import os

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer

path = '/opt/datacourse/data/parts'
token_dict = {}
stemmer = PorterStemmer()

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems

for subdir, dirs, files in os.walk(path):
    for file in files:
        file_path = subdir + os.path.sep + file
        shakes = open(file_path, 'r')
        text = shakes.read()
        lowers = text.lower()
        no_punctuation = lowers.translate(None, string.punctuation)
        token_dict[file] = no_punctuation
        
#this can take some time
tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
tfs = tfidf.fit_transform(token_dict.values())

feature_names = tfidf.get_feature_names()
for col in response.nonzero()[1]:
    print feature_names[col], ' - ', response[0, col]

gensim word2vec

gensim word2vec
nces = [['first', 'sentence'], ['second', 'sentence']]
# train word2vec on the two sentences
model = gensim.models.Word2Vec(sentences, min_count=1)

get tfidf weighted word vector

import spacy
nlp  = spacy.load('en_core_web_md')

# keep only words that are not space, stop words, punctuation or number
def keep_token(t):
    return (t.is_alpha and 
            not (t.is_space or t.is_punct or 
                 t.is_stop or t.like_num))
# keep lemma
def lemmatize_doc(doc):
    return [ t.lemma_ for t in doc if keep_token(t)]

docs = [lemmatize_doc(nlp(doc)) for doc in news_train.data]

# get docs dictionary
from gensim.corpora import Dictionary
from gensim.models.tfidfmodel import TfidfModel
from gensim.matutils import sparse2full

docs_dict = Dictionary(docs)
docs_dict.filter_extremes(no_above=0.05)
docs_dict.compactify()

# get tfidf, a matrix with n rows (docs) and m columns (TF-IDF terms)
import numpy as np

docs_corpus = [docs_dict.doc2bow(doc) for doc in docs]
model_tfidf = TfidfModel(docs_corpus, id2word=docs_dict)
docs_tfidf  = model_tfidf[docs_corpus]
docs_vecs   = np.vstack([sparse2full(c, len(docs_dict)) for c in docs_tfidf])

# get word vectors
tfidf_emb_vecs = np.vstack([nlp(docs_dict[i]).vector for i in range(len(docs_dict))])

# get tfidf weighted sum of word vectors for each document
docs_emb = np.dot(docs_vecs, tfidf_emb_vecs) 

# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

#Construct a reverse map of indices and movie titles
indices = pd.Series(metadata.index, index=metadata['title']).drop_duplicates()

# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return metadata['title'].iloc[movie_indices]

get_recommendations('The Dark Knight Rises')

# plot with TSNE
from sklearn.decomposition import PCA
docs_pca = PCA(n_components=8).fit_transform(docs_emb)
# and then use t-sne to project the vectors to 2D.

from sklearn import manifold

tsne = manifold.TSNE()
viz = tsne.fit_transform(docs_pca)
Plotting with matplotlib.

import matplotlib.pyplot as plt

fig, ax = plt.subplots()
ax.margins(0.05) 

zero_indices = np.where(news_train.target == 0)[0]
one_indices = np.where(news_train.target == 1)[0]

ax.plot(viz[zero_indices,0], viz[zero_indices,1], marker='o', linestyle='', 
        ms=8, alpha=0.3, label=news_train.target_names[0])
ax.plot(viz[one_indices,0], viz[one_indices,1], marker='o', linestyle='', 
        ms=8, alpha=0.3, label=news_train.target_names[1])
ax.legend()

plt.show()

Share on

Twitter Facebook Google+ LinkedIn

Chenxu Wen 文晨旭