Matching Documents with TFIDF Weighted Document Vectors

Recommending jobs based on resume

TFIDF, Document Vector, and TFIDF-Weighted Document Vector

First, install necessary packages.

# install spacy and download english library
!pip install -U spacy
!python -m spacy download en_core_web_md

Read in data.

!pip install docx2txt
!pip install PyPDF2
import glob
import docx2txt
import PyPDF2
# read in jobs
jobs = []
fnames = []
# docx documents
for fname in glob.iglob('data/jobs/*.docx', recursive=True):
    fnames.append(fname)
    jobs.append( docx2txt.process(fname) )
# DOCX documents
for fname in glob.iglob('data/jobs/*.DOCX', recursive=True):
    fnames.append(fname)
    jobs.append( docx2txt.process(fname) )
# pdf documents
for fname in glob.iglob('data/jobs/*.pdf', recursive=True):
    fnames.append(fname)
    with open(fname,'rb') as f:
        pdf_reader = PyPDF2.PdfFileReader(f)
        tmp = ''
        for i in range(pdf_reader.numPages):
            page = pdf_reader.getPage(i).extractText()
            tmp += page
    jobs.append(tmp)
# read in resumes
resumes = []
# docx documents
for fname in glob.iglob('data/resumes/*.docx', recursive=True):
    fnames.append(fname)
    resumes.append( docx2txt.process(fname) )
# pdf documents
for fname in glob.iglob('data/resumes/*.pdf', recursive=True):
    fnames.append(fname)
    with open(fname,'rb') as f:
        pdf_reader = PyPDF2.PdfFileReader(f)
        tmp = ''
        for i in range(pdf_reader.numPages):
            page = pdf_reader.getPage(i).extractText()
            tmp += page
    resumes.append(tmp)
# import and load library
import numpy as np
import pandas as pd
import spacy
nlp = spacy.load('en_core_web_sm')
# keep only lemmatization of words that are not space, stop words, punctuation or number
docs = []
# create a list for doc vector
docs_vec = []
# create a dictionary mapping the lemmatizations to word vectors
word_vec = dict()
for doc in jobs+resumes:
    tmp = nlp(doc.lower())
    docs_vec.append(tmp.vector)
    doc_i = ''
    for t in tmp:
        if t.is_alpha and t.has_vector and not (t.is_space or t.is_punct or t.is_stop or t.like_num): 
            doc_i += ' ' + t.lemma_
            if t not in word_vec:
                word_vec[t.lemma_] = t.vector
    docs.append(doc_i)
# get word vectors in a numpy array
words_vec = np.vstack([v for k, v in word_vec.items()])
word_order = [k for k, v in word_vec.items() ]
# get tfidf 
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(docs)
tfs = tfidf.fit_transform(docs)
doc_tfidf = tfs.todense()
# if different words included, check if important ones are missing
if words_vec.shape[0] != tfs.shape[1]:
    print(set(word_order).difference(set(tfidf.get_feature_names())))
{'y', 'w', 'r', 'n', 'm', 'x', 'd', 't', 'e', 'l', 'o', 'c', 's', 'p', 'f'}
# slice word_vec to have the same number of words as tfidf
ind = [word_order.index(i) for i in tfidf.get_feature_names()]
words_vec = words_vec[ind,]
# calculate tfidf weighted word vec
doc_wt_vec = np.dot(doc_tfidf, words_vec)
# Import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity

# Compute the cosine similarity matrix
tfidf_cos_sim = cosine_similarity(tfs, tfs)
wt_vec_cos_sim = cosine_similarity(doc_wt_vec, doc_wt_vec)
doc_vec_cos_sim = cosine_similarity(docs_vec, docs_vec)
import seaborn as sns
sns.heatmap(tfidf_cos_sim, cmap="YlGnBu", xticklabels='', yticklabels=fnames)
sns.heatmap(wt_vec_cos_sim, cmap="YlGnBu", xticklabels=fnames, yticklabels=fnames)
sns.heatmap(doc_vec_cos_sim, cmap="YlGnBu", xticklabels=fnames, yticklabels=fnames)
def recommend_jobs(resume = 9):
    jobs_ind = np.array(['job' in i for i in fnames])
    jobs_sort = tfidf_cos_sim[resume,jobs_ind]
    # recommendation list
    rec_list = jobs_sort.argsort()[-1:-7:-1]
    # remove self
    rec_list = np.setdiff1d(rec_list,resume)
    rec_df = pd.DataFrame(dict(index = rec_list, 
                               filename = np.array(fnames)[rec_list]))
    return rec_df
resume = 9
recommend_jobs(resume)
filenameindex
0data/jobs/Asst Manager Trust Administration.docx0
1data/jobs/CDL - EVP Head of Asset Mgt.docx1
2data/jobs/Citco - Hedge Fund Accountant JD.DOCX2
3data/jobs/Asst Manager Trust PDF.pdf3
4data/jobs/Corp Sec Senior Executive JD.pdf4
5data/jobs/Asst Finance Mgr - JD.pdf5

Supervised Learning

Now, we can train a logistic regression model if we have match/nonmatch labels between resumes and jobs.

Keras

from keras.models import Model
from keras.layers import Input, Embedding, Dot, Add, Flatten
from keras.regularizers import l2
from keras.optimizers import SGD, Adam
Using TensorFlow backend.
N = df.userId.max() + 1 # number of users
M = df.movie_idx.max() + 1 # number of movies