import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Non-Negative matrix factorization (NMF)

  • NMF = Non-negative matrix factorization
    • Dimension reduction technique
    • NMF models are interpretable (unlike PCA)
    • Easy to interpret means easy to explain
    • However, all sample features must be non-negative ($\ge 0$)
  • NMF components
    • Just like PCA has principal components
    • Dimension of components = dimension of samples
    • Entries are non-negative
    • Can be used to reconstruct the samples
    • Combine feature values with components
  • Sample reconstruction
    • Multiply components by feature values, and add up
    • Can also be expressed as a product of matrices
    • This is the "Matrix Factorization" in "NMF"

NMF applied to Wikipedia articles

In the video, you saw NMF applied to transform a toy word-frequency array. Now it's your turn to apply NMF, this time using the tf-idf word-frequency array of Wikipedia articles, given as a csr matrix articles. Here, fit the model and transform the articles. In the next exercise, you'll explore the result.

Preprocess

from scipy.sparse import csr_matrix

documents = pd.read_csv('./dataset/wikipedia-vectors.csv', index_col=0)
titles = documents.columns
articles = csr_matrix(documents.values).T
from sklearn.decomposition import NMF

# Create an NMF instance: model
model = NMF(n_components=6)

# Fit the model to articles
model.fit(articles)

# Transform the articles: nmf_features
nmf_features = model.transform(articles)

# Print the NMF features
print(nmf_features)
[[0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 4.40447144e-01]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 5.66581665e-01]
 [3.82052712e-03 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 3.98630002e-01]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 3.81723960e-01]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 4.85497565e-01]
 [1.29288170e-02 1.37900639e-02 7.76326408e-03 3.34365996e-02
  0.00000000e+00 3.34508155e-01]
 [0.00000000e+00 0.00000000e+00 2.06741971e-02 0.00000000e+00
  6.04540794e-03 3.59046120e-01]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 4.90956931e-01]
 [1.54271421e-02 1.42828947e-02 3.76635009e-03 2.37026001e-02
  2.62642981e-02 4.80754528e-01]
 [1.11736323e-02 3.13702678e-02 3.09484990e-02 6.56762061e-02
  1.96694618e-02 3.38274818e-01]
 [0.00000000e+00 0.00000000e+00 5.30717612e-01 0.00000000e+00
  2.83704029e-02 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 3.56508094e-01 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [1.20125112e-02 6.50087569e-03 3.12244190e-01 6.09549744e-02
  1.13871286e-02 1.92593939e-02]
 [3.93478571e-03 6.24483457e-03 3.42372089e-01 1.10728765e-02
  0.00000000e+00 0.00000000e+00]
 [4.63812699e-03 0.00000000e+00 4.34913555e-01 0.00000000e+00
  3.84308261e-02 3.08119905e-03]
 [0.00000000e+00 0.00000000e+00 4.83287460e-01 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [5.65006510e-03 1.83547516e-02 3.76531712e-01 3.25342948e-02
  0.00000000e+00 1.13329771e-02]
 [0.00000000e+00 0.00000000e+00 4.80912131e-01 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 9.01923006e-03 5.51006051e-01 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 4.65968041e-01 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 1.14088418e-02 2.08654946e-02 5.17579649e-01
  5.81458673e-02 1.37848139e-02]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 5.10290254e-01
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 5.60141699e-03 0.00000000e+00 4.22226760e-01
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 4.36592958e-01
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 4.97911506e-01
  0.00000000e+00 0.00000000e+00]
 [9.88376115e-02 8.60100028e-02 3.91034522e-03 3.80879401e-01
  4.39283084e-04 5.22130114e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 5.71962504e-01
  0.00000000e+00 7.13513359e-03]
 [1.31466473e-02 1.04860275e-02 0.00000000e+00 4.68736079e-01
  0.00000000e+00 1.16305318e-02]
 [3.84543550e-03 0.00000000e+00 0.00000000e+00 5.75501882e-01
  0.00000000e+00 0.00000000e+00]
 [2.25241869e-03 1.38746694e-03 0.00000000e+00 5.27754407e-01
  1.20275139e-02 1.49477806e-02]
 [0.00000000e+00 4.07574382e-01 1.85713967e-03 0.00000000e+00
  2.96635743e-03 4.52315589e-04]
 [1.53418232e-03 6.08212140e-01 5.22275466e-04 6.24626335e-03
  1.18454877e-03 4.40049387e-04]
 [5.38809700e-03 2.65034105e-01 5.38508926e-04 1.86857967e-02
  6.38706684e-03 2.90092523e-03]
 [0.00000000e+00 6.44957364e-01 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 6.08946122e-01 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 3.43707347e-01 0.00000000e+00 0.00000000e+00
  3.97828600e-03 0.00000000e+00]
 [6.10497459e-03 3.15333091e-01 1.54879481e-02 0.00000000e+00
  5.06288085e-03 4.74315077e-03]
 [6.47362189e-03 2.13342287e-01 9.49492529e-03 4.56815320e-02
  1.71929395e-02 9.52023189e-03]
 [7.99132601e-03 4.67625236e-01 0.00000000e+00 2.43337052e-02
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 6.42861446e-01 0.00000000e+00 2.35768628e-03
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  4.77121003e-01 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  4.94295496e-01 0.00000000e+00]
 [0.00000000e+00 2.99081204e-04 2.14485182e-03 0.00000000e+00
  3.81809252e-01 5.83752705e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 5.64485513e-03
  5.42284829e-01 0.00000000e+00]
 [1.78055699e-03 7.84461186e-04 1.41627290e-02 4.59634651e-04
  4.24336362e-01 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  5.11432598e-01 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 3.28382958e-03 0.00000000e+00
  3.72916714e-01 0.00000000e+00]
 [0.00000000e+00 2.62099570e-04 3.61103149e-02 2.32246874e-04
  2.30529171e-01 0.00000000e+00]
 [1.12515562e-02 2.12341198e-03 1.60971826e-02 1.02447544e-02
  3.25487703e-01 3.75864568e-02]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  4.18991783e-01 3.57664717e-04]
 [3.08367803e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [3.68174824e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [3.97945914e-01 2.81721215e-02 3.67011224e-03 1.70005030e-02
  1.95983506e-03 2.11635763e-02]
 [3.75795603e-01 2.07534002e-03 0.00000000e+00 3.72019376e-02
  0.00000000e+00 5.85903599e-03]
 [4.38029361e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [4.57882228e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [2.75477966e-01 4.46985638e-03 0.00000000e+00 5.29463349e-02
  0.00000000e+00 1.90989751e-02]
 [4.45195103e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00
  5.48742823e-03 0.00000000e+00]
 [2.92741164e-01 1.33673384e-02 1.14263020e-02 1.05161816e-02
  1.87711505e-01 9.23926402e-03]
 [3.78267498e-01 1.43979557e-02 0.00000000e+00 9.84882180e-02
  1.35911385e-02 0.00000000e+00]]

NMF features of the Wikipedia articles

Now you will explore the NMF features you created in the previous exercise. A solution to the previous exercise has been pre-loaded, so the array nmf_features is available. Also available is a list titles giving the title of each Wikipedia article.

When investigating the features, notice that for both actors, the NMF feature 3 has by far the highest value. This means that both articles are reconstructed using mainly the 3rd NMF component. In the next video, you'll see why: NMF components represent topics (for instance, acting!).

df = pd.DataFrame(nmf_features, index=titles)

# Print the row for 'Anne Hathaway'
print(df.loc['Anne Hathaway'])

# Print the row for 'Denzel Washington'
print(df.loc['Denzel Washington'])
0    0.003845
1    0.000000
2    0.000000
3    0.575502
4    0.000000
5    0.000000
Name: Anne Hathaway, dtype: float64
0    0.000000
1    0.005601
2    0.000000
3    0.422227
4    0.000000
5    0.000000
Name: Denzel Washington, dtype: float64

NMF reconstructs samples

In this exercise, you'll check your understanding of how NMF reconstructs samples from its components using the NMF feature values. On the right are the components of an NMF model. If the NMF feature values of a sample are [2, 1], then which of the following is most likely to represent the original sample? A pen and paper will help here! You have to apply the same technique Ben used in the video to reconstruct the sample [0.1203 0.1764 0.3195 0.141].

sample_feature = np.array([2, 1])
components = np.array([[1. , 0.5, 0. ],
                       [0.2, 0.1, 2.1]])
np.matmul(sample_feature.T, components)
array([2.2, 1.1, 2.1])

NMF learns interpretable parts

NMF learns topics of documents

In the video, you learned when NMF is applied to documents, the components correspond to topics of documents, and the NMF features reconstruct the documents from the topics. Verify this for yourself for the NMF model that you built earlier using the Wikipedia articles. Previously, you saw that the 3rd NMF feature value was high for the articles about actors Anne Hathaway and Denzel Washington. In this exercise, identify the topic of the corresponding NMF component.

The NMF model you built earlier is available as model, while words is a list of the words that label the columns of the word-frequency array.

After you are done, take a moment to recognise the topic that the articles about Anne Hathaway and Denzel Washington have in common!

Preprocess

words = []
with open('./dataset/wikipedia-vocabulary-utf8.txt') as f:
    words = f.read().splitlines()
components_df = pd.DataFrame(model.components_, columns=words)

# Print the shape of the DataFrame
print(components_df.shape)

# Select row 3: component
component = components_df.iloc[3]

# Print result of nlargest
print(component.nlargest())
(6, 13125)
film       0.628104
award      0.253223
starred    0.245373
role       0.211528
actress    0.186465
Name: 3, dtype: float64

Explore the LED digits dataset

In the following exercises, you'll use NMF to decompose grayscale images into their commonly occurring patterns. Firstly, explore the image dataset and see how it is encoded as an array. You are given 100 images as a 2D array samples, where each row represents a single 13x8 image. The images in your dataset are pictures of a LED digital display.

Preprocess

df = pd.read_csv('./dataset/lcd-digits.csv', header=None)
df.head()
0 1 2 3 4 5 6 7 8 9 ... 94 95 96 97 98 99 100 101 102 103
0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 104 columns

samples = df.values
digit = samples[0]

# Print digit
print(digit)

# Reshape digit to a 13x8 array: bitmap
bitmap = digit.reshape(13, 8)

# Print bitmap
print(bitmap)

# Use plt.imshow to display bitmap
plt.imshow(bitmap, cmap='gray', interpolation='nearest')
plt.colorbar()
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0.]
[[0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 1. 1. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]]
<matplotlib.colorbar.Colorbar at 0x7fe9d065dc50>

NMF learns the parts of images

Now use what you've learned about NMF to decompose the digits dataset. You are again given the digit images as a 2D array samples. This time, you are also provided with a function show_as_image() that displays the image encoded by any 1D array:

def show_as_image(sample):
    bitmap = sample.reshape((13, 8))
    plt.figure()
    plt.imshow(bitmap, cmap='gray', interpolation='nearest')
    plt.colorbar()
    plt.show()
def show_as_image(sample):
    bitmap = sample.reshape((13, 8))
    plt.figure()
    plt.imshow(bitmap, cmap='gray', interpolation='nearest')
    plt.colorbar()
model = NMF(n_components=7)

# Apply fit_transform to samples: features
features = model.fit_transform(samples)

# Call show_as_image on each component
for component in model.components_:
    show_as_image(component)
    
# Assign the 0th row of features: digit_features
digit_features = features[0]

# Print digit_features
print(digit_features)
[4.76823559e-01 0.00000000e+00 0.00000000e+00 5.90605054e-01
 4.81559442e-01 0.00000000e+00 7.37568241e-16]

PCA doesn't learn parts

Unlike NMF, PCA doesn't learn the parts of things. Its components do not correspond to topics (in the case of documents) or to parts of images, when trained on images. Verify this for yourself by inspecting the components of a PCA model fit to the dataset of LED digit images from the previous exercise. The images are available as a 2D array samples. Also available is a modified version of the show_as_image() function which colors a pixel red if the value is negative.

After submitting the answer, notice that the components of PCA do not represent meaningful parts of images of LED digits!

from sklearn.decomposition import PCA

# Createa PCA instance: model
model = PCA(n_components=7)

# Apply fit_transform to samples: features
features = model.fit_transform(samples)

# Call show_as_image on each component
for component in model.components_:
    show_as_image(component)

Building recommender systems using NMF

  • Finding similar articles
    • Engineer at a large online newspaper
    • Task: recommand articles similar to article being read by customer
    • Similar articles should have similar topics
  • Strategy
    • Apply NMF to the word-frequency array
    • NMF feature values describe the topics, so similar documents have similar NMF feature values
    • Compare NMF feature values?
  • Versions of articles
    • Different versions of the same document have same topic proportions
    • exact feature values may be different! E.g., because one version uses many meaningless words
    • But all versions lie on the same line through the origin
  • Cosine Similarity cosine similarity
    • Uses the angle between the lines
    • Higher values means more similar
    • Maximum value is 1, when angle is 0 degrees

Which articles are similar to 'Cristiano Ronaldo'?

In the video, you learned how to use NMF features and the cosine similarity to find similar articles. Apply this to your NMF model for popular Wikipedia articles, by finding the articles most similar to the article about the footballer Cristiano Ronaldo. The NMF features you obtained earlier are available as nmf_features, while titles is a list of the article titles.

from sklearn.preprocessing import normalize

# Normalize the NMF features: norm_features
norm_features = normalize(nmf_features)

# Create a DataFrame: df
df = pd.DataFrame(norm_features, index=titles)

# Select the row corresponding to 'Cristiano Ronaldo': article
article = df.loc['Cristiano Ronaldo']

# Compute the dot products: similarities
similarities = df.dot(article)

# Display thouse with the largest cosine similarity
print(similarities.nlargest())
Cristiano Ronaldo                1.000000
Franck Ribéry                    0.999972
Radamel Falcao                   0.999942
Zlatan Ibrahimović               0.999942
France national football team    0.999923
dtype: float64

Recommend musical artists part I

In this exercise and the next, you'll use what you've learned about NMF to recommend popular music artists! You are given a sparse array artists whose rows correspond to artists and whose columns correspond to users. The entries give the number of times each artist was listened to by each user.

In this exercise, build a pipeline and transform the array into normalized NMF features. The first step in the pipeline, MaxAbsScaler, transforms the data so that all users have the same influence on the model, regardless of how many different artists they've listened to. In the next exercise, you'll use the resulting normalized NMF features for recommendation!

Preprocess

from scipy.sparse import coo_matrix

df = pd.read_csv('./dataset/scrobbler-small-sample.csv')
artists1 = df.sort_values(['artist_offset', 'user_offset'], ascending=[True, True])
row_ind = np.array(artists1['artist_offset'])
col_ind = np.array(artists1['user_offset'])
data1 = np.array(artists1['playcount'])
artists = coo_matrix((data1, (row_ind, col_ind)))
artists
<111x500 sparse matrix of type '<class 'numpy.int64'>'
	with 2894 stored elements in COOrdinate format>
from sklearn.preprocessing import Normalizer, MaxAbsScaler
from sklearn.pipeline import make_pipeline

# Create a MaxAbsScaler: scaler
scaler = MaxAbsScaler()

# Create an NMF model: nmf
nmf = NMF(n_components=20)

# Create a Normalizer: normalizer
normalizer = Normalizer()

# Create a pipeline: pipeline
pipeline = make_pipeline(scaler, nmf, normalizer)

# Apply fit_transform to artists: norm_features
norm_features = pipeline.fit_transform(artists)

Recommend musical artists part II

Suppose you were a big fan of Bruce Springsteen - which other musicial artists might you like? Use your NMF features from the previous exercise and the cosine similarity to find similar musical artists. A solution to the previous exercise has been run, so norm_features is an array containing the normalized NMF features as rows. The names of the musical artists are available as the list artist_names.

Preprocess

df = pd.read_csv('./dataset/artists.csv', header=None)
artist_names = df.values.reshape(111).tolist()
artist_names
['Massive Attack',
 'Sublime',
 'Beastie Boys',
 'Neil Young',
 'Dead Kennedys',
 'Orbital',
 'Miles Davis',
 'Leonard Cohen',
 'Van Morrison',
 'NOFX',
 'Rancid',
 'Lamb',
 'Korn',
 'Dropkick Murphys',
 'Bob Dylan',
 'Eminem',
 'Nirvana',
 'Van Halen',
 'Damien Rice',
 'Elvis Costello',
 'Everclear',
 'Jimi Hendrix',
 'PJ Harvey',
 'Red Hot Chili Peppers',
 'Ryan Adams',
 'Soundgarden',
 'The White Stripes',
 'Madonna',
 'Eric Clapton',
 'Bob Marley',
 'Dr. Dre',
 'The Flaming Lips',
 'Tom Waits',
 'Moby',
 'Cypress Hill',
 'Garbage',
 'Fear Factory',
 '50 Cent',
 'Ani DiFranco',
 'Matchbox Twenty',
 'The Police',
 'Eagles',
 'Phish',
 'Stone Temple Pilots',
 'Black Sabbath',
 'Britney Spears',
 'Fatboy Slim',
 'System of a Down',
 'Simon & Garfunkel',
 'Snoop Dogg',
 'Aimee Mann',
 'Less Than Jake',
 'Rammstein',
 'Reel Big Fish',
 'The Prodigy',
 'Pantera',
 'Foo Fighters',
 'The Beatles',
 'Incubus',
 'Audioslave',
 'Bright Eyes',
 'Machine Head',
 'AC/DC',
 'Dire Straits',
 'Motörhead',
 'Ramones',
 'Slipknot',
 'Me First and the Gimme Gimmes',
 'Bruce Springsteen',
 'Queens of the Stone Age',
 'The Chemical Brothers',
 'Bon Jovi',
 'Goo Goo Dolls',
 'Alice in Chains',
 'Howard Shore',
 'Barenaked Ladies',
 'Anti-Flag',
 'Nick Cave and the Bad Seeds',
 'Static-X',
 'Misfits',
 '2Pac',
 'Sparta',
 'Interpol',
 'The Crystal Method',
 'The Beach Boys',
 'Goldfrapp',
 'Bob Marley & the Wailers',
 'Kylie Minogue',
 'The Blood Brothers',
 'Mirah',
 'Ludacris',
 'Snow Patrol',
 'The Mars Volta',
 'Yeah Yeah Yeahs',
 'Iced Earth',
 'Fiona Apple',
 'Rilo Kiley',
 'Rufus Wainwright',
 'Flogging Molly',
 'Hot Hot Heat',
 'Dredg',
 'Switchfoot',
 'Tegan and Sara',
 'Rage Against the Machine',
 'Keane',
 'Jet',
 'Franz Ferdinand',
 'The Postal Service',
 'The Dresden Dolls',
 'The Killers',
 'Death From Above 1979']
df = pd.DataFrame(norm_features, index=artist_names)

# Select row of 'Bruce Springsteen': artist
artist = df.loc['Bruce Springsteen']

# Compute cosine similarities: similarities
similarities = df.dot(artist)

# Display those with highest cosine similarity
print(similarities.nlargest())
Bruce Springsteen    1.000000
Neil Young           0.956763
Van Morrison         0.874541
Leonard Cohen        0.866873
Bob Dylan            0.862504
dtype: float64