Sunday, September 29, 2019

K-Nearest Neighbor - supervised technique for classification

  • Used to classify new data points based on the distance from known data.
  • Find K nearest neighbors points.Let  this points vote on the classification.
  • the idea is simple!



This ex. finds the most similar and recommended movies to a particular movie by using the KNN idea.The code defines k nearest distance matrices by genres information and
by rating information.

import pandas as pd
import numpy as np
import operator
from scipy import spatial

# let's define a "distance" function that computes the distance between two movies
# based on the similarity of their genres and popularity
def ComputeDistance(a, b):
# a[1] and b[1] are array's of genres the movie belongs to. ex [0,1,1,1,0.....]
# 1 - belongs, 0 - not belongs
    genresA = a[1]
    genresB = b[1]
    generesDistanse = spatial.distance.cosine(genresA, genresB)
    popularityA = a[2]
    popularityB = b[2]
    popularityDistance = abs(popularityA - popularityB)
    return generesDistanse + popularityDistance


def getNeighbors(movieID, K):
    distance = []
    for movie in movieDict:
        if(movie != movieID):
            dist = ComputeDistance(movieDict[movieID], movieDict[movie])
            distance.append((movie, dist))
    distance.sort(key = operator.itemgetter(1))
    neighbors = []
    for x in range(K):
        neighbors.append(distance[x][0])
    return neighbors


# define columns names
r_cols = ['user_id', 'movie_id', 'rating']
# u.data is a tab delimited data set that contains every rating of a movie
# take the first 3 columns in the data file and import them to a new data frame
ratings = pd.read_csv('data/u.data', sep='\t', names=r_cols, usecols=range(3))
print(ratings.head())

# group by movie_id and compute the total number of ratings and the average rating
# size means how many people rated the movie
MovieProperties = ratings.groupby('movie_id').agg({'rating': [np.size, np.mean]})
print(MovieProperties.head())

# size as a number of ratings gives us no real measurement for popularity.
# we will create a new DataFrame that will normalize the number of size
# ratings by its popularity.# value of 0 means no one rated it.
# value of 1 means it most popular movie there is.
MovieNumRatings = pd.DataFrame(MovieProperties['rating']['size'])
MovieNormalizeNumRatings = 
       MovieNumRatings.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))
print(MovieNormalizeNumRatings)

# now lets build a big dictionary called movieDict.
# each entry will contain:# 1.movie name
# 2.list of genre values the movie belongs to 1-belongs 0-not belongs
# 3.the normalised popularity score 0 to 1# the average rating
movieDict = {}
with open('data/u.item') as f:
    for line in f:
        fields = line.rstrip('\n').split('|')
        movieID = int(fields[0])
        name = fields[1]
        genres = fields[5:25]
        genres = [int(x) for x in genres]
        movieDict[movieID] = 
           (name, genres, MovieNormalizeNumRatings.loc[movieID].get('size'),
            MovieProperties.loc[movieID].rating.get('mean'))

print(movieDict[1][1])
print(movieDict[2][1])
print(ComputeDistance(movieDict[2], movieDict[4]))


K=10
avgRating = 0
neighbors = getNeighbors(1, K)
for neighbor in neighbors:
    avgRating += movieDict[neighbor][3]
    print(movieDict[neighbor][0] + " " + str(movieDict[neighbor][3]))

No comments:

Post a Comment