- Used to classify new data points based on the distance from known data.
- Find K nearest neighbors points.Let this points vote on the classification.
- the idea is simple!
This ex. finds the most similar and recommended movies to a particular movie by using the KNN idea.The code defines k nearest distance matrices by genres information and
by rating information.
import pandas as pd import numpy as np import operator from scipy import spatial # let's define a "distance" function that computes the distance between two movies # based on the similarity of their genres and popularity def ComputeDistance(a, b): # a[1] and b[1] are array's of genres the movie belongs to. ex [0,1,1,1,0.....] # 1 - belongs, 0 - not belongs genresA = a[1] genresB = b[1] generesDistanse = spatial.distance.cosine(genresA, genresB) popularityA = a[2] popularityB = b[2] popularityDistance = abs(popularityA - popularityB) return generesDistanse + popularityDistance def getNeighbors(movieID, K): distance = [] for movie in movieDict: if(movie != movieID): dist = ComputeDistance(movieDict[movieID], movieDict[movie]) distance.append((movie, dist)) distance.sort(key = operator.itemgetter(1)) neighbors = [] for x in range(K): neighbors.append(distance[x][0]) return neighbors # define columns names r_cols = ['user_id', 'movie_id', 'rating'] # u.data is a tab delimited data set that contains every rating of a movie # take the first 3 columns in the data file and import them to a new data frame ratings = pd.read_csv('data/u.data', sep='\t', names=r_cols, usecols=range(3)) print(ratings.head()) # group by movie_id and compute the total number of ratings and the average rating # size means how many people rated the movie MovieProperties = ratings.groupby('movie_id').agg({'rating': [np.size, np.mean]}) print(MovieProperties.head()) # size as a number of ratings gives us no real measurement for popularity. # we will create a new DataFrame that will normalize the number of size # ratings by its popularity.# value of 0 means no one rated it. # value of 1 means it most popular movie there is. MovieNumRatings = pd.DataFrame(MovieProperties['rating']['size']) MovieNormalizeNumRatings = MovieNumRatings.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x))) print(MovieNormalizeNumRatings) # now lets build a big dictionary called movieDict. # each entry will contain:# 1.movie name # 2.list of genre values the movie belongs to 1-belongs 0-not belongs # 3.the normalised popularity score 0 to 1# the average rating movieDict = {} with open('data/u.item') as f: for line in f: fields = line.rstrip('\n').split('|') movieID = int(fields[0]) name = fields[1] genres = fields[5:25] genres = [int(x) for x in genres] movieDict[movieID] = (name, genres, MovieNormalizeNumRatings.loc[movieID].get('size'), MovieProperties.loc[movieID].rating.get('mean')) print(movieDict[1][1]) print(movieDict[2][1]) print(ComputeDistance(movieDict[2], movieDict[4])) K=10 avgRating = 0 neighbors = getNeighbors(1, K) for neighbor in neighbors: avgRating += movieDict[neighbor][3] print(movieDict[neighbor][0] + " " + str(movieDict[neighbor][3]))

No comments:
Post a Comment