協同過濾常常被用於分辨某位特定顧客可能感興趣的東西,這些結論來自於對其他相似顧客對哪些產品感興趣的分析。協同過濾以其出色的速度和健壯性,在全球互聯網領域炙手可熱。 ...
# coding:utf-8 __author__ = 'similarface' #datalink=http://www2.informatik.uni-freiburg.de/~cziegler/BX/BX-CSV-Dump.zip ''' BX-Users["User-ID";"Location";"Age"] BX-Books["ISBN";"Book-Title";"Book-Author";"Year-Of-Publication";"Publisher";"Image-URL-S";"Image-URL-M";"Image-URL-L"] BX-Book-Ratings["User-ID";"ISBN";"Book-Rating"] ''' #專門用作編碼轉換 import codecs, os, sys from math import sqrt users = { "Angelica": {"Blues Traveler": 3.5, "Broken Bells": 2.0, "Norah Jones": 4.5, "Phoenix": 5.0, "Slightly Stoopid": 1.5, "The Strokes": 2.5, "Vampire Weekend": 2.0}, "Bill": {"Blues Traveler": 2.0, "Broken Bells": 3.5, "Deadmau5": 4.0, "Phoenix": 2.0, "Slightly Stoopid": 3.5, "Vampire Weekend": 3.0}, "Chan": {"Blues Traveler": 5.0, "Broken Bells": 1.0, "Deadmau5": 1.0, "Norah Jones": 3.0, "Phoenix": 5, "Slightly Stoopid": 1.0}, "Dan": {"Blues Traveler": 3.0, "Broken Bells": 4.0, "Deadmau5": 4.5, "Phoenix": 3.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 2.0}, "Hailey": {"Broken Bells": 4.0, "Deadmau5": 1.0, "Norah Jones": 4.0, "The Strokes": 4.0, "Vampire Weekend": 1.0}, "Jordyn": {"Broken Bells": 4.5, "Deadmau5": 4.0, "Norah Jones": 5.0, "Phoenix": 5.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 4.0}, "Sam": {"Blues Traveler": 5.0, "Broken Bells": 2.0, "Norah Jones": 3.0, "Phoenix": 5.0, "Slightly Stoopid": 4.0, "The Strokes": 5.0}, "Veronica": {"Blues Traveler": 3.0, "Norah Jones": 5.0, "Phoenix": 4.0, "Slightly Stoopid": 2.5, "The Strokes": 3.0} } class recommender: def __init__(self, data, k=1, metric='pearson', n=5): self.k = k self.n = n self.username2id = {} self.userid2name = {} self.productid2name = {} self.metric = metric if self.metric == 'pearson': self.fn = self.pearson if type(data).__name__ == 'dict': self.data = data def loadBookDB(self, path=''): self.data = {} i = 0 #讀取用戶評分書籍的數據 f = codecs.open(os.path.join(path, 'BX-Book-Ratings.csv'), 'r', 'utf-8',errors='ignore') for line in f: i = i + 1 fields = line.split(';') user = fields[0].strip('"') book = fields[1].strip('"') try: rating = int(fields[2].strip().strip('"')) except ValueError: continue if user in self.data: currentRatings = self.data[user] else: currentRatings = {} currentRatings[book] = rating self.data[user] = currentRatings f.close() #讀取書籍的信息 f = codecs.open(os.path.join(path, 'BX-Books.csv'), 'r', 'utf8',errors='ignore') for line in f: i += 1 fields = line.split(';') #BX-Books["ISBN";"Book-Title";"Book-Author";"Year-Of-Publication";"Publisher";"Image-URL-S";"Image-URL-M";"Image-URL-L"] isbn = fields[0].strip('"') title = fields[1].strip('"') author = fields[2].strip('"') title = title + 'by' + author self.productid2name[isbn] = title f.close() #讀取用戶的信息 f = codecs.open(os.path.join(path, 'BX-Users.csv'), 'r', 'utf8',errors='ignore') for line in f: i += 1 fields = line.split(';') userid = fields[0].strip('"') location = fields[1].strip('"') if len(fields) > 3: age = fields[2].strip().strip('"') else: age = 'NULL' if age != 'NULL': value = location + ' (age: ' + age + ')' else: value = location self.userid2name[userid] = value self.username2id[location] = userid f.close() print(i) def pearson(self, rating1, rating2): ''' 皮爾遜相關參數 在統計學中,皮爾遜積矩相關係數 (英語:Pearson product-moment correlation coefficient, 又稱作 PPMCC或PCCs[1], 文章中常用r或Pearson's r表示) 用於度量兩個變數X和Y之間的相關(線性相關),其值介於-1與1之間。 在自然科學領域中,該繫數廣泛用於度量兩個變數之間的相關程度。 0.8-1.0 極強相關 0.6-0.8 強相關 0.4-0.6 中等程度相關 0.2-0.4 弱相關 0.0-0.2 極弱相關或無相關 ''' sum_xy, sum_x, sum_y, sum_x2, sum_y2, n = 0, 0, 0, 0, 0, 0 for key in rating1: if key in rating2: n = n + 1 x = rating1[key] y = rating2[key] sum_xy += x * y sum_x += x sum_y += y sum_x2 += x ** 2 sum_y2 += y ** 2 if n == 0: return 0 fenmu = sqrt(sum_x2 - (sum_x ** 2) / n) * sqrt(sum_y2 - (sum_y ** 2) / n) if fenmu == 0: return 0 else: return (sum_xy - (sum_x * sum_y) / n) / fenmu def computeNearesNeighbor(self, username): ''' 計算關係繫數 ''' distinces = [] for instance in self.data: if instance != username: #相關係數 distince = self.fn(self.data[username], self.data[instance]) distinces.append((instance, distince)) distinces.sort(key=lambda artistTuple: artistTuple[1], reverse=True) return distinces def recommend(self, user): recommendations = {} nearest = self.computeNearesNeighbor(user) userRating = self.data[user] totalDistance = 0.0 for i in range(self.k): totalDistance += nearest[i][1] for i in range(self.k): weight = nearest[i][1] / totalDistance name = nearest[i][0] neighborRatings = self.data[name] #遍歷相關性高的用戶喜歡的書籍 for artist in neighborRatings: #如果喜歡的書不在推薦用戶的書籍中 if not artist in userRating: #文章是否存在評級 if artist not in recommendations: recommendations[artist] = (neighborRatings[artist] * weight) else: recommendations[artist] = (recommendations[artist] + neighborRatings[artist] * weight) recommendations = list(recommendations.items()) recommendations = [(self.convertProductID2name(k), v) for (k, v) in recommendations] recommendations.sort(key=lambda artistTuple: artistTuple[1], reverse=True) return recommendations[:self.n] def convertProductID2name(self, id): ''' 給定商品編號返回商品名稱 ''' if id in self.productid2name: return self.productid2name[id] else: return id def userRatings(self, id, n): ''' 返回前n條的與用戶id相關的 :param id: :param n: :return: ''' print("Ratings for " + self.userid2name[id]) ratings = self.data[id] print(len(ratings)) ratings = list(ratings.items()) ratings = [(self.convertProductID2name(k), v) for (k, v) in ratings] ratings.sort(key=lambda artistTuple: artistTuple[1], reverse=True) ratings = ratings[:n] for rating in ratings: print("%s\t%i" % (rating[0], rating[1])) if __name__ == '__main__': r = recommender(users) print(r.recommend('Veronica')) r.loadBookDB(u'D:/360安全瀏覽器下載/BX-CSV-Dump') print(r.recommend('276737'))
#result:
[('Blues Traveler', 5.0)] 1700021 [(u"Devil's Waltz (Alex Delaware Novels (Paperback))byJonathan Kellerman", 9.0), (u'Silent Partner (Alex Delaware Novels (Paperback))byJonathan Kellerman', 8.0), (u'The Outsiders (Now in Speak!)byS. E. Hinton', 8.0), (u'Sein LanguagebyJERRY SEINFELD', 8.0), (u'The Girl Who Loved Tom GordonbyStephen King', 8.0)]