利用用户行为数据(代码分析)
Poblog 07月29日 2017
数据集
GroupLens 提供的 MovieLens 数据集
u.data 各个字段含义如下
user_id movie_id rating timestamp
166 346 1 886397596
298 474 4 884182806
115 265 2 881171488
253 465 5 891628467
...
数据读取:
def readData():
data = []
fileName = './u.data'
fr = open(fileName,'r')
for line in fr.readlines():
lineArr = line.strip().split()
//lineArr:一行文本
: ['196', '242', '3', '881250949']
//
data.append([lineArr[0], lineArr[1], 1.0])
//data:加入,这里只研究了用户是否进行评分
: [['196', '242', 1.0]]
//
return data
拆分训练集和测试集
def SplitData(data,M,k,seed):
//data:100000行数据
M:5 拆分的份数
k:0 序号
seed:0 随机数种子
//
test = []
//test:测试集//
train = []
//train:训练集//
random.seed(seed)
for user, item,rating in data:
//user:'196' user_id
item:'242' movie_id
rating:1.0 rating(是否评分)
//
if random.randint(0,M-1) == k:
test.append([user,item,rating])
else:
train.append([user, item,rating])
return train, test
# 将列表形式数据转换为dict形式
def transform(oriData):
ret = dict()
//ret:{}
//
for user,item,rating in oriData:
if user not in ret:
ret[user] = dict()
ret[user][item] = rating
//ret:{'196': {'242': 1.0}} 用户行为记录的字典
//
return ret
构建倒排表
def UserSimilarity(train):
//train:{'196': {'242': 1.0,'241': 1.0},
'198': {'243': 1.0,'246': 1.0}
....} 训练集字典 对应W(A,a)
//
# build inverse table for item_users
item_users = dict()
for u,items in train.items():
//u:'196'
items:{'242': 1.0,....}
//
for i in items.keys()://遍历所有物品
//i:'242'
//
if i not in item_users:
item_users[i] = set()
item_users[i].add(u)
//item_users:{'242': {'196'}}
//
//用户物品倒排表:
item_users:{'242': {'196'...},
....}
//
#calculate co-rated items between users
//用户相似度矩阵W的生成
C = dict()
N = dict()
for i,users in item_users.items():
//i:'242'
users:{'196', '463', '568', ...}
//
for u in users:
//u:'196'
//
N.setdefault(u,0)
//N:{'196': 0}
//
N[u] += 1
//N:{'196': 1}
//
C.setdefault(u,{})
//C:{'196': {}}
//
for v in users:
//v:'196'
//
if u == v:
continue
C[u].setdefault(v,0)
//C:{'196': {'463': 0}}
//
C[u][v] += 1
//C:{'181': {'463': 1}}
//
//C:{'753': {'624': 9, '724': 9, '239': 17, '735': 6,...},
....}
9表示'753'和'624'有9个相同的电影
N:{'753': 42, '624': 109, ...}
42表示'753'评价了42部电影
//
#calculate finial similarity matrix W
W = C.copy()
//W和C是一样的
//
for u, related_users in C.items():
//u:'753'
related_users:{'624': 9, '724': 9, ....}
//
for v, cuv in related_users.items():
//v:'624'
cuv:9
//
W[u][v] = cuv / math.sqrt(N[u] * N[v])
//W:{'753': {'624': 0.13301622404223282, '724': 0.16598500055174645,...}
....}
'753'和'624'的相似度是0.13301622404223282
//
return W
对相似度最高的K个用户的物品(user没有浏览过的)进行推荐指数计算
def Recommend(user,train,W,K = 3):
//user:'1'
train:{'user':{goods...},...}
W:{'user1':{'user2':like,...},...}
//
rank = dict()
interacted_items = train[user]
//interacted_items:{'61': 1.0, '33': 1.0, ...}
存放user评价过的电影
//
for v, wuv in sorted(W[user].items(), key = operator.itemgetter(1), \
reverse = True)[0:K]:
//sorted[...]: [('297', 0.436617742447984), ('435', 0.4350321239701459), ('933', 0.42296463512781696)]
v:'297'
wuv:0.436617742447984
//
for i, rvi in train[v].items():
//i:'133'
rvi:1.0
//
#we should filter items user interacted before
if i in interacted_items:
continue
//评价过的电影不计算在内
//
rank.setdefault(i,0)
rank[i] += wuv * rvi
return rank
//rank:
{'716': 0.436617742447984, '275': 0.436617742447984,....}
'716': 0.436617742447984:user 对716商品感兴趣程度是0.436617742447984
//
def Recommendation(users, train, W, K = 3):
//users:(['22', '122', '291', '308', '63', '7',...])
测试集中的所有用户
train:{'user':{goods...},...}
W:{'user1':{'user2':like,...},...}
K = 3 计算相似用户的个数
//
result = dict()
for user in users:
//user:'22'
//
rank = Recommend(user,train,W,K)
//rank:{'716': 0.436617742447984, '275': 0.436617742447984,....}
给user推荐的电影以及推荐兴趣度
//
R = sorted(rank.items(), key = operator.itemgetter(1), \
reverse = True)
//R:按兴趣度的排序
: [('176', 1.3707690244093957), ('82', 1.3707690244093957), ...]
//
result[user] = R
//result:记录user和对应推荐商品的信息
{'22': [('176', 1.3707690244093957), ('82', 1.3707690244093957), ...]}
//
return result
//推荐结果评价相关
def GetRecommendation(result, user, N = 5000):
//result:{'22': [('176', 1.3707690244093957), ('82', 1.3707690244093957), ...]}
//
rank = result[user]
//rank:
: [('176', 1.3707690244093957), ....]
//
ret = []
if len(rank) > N:
//推荐的礼物比N多//
for item,rating in rank:
ret.append((item,rating))
else:N:
//推荐的礼物不比N多//
ret = rank
return ret
//: [('176', 1.3707690244093957), ('82', 1.3707690244093957),....]
//
//准确度评价
def Precision(train, test,result, N = 5000):
//train test :{'user':{goods...},...}
result:{'22': [('176', 1.3707690244093957), ('82', 1.3707690244093957), ...]}
//
hit = 0
//命中
all = 0
//全部
for user in test.keys():
//user:'22'
tu = test[user]
//tu:user后来评分的电影
{'377': 1.0, '173': 1.0, '176': 1.0,...}
//
rank = GetRecommendation(result,user,N)
//rank:
: [('176', 1.3707690244093957), ....]
//
for item, pui in rank:
//item:'176'
pui:1.3707690244093957
//
if item in tu:
//如果推荐项出现在测试集合则表示命中
hit += 1
all += len(rank)
//all即为推荐列表长度
//
return hit / (all * 1.0)
//正确率为 推荐正确个数/总推荐数
//召回率评价(和准确率唯一不同--总数为测试数据物品项的个数)
def Recall(train,test,result,N = 5000):
hit = 0
all = 0
for user in test.keys():
tu = test[user]
rank = GetRecommendation(result, user, N)
for item, pui in rank:
if item in tu:
hit += 1
all += len(tu)
return hit / (all * 1.0)
//覆盖率评价
def Coverage(train, test, result, N = 5000):
recommend_items = set()
all_items = set()
//all_items:训练集的全部物品
//
for user in train.keys():
for item in train[user].keys():
all_items.add(item)
for user in test.keys():
rank = GetRecommendation(result,user,N)
for item , pui in rank:
recommend_items.add(item)
//recommend_items 推荐的所有物品
//
return len(recommend_items) / (len(all_items) * 1.0)
//覆盖率是 推荐的物品占训练集物品的比重
//新颖度评价
def Popularity(train, test, result, N = 5000):
item_popularity = dict()
for user, items in train.items():
for item in items.keys():
if item not in item_popularity:
item_popularity[item] = 0
item_popularity[item] += 1
//item_popularity:记录训练集中各个物品出现次数
//
ret = 0
n = 0
//
n:测试集推荐的物品的总个数
//
for user in test.keys():
rank = GetRecommendation(result,user,N)
for item,pui in rank:
ret += math.log(1 + item_popularity[item])
//ret:推荐的物品在训练集出现的次数的log和
//
n += 1
ret /= n * 1.0
return ret
//ret:推荐出的物品都很热门,说明推荐的新颖度较低,否则说明推荐结果比较新颖
//
入口函数
if __name__ == '__main__':
data = readData()
//data:包含100000行数据
[['196', '242', 1.0]
....]
//
numFlod = 5
//计算测试的次数(取平均值)
//
precision =0
//准确率//
recall = 0
//召回率//
coverage = 0
//覆盖率//
popularity =0
//新颖度//
for i in range(0,numFlod):
//i:0 循环多次//
[oriTrain,oriTest] = SplitData(data,numFlod,i,0)
//oriTrain:79902行
oriTest:20098行
//
train = transform(oriTrain)
//train:{'196': {'242': 1.0,'241': 1.0},
'198': {'243': 1.0,'246': 1.0}
....} 训练集字典 对应W(A,a)
//
test = transform(oriTest)
//test:{'196': {'242': 1.0,'241': 1.0},
'198': {'243': 1.0,'246': 1.0}
....} 测试集字典 对应W(A,a)
//
W = UserCF.UserSimilarity(train)
#rank = UserCF.Recommend('1',train,W)
result = UserCF.Recommendation(test.keys(), train, W)
N = 10
precision += Evaluation.Precision(train,test, result,N)
recall += Evaluation.Recall(train,test,result,N)
coverage += Evaluation.Coverage(train, test, result,N)
popularity += Evaluation.Popularity(train, test, result,N)
precision /= numFlod
recall /= numFlod
coverage /= numFlod
popularity /= numFlod
#输出结果
print('precision = %f' %precision)
print('recall = %f' %recall)
print('coverage = %f' %coverage)
print('popularity = %f' %popularity)