from numpy import *
import jiebaimport copywith open('content', 'r', encoding = 'utf-8') as f:
content = f.read()with open('stopwords', 'r', encoding = 'utf-8') as f:
sw = f.readlines()sw = map(lambda x: x.replace('\n', ''), sw)#停词
word_list = jieba.cut(content)
word_list = list(word_list)
word_list_vice = copy.deepcopy(word_list)#备份
set_sw = set(sw)
for word in word_list:#迭代消耗
if word in set_sw: word_list_vice.remove(word)word_list = copy.deepcopy(word_list_vice)#备份
word_set_list = list(set(word_list_vice))#转换类型消耗
aa = zeros((len(word_set_list), len(word_set_list)), dtype = float)
word_list_vice = copy.deepcopy(list(word_list))#备份
for i, word in enumerate(word_list):#迭代消耗
if i == len(word_list_vice) - 1: break sindex = word_set_list.index(word) eindex = word_set_list.index(word_list_vice[i + 1]) aa[sindex][eindex] = aa[sindex][eindex] + 1 aa[eindex][sindex] = aa[eindex][sindex] + 1def graph_init(a):#初始化转移矩阵
row, column = a.shape c = zeros((row, column),dtype = float) for i in range(row): for j in range(column): c[i][j] = a[i][j] / (sum(a[i])) return c def pr_init(c):#pageRank向量初始化 row, column = c.shape pr = zeros(row,dtype = float) for i in range(row): pr[i] = float(1) / row return pr def pageRank(s, pr, p):#迭代计算pageRank向量 while(array_equal(pr, p * dot(pr, s) + (1 - p) * pr) == False):#判断pr矩阵是否收敛 pr = p * dot(pr, s) + (1 - p) * pr return prif __name__=="__main__":
s = graph_init(aa) pr = pr_init(s) p = 0.8 #浏览当前网页的概率p r = pageRank(s, pr, p)result_list = []
for i, j in zip(word_set_list, r): result_dict = {} result_dict['word'] = i result_dict['score'] = j result_list.append(result_dict)rl = sorted(result_list, key = lambda x: x['score'])