python apriori算法代码怎么实现
发布网友
发布时间:2022-04-22 05:35
我来回答
共2个回答
懂视网
时间:2022-05-06 18:37
Apriori算法是数据挖掘中频发模式挖掘的鼻祖,从60年代就开始流行,其算法思想也十分简单朴素,首先挖掘出长度为1的频繁模式,然后k=2 将这些频繁模式合并组成长度为k的频繁模式,算出它们的频繁次数,而且要保证其所有k-1长度的子集也是频繁的,值得注意的
Apriori算法是数据挖掘中频发模式挖掘的鼻祖,从60年代就开始流行,其算法思想也十分简单朴素,首先挖掘出长度为1的频繁模式,然后k=2
将这些频繁模式合并组成长度为k的频繁模式,算出它们的频繁次数,而且要保证其所有k-1长度的子集也是频繁的,值得注意的是,为了避免重复,合并的时候,只合并那些前k-2个字符都相同,而k-1的字符一边是少于另一边的。
以下是算法的Python实现:
__author__ = 'linfuyuan'
min_frequency = int(raw_input('please input min_frequency:'))
file_name = raw_input('please input the transaction file:')
transactions = []
def has_infrequent_subset(candidate, Lk):
for i in range(len(candidate)):
subset = candidate[:-1]
subset.sort()
if not ''.join(subset) in Lk:
return False
lastitem = candidate.pop()
candidate.insert(0, lastitem)
return True
def countFrequency(candidate, transactions):
count = 0
for transaction in transactions:
if transaction.issuperset(candidate):
count += 1
return count
with open(file_name) as f:
for line in f.readlines():
line = line.strip()
tokens = line.split(',')
if len(tokens) > 0:
transaction = set(tokens)
transactions.append(transaction)
currentFrequencySet = {}
for transaction in transactions:
for item in transaction:
time = currentFrequencySet.get(item, 0)
currentFrequencySet[item] = time + 1
Lk = set()
for (itemset, count) in currentFrequencySet.items():
if count >= min_frequency:
Lk.add(itemset)
print ', '.join(Lk)
while len(Lk) > 0:
newLk = set()
for itemset1 in Lk:
for itemset2 in Lk:
cancombine = True
for i in range(len(itemset1)):
if i < len(itemset1) - 1:
cancombine = itemset1[i] == itemset2[i]
if not cancombine:
break
else:
cancombine = itemset1[i] < itemset2[i]
if not cancombine:
break
if cancombine:
newitemset = []
for char in itemset1:
newitemset.append(char)
newitemset.append(itemset2[-1])
if has_infrequent_subset(newitemset, Lk) and countFrequency(newitemset, transactions) >= min_frequency:
newLk.add(''.join(newitemset))
print ', '.join(newLk)
Lk = newLk
热心网友
时间:2022-05-06 15:45
class Apriori(object):
def __init__(self, filename, min_support, item_start, item_end):
self.filename = filename
self.min_support = min_support # 最小支持度
self.min_confidence = 50
self.line_num = 0 # item的行数
self.item_start = item_start # 取哪行的item
self.item_end = item_end
self.location = [[i] for i in range(self.item_end - self.item_start + 1)]
self.support = self.sut(self.location)
self.num = list(sorted(set([j for i in self.location for j in i])))# 记录item
self.pre_support = [] # 保存前一个support,location,num
self.pre_location = []
self.pre_num = []
self.item_name = [] # 项目名
self.find_item_name()
self.loop()
self.confidence_sup()
def deal_line(self, line):
"提取出需要的项"
return [i.strip() for i in line.split(' ') if i][self.item_start - 1:self.item_end]
def find_item_name(self):
"根据第一行抽取item_name"
with open(self.filename, 'r') as F:
for index,line in enumerate(F.readlines()):
if index == 0:
self.item_name = self.deal_line(line)
break
def sut(self, location):
"""
输入[[1,2,3],[2,3,4],[1,3,5]...]
输出每个位置集的support [123,435,234...]
"""
with open(self.filename, 'r') as F:
support = [0] * len(location)
for index,line in enumerate(F.readlines()):
if index == 0: continue
# 提取每信息
item_line = self.deal_line(line)
for index_num,i in enumerate(location):
flag = 0
for j in i:
if item_line[j] != 'T':
flag = 1
break
if not flag:
support[index_num] += 1
self.line_num = index # 一共多少行,出去第一行的item_name
return support
def select(self, c):
"返回位置"
stack = []
for i in self.location:
for j in self.num:
if j in i:
if len(i) == c:
stack.append(i)
else:
stack.append([j] + i)
# 多重列表去重
import itertools
s = sorted([sorted(i) for i in stack])
location = list(s for s,_ in itertools.groupby(s))
return location
def del_location(self, support, location):
"清除不满足条件的候选集"
# 小于最小支持度的剔除
for index,i in enumerate(support):
if i < self.line_num * self.min_support / 100:
support[index] = 0
# apriori第二条规则,剔除
for index,j in enumerate(location):
sub_location = [j[:index_loc] + j[index_loc+1:]for index_loc in range(len(j))]
flag = 0
for k in sub_location:
if k not in self.location:
flag = 1
break
if flag:
support[index] = 0
# 删除没用的位置
location = [i for i,j in zip(location,support) if j != 0]
support = [i for i in support if i != 0]
return support, location
def loop(self):
"s级频繁项级的迭代"
s = 2
while True:
print '-'*80
print 'The' ,s - 1,'loop'
print 'location' , self.location
print 'support' , self.support
print 'num' , self.num
print '-'*80
# 生成下一级候选集
location = self.select(s)
support = self.sut(location)
support, location = self.del_location(support, location)
num = list(sorted(set([j for i in location for j in i])))
s += 1
if location and support and num:
self.pre_num = self.num
self.pre_location = self.location
self.pre_support = self.support
self.num = num
self.location = location
self.support = support
else:
break
def confidence_sup(self):
"计算confidence"
if sum(self.pre_support) == 0:
print 'min_support error' # 第一次迭代即失败
else:
for index_location,each_location in enumerate(self.location):
del_num = [each_location[:index] + each_location[index+1:] for index in range(len(each_location))] # 生成上一级频繁项级
del_num = [i for i in del_num if i in self.pre_location] # 删除不存在上一级频繁项级子集
del_support = [self.pre_support[self.pre_location.index(i)] for i in del_num if i in self.pre_location] # 从上一级支持度查找
# print del_num
# print self.support[index_location]
# print del_support
for index,i in enumerate(del_num): # 计算每个关联规则支持度和自信度
index_support = 0
if len(self.support) != 1:
index_support = index
support = float(self.support[index_location])/self.line_num * 100 # 支持度
s = [j for index_item,j in enumerate(self.item_name) if index_item in i]
if del_support[index]:
confidence = float(self.support[index_location])/del_support[index] * 100
if confidence > self.min_confidence:
print ','.join(s) , '->>' , self.item_name[each_location[index]] , ' min_support: ' , str(support) + '%' , ' min_confidence:' , str(confidence) + '%'
def main():
c = Apriori('basket.txt', 14, 3, 13)
d = Apriori('simple.txt', 50, 2, 6)
if __name__ == '__main__':
main()
Apriori(filename, min_support, item_start, item_end)
参数说明
filename:(路径)文件名
min_support:最小支持度
item_start:item起始位置
item_end:item结束位置
import apriori
c = apriori.Apriori('basket.txt', 11, 3, 13)
输出: