如何用python实现英文短文的双词频统计?

发布网友发布时间：2022-05-10 23:49

共4个回答

热心网友时间：2022-05-11 01:19

import re
from itertools import imap as map
from collections import Counter

def parserwords(sentence):
    preword = ''
    result = []
    for word in re.findall('\w+', sentence.lower()):
        if preword:
            result.append((preword, word))
        preword = word
    return result

context = """
Do you hear the people sing, singing a song of angry men.
It is the music of a people, who will not be slaves again,
when the beating of your heart echoes the beating of the drums.
There is a life about to start when tomorrow comes.
"""

words = []
for sentence in map(parserwords,
        re.split(r'[,.]', context.lower())):
    words.extend(sentence)

prefixcounter = Counter([word[0] for word in words])
counter = Counter(words)
meter = {}
for pre, post in counter.iterkeys():
    meter[(pre, post)] = 1. * counter[(pre, post)] / prefixcounter[pre]

result = sorted(meter.iteritems(),
    cmp = lambda a, b: cmp(b[1], a[1]) or cmp(a[0], b[0])
    )

print result[:5]

热心网友时间：2022-05-11 02:37

data="""Do you hear the people sing, singing a song of angry men. It is the music of a people, who will not be slaves again, when the beating of your heart echoes the beating of the drums. There is a life about to start when tomorrow comes."""
data=data.replace(',','')
data=data.replace('.','')
ws=data.split()
dic={}#count two words
ws2=[]#two words
for i in range(len(ws)-1):
ws2.append(ws[i]+" "+ws[i+1])
for w2 in ws2:
if dic.get(w2)==None:
dic[w2]=1
else:
dic[w2]+=1
dic_first={}#count two words by first word
for w2 in ws2:
(l,r)=w2.split()
if dic_first.get(l)==None:
dic_first[l]=1
else:
dic_first[l]+=1
for w2 in ws2:#output
(l,r)=w2.split()
print w2,dic[w2],dic_first[l],dic[w2]/float(dic_first[l])

追问最后输出还有些问题，仅输出n值和双词短语，并且排列顺序是由n值从大至小，n值同样的情况下按双词短语的字母顺序（a-z，a在最前），怎样输出前五个呢？
另外在句号逗号两边的词不能形成一个短语呀，应该把它删掉

追答def count(ju,dic,dic_first):
ws=ju.split()
ws2=[]
for i in range(len(ws)-1):
ws2.append(ws[i]+" "+ws[i+1])
for w2 in ws2:
if dic.get(w2)==None:
dic[w2]=1
else:
dic[w2]+=1
if dic.get("a life")==2:
print ws2
raw_input()
for w2 in ws2:
(l,r)=w2.split()
if dic_first.get(l)==None:
dic_first[l]=1
else:
dic_first[l]+=1
data="""Do you hear the people sing, singing a song of angry men. It is the music of a people, who will not be slaves again, when the beating of your heart echoes the beating of the drums. There is a life about to start when tomorrow comes."""
data=data.replace(',','.')
jus=data.split('.')
dic={}
dic_first={}
ws2=[]
for ju in jus:
count(ju,dic,dic_first)
out=[]
for k in dic:#output
(l,r)=k.split()
print k,dic[k],dic_first[l]
n=dic[k]/float(dic_first[l])
out.append([n,k])
out.sort()
for o in out:
print o[0],o[1]

热心网友时间：2022-05-11 04:11

感觉不是很难啊，1.split切句子，2.split切词，3.遍历生成词组，扔到dict里统计就可以了啊