python - counting a set dictionary of words in a specific html tag -
i trying parse document , if there name associated specific docno, count total number of names. after loop ends docno, want store names[docno]= word count. therefore, if namedict={'henry':'','joe':'') , henry in docno=doc 1 -4 times , joe 6 dictionary store ('doc 1': 10). far, can figure out counting total number of names in entire text file.
from xml.dom.minidom import * import re string import punctuation operator import itemgetter def parsetrec1 (atext): fc = open(atext,'r').read() fc = '<docs>\n' + fc + '\n</docs>' dom = parsestring(fc) w_re = re.compile('[a-z]+',re.ignorecase) doc_nodes = dom.getelementsbytagname('doc') namelist={'matt':'', 'earl':'', 'james':''} default=0 indexdict={} n=10 names={} words={} doc_node in doc_nodes: docno = doc_node.getelementsbytagname('docno')[0].firstchild.data cnt = 1 p_node in doc_node.getelementsbytagname('p'): p = p_node.firstchild.data words = w_re.findall(p) words_gen=(word.strip(punctuation).lower() line in words word in line.split()) aword in words: if aword in namelist: names[aword]=names.get(aword, 0) + 1 print names # top_words=sorted(names.iteritems(), key=lambda(word, count): (-count, word))[:n] # word, frequency in top_words: # print "%s: %d" % (word, frequency) #print words + top_words #print docno + "\t" + str(numbers) parsetrec1('la010189.txt')
i've cleaned code bit make easier follow. here few comments , suggestions:
- to answer key question: should storing count in
names[docno] = names.get(docno, 0) + 1
. - use
defaultdict(int)
instead ofnames.get(aword, 0) + 1
accumlate count. - use
set()
namelist
. - adding
re.multiline
option regular expression should remove needline.split()
. - you didn't use
words_gen
, oversight?
i used doc test with, based on code:
<doc> <docno>1</docno> <p>groucho harpo zeppo</p> <p>larry moe curly</p> </doc> <doc> <docno>2</docno> <p>zoe inara kaylie</p> <p>mal wash jayne</p> </doc>
here cleaned-up version of code count names in each paragraph:
import re collections import defaultdict string import punctuation xml.dom.minidom import * re_words = re.compile('[a-z]+', re.ignorecase | re.m) def parse(path, names): data = '<docs>' + open(path, 'rb').read() + '</docs>' tree = parsestring(data) hits = defaultdict(int) doc in tree.getelementsbytagname('doc'): doc_no = 'doc ' + doc.getelementsbytagname('docno')[0].firstchild.data node in doc.getelementsbytagname('p'): text = node.firstchild.data words = (w.strip(punctuation).lower() w in re_words.findall(text)) hits[doc_no] += len(names.intersection(words)) item in hits.iteritems(): print item names = set(['zoe', 'wash', 'groucho', 'moe', 'curly']) parse('doc.xml', names)
output:
(u'doc 2', 2) (u'doc 1', 3)
Comments
Post a Comment