# Based on Fredrik Lundh's wf-6.py from http://effbot.org/zone/wide-finder.htm # Modifications by Lenny Domnitser, for http://domnit.org/2007/10/procs # This probably doesn't make sense unless you read those two blog posts. import re, sys, os from collections import defaultdict import mmap filemap = None def process(file, chunk): global filemap, fileobj if filemap is None or fileobj.name != file: fileobj = open(file, "rb") filemap = mmap.mmap( fileobj.fileno(), os.path.getsize(file), access=mmap.ACCESS_READ ) d = defaultdict(int) for file in pat.findall(filemap, chunk[0], chunk[0]+chunk[1]): d[file] += 1 return d def getchunks(file, size=1024*1024): # yield sequence of (start, size) chunk descriptors f = open(file, "rb") while 1: start = f.tell() f.seek(size, 1) s = f.readline() # skip forward to next line ending yield start, f.tell() - start if not s: break import time, sys if sys.platform == "win32": timer = time.clock else: timer = time.time t0, t1 = timer(), time.clock() pat = re.compile(r"GET /ongoing/When/\d\d\dx/(\d\d\d\d/\d\d/\d\d/[^ .]+) ") FILE = "o1000k.ap" import procs from functools import partial result = procs.pmap(partial(process, FILE), getchunks(FILE, 50*1024*1024)) # merge the incoming data count = defaultdict(int) for item in result: for key, value in item.items(): count[key] += value # process result for key in sorted(count, key=count.get)[:10]: pass # print "%40s = %s" % (key, count[key]) print timer() - t0, time.clock() - t1 for key in sorted(count, key=count.get)[-10:]: print "%40s = %s" % (key, count[key])