""" This Python module will read and extract ngrams from Google's LDC2006T13 ngram corpus without having to extract and cat the files yourself. Example: corpus = LDC2006T13() for ngram, count in corpus.ngrams(3): print ngram, count On a modern machine you can expect to be able to read approx. 150,000 ngrams/second. Copyright 2008 by Derek Anderson This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . """ from __future__ import division import gzip, os, re, tarfile DEFAULT_FN = 'LDC2006T13.tar' class LDC2006T13: def __init__(self, fn=None, verbose=False): """looks for %s in the current folder, and then via the locate command, if not supplied""" % DEFAULT_FN self.__verbose = verbose if not fn: if os.access( DEFAULT_FN, os.R_OK ): fn = DEFAULT_FN if not fn: for tfn in os.popen('locate "%s"' % DEFAULT_FN).read().split(): if os.access( tfn, os.R_OK ): fn = tfn break if not fn: raise ValueError('could not locate %s on your computer' % DEFAULT_FN) if self.__verbose: print 'opening:', fn self.__tf = tarfile.open(fn,'r') def __get_fns(self, n, start, end): if n==1: fns = ['./LDC2006T13/data/1gms/vocab.gz'] else: fns = [] if start: start = start.split() if end: start = start.split() f = self.__tf.extractfile('./LDC2006T13/data/%igms/%igm.idx'%(n,n)) for line in f: fn, first_ngram = line.split('\t') first_ngram = first_ngram.split() if start and first_ngram < start: continue if end and first_ngram > end: break fns.append( './LDC2006T13/data/%igms/%s' % (n,fn) ) f.close() # fns = [ x for x in self.__tf.getnames() if x.startswith('./LDC2006T13/data/%igms/'%n) and x.endswith('.gz') ] fns.sort() print fns return fns def ngrams(self, n, expr=None, start=None, end=None): """generator that yields all (ngram,count) tuples of size n. ngrams are space separated.""" assert 1 <= n <= 5 fns = self.__get_fns(n,start,end) if expr: expr = re.compile(expr) for fn in fns: if self.__verbose: print 'accessing:', fn f = self.__tf.extractfile(fn) g = gzip.GzipFile(None,'r',None,f) for line in g: #yield line i = line.index('\t') ngram = line[:i] count = int(line[i+1:-1]) if expr and not expr.match(ngram): continue yield ngram, count #line = line.split() #yield tuple(line[:-1]), int(line[-1]) g.close() f.close() def alpha_ngrams(self, n): return self.ngrams(n, expr='[a-zA-Z ]*$', start='A') def readme(self): """extracts and returns the contents of readme.txt in the corpus""" f = self.__tf.extractfile('./LDC2006T13/docs/readme.txt') text = f.read() f.close() return text def __test(): import time start = time.clock() corpus = LDC2006T13(verbose=True) print '\n=== corpus.readme() ===' print corpus.readme() i = 0 for n in range(1,6): j = 0 for ngram, count in corpus.ngrams(n): #print '\t', x i+=1; j+=1 if j>=10**5: break end = time.clock() print print 'read %i ngrams in %f seconds' % (i, end-start), '(%f/sec)' % (i/(end-start)) if __name__=='__main__': __test()