"""
Tries to find out How many pages on the internet end in a given extension?

http://paulisageek.com/webExtension/
Paul Tarjan
"""
import re

class Yahoo :
    def getUrl(self, ext): return "http://search.yahoo.com/search?q=originurlextension:" + ext + "&norw=1"
    def getNum(self, result): 
        match = re.search(r'<span id="infotext">1 - 10 of ([0-9,]+) for ', result)
        if not match : return -1
        return match.groups()[0].replace(",", "")

class Boss :
    def getUrl(self, ext): return "http://boss.yahooapis.com/ysearch/web/v1/originurlextension:" + ext + "?appid=lBgSn1fIkY2xF36UBLRxefZkbkm8W74-&format=json&count=0"
    def getNum(self, result) :
        match = re.search('"totalhits":"(.*?)"', result)
        if not match : return -1
        return match.groups()[0]

class GoogleAPI :
    def getUrl(self, ext): return "http://ajax.googleapis.com/ajax/services/search/web?v=1.0&q=ext:" + ext
    def getNum(seklf, result) : 
        match = re.search('"estimatedResultCount":"(.*?)"', result)
        if not match : return -1
        return match.groups()[0]

class Google :
    def getUrl(self, ext): return "http://www.google.com/search?hl=en&q=ext%3A" + ext
    def getNum(self, result) : 
        match = re.search('of about <b>(.*?)</b>', result)
        if not match : return -1
        return match.groups()[0].replace(",", "")

# Change this if you want to use another source
source = Google()


import itertools
def permutations(iterable, r=None):
    # permutations('ABCD', 2) --> AB AC AD BA BC BD CA CB CD DA DB DC
    # permutations(range(3)) --> 012 021 102 120 201 210
    pool = tuple(iterable)
    n = len(pool)
    if r is None : n = r
    indices = range(n)
    cycles = range(n, n-r, -1)
    yield tuple(pool[i] for i in indices[:r])
    while n:
        for i in reversed(range(r)):
            cycles[i] -= 1
            if cycles[i] == 0:
                indices[i:] = indices[i+1:] + indices[i:i+1]
                cycles[i] = n - i
            else:
                j = cycles[i]
                indices[i], indices[-j] = indices[-j], indices[i]
                yield tuple(pool[i] for i in indices[:r])
                break
        else:
            return
try :
    itertools.permutations
except AttributeError :
    itertools.permutations = permutations

def product(*args, **kwds):
    # product('ABCD', 'xy') --> Ax Ay Bx By Cx Cy Dx Dy
    # product(range(2), repeat=3) --> 000 001 010 011 100 101 110 111
    pools = map(tuple, args) * kwds.get('repeat', 1)
    result = [[]]
    for pool in pools:
        result = [x+[y] for x in result for y in pool]
    for prod in result:
        yield tuple(prod)
try :
    itertools.product
except AttributeError :
    itertools.product = product

import urllib2
import socket
# timeout in seconds
timeout = 5
maxtries = 10
socket.setdefaulttimeout(timeout)

import sys, os
import time

r = map(chr, xrange(ord('a'), ord('z') + 1))

curFile = sys.stdout
def p(i, n) :
    curFile.write("%s %d\n" % (i, n))
    curFile.flush()
    
d = time.strftime("%Y-%m-%d")
try :
    os.mkdir(d)
except :
    print "Can't make dir %s" % d

for l in xrange(1, 5) :
    curFile = file(d + os.sep + str(l) + ".txt", "w")
    for i in itertools.product(r, repeat = l) :
        i = "".join(i)
        url = source.getUrl(i)
        trys = 0
        result = ""
        while trys < maxtries :
            trys += 1
            try :
                req = urllib2.Request(url, headers={"User-Agent" : "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"})
                result = urllib2.urlopen(req).read()
                break
            except IOError :
                sys.stderr.write("retrying : %s\n" % i)
                pass
        if trys == maxtries : 
            sys.stderr.write("maxium trys : %s\n" % i)
            p(i, -1)
            continue    
        try :
            n = source.getNum(result)
            n = int(n)
            p(i, n)
        except Exception, e :
            sys.stderr.write("%s\nerror parsing : %s : %s\n" % (e, i, result))
            p(i, -1)
            continue

