from sgmllib import SGMLParser
import os, re, time, urllib, random, sys, math
    
from urllib import quote


class AppURLopener(urllib.FancyURLopener):
    def __init__(self, *args):
        self.version = "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT 5.0)"
        urllib.FancyURLopener.__init__(self, *args)

urllib._urlopener = AppURLopener()

url_quote = quote

def hitCount( query ):

    query = 'http://www.altavista.com/web/results?lang=en&aqb=&sc=on&aqa=%s&aqmode=s&lh=&dt=tmperiod&avkw=aapt&aqs=&rc=dmn&nbq=10&pg=aq&d2=0' % url_quote(query)

    sock = urllib.urlopen(query)
    html = sock.read().lower()

    tag='altavista found'
    p = html.find(tag )
    if p<0:
        tag = 'of <b>'
        p = html.find(tag )
        if p<0:
            return 0
    html = html[p+len(tag):]

    p = html.find('result')
    html = html[:p]

    html = html.replace('.','')
    html = html.replace(',','')

    try:
        return int(html)
    except:
        return 0

def countryList():
    return [ ["Andorra", 507368], ["United Arab Emirates", 485675], ["Afghanistan", 1659914], ["Antigua and Barbuda", 150533], 
             ["Anguilla", 357580], ["Albania", 562171], ["Armenia", 546460], ["Netherlands Antilles", 244385], ["Angola", 539872],
             ["Antarctica", 612907], ["Argentina", 4614932], ["American Samoa", 203345], ["Austria", 4369057],
             ["Australia", 14428914], ["Anguila", 5840], ["Aruba", 570714], ["Azerbaijan", 479812], ["Bosnia", 769785],
             ["Barbados", 827197], ["Bangladesh", 950767], ["Belgium", 3853098], ["Burkina Faso", 353621],
             ["Bulgaria", 1341424], ["Bahrain", 496097], ["Burundi", 365045], ["Benin", 376006], ["Bermuda", 911995],
             ["Brunei", 525944], ["Bolivia", 1282330], ["Brazil", 3353612], ["Bahamas", 1266815], ["Bhutan", 313043],
             ["Bouvet Island", 23080], ["Botswana", 497482], ["Belarus", 915687], ["Belize", 638180], ["Canada", 22947345],
             ["Congo", 743767], ["Central African Republic", 99158], ["Congo Brazzavile", 129], ["Switzerland", 4464939],
             ["Ivory Coast", 159785], ["Cook Islands", 141587], ["Chile", 3273012], ["Cameroon", 365883],
             ["China", 10503149], ["Colombia", 2391704], ["Costa Rica", 2156026], ["Cuba", 2558546], ["Cape Verde", 149038],
             ["Christmas Island", 89484], ["Cyprus", 1206499], ["Czech Republic", 1661565], ["Germany", 13011450],
             ["Djibouti", 228802], ["Denmark", 2821104], ["Dominica", 442528], ["Dominican Republic", 769942],
             ["Algeria", 708433], ["Ecuador", 1503548], ["Estonia", 1104363], ["Egypt", 2919818], ["Western Sahara", 67282],
             ["Eritrea", 326046], ["Spain", 6262644], ["Ethiopia", 572384], ["Finland", 3029327], ["Fiji", 717135],
             ["Falkland Islands", 96205], ["Micronesia", 240281], ["Faroe Islands", 104229], ["France", 17079307],
             ["Gabon", 261195], ["Great Britain (UK*)", 49484], ["Grenada", 635651], ["Georgia", 6911046],
             ["French Guiana", 92279], ["Ghana", 741162], ["Gibraltar", 586670], ["Greenland", 457377],
             ["Gambia", 316286], ["Guinea", 937213], ["Guadeloupe", 390825], ["Equatorial Guinea", 117821], 
             ["Greece", 3314249], ["Guatemala", 1252271], ["Guam", 505723], ["Guinea", 937213], ["Guyana", 361045],
             ["Hong Kong", 4372714], ["Honduras", 932766], ["Croatia", 901307], ["Haiti", 668931], ["Hungary", 2009608],
             ["Indonesia", 2669256], ["Ireland", 6484026], ["Israel", 6124467], ["India", 8285435], ["Iraq", 3962392],
             ["Iran", 1898788], ["Iceland", 927557], ["Italy", 8194116], ["Jamaica", 1503733], ["Jordan", 3596335],
             ["Japan", 11964871], ["Kenya", 1329593], ["Kyrgyzstan", 286245], ["Cambodia", 874170], ["Comoros", 128402],
             ["North Korea", 480082], ["South Korea", 861535], ["Kuwait", 902961], ["Kazakhstan", 545513], 
             ["Laos", 683111], ["Lebanon", 1611341], ["Saint Lucia", 213232], ["Liechtenstein", 476879],
             ["Sri Lanka", 1255410], ["Liberia", 433926], ["Lesotho", 261276], ["Lithuania", 1010669],
             ["Luxembourg", 1509947], ["Latvia", 993776], ["Libya", 379545], ["Morocco", 875836], ["Monaco", 1211341], 
             ["Moldova", 483230], ["Madagascar", 456664], ["Marshall Islands", 114943], ["Macedonia", 588851],
             ["Mali", 618361], ["Myanmar", 612176], ["Mongolia", 418530], ["Macau", 454230], ["Martinique", 408059],
             ["Mauritania", 234854], ["Montserrat", 272426], ["Malta", 1446525], ["Mauritius", 619848],
             ["Maldives", 399717], ["Malawi", 411174], ["Mexico", 12860749], ["Malaysia", 2896572], 
             ["Mozambique", 445068], ["Namibia", 561881], ["New Caledonia", 196439], ["Niger", 379119],
             ["Norfolk Island", 75537], ["Nigeria", 1065105], ["Nicaragua", 942399], ["Netherlands", 4278864],
             ["Norway", 3049784], ["Nepal", 1255808], ["Nauru", 139532], ["Neutral Zone", 27949], 
             ["New Zealand", 5220176], ["Oman", 538416], ["Panama", 1959774], ["Peru", 2534387], ["French Polynesia", 250512],
             ["Papua New Guinea", 357961], ["Philippines", 2321785], ["Pakistan", 2180449], 
             ["Poland", 2836355], ["Puerto Rico", 2244142], ["Portugal", 4164322], ["Paraguay", 1056969], 
             ["Qatar", 435277], ["Reunion", 1958013], ["Romania", 1626421], ["Russia", 5470901], ["Rwanda", 426131],
             ["Saudi Arabia", 1006404], ["Seychelles", 315412], ["Sudan", 636461], ["Sweden", 4009711],
             ["Singapore", 3994066], ["St. Helena", 175232], ["Slovenia", 974837], ["Slovak Republic", 189661],
             ["Sierra Leone", 368211], ["San Marino", 384320], ["Senegal", 740999], ["Somalia", 409418],
             ["Suriname", 236361], ["Sao Tome and Principe", 48854], ["El Salvador", 850296], ["Syria", 898835],
             ["Swaziland", 259978], ["Chad", 1250832], ["Togo", 333450], ["Thailand", 3746028], ["Tajikistan", 200424],
             ["Turkmenistan", 336623], ["Tunisia", 628396], ["Tonga", 351343], ["East Timor", 227194], 
             ["Turkey", 3330430], ["Trinidad and Tobago", 237116], ["Tuvalu", 143394], ["Taiwan", 2969225], 
             ["Tanzania", 703178], ["Ukraine", 1860363], ["Uganda", 676270], ["United Kingdom", 5432441], 
             ["United States", 22451899], ["Uruguay", 1552015], ["Uzbekistan", 496764], ["Vatican City", 102054],
             ["Saint Vincent & the Grenadines", 9584], ["Venezuela", 2375470], ["Virgin Islands (British)", 21933],
             ["Virgin Islands", 769543], ["Vietnam", 3024465], ["Vanuatu", 279065], ["Wallis and Futuna Islands", 8060],
             ["Samoa", 471357], ["Yemen", 431068], ["Mayotte", 96884], ["Yugoslavia", 862796], ["South Africa", 3073302], 
             ["Zambia", 497879], ["Zimbabwe", 995324],
            ]

def saveMatrix( matrix, countries, weights, size, keyw ):
    f = open( 'res_%s.txt' % keyw, 'w' )
    f.write( ','.join( countries[:size] ) +'\n' )
    f.write( ','.join( weights[:size] ) + '\n' )
    for y in range(size):
        f.write( ','.join( [ "%2.8f" % val for val in matrix[y][:size] ] ) +'\n' )
    f.close()


if 1:
    if len(sys.argv)>1:
        keyw = sys.argv[1]
    else:
        keyw = ''
        
    lst = countryList()
    if keyw!='':
        print 'getting counts for', keyw
        for c in lst:
            country = c[0]
            print 'counting ', country
            c[1] = hitCount( '"%s" NEAR "%s"' % (country, keyw) )
    file('scores_'+keyw, 'w').write( `lst` )
    lst.sort( lambda l1,l2: cmp( l2[1], l1[1] ) )
    matrix =[ [0.0]*len(lst) for i in lst]
    countries = [country.strip() for country, score in lst]
    weights = [str(score) for country, score in lst]
    rows=0
    for countryfrom, scorefrom in lst:
        scores = []
        cols=0
        for countryto, scoreto in lst[:rows]:
            print '*',
            searchfor = '"%s" NEAR "%s"' % (countryfrom, countryto)
            if keyw!='': searchfor + ' NEAR "%s"' % keyw
            hits = None
            while hits==None:
                try:
                    hits = hitCount(  searchfor )
                except:
                    hits = None
            score = hits / math.sqrt(scoreto*scorefrom)
            matrix[cols][rows] =  score
            matrix[rows][cols] =  score
            cols+=1
        rows+=1
        saveMatrix( matrix, countries, weights, rows, keyw )
        print ''
        print countryfrom, 'ready'
