extract_universities.py 2.06 KB
Newer Older
Florent Chehab's avatar
Florent Chehab committed
1 2 3 4
"""
Script to insert the country data in the database

IT HAS TO BE RUN INSIDE ./manage.py shell
Florent Chehab's avatar
Florent Chehab committed
5 6

TODO YURK. Use pandas @florent !!
Florent Chehab's avatar
Florent Chehab committed
7
"""
Florent Chehab's avatar
Florent Chehab committed
8

Florent Chehab's avatar
Florent Chehab committed
9
import csv
Florent Chehab's avatar
Florent Chehab committed
10 11
import os
import time
Florent Chehab's avatar
Florent Chehab committed
12

Florent Chehab's avatar
Florent Chehab committed
13
import reverse_geocoder as rg
Florent Chehab's avatar
Florent Chehab committed
14
from geopy.geocoders import Nominatim
Florent Chehab's avatar
Florent Chehab committed
15

16
tmp = os.path.join(os.path.realpath(__file__), "../../assets/destinations.csv")
Florent Chehab's avatar
Florent Chehab committed
17 18
destinations_path = os.path.abspath(tmp)

19 20 21
tmp = os.path.join(
    os.path.realpath(__file__), "../../assets/destinations_extracted.csv"
)
Florent Chehab's avatar
Florent Chehab committed
22 23 24 25 26 27
destinations_extracted_path = os.path.abspath(tmp)

if not os.path.isfile(destinations_path):
    print(destinations_path)
    raise Exception("Missing file containing country data")

28 29
with open(destinations_path, "rt") as input:
    with open(destinations_extracted_path, "w") as output:
Florent Chehab's avatar
Florent Chehab committed
30 31 32 33 34 35 36 37 38 39
        print("ini")
        reader = csv.reader(input)
        spamwriter = csv.writer(output, quoting=csv.QUOTE_NONNUMERIC)
        geolocator = Nominatim()

        failed = []
        i = 0
        for row in reader:
            # handle the header
            if i == 0:
40
                header = ["university", "city", "country", "lat", "lon"]
Florent Chehab's avatar
Florent Chehab committed
41 42 43
                spamwriter.writerow(header)
                i += 1
            else:
Florent Chehab's avatar
Florent Chehab committed
44 45
                query = row[2]  # + ', ' + row[1] + ', ' + row[0]

Florent Chehab's avatar
Florent Chehab committed
46 47 48 49
                while True:
                    try:
                        location = geolocator.geocode(query)
                        break
Florent Chehab's avatar
Florent Chehab committed
50
                    except:  # noqa: E722
Florent Chehab's avatar
Florent Chehab committed
51
                        print("error during query, retrying")
Florent Chehab's avatar
Florent Chehab committed
52
                        time.sleep(0.5)
Florent Chehab's avatar
Florent Chehab committed
53
                if location is not None:
Florent Chehab's avatar
Florent Chehab committed
54 55
                    coord = (location.latitude, location.longitude)
                    res = rg.search(coord, verbose=False)
56 57 58 59 60 61 62
                    line = [
                        row[2],
                        row[1],
                        res[0]["cc"],
                        location.latitude,
                        location.longitude,
                    ]
Florent Chehab's avatar
Florent Chehab committed
63 64
                    print(line)
                    spamwriter.writerow(line)
Florent Chehab's avatar
Florent Chehab committed
65 66 67
                else:
                    failed.append(query)

Florent Chehab's avatar
Florent Chehab committed
68
        print(failed)