csesoc · jjccharles · Oct 4, 2019 · Oct 11, 2019
diff --git a/server/scripts/create_psql_db.py b/server/scripts/create_psql_db.py
@@ -10,14 +10,14 @@
 
 # Database Configuration
 DEGREE_PLANNER_DATABASE_NAME = "degree_planner"
-PGPORT = os.getenv('PGPORT')
+PGPORT = int(os.getenv('PGPORT'))
 PGHOST = os.getenv('PGHOST')
 PGUSER = os.getenv('PGUSER')
 PGPASSWORD = os.getenv('PGPASSWORD')
 
 # SENG Sample Data
 faculty = "Faculty of Engineering"
-school = "School of Computer Science and Engineering"
+#school = "School of Computer Science and Engineering"
 program = "Bachelor of Engineering (Honours)"
 major = "Software Engineering"
 courses = [
@@ -44,9 +44,8 @@
 # Create tables
 conn = pg8000.connect(database=DEGREE_PLANNER_DATABASE_NAME, port=PGPORT, host=PGHOST, user=PGUSER, password=PGPASSWORD)
 cur = conn.cursor()
-cur.execute("CREATE TABLE IF NOT EXISTS faculties (name text primary key, schools text)")
-cur.execute("CREATE TABLE IF NOT EXISTS schools (name text primary key, programs text)")
-cur.execute("CREATE TABLE IF NOT EXISTS programs (programid integer primary key, name text, major text, minor text)")
+cur.execute("CREATE TABLE IF NOT EXISTS faculties (name text primary key, programs text)")
+cur.execute("CREATE TABLE IF NOT EXISTS programs (programid integer primary key, name text, link text, majors text, minors text)")
 cur.execute("CREATE TABLE IF NOT EXISTS majors (majorid text primary key, name text, courses text)")
 cur.execute("CREATE TABLE IF NOT EXISTS minors (minorid text primary key, name text, courses text)")
 cur.execute("CREATE TABLE IF NOT EXISTS courses (code text, name text, description text, offering text, faculty text, school text, study_level text, gened integer, outline text, uoc integer)")
@@ -55,13 +54,10 @@
 
 # Insert Dummy Data for Bachelor of Software Engineering
 # Faculty
-query, args = pg8000.core.convert_paramstyle("qmark", "INSERT INTO faculties (name, schools) VALUES (?, ?) ON CONFLICT DO NOTHING")
+query, args = pg8000.core.convert_paramstyle("qmark", "INSERT INTO faculties (name, programs) VALUES (?, ?) ON CONFLICT DO NOTHING")
 cur.execute(query, args((faculty, school)))
-# School
-query, args = pg8000.core.convert_paramstyle("qmark", "INSERT INTO schools (name, programs) VALUES (?, ?) ON CONFLICT DO NOTHING")
-cur.execute(query, args((school, program)))
 # Program
-query, args = pg8000.core.convert_paramstyle("qmark", "INSERT INTO programs (programid, name, major, minor) VALUES (?, ?, ?, ?) ON CONFLICT DO NOTHING")
+query, args = pg8000.core.convert_paramstyle("qmark", "INSERT INTO programs (programid, name, link, major, minor) VALUES (?, ?, ?, ?, ?) ON CONFLICT DO NOTHING")
 cur.execute(query, args((3707, "Bachelor of Engineering (Honours)", "Software Engineering", "")))
 # Major
 query, args = pg8000.core.convert_paramstyle("qmark", "INSERT INTO majors (majorid, name, courses) VALUES (?, ?, ?) ON CONFLICT DO NOTHING")

diff --git a/server/scripts/plannify_scraper.py b/server/scripts/plannify_scraper.py
@@ -1,116 +1,89 @@
 #!/usr/bin/python3
 
-import re
-import requests
-import urllib.request
-import time
+import re, requests, time
 from bs4 import BeautifulSoup
 
-# Started by Jeremy Lim on 20/07/2019
-
-# TO-DO LIST
-# - Get timetable done
-# - Get GenEd done
-# - Majors
-# - Degrees
-# - Add to database
-
-# Stripped down alphabet
-course_alphabet = ['A','B','C','D','E','F','G','H','I','L','M','N','O','P','R','S','T','V','Y','Z']
-spec_alphabet = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','R','S','T','V','W']
-course_url = 'http://legacy.handbook.unsw.edu.au/vbook2018/brCoursesByAtoZ.jsp?StudyLevel=Undergraduate&descr='
-spec_url = 'http://legacy.handbook.unsw.edu.au/vbook2018/brSpecialisationsByAtoZ.jsp?StudyLevel=Undergraduate&descr='
-
-##### COURSES #####
-
-# To print the info of each course
-def print_course(code, link, name, cred):
-    print("CODE: " + code) # prints the code of the course
-    print("LINK: " + link) # prints the link of the course
-    print("NAME: " + name) # prints the name of the course
-    print("CREDITS: " + cred) # prints the credits of the course
-
-# To go through each letter's links for courses
-def run_course():
-
-    # for letter in course_alphabet:
-    for letter in course_alphabet[0:2]:
-        # runs the url for the letter search
-        response = requests.get(course_url + letter)
-        course_soup = BeautifulSoup(response.text, "html.parser")
-
-        for i in range(1,len(tr)):
-            counter = 0
-            td = tr[i].find_all('td')
-            code = td[0].text
-            link = td[1].find_all('a')[0]['href']
-            name = td[1].find_all('a')[0].text
-            cred = td[2].text
-
-            print_course(code, link, name, cred)
-
-            # Go to course link and scrape the data
-            link_url = requests.get(link)
-            link_soup = BeautifulSoup(link_url.text, "html.parser")
-
-            p_data = link_soup.find_all('p')
-            h_data = link_soup.find_all('h2')
-            for p_instance in p_data:
-                search = p_instance.findChildren()
-                if (len(search) > 0 and len(search[0].contents) > 0):
-                    if (search[0].text == 'Faculty:'):
-                        if (len(search) > 1):
-                            print("FACULTY: " + search[1].text)
-                        else:
-                            print("FACULTY: " + p_instance.contents[1].strip())
-
-                    if (search[0].text == 'School:'):
-                        if (len(search) > 1):
-                            print("SCHOOL: " + search[1].text)
-                        else:
-                            print("SCHOOL: " + p_instance.contents[1].strip())
-
-                    if (search[0].text == 'Career:'):
-                        if (len(search) > 1):
-                            print("CAREER: " + search[1].text)
-                        else:
-                            print("CAREER: " + p_instance.contents[1].strip())
-
-                    # GenEd not working yet
-
-                    if (search[0].text == "Available for General Education:"):
-                        if (len(search) > 1):
-                            counter += 1
-                        break
-
-            for h_instance in h_data:
-                if (h_instance.text == "Description"):
-                    desc_tags = str(h_instance.find_next_siblings()[0])
-                    desc = str(re.sub("<.*?>", "", desc_tags))
-                    print("DESCRIPTION: " + desc)
-
-            # checks for General Education existence in course link
-            if (counter == 0):
-                print("GENED: False\n")
-            else:
-                print("GENED: True\n")
-
-##### SPECIALISATIONS (WIP) #####
-
-def run_spec():
-    for letter in spec_alphabet[0:2]:
-        response = requests.get(spec_url + letter)
-        spec_soup = BeautifulSoup(response.text, "html.parser")
-
-        spec_tr = spec_soup.find_all('tr') # this finds the first instance
-        for i in range(1,3):
-            counter = 0
-            spec_td = spec_tr[i].find_all('td') # this finds each of the td in tr
-            spec_name = spec_td[0].text
-            spec_link = spec_td[0].find_all('a')[0]['href']
-            print(spec_name)
-            print(spec_link)
-            print("")
+handbook_base_url = "https://www.handbook.unsw.edu.au"
+
+dvc_aca_bos_url = "https://www.handbook.unsw.edu.au/DvcacademicBoardOfStudies/browse?id=5fa56ceb4f0093004aa6eb4f0310c7b3"
+fac_art_des_url = "https://www.handbook.unsw.edu.au/FacultyOfArtDesign/browse?id=57a56ceb4f0093004aa6eb4f0310c7af"
+fac_art_socsci_url = "https://www.handbook.unsw.edu.au/FacultyOfArtsAndSocialSciences/browse?id=d7a56ceb4f0093004aa6eb4f0310c7ac"
+fac_bui_env_url = "https://www.handbook.unsw.edu.au/FacultyOfBuiltEnvironment/browse?id=5fa56ceb4f0093004aa6eb4f0310c7ae"
+fac_eng_url = "https://www.handbook.unsw.edu.au/FacultyOfEngineering/browse?id=5fa56ceb4f0093004aa6eb4f0310c7af"
+fac_law_url = "https://www.handbook.unsw.edu.au/FacultyOfLaw/browse?id=57a56ceb4f0093004aa6eb4f0310c7b0"
+fac_med_url = "https://www.handbook.unsw.edu.au/FacultyOfMedicine/browse?id=5fa56ceb4f0093004aa6eb4f0310c7b0"
+fac_sci_url = "https://www.handbook.unsw.edu.au/FacultyOfScience/browse?id=57a56ceb4f0093004aa6eb4f0310c7ae"
+bus_sch_url = "https://www.handbook.unsw.edu.au/UnswBusinessSchool/browse?id=5a3a1d4f4f4d97404aa6eb4f0310c77a"
+can_adfa_url = "https://www.handbook.unsw.edu.au/UnswCanberraAtAdfa/browse?id=5fa56ceb4f0093004aa6eb4f0310c7ad"
+glob_url = "https://www.handbook.unsw.edu.au/UnswGlobal/browse?id=a9321f614ffd57009106fd501310c7eb"
+
+faculty_urls = [dvc_aca_bos_url, fac_art_des_url, fac_art_socsci_url, fac_bui_env_url, fac_eng_url, fac_law_url, fac_med_url, fac_sci_url, bus_sch_url, can_adfa_url, glob_url]
+
+# Faculty scraping
+## Given the soup for the faculty page, return the name of the faculty
+def get_faculty_name(soup):
+    return soup.find("h2", attrs={"class": "a-browse-heading"})
+
+## Given the soup for the faculty page, return course tuple (link, name, course code, uoc) for courses on the page
+## TODO
+def get_faculty_courses(soup):
+    pass
+
+## Given the soup for the faculty page, returns program tuple (link, name, program code, uoc) for programs on the page
+## TODO: Support double degrees
+def get_faculty_programs(soup):
+    l = soup.find_all("div", attrs={"class": "m-single-course-wrapper-browse"})
+    return [(handbook_base_url + elem.a.attrs["href"].replace("\n", ""), elem.p.text, elem.find("span", attrs={"class": "align-left"}).text, elem.find("span", attrs={"class": None}).text) for elem in l]
+
+# Program scraping
+
+## Given the soup for the program page, return the program tuple (name, code, uoc, faculty, ##TODO figure out rest from page info...##)
+def get_program_info(soup):
+    pass
+
+## Given the soup for the program page, return the major tuple (link, name, major code, uoc, courses/requirement dict) for majors on the page
+## TODO
+def get_program_majors(soup):
+    pass
+
+## Given the soup for the program page, return the minor tuple (link, name, major code, uoc, course/requirement dict) for minors on the page
+## TODO
+def get_program_minors(soup):
+    pass
+
+# Course scraping
+## Given the soup for the course page, return the course tuple (name, code, uoc, overview, [equivalent course codes], [exclusion course codes], outline link, offering, study level, school, faculty)
+## TODO
+def get_course(soup):
+    pass
+
+# Scrape scope
+## Scrape all of UNSW Handbook
+def scrape_all():
+    for faculty_url in faculty_urls:
+        scrape_faculty(faculty_url)
+
+## Scrape all programs in a faculty and the faculty data itself.
+def scrape_faculty(faculty_url):
+    page = requests.get(faculty_url)
+    soup = BeautifulSoup(page.text, "html.parser")
+    name = get_faculty_name(soup)
+    raw_courses = get_faculty_courses(soup)
+    courses = [scrape_course(i[0]) for i in raw_courses]
+    raw_programs = get_faculty_programs(soup)
+    programs = [scrape_program(i[0]) for i in raw_programs]
+
+def scrape_program(program_url):
+    page = requests.get(program_url)
+    soup = BeautifulSoup(page.text, "html.parser")
+    majors = get_program_majors(soup)
+    minors = get_program_minors(soup)
+    return (majors, minors)
+
+def scrape_course(course_url):
+    page = requests.get(course_url)
+    soup = BeautifulSoup(page.text, "html.parser")
+    return get_course(soup)
 
 if __name__ == "__main__":
-    run_course()
+    scrape_faculty(fac_eng_url)