Skip to content
This repository was archived by the owner on Jan 23, 2022. It is now read-only.
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 6 additions & 10 deletions server/scripts/create_psql_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,14 @@

# Database Configuration
DEGREE_PLANNER_DATABASE_NAME = "degree_planner"
PGPORT = os.getenv('PGPORT')
PGPORT = int(os.getenv('PGPORT'))
PGHOST = os.getenv('PGHOST')
PGUSER = os.getenv('PGUSER')
PGPASSWORD = os.getenv('PGPASSWORD')

# SENG Sample Data
faculty = "Faculty of Engineering"
school = "School of Computer Science and Engineering"
#school = "School of Computer Science and Engineering"
program = "Bachelor of Engineering (Honours)"
major = "Software Engineering"
courses = [
Expand All @@ -44,9 +44,8 @@
# Create tables
conn = pg8000.connect(database=DEGREE_PLANNER_DATABASE_NAME, port=PGPORT, host=PGHOST, user=PGUSER, password=PGPASSWORD)
cur = conn.cursor()
cur.execute("CREATE TABLE IF NOT EXISTS faculties (name text primary key, schools text)")
cur.execute("CREATE TABLE IF NOT EXISTS schools (name text primary key, programs text)")
cur.execute("CREATE TABLE IF NOT EXISTS programs (programid integer primary key, name text, major text, minor text)")
cur.execute("CREATE TABLE IF NOT EXISTS faculties (name text primary key, programs text)")
cur.execute("CREATE TABLE IF NOT EXISTS programs (programid integer primary key, name text, link text, majors text, minors text)")
cur.execute("CREATE TABLE IF NOT EXISTS majors (majorid text primary key, name text, courses text)")
cur.execute("CREATE TABLE IF NOT EXISTS minors (minorid text primary key, name text, courses text)")
cur.execute("CREATE TABLE IF NOT EXISTS courses (code text, name text, description text, offering text, faculty text, school text, study_level text, gened integer, outline text, uoc integer)")
Expand All @@ -55,13 +54,10 @@

# Insert Dummy Data for Bachelor of Software Engineering
# Faculty
query, args = pg8000.core.convert_paramstyle("qmark", "INSERT INTO faculties (name, schools) VALUES (?, ?) ON CONFLICT DO NOTHING")
query, args = pg8000.core.convert_paramstyle("qmark", "INSERT INTO faculties (name, programs) VALUES (?, ?) ON CONFLICT DO NOTHING")
cur.execute(query, args((faculty, school)))
# School
query, args = pg8000.core.convert_paramstyle("qmark", "INSERT INTO schools (name, programs) VALUES (?, ?) ON CONFLICT DO NOTHING")
cur.execute(query, args((school, program)))
# Program
query, args = pg8000.core.convert_paramstyle("qmark", "INSERT INTO programs (programid, name, major, minor) VALUES (?, ?, ?, ?) ON CONFLICT DO NOTHING")
query, args = pg8000.core.convert_paramstyle("qmark", "INSERT INTO programs (programid, name, link, major, minor) VALUES (?, ?, ?, ?, ?) ON CONFLICT DO NOTHING")
cur.execute(query, args((3707, "Bachelor of Engineering (Honours)", "Software Engineering", "")))
# Major
query, args = pg8000.core.convert_paramstyle("qmark", "INSERT INTO majors (majorid, name, courses) VALUES (?, ?, ?) ON CONFLICT DO NOTHING")
Expand Down
193 changes: 83 additions & 110 deletions server/scripts/plannify_scraper.py
Original file line number Diff line number Diff line change
@@ -1,116 +1,89 @@
#!/usr/bin/python3

import re
import requests
import urllib.request
import time
import re, requests, time
from bs4 import BeautifulSoup

# Started by Jeremy Lim on 20/07/2019

# TO-DO LIST
# - Get timetable done
# - Get GenEd done
# - Majors
# - Degrees
# - Add to database

# Stripped down alphabet
course_alphabet = ['A','B','C','D','E','F','G','H','I','L','M','N','O','P','R','S','T','V','Y','Z']
spec_alphabet = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','R','S','T','V','W']
course_url = 'http://legacy.handbook.unsw.edu.au/vbook2018/brCoursesByAtoZ.jsp?StudyLevel=Undergraduate&descr='
spec_url = 'http://legacy.handbook.unsw.edu.au/vbook2018/brSpecialisationsByAtoZ.jsp?StudyLevel=Undergraduate&descr='

##### COURSES #####

# To print the info of each course
def print_course(code, link, name, cred):
print("CODE: " + code) # prints the code of the course
print("LINK: " + link) # prints the link of the course
print("NAME: " + name) # prints the name of the course
print("CREDITS: " + cred) # prints the credits of the course

# To go through each letter's links for courses
def run_course():

# for letter in course_alphabet:
for letter in course_alphabet[0:2]:
# runs the url for the letter search
response = requests.get(course_url + letter)
course_soup = BeautifulSoup(response.text, "html.parser")

for i in range(1,len(tr)):
counter = 0
td = tr[i].find_all('td')
code = td[0].text
link = td[1].find_all('a')[0]['href']
name = td[1].find_all('a')[0].text
cred = td[2].text

print_course(code, link, name, cred)

# Go to course link and scrape the data
link_url = requests.get(link)
link_soup = BeautifulSoup(link_url.text, "html.parser")

p_data = link_soup.find_all('p')
h_data = link_soup.find_all('h2')
for p_instance in p_data:
search = p_instance.findChildren()
if (len(search) > 0 and len(search[0].contents) > 0):
if (search[0].text == 'Faculty:'):
if (len(search) > 1):
print("FACULTY: " + search[1].text)
else:
print("FACULTY: " + p_instance.contents[1].strip())

if (search[0].text == 'School:'):
if (len(search) > 1):
print("SCHOOL: " + search[1].text)
else:
print("SCHOOL: " + p_instance.contents[1].strip())

if (search[0].text == 'Career:'):
if (len(search) > 1):
print("CAREER: " + search[1].text)
else:
print("CAREER: " + p_instance.contents[1].strip())

# GenEd not working yet

if (search[0].text == "Available for General Education:"):
if (len(search) > 1):
counter += 1
break

for h_instance in h_data:
if (h_instance.text == "Description"):
desc_tags = str(h_instance.find_next_siblings()[0])
desc = str(re.sub("<.*?>", "", desc_tags))
print("DESCRIPTION: " + desc)

# checks for General Education existence in course link
if (counter == 0):
print("GENED: False\n")
else:
print("GENED: True\n")

##### SPECIALISATIONS (WIP) #####

def run_spec():
for letter in spec_alphabet[0:2]:
response = requests.get(spec_url + letter)
spec_soup = BeautifulSoup(response.text, "html.parser")

spec_tr = spec_soup.find_all('tr') # this finds the first instance
for i in range(1,3):
counter = 0
spec_td = spec_tr[i].find_all('td') # this finds each of the td in tr
spec_name = spec_td[0].text
spec_link = spec_td[0].find_all('a')[0]['href']
print(spec_name)
print(spec_link)
print("")
handbook_base_url = "https://www.handbook.unsw.edu.au"

dvc_aca_bos_url = "https://www.handbook.unsw.edu.au/DvcacademicBoardOfStudies/browse?id=5fa56ceb4f0093004aa6eb4f0310c7b3"
fac_art_des_url = "https://www.handbook.unsw.edu.au/FacultyOfArtDesign/browse?id=57a56ceb4f0093004aa6eb4f0310c7af"
fac_art_socsci_url = "https://www.handbook.unsw.edu.au/FacultyOfArtsAndSocialSciences/browse?id=d7a56ceb4f0093004aa6eb4f0310c7ac"
fac_bui_env_url = "https://www.handbook.unsw.edu.au/FacultyOfBuiltEnvironment/browse?id=5fa56ceb4f0093004aa6eb4f0310c7ae"
fac_eng_url = "https://www.handbook.unsw.edu.au/FacultyOfEngineering/browse?id=5fa56ceb4f0093004aa6eb4f0310c7af"
fac_law_url = "https://www.handbook.unsw.edu.au/FacultyOfLaw/browse?id=57a56ceb4f0093004aa6eb4f0310c7b0"
fac_med_url = "https://www.handbook.unsw.edu.au/FacultyOfMedicine/browse?id=5fa56ceb4f0093004aa6eb4f0310c7b0"
fac_sci_url = "https://www.handbook.unsw.edu.au/FacultyOfScience/browse?id=57a56ceb4f0093004aa6eb4f0310c7ae"
bus_sch_url = "https://www.handbook.unsw.edu.au/UnswBusinessSchool/browse?id=5a3a1d4f4f4d97404aa6eb4f0310c77a"
can_adfa_url = "https://www.handbook.unsw.edu.au/UnswCanberraAtAdfa/browse?id=5fa56ceb4f0093004aa6eb4f0310c7ad"
glob_url = "https://www.handbook.unsw.edu.au/UnswGlobal/browse?id=a9321f614ffd57009106fd501310c7eb"

faculty_urls = [dvc_aca_bos_url, fac_art_des_url, fac_art_socsci_url, fac_bui_env_url, fac_eng_url, fac_law_url, fac_med_url, fac_sci_url, bus_sch_url, can_adfa_url, glob_url]

# Faculty scraping
## Given the soup for the faculty page, return the name of the faculty
def get_faculty_name(soup):
return soup.find("h2", attrs={"class": "a-browse-heading"})

## Given the soup for the faculty page, return course tuple (link, name, course code, uoc) for courses on the page
## TODO
def get_faculty_courses(soup):
pass

## Given the soup for the faculty page, returns program tuple (link, name, program code, uoc) for programs on the page
## TODO: Support double degrees
def get_faculty_programs(soup):
l = soup.find_all("div", attrs={"class": "m-single-course-wrapper-browse"})
return [(handbook_base_url + elem.a.attrs["href"].replace("\n", ""), elem.p.text, elem.find("span", attrs={"class": "align-left"}).text, elem.find("span", attrs={"class": None}).text) for elem in l]

# Program scraping

## Given the soup for the program page, return the program tuple (name, code, uoc, faculty, ##TODO figure out rest from page info...##)
def get_program_info(soup):
pass

## Given the soup for the program page, return the major tuple (link, name, major code, uoc, courses/requirement dict) for majors on the page
## TODO
def get_program_majors(soup):
pass

## Given the soup for the program page, return the minor tuple (link, name, major code, uoc, course/requirement dict) for minors on the page
## TODO
def get_program_minors(soup):
pass

# Course scraping
## Given the soup for the course page, return the course tuple (name, code, uoc, overview, [equivalent course codes], [exclusion course codes], outline link, offering, study level, school, faculty)
## TODO
def get_course(soup):
pass

# Scrape scope
## Scrape all of UNSW Handbook
def scrape_all():
for faculty_url in faculty_urls:
scrape_faculty(faculty_url)

## Scrape all programs in a faculty and the faculty data itself.
def scrape_faculty(faculty_url):
page = requests.get(faculty_url)
soup = BeautifulSoup(page.text, "html.parser")
name = get_faculty_name(soup)
raw_courses = get_faculty_courses(soup)
courses = [scrape_course(i[0]) for i in raw_courses]
raw_programs = get_faculty_programs(soup)
programs = [scrape_program(i[0]) for i in raw_programs]

def scrape_program(program_url):
page = requests.get(program_url)
soup = BeautifulSoup(page.text, "html.parser")
majors = get_program_majors(soup)
minors = get_program_minors(soup)
return (majors, minors)

def scrape_course(course_url):
page = requests.get(course_url)
soup = BeautifulSoup(page.text, "html.parser")
return get_course(soup)

if __name__ == "__main__":
run_course()
scrape_faculty(fac_eng_url)