-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathschema.sql
More file actions
78 lines (71 loc) · 3.03 KB
/
schema.sql
File metadata and controls
78 lines (71 loc) · 3.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
-- run once to create empty tables, then populate with data
-- creation order matters: parent tables before children (foreign key dependencies)
-- companies: stores each company once (jobs reference this via foreign key)
create table companies (
-- serial = auto-incrementing integer, primary key = unique row identifier
id serial primary key,
-- unique prevents duplicate company entries
name text not null unique,
website text
);
-- skills: master list of normalized skill names ("Python", not "python3")
-- junction table (job_skills) connects these to jobs
create table skills (
id serial primary key,
name text not null unique
);
-- jobs: each unique job posting, linked to one company
create table jobs (
id serial primary key,
-- references = foreign key, value must exist in companies.id
company_id integer not null references companies(id),
title text not null,
description text,
-- salary nullable because many postings don't include it
salary_min integer,
salary_max integer,
location text,
-- date = just date, no time component
posted_date date,
source_url text,
-- default now() = auto-set to current timestamp on insert
created_at timestamp default now()
);
-- job_skills: junction table for many-to-many (job <-> skill) relationship
-- each row = "this job requires this skill"
create table job_skills (
-- on delete cascade = if parent row deleted, delete this row too
job_id integer not null references jobs(id) on delete cascade,
skill_id integer not null references skills(id) on delete cascade,
-- composite primary key = (job_id, skill_id) pair must be unique
primary key (job_id, skill_id)
);
-- raw_postings: archive of scraped content before LLM processing
-- multiple raw_postings can point to same job (indeed + linkedin same posting)
create table raw_postings (
id serial primary key,
-- which site: "indeed", "linkedin", etc.
source text not null,
url text not null unique,
-- keep raw html/text for re-extraction if prompts improve
raw_content text,
scraped_at timestamp default now(),
-- pipeline flips to true after LLM extraction
processed boolean default false,
-- set after processing, nullable until then
-- on delete set null = keep archive even if job deleted
job_id integer references jobs(id) on delete set null
);
-- indexes: speed up queries by creating lookup structures
-- trade-off: faster reads, slightly slower writes, more disk space
-- filter/join on company_id
create index idx_jobs_company_id on jobs(company_id);
-- filter by location ("show montreal jobs")
create index idx_jobs_location on jobs(location);
-- junction table lookups from both directions
create index idx_job_skills_job_id on job_skills(job_id);
create index idx_job_skills_skill_id on job_skills(skill_id);
-- find unprocessed postings quickly
create index idx_raw_postings_processed on raw_postings(processed);
-- find which raw_postings belong to which job
create index idx_raw_postings_job_id on raw_postings(job_id);