-
Notifications
You must be signed in to change notification settings - Fork 48
SPOC-542: add snowflake-based distributed sequence access method #479
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
danolivo
wants to merge
1
commit into
main
Choose a base branch
from
spoc-542
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,183 @@ | ||
| /*------------------------------------------------------------------------- | ||
| * | ||
| * spock_seqam.h | ||
| * Distributed sequence access methods for Spock. | ||
| * | ||
| * Copyright (c) 2022-2026, pgEdge, Inc. | ||
| * | ||
| *------------------------------------------------------------------------- | ||
| */ | ||
| #ifndef SPOCK_SEQAM_H | ||
| #define SPOCK_SEQAM_H | ||
|
|
||
| #include "postgres.h" | ||
| #include "catalog/pg_sequence.h" | ||
| #include "commands/sequence.h" /* SeqTable, nextval_hook_type */ | ||
| #include "utils/relcache.h" | ||
|
|
||
| /* | ||
| * Catalog name and column positions for spock.sequence_kind. | ||
| * | ||
| * Keep these in sync with sql/spock--*.sql. The table is keyed on | ||
| * (nspname, relname); seqoid is a non-key cache column. | ||
| */ | ||
| #define CATALOG_SEQUENCE_KIND "sequence_kind" | ||
| #define CATALOG_SEQUENCE_KIND_OID_IDX "spock_sequence_kind_seqoid_idx" | ||
| #define Natts_sequence_kind 4 | ||
| #define Anum_sequence_kind_nspname 1 | ||
| #define Anum_sequence_kind_relname 2 | ||
| #define Anum_sequence_kind_kind 3 | ||
| #define Anum_sequence_kind_seqoid 4 | ||
|
|
||
| /* | ||
| * Hook entry for ALTER SEQUENCE ... RENAME and SET SCHEMA. Called from | ||
| * src/spock_executor.c:spock_object_access on OAT_POST_ALTER for a | ||
| * RELKIND_SEQUENCE relation (subId == 0). Resolves the row by seqoid via | ||
| * the secondary index and updates (nspname, relname) if the live sequence | ||
| * has been renamed or moved. | ||
| */ | ||
| extern void spock_seqam_relocate_sequence_record(Oid seqoid); | ||
|
|
||
| /* | ||
| * Registered methods. Identifiers are stable across versions and serve as | ||
| * the on-disk encoding of the `kind` column when we eventually move it from | ||
| * text to a more compact form. | ||
| * | ||
| * SPOCK_SEQAM_NMETHODS is a sentinel: it must remain one past the last real | ||
| * kind so the method-table array sizes correctly when a new kind is added. | ||
| */ | ||
| typedef enum SpockSeqAmKind | ||
| { | ||
| SPOCK_SEQAM_LOCAL = 0, /* fall through to in-core nextval */ | ||
| SPOCK_SEQAM_SNOWFLAKE = 1, | ||
| SPOCK_SEQAM_NMETHODS /* leave last */ | ||
| } SpockSeqAmKind; | ||
|
|
||
| /* | ||
| * Per-method dispatch table. Filled in by spock_seqam_register_methods() | ||
| * at extension load and never mutated after that. | ||
| */ | ||
| typedef struct SpockSeqAmMethod | ||
| { | ||
| const char *name; /* "snowflake", "local", ... */ | ||
| SpockSeqAmKind kind; | ||
|
|
||
| /* | ||
| * Hot-path nextval. Signature mirrors the SeqAM `nextval` callback | ||
| * (the in-flight Paquier patch's SequenceAmRoutine.nextval): receives | ||
| * the open sequence relation, the unpacked catalog options, and an | ||
| * IN/OUT *last for prefetch reporting. Returns the int64 value to be | ||
| * used as nextval()'s result. | ||
| * | ||
| * Methods that do not prefetch (Snowflake) must set *last to the | ||
| * returned value so the in-core CACHE fast path stays neutral. | ||
| */ | ||
| int64 (*nextval) (Relation rel, | ||
| int64 incby, int64 maxv, int64 minv, | ||
| int64 cache, bool cycle, | ||
| int64 *last); | ||
| } SpockSeqAmMethod; | ||
|
|
||
| /* | ||
| * GUC-controlled defaults. Defined in spock_seqam.c. | ||
| */ | ||
| extern int spock_seqam_default_kind; /* enum SpockSeqAmKind */ | ||
|
|
||
| /* | ||
| * Snowflake layout constants. | ||
| * | ||
| * Bit allocation is fixed. PGD chose the same split for SnowflakeId and the | ||
| * mathematics of (4096 values / node / ms) are sufficient for any OLTP | ||
| * workload; making it configurable is a footgun. | ||
| */ | ||
| #define SPOCK_SNOWFLAKE_TIMESTAMP_BITS 41 | ||
| #define SPOCK_SNOWFLAKE_NODE_BITS 10 | ||
| #define SPOCK_SNOWFLAKE_COUNTER_BITS 12 | ||
|
|
||
| #define SPOCK_SNOWFLAKE_NODE_SHIFT SPOCK_SNOWFLAKE_COUNTER_BITS | ||
| #define SPOCK_SNOWFLAKE_TIMESTAMP_SHIFT (SPOCK_SNOWFLAKE_NODE_BITS + SPOCK_SNOWFLAKE_COUNTER_BITS) | ||
|
|
||
| #define SPOCK_SNOWFLAKE_COUNTER_MASK ((UINT64CONST(1) << SPOCK_SNOWFLAKE_COUNTER_BITS) - 1) | ||
| #define SPOCK_SNOWFLAKE_NODE_MASK (((UINT64CONST(1) << SPOCK_SNOWFLAKE_NODE_BITS) - 1) \ | ||
| << SPOCK_SNOWFLAKE_NODE_SHIFT) | ||
|
|
||
| #define SPOCK_SNOWFLAKE_MAX_NODE_ID ((1 << SPOCK_SNOWFLAKE_NODE_BITS) - 1) | ||
|
|
||
| /* | ||
| * 2023-01-01 00:00:00 UTC as Unix milliseconds. The 41-bit ms timestamp | ||
| * field thus runs out around 2092. Chosen to match the standalone | ||
| * `snowflake` extension's epoch so values from both extensions decode | ||
| * identically (same bit layout, same epoch, same node-id derivation). | ||
| * | ||
| * Do NOT change this constant after a cluster has generated any | ||
| * snowflake values: every previously emitted value would appear to be | ||
| * from a different era and could compare incorrectly with newly | ||
| * generated ones. | ||
| */ | ||
| #define SPOCK_SNOWFLAKE_EPOCH_MS INT64CONST(1672531200000) | ||
|
|
||
| /* | ||
| * Magic word stamped in the special area of a sequence's heap page by | ||
| * the in-core sequence machinery. Hardcoded -- core declares the type | ||
| * privately in src/backend/commands/sequence.c. Validate on every read | ||
| * to catch page-layout drift across PG major versions at buildfarm- | ||
| * assert level. | ||
| */ | ||
| #define SPOCK_SEQUENCE_PAGE_MAGIC 0x1717 | ||
|
|
||
| typedef struct SpockSequencePageMagic | ||
| { | ||
| uint32 magic; | ||
| } SpockSequencePageMagic; | ||
|
|
||
| /* | ||
| * Pre-log window for WAL batching, in milliseconds. On every nextval we | ||
| * compare the emitted value against last_logged_threshold (stored in | ||
| * seq->log_cnt). When we cross the threshold, we WAL-log a reservation | ||
| * for the next SPOCK_SNOWFLAKE_LOG_INTERVAL_MS of generation. Crash | ||
| * recovery jumps last_value forward by at most this interval -- never | ||
| * produces duplicates. Same mechanism core PG uses for stock sequence | ||
| * cache, expressed in time instead of count. | ||
| */ | ||
| #define SPOCK_SNOWFLAKE_LOG_INTERVAL_MS 30 | ||
|
|
||
| /* | ||
| * Module entry point. Called from _PG_init() to register the nextval hook, | ||
| * install GUCs and the relcache invalidation callback. | ||
| */ | ||
| extern void spock_seqam_init(void); | ||
|
|
||
| /* | ||
| * Drop hook callback. Invoked from src/spock_executor.c:spock_object_access | ||
| * on OAT_DROP of a RELKIND_SEQUENCE relation. Deletes the spock.sequence_kind | ||
| * row if any. | ||
| */ | ||
| extern void spock_seqam_drop_sequence_record(Oid seqoid); | ||
|
|
||
| /* | ||
| * Lookup the catalog kind for a sequence by (nspname, relname). Returns | ||
| * false if the catalog is missing (extension not yet installed) or if no | ||
| * row exists for the sequence. Used by replication paths to skip | ||
| * managed-sequence values that must generate independently per node. | ||
| */ | ||
| extern bool spock_seqam_lookup_kind_by_name(const char *nspname, | ||
| const char *relname, | ||
| SpockSeqAmKind *kind_out); | ||
|
|
||
| /* | ||
| * SQL-callable, registered from sql/spock--*.sql. | ||
| */ | ||
| extern Datum spock_alter_sequence_set_kind(PG_FUNCTION_ARGS); | ||
| extern Datum spock_convert_all_sequences(PG_FUNCTION_ARGS); | ||
| extern Datum spock_sequence_hook_available(PG_FUNCTION_ARGS); | ||
|
|
||
| /* | ||
| * Forward declaration of the snowflake method. Defined in | ||
| * src/spock_seqam_snowflake.c. | ||
| */ | ||
| extern int64 spock_seqam_snowflake_nextval(Relation rel, | ||
| int64 incby, int64 maxv, int64 minv, | ||
| int64 cache, bool cycle, | ||
| int64 *last); | ||
|
|
||
| #endif /* SPOCK_SEQAM_H */ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,172 @@ | ||
| diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c | ||
| index fbcfcddb59e..b2430ea44b8 100644 | ||
| --- a/src/backend/commands/sequence.c | ||
| +++ b/src/backend/commands/sequence.c | ||
| @@ -95,6 +95,9 @@ static HTAB *seqhashtab = NULL; /* hash table for SeqTable items */ | ||
| */ | ||
| static SeqTableData *last_used_seq = NULL; | ||
|
|
||
| +/* Spock: nextval generator hook; see sequence.h for contract. */ | ||
| +nextval_hook_type nextval_hook = NULL; | ||
| + | ||
| static void fill_seq_with_data(Relation rel, HeapTuple tuple); | ||
| static void fill_seq_fork_with_data(Relation rel, HeapTuple tuple, ForkNumber forkNum); | ||
| static Relation lock_and_open_sequence(SeqTable seq); | ||
| @@ -637,24 +640,15 @@ nextval_internal(Oid relid, bool check_permissions) | ||
| { | ||
| SeqTable elm; | ||
| Relation seqrel; | ||
| - Buffer buf; | ||
| - Page page; | ||
| HeapTuple pgstuple; | ||
| Form_pg_sequence pgsform; | ||
| - HeapTupleData seqdatatuple; | ||
| - Form_pg_sequence_data seq; | ||
| int64 incby, | ||
| maxv, | ||
| minv, | ||
| cache, | ||
| - log, | ||
| - fetch, | ||
| last; | ||
| - int64 result, | ||
| - next, | ||
| - rescnt = 0; | ||
| + int64 result; | ||
| bool cycle; | ||
| - bool logit = false; | ||
|
|
||
| /* open and lock sequence */ | ||
| init_sequence(relid, &elm, &seqrel); | ||
| @@ -699,12 +693,51 @@ nextval_internal(Oid relid, bool check_permissions) | ||
| cycle = pgsform->seqcycle; | ||
| ReleaseSysCache(pgstuple); | ||
|
|
||
| + /* Spock: hook overrides; swap for rel->rd_sequenceam->nextval at SeqAM merge. */ | ||
| + if (nextval_hook != NULL) | ||
| + result = (*nextval_hook) (seqrel, incby, maxv, minv, cache, cycle, &last); | ||
| + else | ||
| + result = sequence_am_local_nextval(seqrel, incby, maxv, minv, | ||
| + cache, cycle, &last); | ||
| + | ||
| + /* save info in local cache */ | ||
| + elm->increment = incby; | ||
| + elm->last = result; /* last returned number */ | ||
| + elm->cached = last; /* last fetched number */ | ||
| + elm->last_valid = true; | ||
| + | ||
| + relation_close(seqrel, NoLock); | ||
| + last_used_seq = elm; | ||
| + return result; | ||
| +} | ||
| + | ||
| +/* | ||
| + * Spock: stock per-call generator, factored out of nextval_internal so the | ||
| + * nextval_hook can delegate to it. Caller owns seqrel and the per-session | ||
| + * cache state; *last receives the largest value reserved for the session. | ||
| + */ | ||
| +int64 | ||
| +sequence_am_local_nextval(Relation seqrel, | ||
| + int64 incby, int64 maxv, int64 minv, | ||
| + int64 cache, bool cycle, | ||
| + int64 *last) | ||
| +{ | ||
| + Buffer buf; | ||
| + Page page; | ||
| + HeapTupleData seqdatatuple; | ||
| + Form_pg_sequence_data seq; | ||
| + int64 log, | ||
| + fetch; | ||
| + int64 result, | ||
| + next, | ||
| + rescnt = 0; | ||
| + bool logit = false; | ||
| + | ||
| /* lock page' buffer and read tuple */ | ||
| seq = read_seq_tuple(seqrel, &buf, &seqdatatuple); | ||
| page = BufferGetPage(buf); | ||
|
|
||
| - elm->increment = incby; | ||
| - last = next = result = seq->last_value; | ||
| + *last = next = result = seq->last_value; | ||
| fetch = cache; | ||
| log = seq->log_cnt; | ||
|
|
||
| @@ -791,7 +824,7 @@ nextval_internal(Oid relid, bool check_permissions) | ||
| { | ||
| log--; | ||
| rescnt++; | ||
| - last = next; | ||
| + *last = next; | ||
| if (rescnt == 1) /* if it's first result - */ | ||
| result = next; /* it's what to return */ | ||
| } | ||
| @@ -800,13 +833,6 @@ nextval_internal(Oid relid, bool check_permissions) | ||
| log -= fetch; /* adjust for any unfetched numbers */ | ||
| Assert(log >= 0); | ||
|
|
||
| - /* save info in local cache */ | ||
| - elm->last = result; /* last returned number */ | ||
| - elm->cached = last; /* last fetched number */ | ||
| - elm->last_valid = true; | ||
| - | ||
| - last_used_seq = elm; | ||
| - | ||
| /* | ||
| * If something needs to be WAL logged, acquire an xid, so this | ||
| * transaction's commit will trigger a WAL flush and wait for syncrep. | ||
| @@ -862,7 +888,7 @@ nextval_internal(Oid relid, bool check_permissions) | ||
| } | ||
|
|
||
| /* Now update sequence tuple to the intended final state */ | ||
| - seq->last_value = last; /* last fetched number */ | ||
| + seq->last_value = *last; /* last fetched number */ | ||
| seq->is_called = true; | ||
| seq->log_cnt = log; /* how much is logged */ | ||
|
|
||
| @@ -870,8 +896,6 @@ nextval_internal(Oid relid, bool check_permissions) | ||
|
|
||
| UnlockReleaseBuffer(buf); | ||
|
|
||
| - relation_close(seqrel, NoLock); | ||
| - | ||
| return result; | ||
| } | ||
|
|
||
| diff --git a/src/include/commands/sequence.h b/src/include/commands/sequence.h | ||
| index 9da23008101..0fe86199306 100644 | ||
| --- a/src/include/commands/sequence.h | ||
| +++ b/src/include/commands/sequence.h | ||
| @@ -20,6 +20,7 @@ | ||
| #include "nodes/parsenodes.h" | ||
| #include "parser/parse_node.h" | ||
| #include "storage/relfilenode.h" | ||
| +#include "utils/relcache.h" | ||
|
|
||
|
|
||
| typedef struct FormData_pg_sequence_data | ||
| @@ -55,6 +56,24 @@ extern int64 nextval_internal(Oid relid, bool check_permissions); | ||
| extern Datum nextval(PG_FUNCTION_ARGS); | ||
| extern List *sequence_options(Oid relid); | ||
|
|
||
| +/* | ||
| + * Spock: per-call value-generation hook. Signature matches Paquier's | ||
| + * upstream SeqAM nextval callback; the forward-port replaces the call | ||
| + * site with rel->rd_sequenceam->nextval(...). Single consumer; extension | ||
| + * must reject install when nextval_hook != NULL. | ||
| + */ | ||
| +typedef int64 (*nextval_hook_type) (Relation rel, | ||
| + int64 incby, int64 maxv, int64 minv, | ||
| + int64 cache, bool cycle, | ||
| + int64 *last); | ||
| +extern PGDLLIMPORT nextval_hook_type nextval_hook; | ||
| + | ||
| +/* Spock: stock per-call generator, factored out so hooks can delegate. */ | ||
| +extern int64 sequence_am_local_nextval(Relation rel, | ||
| + int64 incby, int64 maxv, int64 minv, | ||
| + int64 cache, bool cycle, | ||
| + int64 *last); | ||
| + | ||
| extern ObjectAddress DefineSequence(ParseState *pstate, CreateSeqStmt *stmt); | ||
| extern ObjectAddress AlterSequence(ParseState *pstate, AlterSeqStmt *stmt); | ||
| extern void SequenceChangePersistence(Oid relid, char newrelpersistence); | ||
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Initialize
lastdefensively before dispatching to the hook.lastis declared (line 31) but never initialized before&lastis passed tonextval_hookat line 47. The local-case helper writes*lastunconditionally on entry, but a third-party hook that returns without writing*last(e.g. an early-return error path that returns a sentinel) will causeelm->cached = last;a few lines below to read uninitialized stack memory and poison the in-core CACHE fast path for the remainder of the session. The header comment documents the contract ("Methods that do not prefetch ... must set *last to the returned value") but the cost of a defensive init is one line.🛡️ Suggested defensive init
🤖 Prompt for AI Agents