Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 23 additions & 2 deletions app/Services/Cloning/AnonymizationEngine.php
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,28 @@ class AnonymizationEngine
{
private readonly Generator $faker;

public function __construct(string $locale = 'en_US')
private readonly string $runSalt;

/**
* The runSalt is a per-run random secret prepended to all unsalted hash
* operations. It defeats cross-run linkability (rainbow tables, joining
* hashed identifiers across multiple snapshots) while preserving intra-run
* referential integrity (the same source value hashes to the same target
* value across all tables of a single run).
*
* Pass an explicit salt only when reproducible hashes are required
* across runs (e.g. integration-test fixtures); production runs should
* always rely on the random default.
*/
public function __construct(string $locale = 'en_US', ?string $runSalt = null)
{
$this->faker = Factory::create($locale);
$this->runSalt = $runSalt ?? bin2hex(random_bytes(32));
}

public function getRunSalt(): string
{
return $this->runSalt;
}

/**
Expand Down Expand Up @@ -54,7 +73,9 @@ private function applyFake(ColumnCloningConfigData $config): mixed

private function applyHash(string $value, ColumnCloningConfigData $config): string
{
return hash($config->hashAlgorithm ?? 'sha256', ($config->hashSalt ?? '').$value);
$salt = $config->hashSalt ?? $this->runSalt;

return hash($config->hashAlgorithm ?? 'sha256', $salt.$value);
}

private function applyMask(string $value, ColumnCloningConfigData $config): string
Expand Down
7 changes: 6 additions & 1 deletion app/Services/Cloning/CloningRunOrchestrator.php
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,10 @@ public function run(
}
}

// Single engine per run: shared per-run random salt across all tables
// preserves intra-run hash joinability while defeating cross-run linkability.
$engine = new AnonymizationEngine($config->options->fakerLocale);

// Transfer data
/** @var list<TableRunResultData> $tableResults */
$tableResults = [];
Expand Down Expand Up @@ -169,6 +173,7 @@ public function run(
$source,
$target,
$pkColumns,
$engine,
$keyRemapping,
$config->keyRemapping,
);
Expand Down Expand Up @@ -254,10 +259,10 @@ private function transferTable(
ConnectionData $source,
ConnectionData $target,
array $pkColumns,
AnonymizationEngine $engine,
?KeyRemappingService $keyRemapping = null,
?KeyRemappingConfigData $keyRemappingConfig = null,
): array {
$engine = new AnonymizationEngine($options->fakerLocale);
$sourceConn = $this->connector->open($source);
$targetConn = $this->connector->open($target);

Expand Down
8 changes: 6 additions & 2 deletions app/Services/Cloning/CloningYamlValidator.php
Original file line number Diff line number Diff line change
Expand Up @@ -383,8 +383,12 @@ private function validateColumnStrategy(string $prefix, string $strategy, array
$errors[] = sprintf("%s: 'hash' strategy requires 'algorithm' (one of: %s)", $prefix, implode(', ', self::VALID_HASH_ALGORITHMS));
}

if (! array_key_exists('salt', $config)) {
$errors[] = sprintf("%s: 'hash' strategy requires 'salt'", $prefix);
// 'salt' is optional — when omitted, the engine applies a
// per-run random salt that defeats cross-run linkability.
// An explicit string salt is honored for reproducible hashes
// across runs (e.g. integration-test fixtures).
if (array_key_exists('salt', $config) && ! is_string($config['salt'])) {
$errors[] = sprintf("%s: 'hash' strategy 'salt' must be a string when provided", $prefix);
}

break;
Expand Down
8 changes: 7 additions & 1 deletion app/Services/Cloning/CloningYamlWriter.php
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,13 @@ public function write(DumpResultData $result): string

case 'hash':
$lines[] = sprintf(' algorithm: %s', $column->hashAlgorithm ?? 'sha256');
$lines[] = sprintf(' salt: "%s"', addslashes($column->hashSalt ?? ''));
// Salt is only written when explicitly configured.
// An absent 'salt' key tells the engine to apply
// a per-run random salt (GDPR-aligned default).
if ($column->hashSalt !== null) {
$lines[] = sprintf(' salt: "%s"', addslashes($column->hashSalt));
}

break;

case 'mask':
Expand Down
7 changes: 6 additions & 1 deletion app/Services/Pii/PiiMatcherYamlWriter.php
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,12 @@ public function write(array $groups, string $path): void
$transformationData['algorithm'] = $t->hashAlgorithm;
}

$transformationData['salt'] = $t->hashSalt ?? '';
// Only write 'salt' when the matcher carries an explicit
// override. Omitting it instructs the engine to apply its
// per-run random salt (GDPR-aligned pseudonymization).
if ($t->hashSalt !== null) {
$transformationData['salt'] = $t->hashSalt;
}
} elseif ($t->strategy === 'mask') {
if ($t->visibleChars !== null) {
$transformationData['visible_chars'] = $t->visibleChars;
Expand Down
17 changes: 11 additions & 6 deletions docs/cloning-yaml.md
Original file line number Diff line number Diff line change
Expand Up @@ -186,19 +186,21 @@ age:

### `hash`

Replace the value with a deterministic one-way hash. The same input always produces the same output — useful for preserving referential integrity across tables without exposing real values.
Replace the value with a one-way hash so the same input produces the same output within a single run — useful for preserving referential integrity across tables without exposing real values.

```yaml
password:
employee_id:
strategy: hash
algorithm: sha256
salt: ""
# salt is optional; omit it to let Clonio apply a per-run random salt.
```

| Field | Required | Values | Description |
|-------|:--------:|--------|-------------|
| `algorithm` | yes | `sha256` \| `sha512` \| `md5` \| `sha1` | PHP `hash()` algorithm. |
| `salt` | yes | string | Prefix prepended to the value before hashing. Use `""` for no salt. |
| `algorithm` | yes | `sha256` \| `sha512` \| `md5` \| `sha1` | PHP `hash()` algorithm. SHA-256 is recommended; SHA-1 / MD5 are accepted only for legacy use. |
| `salt` | no | string | Prefix prepended to the value before hashing. **When omitted, Clonio generates a 32-byte random salt per run.** Hashes are stable inside one run (joins work) but unrelatable across runs (rainbow tables and cross-snapshot linking are defeated). Set an explicit salt only if reproducible hashes across runs are required (e.g. test fixtures). |

> **GDPR notice.** `hash` is a *pseudonymization* technique, not anonymization (GDPR Art. 4 Nr. 5 / Recital 26). The output is still personal data and remains in scope of the GDPR. For columns where re-identification by linkage must be impossible, prefer `fake` or `null`.

---

Expand Down Expand Up @@ -473,9 +475,12 @@ tables:
faker_method: date
faker_arguments: ["Y-m-d"]
password:
strategy: static
value: "REDACTED"
employee_id:
strategy: hash
algorithm: sha256
salt: "clonio"
# salt omitted on purpose — engine uses its per-run random salt.
internal_notes:
strategy: "null"
account_tag:
Expand Down
49 changes: 36 additions & 13 deletions specs/PRD-cloning-yaml-schema.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@ Define the canonical structure of a `.cloning.yaml` file. This document is the s
- **One file, one source** — each YAML file describes one source database connection. The target is always supplied at runtime (`--target`).
- **Human-editable** — the format is intentionally flat and readable; complex nesting is avoided.
- **Strict but forward-compatible** — `additionalProperties: false` at known paths; a `version` field allows future schema evolution.
- **Explicit over implicit** — every value that affects behaviour must be stated in the file. No hidden defaults are applied at runtime. A reader must be able to understand the full transfer configuration by reading the YAML alone, without knowing what the tool's built-in defaults are. The single exception to this rule is column listing: columns that are not listed under a table are implicitly treated as `keep` (see §4.3).
- **Explicit over implicit** — every value that affects behaviour must be stated in the file. No hidden defaults are applied at runtime. A reader must be able to understand the full transfer configuration by reading the YAML alone, without knowing what the tool's built-in defaults are. The exceptions to this rule are:
- **Column listing** — columns that are not listed under a table are implicitly treated as `keep` (see §4.3).
- **Hash `salt`** — when omitted on a `hash` strategy, the engine applies a per-run random salt (GDPR-aligned pseudonymization; see §5.3). This default is intentional security behaviour and cannot be expressed in the file itself.

---

Expand Down Expand Up @@ -128,19 +130,35 @@ email:

### 5.3 `hash`

Replace the value with a deterministic hash. The same input always produces the same output.
Replace the value with a hash. Within a single run, the same input always produces the same output — so foreign-key joins on hashed columns stay valid. Across runs, outputs differ when `salt` is omitted (see GDPR note).

> **GDPR note — pseudonymization, not anonymization.**
> Under GDPR Art. 4 Nr. 5 / Recital 26 and WP29 Opinion 05/2014, a deterministic
> hash with a known or stable salt is **pseudonymization**: the data remains
> personal data and stays in GDPR scope, because re-identification via
> linkability or dictionary attacks is possible. To defeat cross-run
> linkability, **omit `salt`**: the engine generates a fresh random salt per
> run, kept only in memory. For high-risk PII (national IDs, payment data,
> credentials, biometric data), prefer `fake`, `static`, or `null` over `hash`.

```yaml
password:
# Recommended: omit salt — engine applies a per-run random salt.
loyalty_id:
strategy: hash
algorithm: sha256
salt: ""

# Discouraged unless you need stable cross-run identifiers (e.g. internal
# join keys you control); not GDPR-compliant for personal data.
employee_id:
strategy: hash
algorithm: sha256
salt: "internal-stable-key"
```

| Field | Type | Required | Description |
|-------|------|:--------:|-------------|
| `algorithm` | string | yes | PHP `hash()` algorithm: `sha256`, `sha512`, `md5`, `sha1` |
| `salt` | string | yes | Prefix prepended before hashing. Use `""` when no salt is desired. |
| `salt` | string | no | Explicit salt prepended before hashing. **Omit** for the GDPR-aligned default: a per-run random salt that defeats cross-run linkability. Set only when you need stable cross-run output. |

### 5.4 `mask`

Expand Down Expand Up @@ -461,7 +479,7 @@ A YAML language server hint can be placed at the top of every generated file:
},
"salt": {
"type": "string",
"description": "Salt prefix prepended before hashing. Required when strategy is 'hash'. Use empty string for no salt."
"description": "Optional salt prefix prepended before hashing. When omitted, the engine applies a per-run random salt (GDPR-aligned default that defeats cross-run linkability). Set only when stable cross-run output is required."
},
"visible_chars": {
"type": "integer",
Expand Down Expand Up @@ -490,7 +508,7 @@ A YAML language server hint can be placed at the top of every generated file:
},
{
"if": { "properties": { "strategy": { "const": "hash" } }, "required": ["strategy"] },
"then": { "required": ["algorithm", "salt"] }
"then": { "required": ["algorithm"] }
},
{
"if": { "properties": { "strategy": { "const": "mask" } }, "required": ["strategy"] },
Expand Down Expand Up @@ -554,14 +572,19 @@ tables:
faker_method: date
faker_arguments: ["Y-m-d"]
password:
# Credentials must never leak — replace with a fixed marker.
strategy: static
value: "REDACTED"
credit_card:
# Payment data: format-preserving synthetic value, never hashed.
strategy: fake
faker_method: creditCardNumber
faker_arguments: []
loyalty_id:
# Internal join key — hash with per-run random salt (no 'salt' field).
# Same input → same output within one run; differs across runs.
strategy: hash
algorithm: sha256
salt: "clonio"
credit_card:
strategy: mask
visible_chars: 4
mask_char: "*"
preserve_format: false
internal_notes:
strategy: "null"
account_tag:
Expand Down
53 changes: 31 additions & 22 deletions specs/PRD-pii-matchers.md
Original file line number Diff line number Diff line change
Expand Up @@ -150,9 +150,19 @@ transformation:
transformation:
strategy: hash
algorithm: sha256
salt: ""
# salt omitted → engine applies a per-run random salt at transform time
```

`salt` is optional. When absent, the cloning engine prepends a 32-byte random salt that is generated once per `cloning:run`. This:

- Defeats **cross-run linkability** (two snapshots of the same source database produce different hashes for the same input).
- Defeats rainbow-table attacks against small input spaces (SSN, employee numbers, etc.).
- Preserves **intra-run referential integrity** — identical source values still hash to identical target values within a single run, so foreign-key joins on hashed columns continue to work.

Set an explicit `salt:` string only when reproducible hashes across runs are required (e.g. integration-test fixtures).

> **GDPR.** `hash` produces *pseudonymized* data, not anonymized data (GDPR Art. 4 Nr. 5 / Recital 26). The output remains personal data and is still subject to the GDPR. For columns where any chance of linkage must be eliminated, use `fake`, `null`, or `static` instead.

### 7.3 `strategy: mask`

```yaml
Expand Down Expand Up @@ -192,7 +202,7 @@ The binary baseline is organised into six groups. When `matchers init` writes th
| `last_name` | Last Name | `/^(last[-_]?name\|sur[-_]?name\|family[-_]?name\|nachname\|nom)$/i` | `fake` → `lastName` |
| `full_name` | Person Name | `/^(full[-_]?name\|display[-_]?name\|name\|user[-_]?name\|nick[-_]?name)$/i` | `fake` → `name` |
| `date_of_birth` | Date of Birth | `/^(birth[-_]?date\|date[-_]?of[-_]?birth\|dob\|birthday\|geburtsdatum)$/i` | `fake` → `date` |
| `national_id` | National ID / SSN | `/^(ssn\|social[-_]?security\|national[-_]?id\|tax[-_]?id\|personal[-_]?id)$/i` | `hash` → `sha256` |
| `national_id` | National ID / SSN | `/^(ssn\|social[-_]?security\|national[-_]?id\|tax[-_]?id\|personal[-_]?id)$/i` | `fake` → `numerify('###-##-####')` |

### Group: `contact` — Contact Information

Expand All @@ -217,22 +227,22 @@ The binary baseline is organised into six groups. When `matchers init` writes th

| Matcher key | Name | Patterns (non-exhaustive) | Strategy |
|-------------|------|---------------------------|----------|
| `credit_card` | Credit Card Number | `/^(credit[-_]?card\|card[-_]?number\|cc[-_]?number\|payment[-_]?card\|pan)$/i` | `mask` (visible_chars: 4, mask_char: `*`, preserve_format: false) |
| `iban` | IBAN / Bank Account | `/^(iban\|bank[-_]?account\|kontonummer\|bic\|swift)$/i` | `mask` (visible_chars: 4, mask_char: `*`, preserve_format: false) |
| `credit_card` | Credit Card Number | `/^(credit[-_]?card\|card[-_]?number\|cc[-_]?number\|payment[-_]?card\|pan)$/i` | `fake` → `creditCardNumber` |
| `iban` | IBAN / Bank Account | `/^(iban\|bank[-_]?account\|kontonummer\|bic\|swift)$/i` | `fake` → `iban` |
| `company_name` | Company Name | `/^(company\|company[-_]?name\|organization\|org[-_]?name\|firma)$/i` | `fake` → `company` |

### Group: `authentication` — Authentication & Secrets

| Matcher key | Name | Patterns (non-exhaustive) | Strategy |
|-------------|------|---------------------------|----------|
| `password` | Password / Secret | `/^(password\|passwd\|pwd\|secret\|passwort)$/i` | `hash` → `sha256`, salt: `""` |
| `api_token` | API Token / Key | `/^(token\|api[-_]?key\|access[-_]?token\|refresh[-_]?token\|auth[-_]?token)$/i` | `hash` → `sha256`, salt: `""` |
| `password` | Password / Secret | `/^(password\|passwd\|pwd\|secret\|passwort)$/i` | `static` → `"REDACTED"` |
| `api_token` | API Token / Key | `/^(token\|api[-_]?key\|access[-_]?token\|refresh[-_]?token\|auth[-_]?token)$/i` | `static` → `"REDACTED"` |

### Group: `network` — Network & Technical

| Matcher key | Name | Patterns (non-exhaustive) | Strategy |
|-------------|------|---------------------------|----------|
| `ip_address` | IP Address | `/^(ip\|ip[-_]?addr(ess)?\|client[-_]?ip\|remote[-_]?ip\|user[-_]?ip)$/i` | `mask` (visible_chars: 0, mask_char: `*`, preserve_format: true) |
| `ip_address` | IP Address | `/^(ip\|ip[-_]?addr(ess)?\|client[-_]?ip\|remote[-_]?ip\|user[-_]?ip)$/i` | `fake` → `ipv4` |

---

Expand Down Expand Up @@ -282,9 +292,9 @@ groups:
patterns:
- "/^(ssn|social[-_]?security|national[-_]?id|tax[-_]?id|personal[-_]?id)$/i"
transformation:
strategy: hash
algorithm: sha256
salt: ""
strategy: fake
faker_method: numerify
faker_arguments: ["###-##-####"]

contact:
name: "Contact Information"
Expand Down Expand Up @@ -320,21 +330,19 @@ groups:
patterns:
- "/^(credit[-_]?card|card[-_]?number|cc[-_]?number|payment[-_]?card|pan)$/i"
transformation:
strategy: mask
visible_chars: 4
mask_char: "*"
preserve_format: false
strategy: fake
faker_method: creditCardNumber
faker_arguments: []

iban:
name: "IBAN / Bank Account"
enabled: true
patterns:
- "/^(iban|bank[-_]?account|kontonummer)$/i"
transformation:
strategy: mask
visible_chars: 4
mask_char: "*"
preserve_format: false
strategy: fake
faker_method: iban
faker_arguments: []

authentication:
name: "Authentication & Secrets"
Expand All @@ -345,9 +353,8 @@ groups:
patterns:
- "/^(password|passwd|pwd|secret|passwort)$/i"
transformation:
strategy: hash
algorithm: sha256
salt: ""
strategy: static
value: "REDACTED"

api_token:
name: "API Token / Key"
Expand All @@ -366,7 +373,9 @@ groups:
transformation:
strategy: hash
algorithm: sha256
salt: "loyalty"
# salt omitted on purpose — engine prepends a per-run random salt
# so joins stay valid within the run but the values are unrelatable
# to any other run / snapshot.
```

---
Expand Down
Loading