Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 62 additions & 10 deletions lib/doc/pivot-table.js
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,14 @@ function makePivotTable(worksheet, model) {
let {rows, columns, values, pages = []} = model;
const {metric, pageDefaults = {}} = model;

// Generate sharedItems for ALL fields in the source, not just the ones used by this pivot table
// This ensures Excel can properly display any field configuration
// Generate sharedItems only for fields used as pivot axes (rows, columns, pages).
// Value fields and unused fields use lightweight sharedItems (no enumeration) — Excel
// flags pivot caches that materialize sharedItems for unused/long-text fields with a
// "Repaired Records: PivotTable report from /xl/pivotCache/pivotCacheDefinition*.xml"
// warning on open (see protobi/exceljs#42).
const allHeaderNames = sourceSheet.getRow(1).values.slice(1);
const cacheFields = makeCacheFields(sourceSheet, allHeaderNames);
const axialFieldNames = [...rows, ...columns, ...pages];
const cacheFields = makeCacheFields(sourceSheet, axialFieldNames, allHeaderNames);

// let {rows, columns, values, pages} use indices instead of names;
// names can then be accessed via `pivotTable.cacheFields[index].name`.
Expand Down Expand Up @@ -124,23 +128,23 @@ function validate(worksheet, model) {
const allFields = [...model.rows, ...model.columns, ...model.values];
for (const pageField of pages) {
if (allFields.includes(pageField)) {
throw new Error(
`Page field "${pageField}" cannot also be used as a row, column, or value field.`,
);
const msg = `Page field "${pageField}" cannot also be used as a row, column, or value field.`;
throw new Error(msg);
}
}

// Validate pageDefaults reference valid page fields and values
if (model.pageDefaults) {
for (const [fieldName, defaultValue] of Object.entries(model.pageDefaults)) {
for (const fieldName of Object.keys(model.pageDefaults)) {
if (!pages.includes(fieldName)) {
throw new Error(`pageDefaults field "${fieldName}" is not in the pages array.`);
}
}
}
}

function makeCacheFields(worksheet, fieldNamesWithSharedItems) {
function makeCacheFields(worksheet, fieldNamesWithSharedItems, allFieldNames /* reserved */) {
// `allFieldNames` is reserved for future use; iteration uses worksheet.getRow(1)
// Cache fields are used in pivot tables to reference source data.
//
// Example
Expand Down Expand Up @@ -193,12 +197,60 @@ function makeCacheFields(worksheet, fieldNamesWithSharedItems) {
return toSortedArray(uniqueValues);
};

// Inspect a column's body values to derive lightweight sharedItems hints
// (containsBlank / containsString / containsNumber / longText) WITHOUT enumerating
// every unique value. Used for fields that aren't pivot axes — Excel can rebuild
// the index on refresh from the inline values written into pivotCacheRecords.
const inspect = columnIndex => {
const columnValues = worksheet.getColumn(columnIndex).values.slice(2);
const hints = {
containsBlank: false,
containsString: false,
containsNumber: false,
containsInteger: true, // assume true until proven false
hasNumber: false,
longText: false,
};
for (const value of columnValues) {
if (value === null || value === undefined) {
hints.containsBlank = true;
continue;
}
if (typeof value === 'number') {
hints.containsNumber = true;
hints.hasNumber = true;
if (!Number.isInteger(value)) hints.containsInteger = false;
} else if (typeof value === 'string') {
hints.containsString = true;
if (value.length > 255) hints.longText = true;
} else {
// fallback for dates/booleans/etc — treat as string-ish
hints.containsString = true;
}
}
if (!hints.hasNumber) hints.containsInteger = false;
return hints;
};

// make result
const result = [];
for (const columnIndex of range(1, names.length)) {
const name = names[columnIndex];
const sharedItems = nameToHasSharedItems[name] ? aggregate(columnIndex) : null;
result.push({name, sharedItems});
let sharedItems;
if (nameToHasSharedItems[name]) {
// Axial field — needs full enumeration so pivotCacheRecords can reference
// values via <x v="N"/> indices.
sharedItems = aggregate(columnIndex);
} else {
// Non-axial field — emit lightweight sharedItems (type hints only).
// pivotCacheRecords will inline values rather than reference indices.
sharedItems = null;
}
const cacheField = {name, sharedItems};
if (sharedItems === null) {
cacheField.hints = inspect(columnIndex);
}
result.push(cacheField);
}
return result;
}
Expand Down
52 changes: 40 additions & 12 deletions lib/xlsx/xform/pivot-table/cache-field.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
class CacheField {
constructor({name, sharedItems}) {
// string type
constructor({name, sharedItems, hints}) {
// string type (axial field — used as pivot row/column/page)
//
// {
// 'name': 'A',
Expand All @@ -9,14 +9,22 @@ class CacheField {
//
// or
//
// integer type
// non-axial field (value field or unused) — sharedItems is null and `hints`
// describes the column's value types so we can emit a lightweight
// <sharedItems .../> declaration without enumerating every unique value.
// This avoids triggering Excel's "Repaired Records: PivotTable report"
// warning on pivot caches with high-cardinality / long-text non-axial
// fields (see protobi/exceljs#42).
//
// {
// 'name': 'D',
// 'sharedItems': null
// 'sharedItems': null,
// 'hints': { containsBlank, containsString, containsNumber,
// containsInteger, hasNumber, longText }
// }
this.name = name;
this.sharedItems = sharedItems;
this.hints = hints || null;
}

// Helper function to escape XML special characters
Expand All @@ -34,20 +42,40 @@ class CacheField {
// PivotCache Field: http://www.datypic.com/sc/ooxml/e-ssml_cacheField-1.html
// Shared Items: http://www.datypic.com/sc/ooxml/e-ssml_sharedItems-1.html

// integer types
// Non-axial field — emit lightweight <sharedItems> based on hints from
// column inspection. pivotCacheRecords inlines values for these fields
// (see pivot-cache-records-xform.js + lib/doc/pivot-table.js).
if (this.sharedItems === null) {
// TK(2023-07-18): left out attributes... minValue="5" maxValue="45"
const hints = this.hints || {};
const attrs = [];
// Pure-numeric column: declare it explicitly so Excel doesn't infer mixed.
if (hints.containsNumber && !hints.containsString) {
attrs.push('containsSemiMixedTypes="0"');
attrs.push('containsString="0"');
attrs.push('containsNumber="1"');
if (hints.containsInteger) attrs.push('containsInteger="1"');
if (hints.containsBlank) attrs.push('containsBlank="1"');
} else {
// Pure-string or mixed column. Excel's parser is strict about which
// attributes appear together; the safest minimal form is just the
// hints that matter — no count, no enumerated items.
if (hints.containsBlank) attrs.push('containsBlank="1"');
if (hints.longText) attrs.push('longText="1"');
// Mixed string+number: omit containsString/containsNumber to let Excel
// infer from the inline records; declaring both can disagree with the
// record contents and trigger repair.
}
const attrStr = attrs.length ? ` ${attrs.join(' ')}` : '';
return `<cacheField name="${this.escapeXml(this.name)}" numFmtId="0">
<sharedItems containsSemiMixedTypes="0" containsString="0" containsNumber="1" containsInteger="1" />
<sharedItems${attrStr} />
</cacheField>`;
}

// Check if all items are numeric
const allNumeric = this.sharedItems.every(
item =>
typeof item === 'number' ||
(typeof item === 'string' && !isNaN(item) && item.trim() !== ''),
);
const isNumericItem = item =>
typeof item === 'number' ||
(typeof item === 'string' && !Number.isNaN(Number(item)) && item.trim() !== '');
const allNumeric = this.sharedItems.every(isNumericItem);

if (allNumeric) {
// numeric types - use <n> tags
Expand Down
14 changes: 12 additions & 2 deletions lib/xlsx/xform/pivot-table/pivot-cache-definition-xform.js
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,13 @@ class PivotCacheDefinitionXform extends BaseXform {
render(xmlStream, model) {
const {sourceSheet, cacheFields} = model;

// recordCount must match the count attribute of <pivotCacheRecords> — the
// number of data rows in the source sheet (header excluded). Excel uses
// this to validate the cache; a mismatch is one of the triggers for the
// "Repaired Records: PivotTable report from /xl/pivotCache/
// pivotCacheDefinition*.xml" warning (see protobi/exceljs#42).
const recordCount = Math.max(0, sourceSheet.getSheetValues().length - 2);

xmlStream.openXml(XmlStream.StdDocAttributes);
xmlStream.openNode(this.tag, {
...PivotCacheDefinitionXform.PIVOT_CACHE_DEFINITION_ATTRIBUTES,
Expand All @@ -31,7 +38,7 @@ class PivotCacheDefinitionXform extends BaseXform {
createdVersion: '8',
refreshedVersion: '8',
minRefreshableVersion: '3',
recordCount: cacheFields.length + 1,
recordCount,
});

xmlStream.openNode('cacheSource', {type: 'worksheet'});
Expand All @@ -43,7 +50,10 @@ class PivotCacheDefinitionXform extends BaseXform {

xmlStream.openNode('cacheFields', {count: cacheFields.length});
// Note: keeping this pretty-printed for now to ease debugging.
xmlStream.writeXml(cacheFields.map(cacheField => new CacheField(cacheField).render()).join('\n '));
const cacheFieldsXml = cacheFields
.map(cacheField => new CacheField(cacheField).render())
.join('\n ');
xmlStream.writeXml(cacheFieldsXml);
xmlStream.closeNode();

xmlStream.closeNode();
Expand Down
162 changes: 162 additions & 0 deletions spec/integration/issues/issue-42-pivot-cache-repair.spec.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
// Regression test for protobi/exceljs#42:
// "[BUG] Pivot cache triggers Excel repair"
//
// When a workbook contains a pivot table whose source sheet has fields that
// are NOT used as pivot axes (rows/columns/pages) and contain high-cardinality
// or long-text values (e.g. a "Description" column), Excel opens the file with:
//
// "Repaired Records: PivotTable report from /xl/pivotCache/
// pivotCacheDefinition1.xml part (PivotTable cache)"
//
// Root cause: pivotCacheDefinition fully materialized <sharedItems> for every
// source column, including non-axial fields. Excel's strict cache validator
// flags this — and it's also wasteful (1500-row long-text columns produced
// ~900KB cache definitions that were never indexed by anything).
//
// Fix: only enumerate sharedItems for fields actually used as pivot axes.
// Non-axial fields get a lightweight <sharedItems .../> declaration with type
// hints; pivotCacheRecords inlines their values (<n>/<s>/<m>) rather than
// referencing them by index. recordCount on the cache definition is also
// corrected to match the data row count (was previously cacheFields.length+1).

const fs = require('fs');
const {promisify} = require('util');

const fsReadFileAsync = promisify(fs.readFile);

const JSZip = require('jszip');

const ExcelJS = verquire('exceljs');

const TEST_XLSX_FILEPATH = './spec/out/wb.issue-42.xlsx';

function findCacheField(xml, name) {
const re = new RegExp(`<cacheField name="${name}"[^>]*>[\\s\\S]*?</cacheField>`);
return xml.match(re);
}

describe('Workbook', () => {
describe('Pivot Tables (issue #42 — Excel repair on cache definition)', () => {
let zip;
let cacheDefXml;
let cacheRecordsXml;
const ROW_COUNT = 200;

before(async () => {
const workbook = new ExcelJS.Workbook();
const dataSheet = workbook.addWorksheet('Data');
dataSheet.columns = [
{header: 'Repo', key: 'repo', width: 20},
{header: 'Group', key: 'group', width: 20},
{header: 'Severity', key: 'severity', width: 20},
// Description: long-text, high-cardinality, NOT used as a pivot axis.
// This is the field that triggers the bug.
{header: 'Description', key: 'description', width: 80},
{header: 'Counter', key: 'count', width: 10},
];

for (let i = 0; i < ROW_COUNT; i++) {
dataSheet.addRow({
repo: `repo-${i % 10}`,
group: `group-${i % 20}`,
severity: ['CRITICAL', 'HIGH', 'MEDIUM', 'LOW', 'INFO'][i % 5],
description: `long-description-text-${i}-${'x'.repeat(300)}`,
count: 1,
});
}

const pivotSheet = workbook.addWorksheet('Summary');
pivotSheet.addPivotTable({
sourceSheet: dataSheet,
rows: ['Repo', 'Group'],
columns: ['Severity'],
values: ['Counter'],
metric: 'sum',
});

await workbook.xlsx.writeFile(TEST_XLSX_FILEPATH);
const buffer = await fsReadFileAsync(TEST_XLSX_FILEPATH);
zip = await JSZip.loadAsync(buffer);
cacheDefXml = await zip.file('xl/pivotCache/pivotCacheDefinition1.xml').async('string');
cacheRecordsXml = await zip.file('xl/pivotCache/pivotCacheRecords1.xml').async('string');
});

it('emits a pivot cache definition', () => {
expect(cacheDefXml).to.be.a('string');
expect(cacheDefXml).to.include('<pivotCacheDefinition');
});

it('declares recordCount equal to the source data row count', () => {
// Was previously `cacheFields.length + 1` (= 6 for this fixture);
// Excel expects it to match the <pivotCacheRecords count="..."> attribute,
// which is the number of data rows.
const match = cacheDefXml.match(/recordCount="(\d+)"/);
expect(match).to.not.be.null();
expect(Number(match[1])).to.equal(ROW_COUNT);

const recordsCountMatch = cacheRecordsXml.match(/<pivotCacheRecords[^>]*count="(\d+)"/);
expect(recordsCountMatch).to.not.be.null();
expect(Number(recordsCountMatch[1])).to.equal(ROW_COUNT);
});

it('does NOT enumerate sharedItems for the non-axial Description field', () => {
// The Description cacheField should appear with a lightweight
// <sharedItems ... /> (self-closing or with attributes only) — it must
// NOT contain <s v="long-description-..."> children.
const descMatch = findCacheField(cacheDefXml, 'Description');
expect(descMatch, 'Description cacheField missing').to.not.be.null();
const descBlock = descMatch[0];
expect(descBlock).to.not.include('<s v="long-description-text-');
// Should be a self-closing sharedItems element.
expect(descBlock).to.match(/<sharedItems[^>]*\/>/);
});

it('does NOT enumerate sharedItems for the non-axial Counter field', () => {
const counterMatch = findCacheField(cacheDefXml, 'Counter');
expect(counterMatch).to.not.be.null();
const counterBlock = counterMatch[0];
expect(counterBlock).to.not.match(/<n v="\d+" \/>/);
expect(counterBlock).to.match(/<sharedItems[^>]*\/>/);
});

it('keeps full sharedItems enumeration for axial Repo/Group/Severity fields', () => {
for (const fieldName of ['Repo', 'Group', 'Severity']) {
const block = findCacheField(cacheDefXml, fieldName);
expect(block, `${fieldName} cacheField missing`).to.not.be.null();
// Axial fields keep full <sharedItems count="N"> ... </sharedItems>
expect(block[0]).to.include('<sharedItems');
expect(block[0]).to.include('</sharedItems>');
expect(block[0]).to.match(/<s v="/);
}
});

it('inlines values (not <x v="N"/> indices) in pivotCacheRecords for non-axial fields', () => {
// Each row should have 5 cells: <x>(Repo) <x>(Group) <x>(Severity) <s>(Desc) <n>(Counter)
// Pull the first <r>...</r> record and verify shape.
const firstRecord = cacheRecordsXml.match(/<r>([\s\S]*?)<\/r>/);
expect(firstRecord, 'no <r> record found').to.not.be.null();
const body = firstRecord[1];
// Three <x> indices for the three axial fields:
const xCount = (body.match(/<x v="\d+"/g) || []).length;
expect(xCount).to.equal(3);
// One inline <s> for Description:
expect(body).to.match(/<s v="long-description-text-/);
// One inline <n> for Counter:
expect(body).to.match(/<n v="1"/);
});

it('produces a cache definition substantially smaller than the bug version', () => {
// Pre-fix, this fixture (200 rows × ~325-char descriptions) produced a
// ~75 KB pivotCacheDefinition. Post-fix it should be a few KB at most,
// because the giant Description sharedItems block is gone.
expect(cacheDefXml.length).to.be.lessThan(20000);
});

it('round-trip-loads the resulting workbook without throwing', async () => {
const reloaded = new ExcelJS.Workbook();
await reloaded.xlsx.readFile(TEST_XLSX_FILEPATH);
expect(reloaded.worksheets.map(ws => ws.name)).to.include('Data');
expect(reloaded.worksheets.map(ws => ws.name)).to.include('Summary');
});
});
});