Skip to content

Commit 03202e3

Browse files
authored
Throw on iconv() failure and default fputcsv escape to empty. (#149)
Three related changes to the row writer in `_generateRow()` / `_transcode()`: 1. iconv() returning false (e.g. unsupported target encoding, malformed source bytes) previously got concatenated into the accumulator where `(string)false === ''` silently produced an empty row. Now throws a CakeException in `strict` mode. 2. PHP 8.4 deprecates passing a non-empty `$escape` to fputcsv(). The shipped default was the legacy backslash, which triggers E_DEPRECATED on every render under 8.4+. Default is now `''` (RFC 4180-compliant: quotes inside a field are escaped by doubling them rather than by a preceding backslash). Users on legacy PHP-style escaping can still set `escape => '\\'`. 3. Per review: add a configurable `transcodingMode` option so apps that cannot fix bad input can opt out of the throw. Three class-constant values: - `TRANSCODING_MODE_STRICT` (default): throws on iconv failure. - `TRANSCODING_MODE_IGNORE`: passes `//IGNORE` to iconv so unconvertible characters are dropped and the rest of the row is preserved. For mbstring, `mb_substitute_character('none')` is set for the duration of the call. - `TRANSCODING_MODE_TRANSLITERATE`: passes `//TRANSLIT//IGNORE` so accented Latin etc. transliterate to ASCII; mbstring falls back to ignore (no transliteration support). The transcoding logic moves into a new protected `_transcode()` hook so apps can override the policy per-view. The iconv warning emitted right before a strict-mode failure is consumed by a scoped `set_error_handler()` so the user sees the clear CakeException rather than two near-duplicate signals; this also keeps PHPUnit's failOnWarning bar happy. Tests: iconv strict throw, ignore drops unconvertible chars, transliterate converts accented Latin, and fputcsv default emits RFC 4180 escaping with no E_DEPRECATED.
1 parent 7b69f8a commit 03202e3

2 files changed

Lines changed: 233 additions & 8 deletions

File tree

src/View/CsvView.php

Lines changed: 110 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,30 @@ class CsvView extends SerializedView
105105
*/
106106
public const EXTENSION_MBSTRING = 'mbstring';
107107

108+
/**
109+
* Transcoding mode: throw on any unconvertible byte / character (default).
110+
*
111+
* @var string
112+
*/
113+
public const TRANSCODING_MODE_STRICT = 'strict';
114+
115+
/**
116+
* Transcoding mode: silently drop unconvertible characters and keep going.
117+
* Maps to iconv's `//IGNORE` suffix and mbstring's substitute-char `'none'`.
118+
*
119+
* @var string
120+
*/
121+
public const TRANSCODING_MODE_IGNORE = 'ignore';
122+
123+
/**
124+
* Transcoding mode: transliterate where possible, ignore otherwise.
125+
* Maps to iconv's `//TRANSLIT//IGNORE` suffix. For mbstring this falls
126+
* back to ignore (mbstring has no transliteration).
127+
*
128+
* @var string
129+
*/
130+
public const TRANSCODING_MODE_TRANSLITERATE = 'transliterate';
131+
108132
/**
109133
* List of bom signs for encodings.
110134
*
@@ -137,7 +161,10 @@ class CsvView extends SerializedView
137161
* - 'delimiter': (default ',') CSV Delimiter, defaults to comma
138162
* - 'enclosure': (default '"') CSV Enclosure for use with fputcsv()
139163
* - 'newline': (default '\n') CSV Newline replacement for use with fputcsv()
140-
* - 'escape': (default '\\') CSV escape character for use with fputcsv()
164+
* - 'escape': (default '') CSV escape character for use with fputcsv().
165+
* Empty string is RFC 4180 compliant and avoids PHP 8.4's
166+
* deprecation warning for non-empty escape values. Set to '\\' for
167+
* legacy PHP-style escaping (will emit E_DEPRECATED on PHP 8.4+).
141168
* - 'eol': (default '\n') End-of-line character the csv
142169
* - 'bom': (default false) Adds BOM (byte order mark) header
143170
* - 'setSeparator': (default false) Adds sep=[_delimiter] in the first line
@@ -148,6 +175,12 @@ class CsvView extends SerializedView
148175
* When true, sets `bom => true`, `eol => "\r\n"`, and `csvEncoding => 'UTF-8'`.
149176
* These specific keys are forced; if you need a different combination
150177
* do not enable `excel` and set them individually instead.
178+
* - 'transcodingMode': (default 'strict') How to handle source bytes that
179+
* cannot be encoded in the target encoding. One of:
180+
* - 'strict': throw a CakeException naming the source/target encoding.
181+
* - 'ignore': silently drop unconvertible characters and continue.
182+
* - 'transliterate': transliterate where possible (e.g. é → e), ignore
183+
* otherwise. For iconv only; mbstring falls back to 'ignore'.
151184
*
152185
* @var array<string, mixed>
153186
*/
@@ -159,7 +192,7 @@ class CsvView extends SerializedView
159192
'delimiter' => ',',
160193
'enclosure' => '"',
161194
'newline' => "\n",
162-
'escape' => '\\',
195+
'escape' => '',
163196
'eol' => PHP_EOL,
164197
'null' => '',
165198
'bom' => false,
@@ -168,6 +201,7 @@ class CsvView extends SerializedView
168201
'dataEncoding' => 'UTF-8',
169202
'transcodingExtension' => self::EXTENSION_ICONV,
170203
'excel' => false,
204+
'transcodingMode' => self::TRANSCODING_MODE_STRICT,
171205
];
172206

173207
/**
@@ -431,12 +465,7 @@ protected function _generateRow(?array $row = null): string|false
431465
$dataEncoding = $this->getConfig('dataEncoding');
432466
$csvEncoding = $this->getConfig('csvEncoding');
433467
if ($dataEncoding !== $csvEncoding) {
434-
$extension = $this->getConfig('transcodingExtension');
435-
if ($extension === static::EXTENSION_ICONV) {
436-
$csv = iconv($dataEncoding, $csvEncoding, $csv);
437-
} elseif ($extension === static::EXTENSION_MBSTRING) {
438-
$csv = mb_convert_encoding($csv, $csvEncoding, $dataEncoding);
439-
}
468+
$csv = $this->_transcode($csv, $dataEncoding, $csvEncoding);
440469
}
441470

442471
// BOM must be added after encoding
@@ -461,4 +490,77 @@ protected function getBom(string $csvEncoding): string
461490

462491
return $this->bomMap[$csvEncoding] ?? '';
463492
}
493+
494+
/**
495+
* Transcode a row's worth of CSV between encodings, honoring the
496+
* configured `transcodingMode` (strict / ignore / transliterate).
497+
*
498+
* @param string $csv The current CSV chunk.
499+
* @param string $dataEncoding Source encoding.
500+
* @param string $csvEncoding Target encoding.
501+
* @return string Transcoded CSV chunk.
502+
* @throws \Cake\Core\Exception\CakeException When mode is `strict` and the
503+
* transcoder reports a conversion failure.
504+
*/
505+
protected function _transcode(string $csv, string $dataEncoding, string $csvEncoding): string
506+
{
507+
$extension = $this->getConfig('transcodingExtension');
508+
$mode = $this->getConfig('transcodingMode');
509+
510+
if ($extension === static::EXTENSION_ICONV) {
511+
$targetSpec = match ($mode) {
512+
static::TRANSCODING_MODE_IGNORE => $csvEncoding . '//IGNORE',
513+
static::TRANSCODING_MODE_TRANSLITERATE => $csvEncoding . '//TRANSLIT//IGNORE',
514+
default => $csvEncoding,
515+
};
516+
// iconv() emits an E_NOTICE / E_WARNING immediately before returning
517+
// false on unconvertible input. Install a no-op handler for the
518+
// duration of the call so we surface the failure via our own
519+
// (strict-mode) exception below rather than as two near-duplicate
520+
// signals. PHPUnit's own error handler is restored on `finally`.
521+
set_error_handler(static fn(): bool => true, E_NOTICE | E_WARNING);
522+
try {
523+
$converted = iconv($dataEncoding, $targetSpec, $csv);
524+
} finally {
525+
restore_error_handler();
526+
}
527+
if ($converted === false) {
528+
if ($mode === static::TRANSCODING_MODE_STRICT) {
529+
throw new CakeException(sprintf(
530+
'iconv() failed to transcode row from `%s` to `%s`. '
531+
. 'Check that the source data is valid `%s` and that both '
532+
. 'encodings are supported by your iconv build, or set '
533+
. '`transcodingMode` to `ignore` or `transliterate` to '
534+
. 'tolerate unconvertible characters.',
535+
$dataEncoding,
536+
$csvEncoding,
537+
$dataEncoding,
538+
));
539+
}
540+
541+
return '';
542+
}
543+
544+
return $converted;
545+
}
546+
547+
if ($extension === static::EXTENSION_MBSTRING) {
548+
$previousSubstitute = null;
549+
if ($mode !== static::TRANSCODING_MODE_STRICT) {
550+
$previousSubstitute = mb_substitute_character();
551+
mb_substitute_character('none');
552+
}
553+
try {
554+
$converted = mb_convert_encoding($csv, $csvEncoding, $dataEncoding);
555+
} finally {
556+
if ($previousSubstitute !== null) {
557+
mb_substitute_character($previousSubstitute);
558+
}
559+
}
560+
561+
return $converted;
562+
}
563+
564+
return $csv;
565+
}
464566
}

tests/TestCase/View/CsvViewTest.php

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -641,4 +641,127 @@ public function testExcelPresetOverridesIndividualKeys()
641641
$this->assertStringStartsWith($bom, $output);
642642
$this->assertStringEndsWith("\r\n", $output);
643643
}
644+
645+
/**
646+
* The default `escape` value is `''` (RFC 4180 compliant) to avoid
647+
* PHP 8.4's deprecation warning for any non-empty escape passed to
648+
* `fputcsv()`. Rendering a row with a quote in it must produce
649+
* doubled-quote escaping rather than legacy backslash escaping, and
650+
* must not raise E_DEPRECATED.
651+
*
652+
* @return void
653+
*/
654+
public function testDefaultEscapeIsRfc4180()
655+
{
656+
$deprecations = [];
657+
set_error_handler(function ($severity, $message) use (&$deprecations) {
658+
$deprecations[] = $message;
659+
}, E_DEPRECATED | E_USER_DEPRECATED);
660+
661+
try {
662+
$data = [['contains "quote"']];
663+
$this->view->set(['data' => $data])
664+
->setConfig(['serialize' => 'data']);
665+
$output = $this->view->render();
666+
} finally {
667+
restore_error_handler();
668+
}
669+
670+
// RFC 4180: quote is escaped by doubling, not by backslash.
671+
$this->assertSame('"contains ""quote"""' . PHP_EOL, $output);
672+
$this->assertSame(
673+
[],
674+
$deprecations,
675+
'fputcsv() raised an unexpected deprecation: ' . implode(', ', $deprecations),
676+
);
677+
}
678+
679+
public function testIconvFailureThrows()
680+
{
681+
if (!extension_loaded('iconv')) {
682+
$this->markTestSkipped('The iconv extension is not available.');
683+
}
684+
685+
$data = [['hello']];
686+
$this->view->set(['data' => $data])
687+
->setConfig([
688+
'serialize' => 'data',
689+
'dataEncoding' => 'UTF-8',
690+
// Bogus target encoding name. iconv returns false for this.
691+
'csvEncoding' => 'NOT-A-REAL-ENCODING',
692+
'transcodingExtension' => CsvView::EXTENSION_ICONV,
693+
]);
694+
695+
try {
696+
$this->view->render();
697+
$this->fail('Expected exception for iconv() returning false.');
698+
} catch (Exception $e) {
699+
$previous = $e->getPrevious() ?? $e;
700+
$this->assertInstanceOf(CakeException::class, $previous);
701+
$this->assertStringContainsString(
702+
'iconv() failed to transcode',
703+
$previous->getMessage(),
704+
);
705+
}
706+
}
707+
708+
/**
709+
* `transcodingMode => 'ignore'` must keep generating the CSV when iconv
710+
* cannot convert a character: the unconvertible character is dropped and
711+
* the rest of the row is preserved instead of throwing.
712+
*
713+
* @return void
714+
*/
715+
public function testIconvIgnoreModeDropsUnconvertibleChars()
716+
{
717+
if (!extension_loaded('iconv')) {
718+
$this->markTestSkipped('The iconv extension is not available.');
719+
}
720+
721+
// `あ` cannot be represented in ASCII; in ignore mode it is dropped.
722+
$data = [['hello あ world']];
723+
$this->view->set(['data' => $data])
724+
->setConfig([
725+
'serialize' => 'data',
726+
'dataEncoding' => 'UTF-8',
727+
'csvEncoding' => 'ASCII',
728+
'transcodingExtension' => CsvView::EXTENSION_ICONV,
729+
'transcodingMode' => CsvView::TRANSCODING_MODE_IGNORE,
730+
]);
731+
732+
$output = $this->view->render();
733+
$this->assertStringContainsString('hello ', $output);
734+
$this->assertStringContainsString(' world', $output);
735+
$this->assertStringNotContainsString('', $output);
736+
}
737+
738+
/**
739+
* `transcodingMode => 'transliterate'` must convert what it can (e.g.
740+
* accented Latin → ASCII equivalents).
741+
*
742+
* @return void
743+
*/
744+
public function testIconvTransliterateModeConvertsAccentedChars()
745+
{
746+
if (!extension_loaded('iconv')) {
747+
$this->markTestSkipped('The iconv extension is not available.');
748+
}
749+
750+
$data = [['café Möhre']];
751+
$this->view->set(['data' => $data])
752+
->setConfig([
753+
'serialize' => 'data',
754+
'dataEncoding' => 'UTF-8',
755+
'csvEncoding' => 'ASCII',
756+
'transcodingExtension' => CsvView::EXTENSION_ICONV,
757+
'transcodingMode' => CsvView::TRANSCODING_MODE_TRANSLITERATE,
758+
]);
759+
760+
$output = $this->view->render();
761+
// iconv//TRANSLIT typically produces `cafe` and `Mohre`; the exact
762+
// output varies by libiconv build, but neither é nor ö should
763+
// survive.
764+
$this->assertStringNotContainsString('é', $output);
765+
$this->assertStringNotContainsString('ö', $output);
766+
}
644767
}

0 commit comments

Comments
 (0)