Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/Directory.Packages.props
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
<PackageVersion Include="Xunit.SkippableFact" Version="1.5.61" />
<PackageVersion Include="XunitXml.TestLogger" Version="8.0.0" />
<PackageVersion Include="FluentAssertions" Version="7.2.1" /> <!-- Latest 7.x version under Apache license. -->
<PackageVersion Include="System.Text.Encoding.CodePages" Version="10.0.5" />
<!-- Other packages -->
<PackageVersion Include="System.Resources.Extensions" Version="10.0.5" />
<!-- Needed for PDFsharp-GDI. -->
Expand Down
21 changes: 15 additions & 6 deletions src/foundation/src/PDFsharp/src/PdfSharp/Pdf.Content/CLexer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -234,20 +234,28 @@ static char LogError(char ch)
}

var name = Token;
// Check token for UTF-8 encoding.
// Check for non-ASCII bytes that may indicate a multi-byte or legacy ANSI encoding.
for (int idx = 0; idx < name.Length; idx++)
{
// If the two top most significant bits are set this identifies a 2, 3, or 4
// byte UTF-8 encoding sequence.
if ((name[idx] & 0xC0) == 0xC0)
if ((name[idx] & 0x80) != 0)
{
// Special characters in Name objects use UTF-8 encoding.
// Special characters in Name objects may use UTF-8 or a legacy ANSI encoding.
var length = name.Length;
var bytes = new byte[length];
for (int idx2 = 0; idx2 < length; idx2++)
bytes[idx2] = (byte)name[idx2];

var decodedName = Encoding.UTF8.GetString(bytes);
string decodedName;
try
{
// Try strict UTF-8 first; throws DecoderFallbackException on invalid sequences.
decodedName = StrictUtf8.GetString(bytes);
}
catch (DecoderFallbackException)
{
// Fallback to ANSI code page encoding if UTF-8 decoding fails.
decodedName = PdfEncoders.AnsiCodepageEncoding.GetString( bytes );
}
_token.Clear();
_token.Append(decodedName);
break;
Expand Down Expand Up @@ -498,6 +506,7 @@ public CSymbol ScanNumber()
return Symbol = CSymbol.Real; // CLexer returns "Real" because there is no "LongInteger".
}

static readonly UTF8Encoding StrictUtf8 = new(false, true);
static readonly double[] PowersOf10 = [1, 10, 100, 1_000, 10_000, 100_000, 1_000_000, 10_000_000, 100_000_000, 1_000_000_000, 10_000_000_000];

/// <summary>
Expand Down
21 changes: 15 additions & 6 deletions src/foundation/src/PDFsharp/src/PdfSharp/Pdf.IO/Lexer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -247,20 +247,28 @@ static char LogError(char ch)
}

var name = Token;
// Check for UTF-8 encoding.
// Check for non-ASCII bytes that may indicate a multi-byte or legacy ANSI encoding.
for (int idx = 0; idx < name.Length; idx++)
{
// If the two top most significant bits are set this identifies a 2, 3, or 4
// byte UTF-8 encoding sequence.
if ((name[idx] & 0xC0) == 0xC0)
if ((name[idx] & 0x80) != 0)
{
// Special characters in Name objects use UTF-8 encoding.
// Special characters in Name objects may use UTF-8 or a legacy ANSI encoding.
var length = name.Length;
var bytes = new byte[length];
for (int idx2 = 0; idx2 < length; idx2++)
bytes[idx2] = (byte)name[idx2];

var decodedName = Encoding.UTF8.GetString(bytes);
string decodedName;
try
{
// Try strict UTF-8 first; throws DecoderFallbackException on invalid sequences.
decodedName = StrictUtf8.GetString(bytes);
}
catch (DecoderFallbackException)
{
// Fallback to ANSI code page encoding if UTF-8 decoding fails.
decodedName = PdfEncoders.AnsiCodepageEncoding.GetString( bytes );
}
_token.Clear();
_token.Append(decodedName);
break;
Expand Down Expand Up @@ -558,6 +566,7 @@ int TryReadReference()
}
}

static readonly UTF8Encoding StrictUtf8 = new(false, true);
static readonly double[] PowersOf10 = [1, 10, 100, 1_000, 10_000, 100_000, 1_000_000, 10_000_000, 100_000_000, 1_000_000_000, 10_000_000_000];

/// <summary>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,36 @@ namespace PdfSharp.Pdf.Internal
/// </summary>
public static class PdfEncoders
{
/// <summary>
/// Gets the PDFsharp specific encoder RawEncoding.
/// Ray encoding allows wo work with string instead of byte array.
/// A raw encoded string is equivalent to a byte array of the same length
/// where each sting character represents one byte.
/// Therefore, each character of a raw string has a value less than 256.
/// </summary>
public static Encoding RawEncoding => _rawEncoding ??= new RawEncoding();
static PdfEncoders()
{
#if !NETFRAMEWORK
#if NETSTANDARD
// netstandard2.0 は .NET Framework 上でも動作しうるため、ランタイムで判定
// .NET Framework は、CodePagesEncodingProvider を必要としないため、コンポーネントがないため呼び出し自体を避ける
if (System.Runtime.InteropServices.RuntimeInformation.FrameworkDescription.StartsWith(".NET Framework", StringComparison.OrdinalIgnoreCase))
return;
#endif
RegisterCodePages();
#endif
}
#if !NETFRAMEWORK
private static void RegisterCodePages()
{
// Register CodePagesEncodingProvider so that legacy encodings like Shift-JIS (CP932),
// GBK (CP936), Big5 (CP950), EUC-KR (CP949), etc. are available on all platforms.
// On .NET Framework, all code pages are natively available and registration is not needed.
Encoding.RegisterProvider( CodePagesEncodingProvider.Instance );
}
#endif

/// <summary>
/// Gets the PDFsharp specific encoder RawEncoding.
/// Ray encoding allows wo work with string instead of byte array.
/// A raw encoded string is equivalent to a byte array of the same length
/// where each sting character represents one byte.
/// Therefore, each character of a raw string has a value less than 256.
/// </summary>
public static Encoding RawEncoding => _rawEncoding ??= new RawEncoding();
static Encoding? _rawEncoding;

internal static Encoding ByteStringEncoding => _rawEncoding ??= new RawEncoding(); // new name??
Expand Down Expand Up @@ -64,6 +86,20 @@ public static Encoding WinAnsiEncoding
public static Encoding UnicodeEncoding => _unicodeEncoding ??= Encoding.Unicode;
static Encoding? _unicodeEncoding;

/// <summary>
/// Gets an encoding that corresponds to the ANSI code page of the current culture.
/// </summary>
/// <remarks>The returned encoding is determined by the ANSI code page associated with the current
/// thread's culture. If the current culture does not define an ANSI code page, a default encoding such as
/// Latin1 may be used. This property is useful for interoperability with legacy systems or file formats that
/// rely on culture-specific encodings.</remarks>
public static Encoding AnsiCodepageEncoding =>
#if NET5_0_OR_GREATER
CultureInfo.CurrentCulture.TextInfo.ANSICodePage != 0 ? Encoding.GetEncoding( CultureInfo.CurrentCulture.TextInfo.ANSICodePage ) : Encoding.Latin1;
#else
Encoding.GetEncoding(CultureInfo.CurrentCulture.TextInfo.ANSICodePage);
#endif

///// <summary>
///// Encodes a string from a byte array. Each character gets the code of the corresponding byte.
///// </summary>
Expand Down
4 changes: 4 additions & 0 deletions src/foundation/src/PDFsharp/src/PdfSharp/PdfSharp.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,10 @@
<InternalsVisibleTo Include="PdfSharp.Graphics.Pdf, PublicKey=$(PDFsharpStronNamePublicKey)" />
</ItemGroup>

<ItemGroup Condition="'$(TargetFramework)' == 'netstandard2.0'">
<PackageReference Include="System.Text.Encoding.CodePages" />
</ItemGroup>

<ItemGroup>
<Compile Include="..\..\..\shared\src\PdfSharp.Shared\dotnet\GetSubArray.cs" Link="Properties\GetSubArray(included).cs" />
<Compile Include="..\..\..\shared\src\PdfSharp.System\Properties\FloatOrDouble.cs" Link="Properties\FloatOrDouble.cs" />
Expand Down
65 changes: 65 additions & 0 deletions src/foundation/src/PDFsharp/tests/PdfSharp.Tests/IO/LexerTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,71 @@ public void Scan_ObjRef_Tests(string text, (int, int) objID/*, bool testAsLong,

}

[Fact]
public void ScanName_UTF8_encoded_name_is_decoded_correctly()
{
// UTF-8 encoding of "日本語" is E6 97 A5 E6 9C AC E8 AA 9E.
// ScanName returns the token INCLUDING the leading '/', e.g. "/日本語".
var nameBytes = System.Text.Encoding.UTF8.GetBytes("日本語");
var bytes = new byte[1 + nameBytes.Length + 1];
bytes[0] = (byte)'/';
nameBytes.CopyTo(bytes, 1);
bytes[^1] = (byte)' ';

var lexer = CreateLexerFromBytes(bytes);
var symbol = lexer.ScanName();

symbol.Should().Be(Symbol.Name);
lexer.Token.Should().Be("/日本語");
}

[Fact]
public void ScanName_NonUTF8_bytes_do_not_throw()
{
// 0xE3 0x81 is an invalid UTF-8 sequence (incomplete 3-byte sequence).
// Regardless of platform, this must not throw and must return some string.
byte[] bytes = [(byte)'/', 0xE3, 0x81, (byte)' '];

var lexer = CreateLexerFromBytes(bytes);
var symbol = lexer.ScanName();

symbol.Should().Be(Symbol.Name);
lexer.Token.Should().NotBeNullOrEmpty();
lexer.Token.Should().StartWith("/");
}

#if NET8_0_OR_GREATER
[SkippableFact]
public void ScanName_ShiftJIS_name_is_decoded_on_ShiftJIS_default_encoding()
{
// This test verifies Shift-JIS decoding when Encoding.Default is CP932.
// On non-Japanese environments, Encoding.Default may not be CP932 and
// the decoded string will differ, so we skip this test if CP932 is not the default.
Skip.If(System.Text.Encoding.Default.CodePage != 932,
"Requires Encoding.Default to use code page 932 (Shift-JIS).");

// Shift-JIS encoding of "日本語": 93 FA 96 D1 8C EA
// ScanName returns the token INCLUDING the leading '/'.
byte[] sjisBytes = [0x93, 0xFA, 0x96, 0xD1, 0x8C, 0xEA];
byte[] bytes = new byte[1 + sjisBytes.Length + 1];
bytes[0] = (byte)'/';
sjisBytes.CopyTo(bytes, 1);
bytes[^1] = (byte)' ';

var lexer = CreateLexerFromBytes(bytes);
var symbol = lexer.ScanName();

symbol.Should().Be(Symbol.Name);
lexer.Token.Should().Be("/日本語");
}
#endif

static Lexer CreateLexerFromBytes(byte[] bytes)
{
var stream = new MemoryStream(bytes);
return new Lexer(stream, null);
}

Lexer CreateLexer(string text)
{
var pdfString = new PdfString(text, PdfStringEncoding.RawEncoding);
Expand Down