diff --git a/src/UglyToad.PdfPig.Fonts/GlyphList.cs b/src/UglyToad.PdfPig.Fonts/GlyphList.cs index 23a6d0b31..aa264b054 100644 --- a/src/UglyToad.PdfPig.Fonts/GlyphList.cs +++ b/src/UglyToad.PdfPig.Fonts/GlyphList.cs @@ -13,7 +13,7 @@ public class GlyphList { /// - /// .notdef. + /// .notdef name. /// public const string NotDefined = ".notdef"; @@ -37,7 +37,7 @@ public class GlyphList public static GlyphList AdditionalGlyphList => LazyAdditionalGlyphList.Value; private static readonly Lazy LazyZapfDingbatsGlyphList = new Lazy(() => GlyphListFactory.Get("zapfdingbats")); - + /// /// Zapf Dingbats. /// @@ -84,6 +84,7 @@ public string UnicodeCodePointToName(int unicodeValue) /// /// Get the unicode value for the glyph name. + /// See . /// public string NameToUnicode(string name) { @@ -103,25 +104,47 @@ public string NameToUnicode(string name) } string unicode; - // Remove suffixes + // 1. Drop all the characters from the glyph name starting with the first occurrence of a period (U+002E FULL STOP), if any. if (name.IndexOf('.') > 0) { unicode = NameToUnicode(name.Substring(0, name.IndexOf('.'))); } - else if (name.StartsWith("uni") && name.Length == 7) + // 2. Split the remaining string into a sequence of components, using underscore (U+005F LOW LINE) as the delimiter. + else if (name.IndexOf('_') > 0) + { + /* + * MOZILLA-3136-0.pdf + * 68-1990-01_A.pdf + * TIKA-2054-0.pdf + */ + var sb = new StringBuilder(); + foreach (var s in name.Split('_')) + { + sb.Append(NameToUnicode(s)); + } + + unicode = sb.ToString(); + } + // Otherwise, if the component is of the form ‘uni’ (U+0075, U+006E, and U+0069) followed by a sequence of uppercase hexadecimal + // digits (0–9 and A–F, meaning U+0030 through U+0039 and U+0041 through U+0046), if the length of that sequence is a multiple + // of four, and if each group of four digits represents a value in the ranges 0000 through D7FF or E000 through FFFF, then + // interpret each as a Unicode scalar value and map the component to the string made of those scalar values. Note that the range + // and digit-length restrictions mean that the ‘uni’ glyph name prefix can be used only with UVs in the Basic Multilingual Plane (BMP). + else if (name.StartsWith("uni") && (name.Length - 3) % 4 == 0) { // test for Unicode name in the format uniXXXX where X is hex int nameLength = name.Length; var uniStr = new StringBuilder(); - var foundUnicode = true; for (int chPos = 3; chPos + 4 <= nameLength; chPos += 4) { - if (!int.TryParse(name.AsSpanOrSubstring(chPos, 4), NumberStyles.HexNumber, CultureInfo.InvariantCulture, out var codePoint)) + if (!int.TryParse(name.AsSpanOrSubstring(chPos, 4), + NumberStyles.HexNumber, + CultureInfo.InvariantCulture, + out var codePoint)) { - foundUnicode = false; - break; + return null; } if (codePoint > 0xD7FF && codePoint < 0xE000) @@ -132,33 +155,30 @@ public string NameToUnicode(string name) uniStr.Append((char)codePoint); } - if (!foundUnicode) - { - return null; - } - unicode = uniStr.ToString(); } - else if (name.StartsWith("u", StringComparison.Ordinal) && name.Length == 5) + // Otherwise, if the component is of the form ‘u’ (U+0075) followed by a sequence of four to six uppercase hexadecimal digits (0–9 + // and A–F, meaning U+0030 through U+0039 and U+0041 through U+0046), and those digits represents a value in the ranges 0000 through + // D7FF or E000 through 10FFFF, then interpret it as a Unicode scalar value and map the component to the string made of this scalar value. + else if (name.StartsWith("u", StringComparison.Ordinal) && name.Length >= 5 && name.Length <= 7) { - // test for an alternate Unicode name representation uXXXX var codePoint = int.Parse(name.AsSpanOrSubstring(1), NumberStyles.HexNumber, CultureInfo.InvariantCulture); if (codePoint > 0xD7FF && codePoint < 0xE000) { - throw new InvalidFontFormatException( - $"Unicode character name with disallowed code area: {name}"); + throw new InvalidFontFormatException($"Unicode character name with disallowed code area: {name}"); } unicode = char.ConvertFromUtf32(codePoint); } + // Ad-hoc special cases else if (name.StartsWith("c", StringComparison.OrdinalIgnoreCase) && name.Length >= 3 && name.Length <= 4) { // name representation cXXX var codePoint = int.Parse(name.AsSpanOrSubstring(1), NumberStyles.Integer, CultureInfo.InvariantCulture); - System.Diagnostics.Debug.Assert(codePoint > 0); unicode = char.ConvertFromUtf32(codePoint); } + // Otherwise, map the component to an empty string. else { return null; diff --git a/src/UglyToad.PdfPig.Tests/Fonts/Encodings/GlyphListTests.cs b/src/UglyToad.PdfPig.Tests/Fonts/Encodings/GlyphListTests.cs index 836f0d9cb..d53c7e720 100644 --- a/src/UglyToad.PdfPig.Tests/Fonts/Encodings/GlyphListTests.cs +++ b/src/UglyToad.PdfPig.Tests/Fonts/Encodings/GlyphListTests.cs @@ -57,7 +57,7 @@ public void NameToUnicodeRemovesSuffix() { var list = new GlyphList(new Dictionary { - {"Boris", "B"} + { "Boris", "B" } }); var result = list.NameToUnicode("Boris.Special"); @@ -70,7 +70,7 @@ public void NameToUnicodeConvertsHexAndUsesHexValue() { var list = new GlyphList(new Dictionary { - {"B", "X"} + { "B", "X" } }); var result = list.NameToUnicode("uni0042"); @@ -83,12 +83,27 @@ public void NameToUnicodeConvertsShortHexAndUsesHexValue() { var list = new GlyphList(new Dictionary { - {"E", "Æ"} + { "E", "Æ" } }); var result = list.NameToUnicode("u0045"); Assert.Equal("E", result); } + + + [Fact(Skip = "TODO - String don't match")] + public void NameToUnicodeConvertAglSpecification() + { + // https://github.com/adobe-type-tools/agl-specification?tab=readme-ov-file#3-examples + var list = new GlyphList(new Dictionary + { + { "Lcommaaccent", "\u013B" } + }); + + var result = list.NameToUnicode("Lcommaaccent_uni20AC0308_u1040C.alternate"); + + Assert.Equal("\u013B\u20AC\u0308\u1040C", result); + } } } diff --git a/src/UglyToad.PdfPig.Tests/Integration/Documents/TIKA-2054-0.pdf b/src/UglyToad.PdfPig.Tests/Integration/Documents/TIKA-2054-0.pdf new file mode 100644 index 000000000..97f1bc732 Binary files /dev/null and b/src/UglyToad.PdfPig.Tests/Integration/Documents/TIKA-2054-0.pdf differ diff --git a/src/UglyToad.PdfPig.Tests/Integration/GithubIssuesTests.cs b/src/UglyToad.PdfPig.Tests/Integration/GithubIssuesTests.cs index e0b8479b3..6a52587df 100644 --- a/src/UglyToad.PdfPig.Tests/Integration/GithubIssuesTests.cs +++ b/src/UglyToad.PdfPig.Tests/Integration/GithubIssuesTests.cs @@ -6,6 +6,41 @@ public class GithubIssuesTests { + [Fact] + public void Issue945() + { + // Odd ligatures names + var path = IntegrationHelpers.GetDocumentPath("MOZILLA-3136-0.pdf"); + using (var document = PdfDocument.Open(path)) + { + var page = document.GetPage(2); + Assert.Contains("ff", page.Letters.Select(l => l.Value)); + } + + path = IntegrationHelpers.GetDocumentPath("68-1990-01_A.pdf"); + using (var document = PdfDocument.Open(path)) + { + var page = document.GetPage(7); + Assert.Contains("fi", page.Letters.Select(l => l.Value)); + } + + path = IntegrationHelpers.GetDocumentPath("TIKA-2054-0.pdf"); + using (var document = PdfDocument.Open(path)) + { + var page = document.GetPage(3); + Assert.Contains("fi", page.Letters.Select(l => l.Value)); + + page = document.GetPage(4); + Assert.Contains("ff", page.Letters.Select(l => l.Value)); + + page = document.GetPage(6); + Assert.Contains("fl", page.Letters.Select(l => l.Value)); + + page = document.GetPage(16); + Assert.Contains("ffi", page.Letters.Select(l => l.Value)); + } + } + [Fact] public void Issue943() {