From 94b4b0757419f2890dbc070ee7653d57092c43cb Mon Sep 17 00:00:00 2001 From: Thomas Leitner Date: Thu, 17 Oct 2024 00:40:22 +0200 Subject: [PATCH] Fix TrueTypeWrapper to correctly encode multiple codepoints refering to the same glyph When multiple codepoints are mapped to the same glyph, the result is erroneous due to the double usage of identity maps for the charcode to CID mapping and the CID to GID mapping. To fix this, * glyph caching in #glyph must be performed not only on the glyph ID but also on the supplied string; * glyph encoding in #encode must be done using custom charcodes and not the glpyh ID; * the custom mapping of charcodes to CIDs must be encoded using a custom CID CMap for the /Encoding entry (in most cases, sometimes it is still possible to use the identity encoding). --- CHANGELOG.md | 2 + lib/hexapdf/font/true_type_wrapper.rb | 57 +++++++++++++++------ test/hexapdf/font/test_true_type_wrapper.rb | 20 ++++++-- 3 files changed, 60 insertions(+), 19 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 21793122..e8ec8909 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,8 @@ * Parsing of invalid `)` character in PDF objects and content streams * Handling of files that contain stream length values that are indirect objects that do not exist +* [HexaPDF::Font::TrueTypeWrapper] to correctly handle the situation when + multiple codepoints refer to the same glyph ID ## 0.47.0 - 2024-09-07 diff --git a/lib/hexapdf/font/true_type_wrapper.rb b/lib/hexapdf/font/true_type_wrapper.rb index 610715f8..df0cfeab 100644 --- a/lib/hexapdf/font/true_type_wrapper.rb +++ b/lib/hexapdf/font/true_type_wrapper.rb @@ -57,6 +57,10 @@ module Font class TrueTypeWrapper # Represents a single glyph of the wrapped font. + # + # Since some characters/strings may be mapped to the same glyph id by the font's builtin cmap + # table, it is possible that different Glyph instances with the same #id but different #str + # exist. class Glyph # The associated TrueTypeWrapper object. @@ -152,6 +156,7 @@ def initialize(document, font, pdf_object: nil, subset: true) @id_to_glyph = {} @codepoint_to_glyph = {} @encoded_glyphs = {} + @last_char_code = 0 end # Returns the type of the font, i.e. :TrueType. @@ -179,14 +184,15 @@ def subset? !@subsetter.nil? end - # Returns a Glyph object for the given glyph ID. + # Returns a Glyph object for the given glyph ID and +str+ pair. # - # The optional argument +str+ should be the string representation of the glyph. Only use it if - # it is known, + # The optional argument +str+ should be the string representation of the glyph. It is possible + # that multiple strings map to the same glyph (e.g. hyphen and soft-hyphen could be + # represented by the same glyph). # # Note: Although this method is public, it should normally not be used by application code! def glyph(id, str = nil) - @id_to_glyph[id] ||= + @id_to_glyph[[id, str]] ||= if id >= 0 && id < @wrapped_font[:maxp].num_glyphs Glyph.new(self, id, str || (+'' << (@cmap.gid_to_code(id) || 0xFFFD))) else @@ -228,14 +234,12 @@ def decode_codepoint(codepoint) # Encodes the glyph and returns the code string. def encode(glyph) - (@encoded_glyphs[glyph.id] ||= + (@encoded_glyphs[glyph] ||= begin raise HexaPDF::MissingGlyphError.new(glyph) if glyph.kind_of?(InvalidGlyph) - if @subsetter - [[@subsetter.use_glyph(glyph.id)].pack('n'), glyph] - else - [[glyph.id].pack('n'), glyph] - end + @subsetter.use_glyph(glyph.id) if @subsetter + @last_char_code += 1 + [[@last_char_code].pack('n'), @last_char_code] end)[0] end @@ -286,7 +290,7 @@ def create_pdf_object(document) Supplement: 0}, CIDToGIDMap: :Identity}) dict = document.add({Type: :Font, Subtype: :Type0, BaseFont: cid_font[:BaseFont], - Encoding: :'Identity-H', DescendantFonts: [cid_font]}) + DescendantFonts: [cid_font]}) dict.font_wrapper = self document.register_listener(:complete_objects) do @@ -294,6 +298,7 @@ def create_pdf_object(document) embed_font(dict, document) complete_width_information(dict) create_to_unicode_cmap(dict, document) + add_encoding_information_cmap(dict, document) end dict @@ -306,7 +311,7 @@ def update_font_name(dict) return unless @subsetter tag = +'' - data = @encoded_glyphs.each_with_object(''.b) {|(id, v), s| s << id.to_s << v[0] } + data = @encoded_glyphs.each_with_object(''.b) {|(g, v), s| s << g.id.to_s << v[0] } hash = Digest::MD5.hexdigest(data << @wrapped_font.font_name).to_i(16) while hash != 0 && tag.length < 6 hash, mod = hash.divmod(UPPERCASE_LETTERS.length) @@ -336,8 +341,8 @@ def embed_font(dict, document) # Adds the /DW and /W fields to the CIDFont dictionary. def complete_width_information(dict) default_width = glyph(3, " ").width.to_i - widths = @encoded_glyphs.reject {|_, v| v[1].width == default_width }.map do |id, v| - [(@subsetter ? @subsetter.subset_glyph_id(id) : id), v[1].width] + widths = @encoded_glyphs.reject {|g, _| g.width == default_width }.map do |g, _| + [(@subsetter ? @subsetter.subset_glyph_id(g.id) : g.id), g.width] end.sort! dict[:DescendantFonts].first.set_widths(widths, default_width: default_width) end @@ -346,9 +351,10 @@ def complete_width_information(dict) # correctly. def create_to_unicode_cmap(dict, document) stream = HexaPDF::StreamData.new do - mapping = @encoded_glyphs.keys.map! do |id| + mapping = @encoded_glyphs.map do |glyph, (_, char_code)| # Using 0xFFFD as mentioned in Adobe #5411, last line before section 1.5 - [(@subsetter ? @subsetter.subset_glyph_id(id) : id), @cmap.gid_to_code(id) || 0xFFFD] + # TODO: glyph.str assumed to consist of single char, No support for multiple chars + [char_code, glyph.str.ord || 0xFFFD] end.sort_by!(&:first) HexaPDF::Font::CMap.create_to_unicode_cmap(mapping) end @@ -357,6 +363,25 @@ def create_to_unicode_cmap(dict, document) dict[:ToUnicode] = stream_obj end + # Adds the /Encoding entry to the +dict+. + # + # This can either be the identity mapping or, if some Unicode codepoints are mapped to the + # same glyph, a custom CMap. + def add_encoding_information_cmap(dict, document) + mapping = @encoded_glyphs.map do |glyph, (_, char_code)| + # Using 0xFFFD as mentioned in Adobe #5411, last line before section 1.5 + [char_code, (@subsetter ? @subsetter.subset_glyph_id(glyph.id) : glyph.id)] + end.sort_by!(&:first) + if mapping.all? {|char_code, cid| char_code == cid } + dict[:Encoding] = :'Identity-H' + else + stream = HexaPDF::StreamData.new { HexaPDF::Font::CMap.create_cid_cmap(mapping) } + stream_obj = document.add({}, stream: stream) + stream_obj.set_filter(:FlateDecode) + dict[:Encoding] = stream_obj + end + end + end end diff --git a/test/hexapdf/font/test_true_type_wrapper.rb b/test/hexapdf/font/test_true_type_wrapper.rb index 14aa8cf5..60de396e 100644 --- a/test/hexapdf/font/test_true_type_wrapper.rb +++ b/test/hexapdf/font/test_true_type_wrapper.rb @@ -71,6 +71,12 @@ glyph.inspect) end + it "caches glyphs based on the id and string" do + glyph = @font_wrapper.glyph(17) + assert_same(glyph, @font_wrapper.glyph(17)) + refute_same(glyph, @font_wrapper.glyph(17, "1")) + end + it "invokes font.on_missing_glyph for missing glyphs" do glyph = @font_wrapper.glyph(9999) assert_kind_of(HexaPDF::Font::InvalidGlyph, glyph) @@ -99,14 +105,18 @@ assert_equal([1].pack('n'), code) code = @font_wrapper.encode(@font_wrapper.glyph(10)) assert_equal([2].pack('n'), code) + code = @font_wrapper.encode(@font_wrapper.glyph(10, "o")) + assert_equal([3].pack('n'), code) end it "returns the encoded glyph ID for fonts that are not subset" do @font_wrapper = HexaPDF::Font::TrueTypeWrapper.new(@doc, @font, subset: false) code = @font_wrapper.encode(@font_wrapper.glyph(3)) - assert_equal([3].pack('n'), code) + assert_equal([1].pack('n'), code) code = @font_wrapper.encode(@font_wrapper.glyph(10)) - assert_equal([10].pack('n'), code) + assert_equal([2].pack('n'), code) + code = @font_wrapper.encode(@font_wrapper.glyph(10, "o")) + assert_equal([3].pack('n'), code) end it "raises an error if an InvalidGlyph is encoded" do @@ -180,14 +190,18 @@ it "with fonts that are not subset (only differences to other case)" do @font_wrapper = HexaPDF::Font::TrueTypeWrapper.new(@doc, @font, subset: false) @font_wrapper.encode(@font_wrapper.glyph(3)) + @font_wrapper.encode(@font_wrapper.glyph(3, "-")) glyph = @font_wrapper.decode_utf8('H').first @font_wrapper.encode(glyph) @doc.dispatch_message(:complete_objects) dict = @font_wrapper.pdf_object - assert_equal(HexaPDF::Font::CMap.create_to_unicode_cmap([[3, ' '.ord], [glyph.id, 'H'.ord]]), + assert_equal(HexaPDF::Font::CMap.create_to_unicode_cmap([[1, ' '.ord], [2, '-'.ord], + [3, 'H'.ord]]), dict[:ToUnicode].stream) + assert_equal(HexaPDF::Font::CMap.create_cid_cmap([[1, 3], [2, 3], [3, glyph.id]]), + dict[:Encoding].stream) assert_equal([glyph.id, [glyph.width]], dict[:DescendantFonts][0][:W].value) end end