196 lines
8.4 KiB
Diff
196 lines
8.4 KiB
Diff
From 6c330d56fdf54cb002b8a84b33f73d2a8dd40879 Mon Sep 17 00:00:00 2001
|
|
From: Tavmjong Bah <tavmjong@free.fr>
|
|
Date: Fri, 8 Nov 2024 15:03:12 +0100
|
|
Subject: [PATCH] Fixes two problems related to emojis in PDF import.
|
|
|
|
1. UTF-8 conversion.
|
|
|
|
If a "ToUnicode" table is included in an OpenType font in a PDF
|
|
file, one can find the Unicode code point that corresponds to a
|
|
given glyph (or group of glyphs). This is often the only way one
|
|
can reconstruct text from a PDF (which might contain only glyphs
|
|
and glyph positions). For Emoji, the code points are outside the
|
|
"Basic Plane" (code points that can be encode by four or fewer
|
|
hexadecimal digits) and in are located the "Supplementary
|
|
Multilingual Plane", a.k.a. "Plane 1". Code points in "Plane 1" are
|
|
represented by a hexadecimal number of the form "1xxxx", where 'x'
|
|
is any hex digit.
|
|
|
|
Inkscape's PDF import code takes a Unicode code point and converts
|
|
it to its UTF-8 representation. This code assumes that the code
|
|
point can be represented by a gunichar2 which is typedef'ed from a
|
|
guint16. The glib function g_utf16_to_utf8 is then used for the
|
|
conversion. This in incorrect: a single guint16 can only represent
|
|
a 4 hex-digit code point and then not all possible values (some
|
|
values are used to indicate that a second 16-bit value is being used
|
|
to to enable encoding a code point outside the basic plane).
|
|
|
|
We already use std::wstring_convert<> and std::codecvt<> earlier in
|
|
the same function to build up a string to store the original text
|
|
in the 'aria-label' attribute. I changed the code to reuse that
|
|
result. Note that these are deprecated and will be removed in C++26
|
|
so we'll eventually need to find a different solution.
|
|
|
|
2. Empty paths.
|
|
|
|
Emoji fonts usually have color. There are four competing methods of
|
|
embedding color font data in an OpenType font. Two of these use
|
|
bitmaps. If a font has only bitmap glyph data then there is no
|
|
vector data to build a path. If there is no path, Inkscape's
|
|
current code returns 'nullptr' instead of a pointer to a Path
|
|
node. This triggers an assert. Simply removing the assert leads to
|
|
other problems down the line. The simplest solution is to return a
|
|
Path node with an empty "d" attribute. This also allows one to
|
|
store the original text in the "aria-label" attribute of the Path
|
|
node.
|
|
|
|
In the future, we should be able to render bitmaps from fonts in
|
|
the same way the same way that we render SVG OpenType fonts (which
|
|
caches the glyphs as bitmaps).
|
|
|
|
Unfixed problems:
|
|
|
|
If the "Noto-Color-Emoji" font is present, it will be used for
|
|
rendering emoji even if "Noto-Emoji" or "Symbola" is specified. It
|
|
will also be used as a fallback font for rendering emoji.
|
|
"Noto-Color-Emoji" has to "glyf" table and thus lacks vectorized
|
|
paths. This leads to empty paths. It would be good to block the use
|
|
of "Noto-Color-Emoji" in this case. It's not clear how easy it
|
|
would be to do this.
|
|
|
|
What is even stranger is that the terminal will not show an Emoji
|
|
if "Noto-Color-Emoji" is not installed even though the glyph that
|
|
is shown is not from "Noto-Color-Emoji"! (It's from "Symbola".)
|
|
|
|
There appears to be incorrect logic in SvgBuider::_flushTextPath().
|
|
If the style changes inside the function, the node existing node is
|
|
replaced, effectively orphaning the previous node. Whether this
|
|
actually happens in PDF input is unknown.
|
|
---
|
|
.../internal/pdfinput/svg-builder.cpp | 54 +++++++++----------
|
|
1 file changed, 26 insertions(+), 28 deletions(-)
|
|
|
|
diff --git a/src/extension/internal/pdfinput/svg-builder.cpp b/src/extension/internal/pdfinput/svg-builder.cpp
|
|
index 73f57f04952..bb2ce5145e8 100644
|
|
--- a/src/extension/internal/pdfinput/svg-builder.cpp
|
|
+++ b/src/extension/internal/pdfinput/svg-builder.cpp
|
|
@@ -1250,7 +1250,7 @@ void SvgBuilder::updateFont(GfxState *state, std::shared_ptr<CairoFont> cairo_fo
|
|
TRACE(("updateFont()\n"));
|
|
updateTextMatrix(state, flip); // Ensure that we have a text matrix built
|
|
|
|
- auto font = state->getFont();
|
|
+ auto font = state->getFont(); // GfxFont
|
|
auto font_id = font->getID()->num;
|
|
|
|
auto new_font_size = state->getFontSize();
|
|
@@ -1711,6 +1711,10 @@ void SvgBuilder::_setTextStyle(Inkscape::XML::Node *node, GfxState *state, SPCSS
|
|
/**
|
|
* Renders the text as a path object using cairo and returns the node object.
|
|
*
|
|
+ * If the path is empty (e.g. due to trying to render a color bitmap font),
|
|
+ * return path node with empty "d" attribute. The aria attribute will still
|
|
+ * contain the original text.
|
|
+ *
|
|
* cairo_font - The font that cairo can use to convert text to path.
|
|
* font_size - The size of the text when drawing the path.
|
|
* transform - The matrix which will place the text on the page, this is critical
|
|
@@ -1722,8 +1726,13 @@ Inkscape::XML::Node *SvgBuilder::_renderText(std::shared_ptr<CairoFont> cairo_fo
|
|
const Geom::Affine &transform,
|
|
cairo_glyph_t *cairo_glyphs, unsigned int count)
|
|
{
|
|
- if (!cairo_glyphs || !cairo_font || _aria_label.empty())
|
|
- return nullptr;
|
|
+ Inkscape::XML::Node *path = _addToContainer("svg:path");
|
|
+ path->setAttribute("d", "");
|
|
+
|
|
+ if (!cairo_glyphs || !cairo_font || _aria_label.empty()) {
|
|
+ std::cerr << "SvgBuilder::_renderText: Invalid argument!" << std::endl;
|
|
+ return path;
|
|
+ }
|
|
|
|
// The surface isn't actually used, no rendering in cairo takes place.
|
|
cairo_surface_t *surface = cairo_image_surface_create(CAIRO_FORMAT_ARGB32, _width, _height);
|
|
@@ -1738,16 +1747,17 @@ Inkscape::XML::Node *SvgBuilder::_renderText(std::shared_ptr<CairoFont> cairo_fo
|
|
|
|
// Failing to render text.
|
|
if (!pathv) {
|
|
- g_warning("Failed to render PDF text!");
|
|
- return nullptr;
|
|
+ std::cerr << "SvgBuilder::_renderText: Failed to render PDF text! " << _aria_label << std::endl;
|
|
+ return path;
|
|
}
|
|
|
|
auto textpath = sp_svg_write_path(*pathv);
|
|
- if (textpath.empty())
|
|
- return nullptr;
|
|
-
|
|
- Inkscape::XML::Node *path = _addToContainer("svg:path");
|
|
path->setAttribute("d", textpath);
|
|
+
|
|
+ if (textpath.empty()) {
|
|
+ std::cerr << "SvgBuilder::_renderText: Empty path! " << _aria_label << std::endl;
|
|
+ }
|
|
+
|
|
return path;
|
|
}
|
|
|
|
@@ -1785,7 +1795,7 @@ void SvgBuilder::endString(GfxState *state)
|
|
* dx, dy: Advance of glyph.
|
|
* originX, originY
|
|
* code: 8-bit char code, 16 bit CID, or Unicode of glyph.
|
|
- * u: Unicode mapping of character.
|
|
+ * u: Unicode mapping of character. "Unicode" is an unsigned int.
|
|
*/
|
|
void SvgBuilder::addChar(GfxState *state, double x, double y, double dx, double dy, double originX, double originY,
|
|
CharCode code, int /*nBytes*/, Unicode const *u, int uLen)
|
|
@@ -1801,9 +1811,13 @@ void SvgBuilder::addChar(GfxState *state, double x, double y, double dx, double
|
|
}
|
|
_aria_space = false;
|
|
|
|
+ std::string utf8_code;
|
|
static std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv1;
|
|
+ // Note std::wstring_convert and std::codecvt_utf are deprecated and will be removed in C++26.
|
|
if (u) {
|
|
- _aria_label += conv1.to_bytes(*u);
|
|
+ // 'u' maybe null if there is not a "ToUnicode" table in the PDF!
|
|
+ utf8_code = conv1.to_bytes(*u);
|
|
+ _aria_label += utf8_code;
|
|
}
|
|
|
|
// Skip control characters, found in LaTeX generated PDFs
|
|
@@ -1835,6 +1849,7 @@ void SvgBuilder::addChar(GfxState *state, double x, double y, double dx, double
|
|
}
|
|
|
|
SvgGlyph new_glyph;
|
|
+ new_glyph.code = utf8_code;
|
|
new_glyph.is_space = is_space;
|
|
new_glyph.delta = delta;
|
|
new_glyph.position = Geom::Point( x - originX, y - originY );
|
|
@@ -1849,23 +1864,6 @@ void SvgBuilder::addChar(GfxState *state, double x, double y, double dx, double
|
|
}
|
|
_text_position += delta;
|
|
|
|
- // Convert the character to UTF-8 since that's our SVG document's encoding
|
|
- {
|
|
- gunichar2 uu[8] = {0};
|
|
-
|
|
- for (int i = 0; i < uLen; i++) {
|
|
- uu[i] = u[i];
|
|
- }
|
|
-
|
|
- gchar *tmp = g_utf16_to_utf8(uu, uLen, nullptr, nullptr, nullptr);
|
|
- if ( tmp && *tmp ) {
|
|
- new_glyph.code = tmp;
|
|
- } else {
|
|
- new_glyph.code.clear();
|
|
- }
|
|
- g_free(tmp);
|
|
- }
|
|
-
|
|
// Copy current style if it has changed since the previous glyph
|
|
if (_invalidated_style || _glyphs.empty()) {
|
|
_invalidated_style = false;
|
|
--
|
|
GitLab
|
|
|