From 112220da86222ba4cf9054ac53685c735cc4502d Mon Sep 17 00:00:00 2001 From: tnegre Date: Fri, 3 Jan 2025 10:08:56 +0100 Subject: [PATCH 1/5] BUG: when there are two identical HTML tags in a substitution string, everything is taken between first opening tag and last closing tag. Write test to reproduce. --- test/phpunit/ODFTest.php | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/test/phpunit/ODFTest.php b/test/phpunit/ODFTest.php index a7d972bf3450d..a019d5754e2ac 100644 --- a/test/phpunit/ODFTest.php +++ b/test/phpunit/ODFTest.php @@ -296,33 +296,39 @@ public function testODFconvertVarToOdf() 'charset' => null, 'expected' => utf8_encode('text with intricatedtags'), ], + 24 => [ + 'to_convert' => "text with two (strong) tags", + 'encode' => true, + 'charset' => null, + 'expected' => utf8_encode('text with two (strong) tags'), + ], // One can also pass html-encoded string to the method - 24 => [ + 25 => [ 'to_convert' => 'One&two', 'encode' => true, 'charset' => null, 'expected' => 'One&two', ], - 25 => [ + 26 => [ 'to_convert' => "text with <strong>strong, </strong><em>emphasis</em> and <u>underlined</u> words with <i>it@lic sp&ciàlchärs éè l'</i>", 'encode' => false, 'charset' => 'UTF-8', 'expected' => 'text with strong, emphasis and underlined words with it@lic sp&ciàlchärs éè l\'', ], - 26 => [ + 27 => [ 'to_convert' => "text with <strong>strong, </strong><em>emphasis</em> and <u>underlined</u> words with <i>it@lic sp&ciàlchärs éè l'</i>", 'encode' => true, 'charset' => 'UTF-8', 'expected' => 'text with strong, emphasis and underlined words with it@lic sp&ciàlchärs éè l'', ], - 27 => [ + 28 => [ 'to_convert' => "text with <strong>strong, </strong><em>emphasis</em> and <u>underlined</u> words with <i>it@lic sp&ciàlchärs éè l'</i>", 'encode' => false, 'charset' => null, 'expected' => utf8_encode('text with strong, emphasis and underlined words with it@lic sp&ciàlchärs éè l\''), ], - 28 => [ + 29 => [ 'to_convert' => "text with <strong>strong, </strong><em>emphasis</em> and <u>underlined</u> words with <i>it@lic sp&ciàlchärs éè l'</i>", 'encode' => true, 'charset' => null, @@ -341,20 +347,20 @@ public function testODFconvertVarToOdf() // Following tests reflect the current behavior. They may evolve if the method behavior changes. // The method removes hyperlinks and tags that are not dealt with. - 29 => [ + 30 => [ 'to_convert' => '123 trucmachin > truc < troc > tracbla bla', 'encode' => true, 'charset' => null, 'expected' => "123 trucmachin > truc < troc > tracbla bla", ], - 30 => [ + 31 => [ 'to_convert' => '123

Title

bla', 'encode' => true, 'charset' => null, 'expected' => "123 Title bla", ], // HTML should not take \n into account, but only
. - 31 => [ + 32 => [ 'to_convert' => "text with strong text , a line\nbreak and underlined words with it@lic sp&ciàlchärs éè l'", 'encode' => false, 'charset' => 'UTF-8', From b31962f7e2835a294d3079985ba2fe3426152e59 Mon Sep 17 00:00:00 2001 From: tnegre Date: Fri, 3 Jan 2025 10:41:35 +0100 Subject: [PATCH 2/5] fix --- htdocs/includes/odtphp/odf.php | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/htdocs/includes/odtphp/odf.php b/htdocs/includes/odtphp/odf.php index 5a83223468672..17dc178b6a21b 100644 --- a/htdocs/includes/odtphp/odf.php +++ b/htdocs/includes/odtphp/odf.php @@ -46,8 +46,8 @@ class Odf public $userdefined=array(); const PIXEL_TO_CM = 0.026458333; - const FIND_TAGS_REGEX = '/<([A-Za-z0-9]+)(?:\s([A-Za-z]+(?:\-[A-Za-z]+)?(?:=(?:".*?")|(?:[0-9]+))))*(?:(?:\s\/>)|(?:>(.*)<\/\1>))/s'; - const FIND_ENCODED_TAGS_REGEX = '/<([A-Za-z]+)(?:\s([A-Za-z]+(?:\-[A-Za-z]+)?(?:=(?:".*?")|(?:[0-9]+))))*(?:(?:\s\/>)|(?:>(.*)<\/\1>))/'; + const FIND_TAGS_REGEX = '/<([A-Za-z0-9]+)(?:\s([A-Za-z]+(?:\-[A-Za-z]+)?(?:=(?:".*?")|(?:[0-9]+))))*(?:(?:\s\/>)|(?:>([^(<\1>)]*)<\/\1>))/s'; + const FIND_ENCODED_TAGS_REGEX = '/<([A-Za-z]+)(?:\s([A-Za-z]+(?:\-[A-Za-z]+)?(?:=(?:".*?")|(?:[0-9]+))))*(?:(?:\s\/>)|(?:>([^(<\1>)]*)<\/\1>))/'; /** @@ -1017,4 +1017,4 @@ public function getvalue($valuename) $this->contentXml = preg_replace($searchreg, "", $this->contentXml); return $matches[1]; } -} \ No newline at end of file +} From 61b03b0c0575456e098c2e7a2628ba385c1d179d Mon Sep 17 00:00:00 2001 From: tnegre Date: Fri, 3 Jan 2025 10:50:30 +0100 Subject: [PATCH 3/5] Testing ODF : revert parameters order for assertEquals() --- test/phpunit/ODFTest.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/phpunit/ODFTest.php b/test/phpunit/ODFTest.php index a019d5754e2ac..ab1a2c74d48ce 100644 --- a/test/phpunit/ODFTest.php +++ b/test/phpunit/ODFTest.php @@ -377,7 +377,7 @@ public function testODFconvertVarToOdf() } else { $res = $odf->convertVarToOdf($case['to_convert'], $case['encode']); } - $this->assertEquals($res, $case['expected']); + $this->assertEquals($case['expected'], $res); } print __METHOD__." result=".$result."\n"; From 78bf506790c18a227c8b9936919f92e4a0b46df4 Mon Sep 17 00:00:00 2001 From: tnegre Date: Fri, 3 Jan 2025 11:14:17 +0100 Subject: [PATCH 4/5] fix tests in error --- htdocs/includes/odtphp/odf.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/htdocs/includes/odtphp/odf.php b/htdocs/includes/odtphp/odf.php index 17dc178b6a21b..f5678d838a389 100644 --- a/htdocs/includes/odtphp/odf.php +++ b/htdocs/includes/odtphp/odf.php @@ -46,8 +46,8 @@ class Odf public $userdefined=array(); const PIXEL_TO_CM = 0.026458333; - const FIND_TAGS_REGEX = '/<([A-Za-z0-9]+)(?:\s([A-Za-z]+(?:\-[A-Za-z]+)?(?:=(?:".*?")|(?:[0-9]+))))*(?:(?:\s\/>)|(?:>([^(<\1>)]*)<\/\1>))/s'; - const FIND_ENCODED_TAGS_REGEX = '/<([A-Za-z]+)(?:\s([A-Za-z]+(?:\-[A-Za-z]+)?(?:=(?:".*?")|(?:[0-9]+))))*(?:(?:\s\/>)|(?:>([^(<\1>)]*)<\/\1>))/'; + const FIND_TAGS_REGEX = '/<([A-Za-z0-9]+)(?:\s([A-Za-z]+(?:\-[A-Za-z]+)?(?:=(?:".*?")|(?:[0-9]+))))*(?:(?:\s\/>)|(?:>(((?!<\1>).)*)<\/\1>))/s'; + const FIND_ENCODED_TAGS_REGEX = '/<([A-Za-z]+)(?:\s([A-Za-z]+(?:\-[A-Za-z]+)?(?:=(?:".*?")|(?:[0-9]+))))*(?:(?:\s\/>)|(?:>(((?!<\1>).)*)<\/\1>))/'; /** From 7e82f70c77aa42718cb6009d72cb5d806ca7175e Mon Sep 17 00:00:00 2001 From: tnegre Date: Thu, 9 Jan 2025 11:05:45 +0100 Subject: [PATCH 5/5] add case when HTML tags contain attributes --- htdocs/includes/odtphp/odf.php | 4 ++-- test/phpunit/ODFTest.php | 22 ++++++++++++++-------- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/htdocs/includes/odtphp/odf.php b/htdocs/includes/odtphp/odf.php index f5678d838a389..39f595f493443 100644 --- a/htdocs/includes/odtphp/odf.php +++ b/htdocs/includes/odtphp/odf.php @@ -46,8 +46,8 @@ class Odf public $userdefined=array(); const PIXEL_TO_CM = 0.026458333; - const FIND_TAGS_REGEX = '/<([A-Za-z0-9]+)(?:\s([A-Za-z]+(?:\-[A-Za-z]+)?(?:=(?:".*?")|(?:[0-9]+))))*(?:(?:\s\/>)|(?:>(((?!<\1>).)*)<\/\1>))/s'; - const FIND_ENCODED_TAGS_REGEX = '/<([A-Za-z]+)(?:\s([A-Za-z]+(?:\-[A-Za-z]+)?(?:=(?:".*?")|(?:[0-9]+))))*(?:(?:\s\/>)|(?:>(((?!<\1>).)*)<\/\1>))/'; + const FIND_TAGS_REGEX = '/<([A-Za-z0-9]+)(?:\s([A-Za-z]+(?:\-[A-Za-z]+)?(?:=(?:".*?")|(?:[0-9]+))))*(?:(?:\s\/>)|(?:>(((?!<\1(\s.*)?>).)*)<\/\1>))/s'; + const FIND_ENCODED_TAGS_REGEX = '/<([A-Za-z]+)(?:\s([A-Za-z]+(?:\-[A-Za-z]+)?(?:=(?:".*?")|(?:[0-9]+))))*(?:(?:\s\/>)|(?:>(((?!<\1(\s.*)?>).)*)<\/\1>))/'; /** diff --git a/test/phpunit/ODFTest.php b/test/phpunit/ODFTest.php index ab1a2c74d48ce..2badd77baf6d5 100644 --- a/test/phpunit/ODFTest.php +++ b/test/phpunit/ODFTest.php @@ -302,33 +302,39 @@ public function testODFconvertVarToOdf() 'charset' => null, 'expected' => utf8_encode('text with two (strong) tags'), ], + 25 => [ + 'to_convert' => "text with two (strong) tags and intricated underline ", + 'encode' => true, + 'charset' => null, + 'expected' => utf8_encode('text with two (strong) tags and intricated underline '), + ], // One can also pass html-encoded string to the method - 25 => [ + 26 => [ 'to_convert' => 'One&two', 'encode' => true, 'charset' => null, 'expected' => 'One&two', ], - 26 => [ + 27 => [ 'to_convert' => "text with <strong>strong, </strong><em>emphasis</em> and <u>underlined</u> words with <i>it@lic sp&ciàlchärs éè l'</i>", 'encode' => false, 'charset' => 'UTF-8', 'expected' => 'text with strong, emphasis and underlined words with it@lic sp&ciàlchärs éè l\'', ], - 27 => [ + 28 => [ 'to_convert' => "text with <strong>strong, </strong><em>emphasis</em> and <u>underlined</u> words with <i>it@lic sp&ciàlchärs éè l'</i>", 'encode' => true, 'charset' => 'UTF-8', 'expected' => 'text with strong, emphasis and underlined words with it@lic sp&ciàlchärs éè l'', ], - 28 => [ + 29 => [ 'to_convert' => "text with <strong>strong, </strong><em>emphasis</em> and <u>underlined</u> words with <i>it@lic sp&ciàlchärs éè l'</i>", 'encode' => false, 'charset' => null, 'expected' => utf8_encode('text with strong, emphasis and underlined words with it@lic sp&ciàlchärs éè l\''), ], - 29 => [ + 30 => [ 'to_convert' => "text with <strong>strong, </strong><em>emphasis</em> and <u>underlined</u> words with <i>it@lic sp&ciàlchärs éè l'</i>", 'encode' => true, 'charset' => null, @@ -347,20 +353,20 @@ public function testODFconvertVarToOdf() // Following tests reflect the current behavior. They may evolve if the method behavior changes. // The method removes hyperlinks and tags that are not dealt with. - 30 => [ + 31 => [ 'to_convert' => '123 trucmachin > truc < troc > tracbla bla', 'encode' => true, 'charset' => null, 'expected' => "123 trucmachin > truc < troc > tracbla bla", ], - 31 => [ + 32 => [ 'to_convert' => '123

Title

bla', 'encode' => true, 'charset' => null, 'expected' => "123 Title bla", ], // HTML should not take \n into account, but only
. - 32 => [ + 33 => [ 'to_convert' => "text with strong text , a line\nbreak and underlined words with it@lic sp&ciàlchärs éè l'", 'encode' => false, 'charset' => 'UTF-8',