From 971af819e882ab975bc880e8ea1d0f2b526dc40e Mon Sep 17 00:00:00 2001 From: Tim Davies Date: Sat, 19 Oct 2024 22:18:21 +0100 Subject: [PATCH] Better support for emoji with zero width connectors (#17) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The [zero-width connector](https://emojipedia.org/zero-width-joiner#technical) is a special character used to combine multiple emoji into a single emoji character on screen. For example ๐Ÿ‘ฉโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ง This patches the code to skip specific instances of the zero-width connector to avoid parsing parts of the emoji as unicode characters/breaking into multiple emoji. This is based on https://github.com/tmdvs/Go-Emoji-Utils/issues/12#issuecomment-1362747872 --- emoji.go | 2 +- search.go | 17 +++++++++++++++-- tests/search_test.go | 14 ++++++++++++++ 3 files changed, 30 insertions(+), 3 deletions(-) diff --git a/emoji.go b/emoji.go index 84977b9..176ef03 100644 --- a/emoji.go +++ b/emoji.go @@ -86,7 +86,7 @@ func RemoveAll(input string) string { matches := FindAll(input) for _, item := range matches { - emo := item.Match.(Emoji) + emo := item.Match rs := []rune(emo.Value) for _, r := range rs { input = strings.ReplaceAll(input, string([]rune{r}), "") diff --git a/search.go b/search.go index 2402511..a80dc8a 100644 --- a/search.go +++ b/search.go @@ -9,7 +9,7 @@ import ( // SearchResult - Occurrence of an emoji in a string type SearchResult struct { - Match interface{} + Match Emoji Occurrences int Locations [][]int } @@ -43,7 +43,7 @@ func Find(emojiString string, input string) (result SearchResult, err error) { // Loop through emoji present in input and if any match the // emoji we're looking for we'll return the result for _, r := range allEmoji { - if r.Match.(Emoji).Key == emoji.Key { + if r.Match.Key == emoji.Key { result = r return } @@ -71,6 +71,15 @@ func FindAll(input string) (detectedEmojis SearchResults) { continue } + // If the previous rune was a zero width joiner we'll skip this one + // [Github issue](https://github.com/tmdvs/Go-Emoji-Utils/issues/12#issuecomment-1362747872) + if index >= 1 { + previousRune := []rune{runes[index-1]} + if isRuneZeroWidthJoiner(previousRune) { + continue + } + } + // Grab the initial hex value of this run hexKey := utils.RunesToHexKey([]rune{r}) @@ -162,3 +171,7 @@ func findEmoji(term string, list map[string]Emoji) (results map[string]Emoji) { } return } + +func isRuneZeroWidthJoiner(r []rune) bool { + return utils.RunesToHexKey(r) == "200D" +} diff --git a/tests/search_test.go b/tests/search_test.go index 520f9bf..2607ac5 100644 --- a/tests/search_test.go +++ b/tests/search_test.go @@ -31,6 +31,9 @@ func TestRemoveAllEmoji(t *testing.T) { totalUniqueEmoji := len(matches) assert.Equal(t, totalUniqueEmoji, 6, "There should be six different emoji, found: %v", matches) + assert.Equal(t, matches[0].Match.Value, "๐Ÿ˜„", "The first emoji should be ๐Ÿ˜„") + assert.Equal(t, matches[1].Match.Value, "๐Ÿท", "The second emoji should be ๐Ÿท") + assert.Equal(t, matches[5].Match.Value, "๐Ÿฅฐ", "The second emoji should be ๐Ÿฅฐ") emojiRemoved := emoji.RemoveAll(str) assert.Equal(t, "This is a string with some emoji!", emojiRemoved, "There should be no emoji") @@ -57,6 +60,14 @@ func TestNumericalKeycaps(t *testing.T) { assert.Equal(t, 11, totalUniqueEmoji, "There should be 11 unique emoji") } +func TestFamilyEmoji(t *testing.T) { + str := "๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘ฆโ€๐Ÿ‘ฆfamily emoji" + matches := emoji.FindAll(str) + totalUniqueEmoji := len(matches) + + assert.Equal(t, 1, totalUniqueEmoji, "There should be 1 unique emoji") +} + func TestRemoveAllEmojiChinese(t *testing.T) { str := "่ตทๅŽ็‰นๅœจ๐Ÿ‡ซ๐Ÿ‡ท้˜Ÿ็š„ไฝœ็”จๆ›ด ๅ“ˆๅ“ˆๅ“ˆ" @@ -65,6 +76,7 @@ func TestRemoveAllEmojiChinese(t *testing.T) { totalUniqueEmoji := len(matches) assert.Equal(t, totalUniqueEmoji, 1, "There should be one emoji") + assert.Equal(t, matches[0].Match.Value, "๐Ÿ‡ซ๐Ÿ‡ท", "The emoji should be ๐Ÿ‡ซ๐Ÿ‡ท") emojiRemoved := emoji.RemoveAll(str) assert.Equal(t, "่ตทๅŽ็‰นๅœจ้˜Ÿ็š„ไฝœ็”จๆ›ด ๅ“ˆๅ“ˆๅ“ˆ", emojiRemoved, "There should be no emoji") @@ -79,6 +91,8 @@ func TestRemoveAllEmojiChineseEnglishMixed(t *testing.T) { totalUniqueEmoji := len(matches) assert.Equal(t, totalUniqueEmoji, 8, "There should be one emoji") + assert.Equal(t, matches[0].Match.Value, "๐Ÿคฎ", "The first emoji should be ๐Ÿคฎ") + assert.Equal(t, matches[4].Match.Value, "๐Ÿค ", "The fifth emoji should be ๐Ÿค ") emojiRemoved := emoji.RemoveAll(str) assert.Equal(t, "woๆญฆๆ–Œello aๆญฆๆ–Œ g ood peoello", emojiRemoved, "There should be no emoji")