Skip to content

Commit

Permalink
ICU collation for strings #17.
Browse files Browse the repository at this point in the history
Implemented using "golang.org/x/text/collate" package. A new
configuration API is exposed to set a collator object, which is
responsible for compiling strings to sort-key.

Note that SetTextCollator() API is not stable.

Another caveat, AFAIK, in using golang.org/x/text/collate is that
we cannot convert sort-key back to string.
  • Loading branch information
prataprc committed Apr 13, 2018
1 parent 84a4f87 commit 936382f
Show file tree
Hide file tree
Showing 5 changed files with 166 additions and 4 deletions.
2 changes: 1 addition & 1 deletion cbor_json.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@

package gson

import "strconv"
import "math"
import "strconv"
import "encoding/binary"

var nullBin = []byte("null")
Expand Down
4 changes: 4 additions & 0 deletions collate.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ import "encoding/json"
import "math/big"
import "bytes"

import "golang.org/x/text/collate"

// Collation order for supported types. Applications desiring different
// ordering between types can initialize these byte values before
// instantiating a config object.
Expand Down Expand Up @@ -34,6 +36,8 @@ type collateConfig struct {
enc *json.Encoder
buf *bytes.Buffer
zf *big.Float
tcltbuffer *collate.Buffer
textcollator *collate.Collator
}

// Collate abstraction for value encoded into binary-collation.
Expand Down
140 changes: 140 additions & 0 deletions collate_test.go
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
package gson

import "sort"
import "bytes"
import "strings"
import "testing"
import "reflect"

import "golang.org/x/text/collate"
import "golang.org/x/text/language"

func TestCollateReset(t *testing.T) {
config := NewDefaultConfig()
clt := config.NewCollate(make([]byte, 0, 1024))
Expand Down Expand Up @@ -49,6 +54,117 @@ func TestCollateEmpty(t *testing.T) {
}()
}

func TestAlternateSortTypes(t *testing.T) {
testCases := []struct {
lang string
in testtxtclts
want []string
}{{
lang: "zh,cmn,zh-Hant-u-co-pinyin,zh-HK-u-co-pinyin,zh-pinyin",
in: testtxtclts{
&testtxtclt{in: "爸爸"}, &testtxtclt{in: "妈妈"},
&testtxtclt{in: "儿子"}, &testtxtclt{in: "女儿"},
},
want: []string{"爸爸", "儿子", "妈妈", "女儿"},
}, {
lang: "zh-Hant,zh-u-co-stroke,zh-Hant-u-co-stroke",
in: testtxtclts{
&testtxtclt{in: "爸爸"}, &testtxtclt{in: "妈妈"},
&testtxtclt{in: "儿子"}, &testtxtclt{in: "女儿"},
},
want: []string{"儿子", "女儿", "妈妈", "爸爸"},
}}

for _, tc := range testCases {
for _, tag := range strings.Split(tc.lang, ",") {
collator := collate.New(language.MustParse(tag))
config := NewDefaultConfig().SetTextCollator(collator)
for _, item := range tc.in {
item.collate(config)
}
sort.Sort(tc.in)
got := []string{}
for _, item := range tc.in {
got = append(got, item.in)
}
if !reflect.DeepEqual(got, tc.want) {
t.Errorf("%v %v expected %v; got %v", tag, tc.in, tc.want, got)
}
}
}
}

func TestTextNocase(t *testing.T) {
testCases := []struct {
lang string
in testtxtclts
want []string
}{{
lang: "en",
in: testtxtclts{
&testtxtclt{in: "B"}, &testtxtclt{in: "b"},
&testtxtclt{in: "a"}, &testtxtclt{in: "A"},
},
want: []string{"a", "A", "B", "b"},
}}

for _, tc := range testCases {
for _, tag := range strings.Split(tc.lang, ",") {
collator := collate.New(language.MustParse(tag))
config := NewDefaultConfig().SetTextCollator(collator)
for _, item := range tc.in {
item.collate(config)
}
sort.Sort(tc.in)
got := []string{}
for _, item := range tc.in {
got = append(got, item.in)
}
if !reflect.DeepEqual(got, tc.want) {
t.Errorf("%v %v expected %v; got %v", tag, tc.in, tc.want, got)
}
}
}
}

func TestTextGermanSwedish(t *testing.T) {
testCases := []struct {
lang string
in testtxtclts
want []string
}{{
lang: "de",
in: testtxtclts{
&testtxtclt{in: "a"}, &testtxtclt{in: "z"}, &testtxtclt{in: "ä"},
},
want: []string{"a", "ä", "z"},
}, {
lang: "sv",
in: testtxtclts{
&testtxtclt{in: "a"}, &testtxtclt{in: "z"}, &testtxtclt{in: "ä"},
},
want: []string{"a", "z", "ä"},
}}

for _, tc := range testCases {
for _, tag := range strings.Split(tc.lang, ",") {
collator := collate.New(language.MustParse(tag))
config := NewDefaultConfig().SetTextCollator(collator)
for _, item := range tc.in {
item.collate(config)
}
sort.Sort(tc.in)
got := []string{}
for _, item := range tc.in {
got = append(got, item.in)
}
if !reflect.DeepEqual(got, tc.want) {
t.Errorf("%v %v expected %v; got %v", tag, tc.in, tc.want, got)
}
}
}
}

// sort type for slice of []byte

type ByteSlices [][]byte
Expand All @@ -64,3 +180,27 @@ func (bs ByteSlices) Less(i, j int) bool {
func (bs ByteSlices) Swap(i, j int) {
bs[i], bs[j] = bs[j], bs[i]
}

type testtxtclt struct {
in string
clt []byte
}

func (item *testtxtclt) collate(config *Config) {
val := config.NewValue(item.in)
item.clt = val.Tocollate(config.NewCollate(nil)).Bytes()
}

type testtxtclts []*testtxtclt

func (items testtxtclts) Len() int {
return len(items)
}

func (items testtxtclts) Less(i, j int) bool {
return bytes.Compare(items[i].clt, items[j].clt) < 0
}

func (items testtxtclts) Swap(i, j int) {
items[i], items[j] = items[j], items[i]
}
15 changes: 13 additions & 2 deletions config.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ import "fmt"
import "math/big"
import "encoding/json"

import "golang.org/x/text/collate"

// NumberKind how to treat numbers.
type NumberKind byte

Expand Down Expand Up @@ -72,12 +74,15 @@ func NewDefaultConfig() *Config {
}

func (config *Config) init() *Config {
// collateConfig
config.buf = bytes.NewBuffer(make([]byte, 0, 1024)) // start with 1K
config.enc = json.NewEncoder(config.buf)
a, b, c, d := config.strlen, config.numkeys, config.itemlen, config.ptrlen
config.pools = newMempool(a, b, c, d)
config.zf = big.NewFloat(0)
config.zf.SetPrec(64)
config.tcltbuffer = &collate.Buffer{}
// mempools
a, b, c, d := config.strlen, config.numkeys, config.itemlen, config.ptrlen
config.pools = newMempool(a, b, c, d)
return config
}

Expand Down Expand Up @@ -135,6 +140,12 @@ func (config Config) SetMaxkeys(n int) *Config {
return config.init()
}

// SetTextCollator for string type.
func (config Config) SetTextCollator(collator *collate.Collator) *Config {
config.textcollator = collator
return &config
}

// ResetPools configure a new set of pools with specified size, instead
// of using the default size: MaxStringLen, MaxKeys, MaxCollateLen, and,
// MaxJsonpointerLen.
Expand Down
9 changes: 8 additions & 1 deletion util.go
Original file line number Diff line number Diff line change
Expand Up @@ -186,9 +186,16 @@ func collateString(str string, code []byte, config *Config) (n int) {
code[0], code[1] = TypeMissing, Terminator
return 2
}
strcode := str2bytes(str)
if config.textcollator != nil {
config.tcltbuffer.Reset()
strcode = config.textcollator.Key(config.tcltbuffer, strcode)
strcode = strcode[:len(strcode)-1] // return text is null terminated
}

code[n] = TypeString
n++
n += suffixEncodeString(str2bytes(str), code[n:])
n += suffixEncodeString(strcode, code[n:])
code[n] = Terminator
n++
return n
Expand Down

0 comments on commit 936382f

Please sign in to comment.