-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit b817e65
Showing
9 changed files
with
282 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
name: golangci-lint | ||
on: | ||
push: | ||
branches: | ||
- master | ||
- main | ||
pull_request: | ||
|
||
permissions: | ||
contents: read | ||
pull-requests: read | ||
checks: write | ||
|
||
jobs: | ||
golangci: | ||
name: lint | ||
runs-on: ubuntu-latest | ||
steps: | ||
- uses: actions/checkout@v4 | ||
- uses: actions/setup-go@v5 | ||
with: | ||
go-version: '1.22' | ||
cache: false | ||
- name: golangci-lint | ||
uses: golangci/golangci-lint-action@v4 | ||
with: | ||
version: v1.56 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
name: test | ||
on: | ||
push: | ||
branches: | ||
- master | ||
- main | ||
pull_request: | ||
|
||
jobs: | ||
build: | ||
runs-on: ubuntu-latest | ||
steps: | ||
- uses: actions/checkout@v3 | ||
|
||
- name: Set up Go | ||
uses: actions/setup-go@v4 | ||
with: | ||
go-version: '1.22' | ||
|
||
- name: Build | ||
run: go build . | ||
|
||
- name: Test | ||
run: go test -v ./... |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
# If you prefer the allow list template instead of the deny list, see community template: | ||
# https://github.com/github/gitignore/blob/main/community/Golang/Go.AllowList.gitignore | ||
# | ||
# Binaries for programs and plugins | ||
*.exe | ||
*.exe~ | ||
*.dll | ||
*.so | ||
*.dylib | ||
|
||
# Test binary, built with `go test -c` | ||
*.test | ||
|
||
# Output of the go coverage tool, specifically when used with LiteIDE | ||
*.out | ||
|
||
# Dependency directories (remove the comment below to include it) | ||
# vendor/ | ||
|
||
# Go workspace file | ||
go.work |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
Copyright (c) 2024 Vadim Yakshigulov | ||
|
||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
in the Software without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
|
||
The above copyright notice and this permission notice shall be included in all | ||
copies or substantial portions of the Software. | ||
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
SOFTWARE. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
# stopwords-iso | ||
stopwords-iso is a go package that removes stop words from a text content | ||
|
||
## Example | ||
|
||
You can remove stopwords by language | ||
```go | ||
package main | ||
import ( | ||
sw "github.com/toadharvard/stopwords-iso" | ||
) | ||
|
||
func main() { | ||
stopwordsMapping, _ := sw.NewStopwordsMapping() | ||
|
||
originalString := "This is a sample string with some stopwords." | ||
language := "en" | ||
|
||
clearedString := stopwordsMapping.ClearStringByLang(originalString, language) | ||
fmt.Printf("Cleared string: %s\n", clearedString) | ||
} | ||
``` | ||
|
||
or remove all stopwords from all supported languages | ||
|
||
```go | ||
package main | ||
import ( | ||
sw "github.com/toadharvard/stopwords-iso" | ||
) | ||
func main() { | ||
stopwordsMapping, _ := sw.NewStopwordsMapping() | ||
|
||
originalString := "the book on the table y la pluma es de ella und da Licht ist aus et la porte est ouverte и я it's" | ||
|
||
clearedString := stopwordsMapping.ClearString(originalString) | ||
fmt.Printf("Cleared string: %s\n", clearedString) | ||
} | ||
``` | ||
|
||
## Supported languages | ||
This package uses the [stopwords-iso](https://github.com/stopwords-iso/) words pack. All languages supported by stopwords-iso are listed here: https://github.com/stopwords-iso/stopwords-iso?tab=readme-ov-file#credits | ||
|
||
## License | ||
|
||
Distributed under the MIT license. | ||
See [LICENSE](LICENSE) for more information. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
module github.com/toadharvard/stopwords-iso | ||
|
||
go 1.22.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
package main | ||
|
||
import ( | ||
"encoding/json" | ||
"os" | ||
"regexp" | ||
"slices" | ||
"strings" | ||
) | ||
|
||
type StopwordsMapping map[string][]string | ||
type ISOCode639_1 = string | ||
|
||
var wordSegmenter = regexp.MustCompile(`[\pL\p{Mc}\p{Mn}-_']+`) | ||
|
||
func standardizeSpaces(str string) string { | ||
return strings.Join(strings.Fields(str), " ") | ||
} | ||
|
||
// NewStopwordsMapping initializes a new StopwordsMapping from a JSON file. | ||
// | ||
// Returns: | ||
// - StopwordsMapping: a map containing language to stopwords mapping. | ||
// - error: an error object if an error occurred while reading or unmarshaling the JSON file. | ||
func NewStopwordsMapping() (StopwordsMapping, error) { | ||
jsonFile, err := os.ReadFile("stopwords-iso.json") | ||
if err != nil { | ||
return *new(StopwordsMapping), err | ||
} | ||
|
||
mapping := make(map[string][]string) | ||
json.Unmarshal(jsonFile, &mapping) | ||
return mapping, nil | ||
} | ||
|
||
// ClearStringByLang clears the given string by removing all stopwords in the specified language. | ||
// | ||
// Parameters: | ||
// - str: the string to be cleared. | ||
// - language: the language of the stopwords to be removed in ISO 639-1 format. | ||
// | ||
// Return: | ||
// - string: the cleared string. | ||
func (m *StopwordsMapping) ClearStringByLang(str string, language ISOCode639_1) string { | ||
language = strings.ToLower(language) | ||
|
||
str = standardizeSpaces(str) | ||
|
||
words := wordSegmenter.FindAllString(str, -1) | ||
|
||
filtered := []string{} | ||
for _, word := range words { | ||
if !slices.Contains((*m)[language], strings.ToLower(word)) { | ||
filtered = append(filtered, word) | ||
} | ||
} | ||
return strings.Join(filtered, " ") | ||
} | ||
|
||
// ClearString clears the given string by removing stopwords for all languages. | ||
// | ||
// Parameters: | ||
// - str: the string to be cleared. | ||
// | ||
// Returns: | ||
// - string: the cleared string. | ||
func (m *StopwordsMapping) ClearString(str string) string { | ||
for language := range *m { | ||
str = m.ClearStringByLang(str, language) | ||
} | ||
return str | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
package main | ||
|
||
import ( | ||
"os" | ||
"testing" | ||
) | ||
|
||
func TestNewStopwordsMapping(t *testing.T) { | ||
mapping, err := NewStopwordsMapping() | ||
if err != nil { | ||
t.Errorf("NewStopwordsMapping() error = %v, wantErr %v", err, false) | ||
} | ||
if len(mapping) == 0 { | ||
t.Errorf("NewStopwordsMapping() returned an empty mapping") | ||
} | ||
// Test if the mapping contains expected languages | ||
if _, exists := mapping["en"]; !exists { | ||
t.Errorf("NewStopwordsMapping() should contain 'en' key for English stopwords") | ||
} | ||
// More specific tests can be added here | ||
} | ||
|
||
func TestClearStringByLang(t *testing.T) { | ||
mapping, _ := NewStopwordsMapping() // Assuming that NewStopwordsMapping works correctly | ||
|
||
tests := []struct { | ||
name string | ||
str string | ||
language ISOCode639_1 | ||
want string | ||
}{ | ||
{"English", "this is a special", "en", "special"}, | ||
{"Non-Existent Language", "this should not change", "xx", "this should not change"}, | ||
{"Case Insensitivity", "THIS is A Google", "EN", "Google"}, | ||
{"No Stopwords", "uniqueword", "en", "uniqueword"}, | ||
} | ||
|
||
for _, tt := range tests { | ||
t.Run(tt.name, func(t *testing.T) { | ||
if got := mapping.ClearStringByLang(tt.str, tt.language); got != tt.want { | ||
t.Errorf("StopwordsMapping.ClearStringByLang() got = %v, want %v", got, tt.want) | ||
} | ||
}) | ||
} | ||
} | ||
|
||
func TestStopwordsMapping_ClearString(t *testing.T) { | ||
mapping, _ := NewStopwordsMapping() | ||
|
||
testStr := "А ИЛИ ОН it's ok vadim" | ||
want := "vadim" | ||
|
||
got := mapping.ClearString(testStr) | ||
if got != want { | ||
t.Errorf("StopwordsMapping.ClearString() got = %v, want %v", got, want) | ||
} | ||
} | ||
|
||
func TestNewStopwordsMapping_Error(t *testing.T) { | ||
// Temporarily rename the stopwords file to simulate an error | ||
os.Rename("stopwords-iso.json", "stopwords-iso_backup.json") | ||
defer os.Rename("stopwords-iso_backup.json", "stopwords-iso.json") | ||
|
||
_, err := NewStopwordsMapping() | ||
if err == nil { | ||
t.Errorf("Expected an error when stopwords-iso.json does not exist") | ||
} | ||
} |
Large diffs are not rendered by default.
Oops, something went wrong.