Skip to content

Commit

Permalink
feat 🚀: initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
toadharvard committed Mar 25, 2024
0 parents commit b817e65
Show file tree
Hide file tree
Showing 9 changed files with 282 additions and 0 deletions.
27 changes: 27 additions & 0 deletions .github/workflows/lint.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
name: golangci-lint
on:
push:
branches:
- master
- main
pull_request:

permissions:
contents: read
pull-requests: read
checks: write

jobs:
golangci:
name: lint
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-go@v5
with:
go-version: '1.22'
cache: false
- name: golangci-lint
uses: golangci/golangci-lint-action@v4
with:
version: v1.56
24 changes: 24 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
name: test
on:
push:
branches:
- master
- main
pull_request:

jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3

- name: Set up Go
uses: actions/setup-go@v4
with:
go-version: '1.22'

- name: Build
run: go build .

- name: Test
run: go test -v ./...
21 changes: 21 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# If you prefer the allow list template instead of the deny list, see community template:
# https://github.com/github/gitignore/blob/main/community/Golang/Go.AllowList.gitignore
#
# Binaries for programs and plugins
*.exe
*.exe~
*.dll
*.so
*.dylib

# Test binary, built with `go test -c`
*.test

# Output of the go coverage tool, specifically when used with LiteIDE
*.out

# Dependency directories (remove the comment below to include it)
# vendor/

# Go workspace file
go.work
19 changes: 19 additions & 0 deletions LICENCE
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
Copyright (c) 2024 Vadim Yakshigulov

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
47 changes: 47 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# stopwords-iso
stopwords-iso is a go package that removes stop words from a text content

## Example

You can remove stopwords by language
```go
package main
import (
sw "github.com/toadharvard/stopwords-iso"
)

func main() {
stopwordsMapping, _ := sw.NewStopwordsMapping()

originalString := "This is a sample string with some stopwords."
language := "en"

clearedString := stopwordsMapping.ClearStringByLang(originalString, language)
fmt.Printf("Cleared string: %s\n", clearedString)
}
```

or remove all stopwords from all supported languages

```go
package main
import (
sw "github.com/toadharvard/stopwords-iso"
)
func main() {
stopwordsMapping, _ := sw.NewStopwordsMapping()

originalString := "the book on the table y la pluma es de ella und da Licht ist aus et la porte est ouverte и я it's"

clearedString := stopwordsMapping.ClearString(originalString)
fmt.Printf("Cleared string: %s\n", clearedString)
}
```

## Supported languages
This package uses the [stopwords-iso](https://github.com/stopwords-iso/) words pack. All languages supported by stopwords-iso are listed here: https://github.com/stopwords-iso/stopwords-iso?tab=readme-ov-file#credits

## License

Distributed under the MIT license.
See [LICENSE](LICENSE) for more information.
3 changes: 3 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
module github.com/toadharvard/stopwords-iso

go 1.22.0
72 changes: 72 additions & 0 deletions main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
package main

import (
"encoding/json"
"os"
"regexp"
"slices"
"strings"
)

type StopwordsMapping map[string][]string
type ISOCode639_1 = string

var wordSegmenter = regexp.MustCompile(`[\pL\p{Mc}\p{Mn}-_']+`)

func standardizeSpaces(str string) string {
return strings.Join(strings.Fields(str), " ")
}

// NewStopwordsMapping initializes a new StopwordsMapping from a JSON file.
//
// Returns:
// - StopwordsMapping: a map containing language to stopwords mapping.
// - error: an error object if an error occurred while reading or unmarshaling the JSON file.
func NewStopwordsMapping() (StopwordsMapping, error) {
jsonFile, err := os.ReadFile("stopwords-iso.json")
if err != nil {
return *new(StopwordsMapping), err
}

mapping := make(map[string][]string)
json.Unmarshal(jsonFile, &mapping)

Check failure on line 32 in main.go

View workflow job for this annotation

GitHub Actions / lint

Error return value of `json.Unmarshal` is not checked (errcheck)
return mapping, nil
}

// ClearStringByLang clears the given string by removing all stopwords in the specified language.
//
// Parameters:
// - str: the string to be cleared.
// - language: the language of the stopwords to be removed in ISO 639-1 format.
//
// Return:
// - string: the cleared string.
func (m *StopwordsMapping) ClearStringByLang(str string, language ISOCode639_1) string {
language = strings.ToLower(language)

str = standardizeSpaces(str)

words := wordSegmenter.FindAllString(str, -1)

filtered := []string{}
for _, word := range words {
if !slices.Contains((*m)[language], strings.ToLower(word)) {
filtered = append(filtered, word)
}
}
return strings.Join(filtered, " ")
}

// ClearString clears the given string by removing stopwords for all languages.
//
// Parameters:
// - str: the string to be cleared.
//
// Returns:
// - string: the cleared string.
func (m *StopwordsMapping) ClearString(str string) string {
for language := range *m {
str = m.ClearStringByLang(str, language)
}
return str
}
68 changes: 68 additions & 0 deletions main_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
package main

import (
"os"
"testing"
)

func TestNewStopwordsMapping(t *testing.T) {
mapping, err := NewStopwordsMapping()
if err != nil {
t.Errorf("NewStopwordsMapping() error = %v, wantErr %v", err, false)
}
if len(mapping) == 0 {
t.Errorf("NewStopwordsMapping() returned an empty mapping")
}
// Test if the mapping contains expected languages
if _, exists := mapping["en"]; !exists {
t.Errorf("NewStopwordsMapping() should contain 'en' key for English stopwords")
}
// More specific tests can be added here
}

func TestClearStringByLang(t *testing.T) {
mapping, _ := NewStopwordsMapping() // Assuming that NewStopwordsMapping works correctly

tests := []struct {
name string
str string
language ISOCode639_1
want string
}{
{"English", "this is a special", "en", "special"},
{"Non-Existent Language", "this should not change", "xx", "this should not change"},
{"Case Insensitivity", "THIS is A Google", "EN", "Google"},
{"No Stopwords", "uniqueword", "en", "uniqueword"},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := mapping.ClearStringByLang(tt.str, tt.language); got != tt.want {
t.Errorf("StopwordsMapping.ClearStringByLang() got = %v, want %v", got, tt.want)
}
})
}
}

func TestStopwordsMapping_ClearString(t *testing.T) {
mapping, _ := NewStopwordsMapping()

testStr := "А ИЛИ ОН it's ok vadim"
want := "vadim"

got := mapping.ClearString(testStr)
if got != want {
t.Errorf("StopwordsMapping.ClearString() got = %v, want %v", got, want)
}
}

func TestNewStopwordsMapping_Error(t *testing.T) {
// Temporarily rename the stopwords file to simulate an error
os.Rename("stopwords-iso.json", "stopwords-iso_backup.json")

Check failure on line 61 in main_test.go

View workflow job for this annotation

GitHub Actions / lint

Error return value of `os.Rename` is not checked (errcheck)
defer os.Rename("stopwords-iso_backup.json", "stopwords-iso.json")

Check failure on line 62 in main_test.go

View workflow job for this annotation

GitHub Actions / lint

Error return value of `os.Rename` is not checked (errcheck)

_, err := NewStopwordsMapping()
if err == nil {
t.Errorf("Expected an error when stopwords-iso.json does not exist")
}
}
1 change: 1 addition & 0 deletions stopwords-iso.json

Large diffs are not rendered by default.

0 comments on commit b817e65

Please sign in to comment.