Skip to content

Commit

Permalink
New languages support (#16)
Browse files Browse the repository at this point in the history
Co-authored-by: Andrea Sterbini <[email protected]>
Co-authored-by: Andrea Sterbini <[email protected]>
  • Loading branch information
3 people authored Sep 13, 2022
1 parent 806cda8 commit 4bc9fe0
Show file tree
Hide file tree
Showing 6 changed files with 122 additions and 46 deletions.
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ pip install spacy-wordnet


### Supported languages
We currently support Spanish, English and Portuguese, but we welcome contributions in order to add and test new languages supported by spaCy and NLTK.
Almost all Open Multi Wordnet languages are supported.

## Usage

Expand All @@ -49,12 +49,12 @@ import spacy

from spacy_wordnet.wordnet_annotator import WordnetAnnotator

# Load an spacy model (for one of the supported languages: Spanish, English or Portuguese)
# Load an spacy model
nlp = spacy.load('en_core_web_sm')
# Spacy 3.x
nlp.add_pipe("spacy_wordnet", after='tagger', config={'lang': nlp.lang})
nlp.add_pipe("spacy_wordnet", after='tagger')
# Spacy 2.x
# nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger')
# nlp.add_pipe(WordnetAnnotator(nlp, name="spacy_wordnet"), after='tagger')
token = nlp('prices')[0]

# wordnet object link spacy token with nltk wordnet interface by giving acces to
Expand Down
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ setup_requires = pyscaffold>=3.1a0,<3.2a0
# Add here dependencies of your project (semicolon/line-separated), e.g.
install_requires =
nltk>=3.3,<3.6
spacy>=2
# The usage of test_requires is discouraged, see `Dependency Management` docs
# tests_require = pytest; pytest-cov
# Require a specific Python version, e.g. Python 2.7 or >= 3.4
Expand Down
77 changes: 61 additions & 16 deletions spacy_wordnet/__utils__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,24 +3,65 @@
from collections import defaultdict
from typing import Optional, List, Dict, Set

from nltk.corpus.reader.wordnet import \
ADJ as WN_ADJ, \
ADV as WN_ADV, \
NOUN as WN_NOUN, \
VERB as WN_VERB, Synset
from nltk.corpus.reader.wordnet import (
ADJ as WN_ADJ,
ADV as WN_ADV,
NOUN as WN_NOUN,
VERB as WN_VERB,
Synset,
)

from spacy.parts_of_speech import ADJ, ADV, NOUN, VERB, AUX

from spacy_wordnet import get_package_basepath

__DEFAULT_LANG = 'spa'
__WN_LANGUAGES_MAPPING = dict(es=__DEFAULT_LANG, en='eng', pt='por')
# The Open Multi Wordnet corpus contains the following languages:
# als arb bul cat cmn dan ell eus fas fin fra glg heb hrv ind ita jpn nld nno nob pol por qcn slv spa swe tha zsm
# ('deu' can be found in Extended Open Multi Wordnet)
# the available spacy languages are:
# af am ar bg bn ca cs da de el en es et eu fa fi fr ga gu he hi hr hu hy id is it ja kn ko ky lb lij
# lt lv mk ml mr nb ne nl pl pt ro ru sa si sk sl sq sr sv ta te th ti tl tn tr tt uk ur vi xx yo zh
# then the full mapping is
__DEFAULT_LANG = "spa"
__WN_LANGUAGES_MAPPING = dict(
es=__DEFAULT_LANG,
en="eng",
fr="fra",
it="ita",
pt="por",
de="deu",
# other languages from omw
sq="als", # Albanian
ar="arb", # Arabic
bg="bul", # Bulgarian
ca="cat", # Catalan
zh="cmn", # Chinese Open Wordnet
da="dan", # Danish
el="ell", # Greek
eu="eus", # Basque
fa="fas", # Persian
fi="fin", # Finnish
# ?? ='glg', # Galician
he="heb", # Hebrew
hr="hrv", # Croatian
id="ind", # Indonesian
ja="jpn", # Japanese
nl="nld", # Dutch
# no ='nno', # Norwegian
# nb ='nob', # Norwegian Bokmal
pl="pol", # Polish
# ?? ='qcn', # Chinese (Taiwan)
sl="slv", # Slovenian
sv="swe", # Swedish
th="tha", # Thai
ml="zsm", # Malayalam
)
__WN_POS_MAPPING = {
ADJ: WN_ADJ,
NOUN: WN_NOUN,
ADV: WN_ADV,
VERB: WN_VERB,
AUX: WN_VERB
AUX: WN_VERB,
}


Expand All @@ -32,18 +73,22 @@ def fetch_wordnet_lang(lang: Optional[str] = None) -> str:
language = __WN_LANGUAGES_MAPPING.get(lang, None)

if not language:
raise Exception('Language {} not supported'.format(lang))
raise Exception("Language {} not supported".format(lang))

return language


def _persist_wordnet_domains(path: str, domains: Dict[str, Set[str]]):
with open(path, 'w') as file:
file.writelines(['{}\n'.format('\t'.join([k, ' '.join(v)])) for k, v in domains.items()])
with open(path, "w") as file:
file.writelines(
["{}\n".format("\t".join([k, " ".join(v)])) for k, v in domains.items()]
)
file.close()


def _load_wordnet_domains_from_ppv(path: str, threshold: float = 0.00009) -> Dict[str, List[str]]:
def _load_wordnet_domains_from_ppv(
path: str, threshold: float = 0.00009
) -> Dict[str, List[str]]:
def domain_name_from_filename(filename: str) -> str:
name, _ = os.path.splitext(os.path.basename(filename))
return name
Expand All @@ -52,8 +97,8 @@ def domain_name_from_filename(filename: str) -> str:

for filename in glob.glob(path):
domain_name = domain_name_from_filename(filename)
for line in open(filename, 'r'):
ssid, weight = line.strip().split('\t')
for line in open(filename, "r"):
ssid, weight = line.strip().split("\t")
if float(weight) >= threshold:
domains_by_ssid[ssid].append(domain_name)

Expand All @@ -63,8 +108,8 @@ def domain_name_from_filename(filename: str) -> str:
def _load_wordnet_domains_from_txt(filepath: str) -> Dict[str, List[str]]:
domains_by_ssid = defaultdict(list)
for filename in glob.glob(filepath):
for line in open(filename, 'r'):
row = line.strip().split(' ')
for line in open(filename, "r"):
row = line.strip().split(" ")
ssid = row[0]
domains = row[2:]
domains_by_ssid[ssid].extend(domains)
Expand Down
18 changes: 12 additions & 6 deletions spacy_wordnet/wordnet_annotator.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,37 @@
from spacy.tokens.doc import Doc
from spacy.tokens.token import Token
from spacy.parts_of_speech import *
from spacy.language import Language

from spacy_wordnet.wordnet_domains import Wordnet, load_wordnet_domains

try:

@Language.factory("spacy_wordnet", default_config={"lang": "en"})
def wordnet_annotator(nlp, name, lang: str):
return WordnetAnnotator(lang=lang)

@Language.factory("spacy_wordnet", default_config={})
def wordnet_annotator(nlp, name):
return WordnetAnnotator(nlp=nlp, name=name)

except AttributeError:

pass # spacy 2.x


class WordnetAnnotator(object):
__FIELD = "wordnet"

def __init__(self, lang: str = "es"):
def __init__(self, nlp: Language, name: str):
Token.set_extension(WordnetAnnotator.__FIELD, default=None, force=True)
load_wordnet_domains()
self.__lang = lang
self.__lang = nlp.lang

def __call__(self, doc: Doc):
for token in doc:
wordnet = Wordnet(token=token, lang=self.__lang)
token._.set(WordnetAnnotator.__FIELD, wordnet)

return doc


if hasattr(Language, "factory"):
# SpaCy 3.x
Language.factory("wordnet")(WordnetAnnotator)
49 changes: 39 additions & 10 deletions spacy_wordnet/wordnet_domains.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,36 +3,53 @@

from spacy_wordnet.__utils__ import *

__WN_DOMAINS_PATH = os.path.join(get_package_basepath(), 'data/wordnet_domains.txt')
__WN_DOMAINS_PATH = os.path.join(get_package_basepath(), "data/wordnet_domains.txt")

__WN_DOMAINS_BY_SSID = defaultdict(list)


def wordnet_domains_path() -> str:
return __WN_DOMAINS_PATH


def load_wordnet_domains(path: Optional[str] = wordnet_domains_path()):
if __WN_DOMAINS_BY_SSID:
return

for line in open(path, 'r'):
ssid, domains = line.strip().split('\t')
__WN_DOMAINS_BY_SSID[ssid] = domains.split(' ')
for line in open(path, "r"):
ssid, domains = line.strip().split("\t")
__WN_DOMAINS_BY_SSID[ssid] = domains.split(" ")


def get_domains_for_synset(synset: Synset) -> List[str]:
ssid = '{}-{}'.format(str(synset.offset()).zfill(8), synset.pos())
ssid = "{}-{}".format(str(synset.offset()).zfill(8), synset.pos())
return __WN_DOMAINS_BY_SSID.get(ssid, [])


class Wordnet(object):

def __init__(self, token: Token, lang: str = 'es'):
# # TODO: add serialization
# def to_disk(self, path):
# # save:
# # __token?
# # __lang?
# # __synsets
# # __lemmas
# # __wordnet_domains
# pass
# def from_disk(self, path):
# pass

def __init__(self, token: Token, lang: str = "es"):
self.__token = token
self.__lang = fetch_wordnet_lang(lang)
self.__synsets = self.__find_synsets(token, self.__lang)
self.__lemmas = self.__find_lemmas()
self.__wordnet_domains = self.__find_wordnet_domains()

def lang(self):
return self.__lang

def synsets(self):
return self.__synsets

Expand All @@ -46,7 +63,9 @@ def wordnet_domains_for_synset(self, synset: Synset):
return get_domains_for_synset(synset)

def wordnet_synsets_for_domain(self, domains: List[str]):
return [synset for synset in self.synsets() if self.__has_domains(synset, domains)]
return [
synset for synset in self.synsets() if self.__has_domains(synset, domains)
]

@staticmethod
def __find_synsets(token: Token, lang: str):
Expand All @@ -56,7 +75,9 @@ def __find_synsets(token: Token, lang: str):
word_variants.append(token.lemma_)

for word in word_variants:
token_synsets = wn.synsets(word, pos=spacy2wordnet_pos(token.pos), lang=lang)
token_synsets = wn.synsets(
word, pos=spacy2wordnet_pos(token.pos), lang=lang
)
if token_synsets:
return token_synsets

Expand All @@ -67,7 +88,15 @@ def __has_domains(synset: Synset, domains: List[str]) -> bool:
return not set(domains).isdisjoint(get_domains_for_synset(synset))

def __find_wordnet_domains(self):
return [domain for synset in self.synsets() for domain in get_domains_for_synset(synset)]
return [
domain
for synset in self.synsets()
for domain in get_domains_for_synset(synset)
]

def __find_lemmas(self):
return [lemma for synset in self.synsets() for lemma in synset.lemmas(lang=self.__lang)]
return [
lemma
for synset in self.synsets()
for lemma in synset.lemmas(lang=self.__lang)
]
15 changes: 5 additions & 10 deletions tests/test_wordnet_annotator.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,7 @@
import unittest
from collections import defaultdict

import spacy

import numpy as np

from itertools import product

from spacy_wordnet.wordnet_annotator import WordnetAnnotator


Expand All @@ -19,11 +14,11 @@ def __init__(self, *args, **kwargs):

try:
# Add wordnet component
self.nlp_en.add_pipe("spacy_wordnet", config={"lang": self.nlp_en.lang})
self.nlp_es.add_pipe("spacy_wordnet", config={"lang": self.nlp_es.lang})
except TypeError: # spacy 2.x
self.nlp_en.add_pipe(WordnetAnnotator(self.nlp_en.lang))
self.nlp_es.add_pipe(WordnetAnnotator(self.nlp_es.lang))
self.nlp_en.add_pipe("spacy_wordnet")
self.nlp_es.add_pipe("spacy_wordnet")
except (ValueError, TypeError): # spacy 2.x
self.nlp_en.add_pipe(WordnetAnnotator(self.nlp_en, name="spacy_wordnet"))
self.nlp_es.add_pipe(WordnetAnnotator(self.nlp_es, name="spacy_wordnet"))

def test_english_annotations(self):

Expand Down

0 comments on commit 4bc9fe0

Please sign in to comment.