from __future__ import annotations
import html
import re
from lingua import IsoCode639_1, LanguageDetectorBuilder
from onegov.core.orm import find_models
from typing import Any, Generic, TypeVar, TYPE_CHECKING
if TYPE_CHECKING:
from collections.abc import Callable, Iterator, Sequence
from lingua import ConfidenceValue
from onegov.search.mixins import Searchable
[docs]
T_co = TypeVar('T_co', covariant=True)
# XXX this is doubly defined in onegov.org.utils, maybe move to a common
# regex module in in onegov.core
[docs]
HASHTAG = re.compile(r'(?<![\w/])#\w{3,}')
[docs]
def searchable_sqlalchemy_models(
base: type[T]
) -> Iterator[type[Searchable]]:
""" Searches through the given SQLAlchemy base and returns the classes
of all SQLAlchemy models found which inherit from the
:class:`onegov.search.mixins.Searchable` interface.
"""
# XXX circular imports
from onegov.search import Searchable
yield from find_models( # type:ignore[misc]
base, lambda cls: issubclass(cls, Searchable)
)
[docs]
_invalid_index_characters = re.compile(r'[\\/?"<>|\s,A-Z:]+')
[docs]
def is_valid_index_name(name: str) -> bool:
""" Checks if the given name is a valid elasticsearch index name.
Elasticsearch does it's own checks, but we can do it earlier and we are
a bit stricter.
"""
if name.startswith(('_', '.')):
return False
if _invalid_index_characters.search(name):
return False
if '*' in name:
return False
return True
[docs]
def is_valid_type_name(name: str) -> bool:
# the type name may be part of the index name, so we use the same check
return is_valid_index_name(name)
[docs]
def normalize_index_segment(segment: str, allow_wildcards: bool) -> str:
valid = _invalid_index_characters.sub('_', segment.lower())
if not allow_wildcards:
valid = valid.replace('*', '_')
return valid.replace('.', '_').replace('-', '_')
[docs]
class classproperty(Generic[T_co]): # noqa: N801
def __init__(self, f: Callable[[type[Any]], T_co]) -> None:
if isinstance(f, classmethod):
# unwrap classmethod decorator which is used for typing
f = f.__func__ # type:ignore[unreachable]
[docs]
def __get__(self, obj: object | None, owner: type[object]) -> T_co:
return self.f(owner)
[docs]
def iter_subclasses(baseclass: type[T]) -> Iterator[type[T]]:
for subclass in baseclass.__subclasses__():
yield subclass
# FIXME: Why are we only iterating two levels of inheritance?
yield from subclass.__subclasses__()
[docs]
class LanguageDetector:
""" Detects languages with the help of lingua-language-detector.
"""
def __init__(self, supported_languages: Sequence[str]) -> None:
[docs]
self.supported_languages = supported_languages
[docs]
self.detector = LanguageDetectorBuilder.from_iso_codes_639_1(*(
IsoCode639_1.from_str(language)
for language in supported_languages
)).build()
[docs]
def detect(self, text: str) -> str:
language = self.detector.detect_language_of(text)
if language is None:
# fallback to the first supported language
return self.supported_languages[0]
return language.iso_code_639_1.name.lower()
[docs]
def probabilities(self, text: str) -> list[ConfidenceValue]:
return self.detector.compute_language_confidence_values(text)