Skip to content

Commit 544ea39

Browse files
authored
Merge pull request #932 from gavishpoddar/language
Optional Language Detect
2 parents 44e8624 + b8dcf7b commit 544ea39

25 files changed

+1218
-25
lines changed

dateparser/__init__.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@
77

88

99
@apply_settings
10-
def parse(date_string, date_formats=None, languages=None, locales=None, region=None, settings=None):
10+
def parse(date_string, date_formats=None, languages=None, locales=None,
11+
region=None, settings=None, detect_languages_function=None):
1112
"""Parse date and time from given date string.
1213
1314
:param date_string:
@@ -39,6 +40,12 @@ def parse(date_string, date_formats=None, languages=None, locales=None, region=N
3940
Configure customized behavior using settings defined in :mod:`dateparser.conf.Settings`.
4041
:type settings: dict
4142
43+
:param detect_languages_function:
44+
A function for language detection that takes as input a string (the `date_string`) and
45+
a `confidence_threshold`, and returns a list of detected language codes.
46+
Note: this function is only used if ``languages`` and ``locales`` are not provided.
47+
:type detect_languages_function: function
48+
4249
:return: Returns :class:`datetime <datetime.datetime>` representing parsed date if successful, else returns None
4350
:rtype: :class:`datetime <datetime.datetime>`.
4451
:raises:
@@ -47,9 +54,9 @@ def parse(date_string, date_formats=None, languages=None, locales=None, region=N
4754
"""
4855
parser = _default_parser
4956

50-
if languages or locales or region or not settings._default:
57+
if languages or locales or region or detect_languages_function or not settings._default:
5158
parser = DateDataParser(languages=languages, locales=locales,
52-
region=region, settings=settings)
59+
region=region, settings=settings, detect_languages_function=detect_languages_function)
5360

5461
data = parser.get_date_data(date_string, date_formats)
5562

dateparser/conf.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from datetime import datetime
33
from functools import wraps
44

5+
from dateparser.data.languages_info import language_order
56
from .parser import date_order_chart
67
from .utils import registry
78

@@ -25,6 +26,8 @@ class Settings:
2526
* `NORMALIZE`
2627
* `RETURN_TIME_AS_PERIOD`
2728
* `PARSERS`
29+
* `DEFAULT_LANGUAGES`
30+
* `LANGUAGE_DETECTION_CONFIDENCE_THRESHOLD`
2831
"""
2932

3033
_default = True
@@ -129,6 +132,28 @@ def _check_parsers(setting_name, setting_value):
129132
_check_repeated_values(setting_name, setting_value)
130133

131134

135+
def _check_default_languages(setting_name, setting_value):
136+
unsupported_languages = set(setting_value) - set(language_order)
137+
if unsupported_languages:
138+
raise SettingValidationError(
139+
"Found invalid languages in the '{}' setting: {}".format(
140+
setting_name, ', '.join(map(repr, unsupported_languages))
141+
)
142+
)
143+
_check_repeated_values(setting_name, setting_value)
144+
145+
146+
def _check_between_0_and_1(setting_name, setting_value):
147+
is_valid = 0 <= setting_value <= 1
148+
if not is_valid:
149+
raise SettingValidationError(
150+
'{} is not a valid value for {}. It can take values between 0 and '
151+
'1.'.format(
152+
setting_value, setting_name,
153+
)
154+
)
155+
156+
132157
def check_settings(settings):
133158
"""
134159
Check if provided settings are valid, if not it raises `SettingValidationError`.
@@ -193,6 +218,14 @@ def check_settings(settings):
193218
'PREFER_LOCALE_DATE_ORDER': {
194219
'type': bool
195220
},
221+
'DEFAULT_LANGUAGES': {
222+
'type': list,
223+
'extra_check': _check_default_languages
224+
},
225+
'LANGUAGE_DETECTION_CONFIDENCE_THRESHOLD': {
226+
'type': float,
227+
'extra_check': _check_between_0_and_1
228+
},
196229
}
197230

198231
modified_settings = settings._mod_settings # check only modified settings

dateparser/custom_language_detection/__init__.py

Whitespace-only changes.
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
import os
2+
3+
import fasttext
4+
5+
from dateparser_cli.fasttext_manager import fasttext_downloader
6+
from dateparser_cli.utils import dateparser_model_home, create_data_model_home
7+
from dateparser_cli.exceptions import FastTextModelNotFoundException
8+
9+
10+
_supported_models = ["large.bin", "small.bin"]
11+
_DEFAULT_MODEL = "small"
12+
13+
14+
class _FastTextCache:
15+
model = None
16+
17+
18+
def _load_fasttext_model():
19+
if _FastTextCache.model:
20+
return _FastTextCache.model
21+
create_data_model_home()
22+
downloaded_models = [
23+
file for file in os.listdir(dateparser_model_home)
24+
if file in _supported_models
25+
]
26+
if not downloaded_models:
27+
fasttext_downloader(_DEFAULT_MODEL)
28+
return _load_fasttext_model()
29+
model_path = os.path.join(dateparser_model_home, downloaded_models[0])
30+
if not os.path.isfile(model_path):
31+
raise FastTextModelNotFoundException('Fasttext model file not found')
32+
_FastTextCache.model = fasttext.load_model(model_path)
33+
return _FastTextCache.model
34+
35+
36+
def detect_languages(text, confidence_threshold):
37+
_language_parser = _load_fasttext_model()
38+
text = text.replace('\n', ' ').replace('\r', '')
39+
language_codes = []
40+
parser_data = _language_parser.predict(text)
41+
for idx, language_probability in enumerate(parser_data[1]):
42+
if language_probability > confidence_threshold:
43+
language_code = parser_data[0][idx].replace("__label__", "")
44+
language_codes.append(language_code)
45+
return language_codes
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
import langdetect
2+
3+
4+
# The below _Factory is set to prevent setting global state of the library
5+
# but still get consistent results.
6+
# Refer : https://github.com/Mimino666/langdetect
7+
8+
class _Factory:
9+
data = None
10+
11+
12+
def _init_factory():
13+
if _Factory.data is None:
14+
_Factory.data = langdetect.detector_factory.DetectorFactory()
15+
_Factory.data.load_profile(langdetect.detector_factory.PROFILES_DIRECTORY)
16+
_Factory.data.seed = 0
17+
18+
19+
def _get_language_probablities(text):
20+
_init_factory()
21+
detector = _Factory.data.create()
22+
detector.append(text)
23+
return detector.get_probabilities()
24+
25+
26+
def detect_languages(text, confidence_threshold):
27+
language_codes = []
28+
try:
29+
parser_data = _get_language_probablities(text)
30+
for language_candidate in parser_data:
31+
if language_candidate.prob > confidence_threshold:
32+
language_codes.append(language_candidate.lang)
33+
except langdetect.lang_detect_exception.LangDetectException:
34+
# This exception can be produced with empty strings or inputs without letters like `10-10-2021`.
35+
# As this could be really common, we ignore them.
36+
pass
37+
return language_codes
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
from dateparser.data.languages_info import language_map
2+
3+
4+
def map_languages(language_codes):
5+
"""
6+
Returns the candidates from the supported languages codes.
7+
:param language_codes:
8+
A list of language codes, e.g. ['en', 'es'] in ISO 639 Standard.
9+
:type language_codes: list
10+
:return: Returns list[str] representing supported languages
11+
:rtype: list[str]
12+
"""
13+
return [
14+
language_code
15+
for language in language_codes
16+
if language in language_map
17+
for language_code in language_map[language]
18+
]

0 commit comments

Comments
 (0)