This repository has been archived on 2024-05-31. You can view files and clone it, but cannot push or open issues or pull requests.
devicehub-teal/ereuse_devicehub/ereuse_utils/getter.py

172 lines
5.1 KiB
Python

"""Functions to get values from dictionaries and list encoded key-value
strings with meaningful indentations.
Values obtained from these functions are sanitized and automatically
(or explicitly set) casted. Sanitization includes removing unnecessary
whitespaces and removing useless keywords (in the context of
computer hardware) from the texts.
"""
import re
from itertools import chain
from typing import Any, Iterable, Set, Type, Union
from unittest.mock import DEFAULT
import boltons.iterutils
import yaml
from ereuse_devicehub.ereuse_utils.text import clean
def dict(
d: dict,
path: Union[str, tuple],
remove: Set[str] = set(),
default: Any = DEFAULT,
type: Type = None,
):
"""Gets a value from the dictionary and sanitizes it.
Values are patterned and compared against sets
of meaningless characters for device hardware.
:param d: A dictionary potentially containing the value.
:param path: The key or a tuple-path where the value should be.
:param remove: Remove these words if found.
:param default: A default value to return if not found. If not set,
an exception is raised.
:param type: Enforce a type on the value (like ``int``). By default
dict tries to guess the correct type.
"""
try:
v = boltons.iterutils.get_path(d, (path,) if isinstance(path, str) else path)
except KeyError:
return _default(path, default)
else:
return sanitize(v, remove, type=type)
def kv(
iterable: Iterable[str],
key: str,
default: Any = DEFAULT,
sep=':',
type: Type = None,
) -> Any:
"""Key-value. Gets a value from an iterable representing key values in the
form of a list of strings lines, for example an ``.ini`` or yaml file,
if they are opened with ``.splitlines()``.
:param iterable: An iterable of strings.
:param key: The key where the value should be.
:param default: A default value to return if not found. If not set,
an exception is raised.
:param sep: What separates the key from the value in the line.
Usually ``:`` or ``=``.
:param type: Enforce a type on the value (like ``int``). By default
dict tries to guess the correct type.
"""
for line in iterable:
try:
k, value, *_ = line.strip().split(sep)
except ValueError:
continue
else:
if key == k:
return sanitize(value, type=type)
return _default(key, default)
def indents(iterable: Iterable[str], keyword: str, indent=' '):
"""For a given iterable of strings, returns blocks of the same
left indentation.
For example:
foo1
bar1
bar2
foo2
foo2
For that text, this method would return ``[bar1, bar2]`` for passed-in
keyword ``foo1``.
:param iterable: A list of strings representing lines.
:param keyword: The title preceding the indentation.
:param indent: Which characters makes the indentation.
"""
section_pos = None
for i, line in enumerate(iterable):
if not line.startswith(indent):
if keyword in line:
section_pos = i
elif section_pos is not None:
yield iterable[section_pos:i]
section_pos = None
return
def _default(key, default):
if default is DEFAULT:
raise IndexError('Value {} not found.'.format(key))
else:
return default
"""Gets"""
TO_REMOVE = {'none', 'prod', 'o.e.m', 'oem', r'n/a', 'atapi', 'pc', 'unknown'}
"""Delete those *words* from the value"""
assert all(v.lower() == v for v in TO_REMOVE), 'All words need to be lower-case'
REMOVE_CHARS_BETWEEN = '(){}[]'
"""
Remove those *characters* from the value.
All chars inside those are removed. Ex: foo (bar) => foo
"""
CHARS_TO_REMOVE = '*'
"""Remove the characters.
'*' Needs to be removed or otherwise it is interpreted
as a glob expression by regexes.
"""
MEANINGLESS = {
'to be filled',
'system manufacturer',
'system product',
'sernum',
'xxxxx',
'system name',
'not specified',
'modulepartnumber',
'system serial',
'0001-067a-0000',
'partnum',
'manufacturer',
'0000000',
'fffff',
'jedec id:ad 00 00 00 00 00 00 00',
'012000',
'x.x',
'sku',
}
"""Discard a value if any of these values are inside it. """
assert all(v.lower() == v for v in MEANINGLESS), 'All values need to be lower-case'
def sanitize(value, remove=set(), type=None):
if value is None:
return None
remove = remove | TO_REMOVE
regex = r'({})\W'.format('|'.join(s for s in remove))
val = re.sub(regex, '', value, flags=re.IGNORECASE)
val = '' if val.lower() in remove else val # regex's `\W` != whole string
val = re.sub(r'\([^)]*\)', '', val) # Remove everything between
for char_to_remove in chain(REMOVE_CHARS_BETWEEN, CHARS_TO_REMOVE):
val = val.replace(char_to_remove, '')
val = clean(val)
if val and not any(meaningless in val.lower() for meaningless in MEANINGLESS):
return type(val) if type else yaml.load(val, Loader=yaml.SafeLoader)
else:
return None