from __future__ import annotations
import logging
from dataclasses import dataclass
from typing import TYPE_CHECKING
from dapla_metadata.datasets.external_sources.external_sources import GetExternalSource
from dapla_metadata.datasets.utility.enums import SupportedLanguages
if TYPE_CHECKING:
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
from klass.classes.classification import KlassClassification
logger = logging.getLogger(__name__)
[docs]
@dataclass
class CodeListItem:
"""Data structure for a code list item.
Attributes:
titles: A dictionary mapping language codes to titles.
code: The code associated with the item.
"""
titles: dict[SupportedLanguages, str]
code: str
[docs]
def get_title(self, language: SupportedLanguages) -> str:
"""Return the title in the specified language.
Args:
language: The language code for which to get the title.
Returns:
The title in the specified language. It returns the title in Norwegian
Bokmål ("nb") if the language is either Norwegian Bokmål or Norwegian
Nynorsk, otherwise it returns the title in English ("en"). If none of
these are available, it returns an empty string and logs an exception.
"""
try:
return self.titles[language]
except KeyError:
try:
return self.titles[
(
SupportedLanguages.NORSK_BOKMÅL
if language
in [
SupportedLanguages.NORSK_BOKMÅL,
SupportedLanguages.NORSK_NYNORSK,
]
else SupportedLanguages.ENGLISH
)
]
except KeyError:
logger.exception(
"Could not find title for subject %s and language: %s",
self,
language.name,
)
return ""
[docs]
class CodeList(GetExternalSource):
"""Class for retrieving classifications from Klass.
This class fetches a classification given a classification ID
and supports multiple languages.
Attributes:
supported_languages: A list of supported language codes.
_classifications: A list to store classification items.
classification_id: The ID of the classification to retrieve.
classifications_dataframes: A dictionary to store dataframes of
classifications.
"""
def __init__(
self,
executor: ThreadPoolExecutor,
classification_id: int | None,
) -> None:
"""Initialize the CodeList with the given classification ID and executor.
Args:
executor: An instance of ThreadPoolExecutor to manage the asynchronous
execution of data fetching.
classification_id: The ID of the classification to retrieve.
"""
self._classifications: list[CodeListItem] = []
self.classification_id = classification_id
self.classifications_dataframes: (
dict[SupportedLanguages, pd.DataFrame] | None
) = None
super().__init__(executor)
def _fetch_data_from_external_source(
self,
) -> dict[SupportedLanguages, pd.DataFrame] | None:
"""Fetch the classifications from Klass by classification ID.
This method retrieves classification data for each supported language and
stores it in a dictionary where the keys are language codes and the values
are pandas DataFrames containing the classification data.
Returns:
A dictionary mapping language codes to pandas DataFrames containing the
classification data for the given classification ID.
If an exception occurs during the fetching process, logs the exception
and returns None.
"""
classifications_dataframes: dict[SupportedLanguages, pd.DataFrame] = {}
for i in [
SupportedLanguages.NORSK_BOKMÅL,
SupportedLanguages.ENGLISH,
]:
try:
classifications_dataframes[i] = (
KlassClassification(
str(self.classification_id),
i.lower(), # type: ignore [arg-type]
)
.get_codes()
.data
)
except Exception: # noqa: PERF203
logger.exception(
"Exception while getting classifications from Klass",
)
return None
else:
return classifications_dataframes
return None
def _extract_titles(
self,
dataframes: dict[SupportedLanguages, pd.DataFrame],
) -> list[dict[SupportedLanguages, str]]:
"""Extract titles from the dataframes for each supported language.
This method processes the provided dataframes and extracts the title from
each row for all supported languages, creating a list of dictionaries where
each dictionary maps language codes to titles.
Args:
dataframes: A dictionary mapping language codes to pandas DataFrames
containing classification data.
Returns:
A list of dictionaries, each mapping language codes to titles.
If a title is not available in a dataframe, the corresponding dictionary
value will be None.
"""
list_of_titles = []
languages = list(dataframes)
for i in range(len(dataframes[SupportedLanguages.NORSK_BOKMÅL])):
titles = {}
for j in languages:
if "name" in dataframes[j]:
titles[j] = dataframes[j].loc[:, "name"][i]
else:
titles[j] = None
list_of_titles.append(titles)
return list_of_titles
def _create_code_list_from_dataframe(
self,
classifications_dataframes: dict[SupportedLanguages, pd.DataFrame],
) -> list[CodeListItem]:
"""Create a list of CodeListItem objects from the classification dataframes.
This method extracts titles from the provided dataframes and pairs them
with their corresponding classification codes to create a list of
CodeListItem objects.
Args:
classifications_dataframes: A dictionary mapping language codes to
pandas DataFrames containing classification data.
Returns:
A list of CodeListItem objects containing classification titles
and codes.
"""
classification_names = self._extract_titles(classifications_dataframes)
classification_codes: list
if "code" in classifications_dataframes[SupportedLanguages.NORSK_BOKMÅL]:
classification_codes = (
classifications_dataframes[SupportedLanguages.NORSK_BOKMÅL]
.loc[:, "code"]
.to_list()
)
else:
classification_codes = [None] * len(classification_names)
classification_items = []
for a, b in zip(classification_names, classification_codes, strict=False):
classification_items.append(
CodeListItem(a, b),
)
return classification_items
def _get_classification_dataframe_if_loaded(self) -> bool:
"""Check if the classification data from Klass is loaded.
This method verifies whether the classification data has been loaded.
If not, it retrieves the data from an external source and populates the
classifications. It logs the process and returns a boolean indicating the
success of the operation.
Returns:
True if the data is loaded and classifications are successfully extracted,
False otherwise.
"""
if not self._classifications:
self.classifications_dataframes = self.retrieve_external_data()
if self.classifications_dataframes is not None:
self._classifications = self._create_code_list_from_dataframe(
self.classifications_dataframes,
)
logger.debug(
"Thread finished. found %s classifications",
len(self._classifications),
)
return True
logger.warning(
"Thread is not done. Cannot get classifications from the dataframe.",
)
return False
@property
def classifications(self) -> list[CodeListItem]:
"""Get the list of classifications.
Returns:
A list of CodeListItem objects.
"""
self._get_classification_dataframe_if_loaded()
logger.debug("Got %s classifications subjects", len(self._classifications))
return self._classifications