Source code for dapla_metadata.datasets.statistic_subject_mapping
from __future__ import annotations
import logging
from dataclasses import dataclass
from typing import TYPE_CHECKING
import bs4
import requests
from bs4 import BeautifulSoup
from bs4 import ResultSet
from dapla_metadata.datasets.external_sources.external_sources import GetExternalSource
from dapla_metadata.datasets.utility.enums import SupportedLanguages
if TYPE_CHECKING:
from concurrent.futures import ThreadPoolExecutor
logger = logging.getLogger(__name__)
[docs]
@dataclass
class Subject:
"""Base class for Primary and Secondary subjects.
A statistical subject is a related grouping of statistics.
"""
titles: dict[str, str]
subject_code: str
[docs]
def get_title(self, language: SupportedLanguages) -> str:
"""Get the title in the given language."""
try:
return self.titles[
(
# Adjust to language codes in the StatisticSubjectMapping structure.
"no"
if language
in [
SupportedLanguages.NORSK_BOKMÃ…L,
SupportedLanguages.NORSK_NYNORSK,
]
else "en"
)
]
except KeyError:
logger.exception(
"Could not find title for subject %s and language: %s",
self,
language.name,
)
return ""
[docs]
@dataclass
class SecondarySubject(Subject):
"""Data structure for secondary subjects or 'delemne'."""
statistic_short_names: list[str]
[docs]
@dataclass
class PrimarySubject(Subject):
"""Data structure for primary subjects or 'hovedemne'."""
secondary_subjects: list[SecondarySubject]
[docs]
class StatisticSubjectMapping(GetExternalSource):
"""Provide mapping between statistic short name and primary and secondary subject."""
def __init__(
self,
executor: ThreadPoolExecutor,
source_url: str | None,
) -> None:
"""Retrieve the statistical structure document from the given URL.
Initializes the mapping based on values in the statistical structure document sourced at `source_url`.
Args:
executor: The ThreadPoolExecutor which will run the job of fetching the statistical structure document.
source_url: The URL from which to fetch the statistical structure document.
"""
self.source_url = source_url
self._statistic_subject_structure_xml: ResultSet | None = None
self._primary_subjects: list[PrimarySubject] = []
super().__init__(executor)
[docs]
def get_secondary_subject(self, statistic_short_name: str | None) -> str | None:
"""Looks up the secondary subject for the given statistic short name in the mapping dict.
Returns the secondary subject string if found, else None.
"""
for p in self.primary_subjects:
for s in p.secondary_subjects:
if statistic_short_name in s.statistic_short_names:
logger.debug("Got %s from %s", s, statistic_short_name)
return s.subject_code
logger.debug("No secondary subject found for %s", statistic_short_name)
return None
@staticmethod
def _extract_titles(titles_xml: bs4.element.Tag) -> dict[str, str]:
titles = {}
for title in titles_xml.find_all("tittel"):
titles[title["sprak"]] = title.text
return titles
def _fetch_data_from_external_source(self) -> ResultSet | None:
"""Fetch statistical structure document from source_url.
Returns a BeautifulSoup ResultSet.
"""
if not self.source_url:
logger.debug("No statistic subject url supplied")
return None
try:
response = requests.get(str(self.source_url), timeout=30)
response.encoding = "utf-8"
logger.debug("Got response %s from %s", response, self.source_url)
soup = BeautifulSoup(response.text, features="xml")
return soup.find_all("hovedemne")
except requests.exceptions.RequestException:
logger.exception("Exception while fetching statistical structure")
return None
def _parse_statistic_subject_structure_xml(
self,
statistical_structure_xml: ResultSet,
) -> list[PrimarySubject]:
primary_subjects: list[PrimarySubject] = []
for p in statistical_structure_xml:
secondary_subjects: list[SecondarySubject] = [
SecondarySubject(
self._extract_titles(s.titler),
s["emnekode"],
[
statistikk["kortnavn"]
for statistikk in s.find_all("Statistikk")
if statistikk["isPrimaerPlassering"] == "true"
],
)
for s in p.find_all("delemne")
]
primary_subjects.append(
PrimarySubject(
self._extract_titles(p.titler),
p["emnekode"],
secondary_subjects,
),
)
return primary_subjects
@property
def primary_subjects(self) -> list[PrimarySubject]:
"""Getter for primary subjects."""
if not self._primary_subjects:
self._parse_xml_if_loaded()
logger.debug("Got %s primary subjects", len(self._primary_subjects))
return self._primary_subjects
def _parse_xml_if_loaded(self) -> bool:
"""Checks if the xml is loaded, then parses the xml if it is loaded.
Returns `True` if it is loaded and parsed.
"""
if self.check_if_external_data_is_loaded():
self._statistic_subject_structure_xml = self.retrieve_external_data()
if self._statistic_subject_structure_xml is not None:
self._primary_subjects = self._parse_statistic_subject_structure_xml(
self._statistic_subject_structure_xml,
)
logger.debug(
"Thread finished. Parsed %s primary subjects",
len(self._primary_subjects),
)
return True
logger.warning("Thread is not done. Cannot parse xml.")
return False