Source code for dapla_metadata.datasets.utility.urn

"""Validate, parse and render URNs."""

import logging
import re
from collections.abc import Iterable
from dataclasses import dataclass
from enum import Enum
from enum import auto
from typing import Literal

from pydantic import AnyUrl

from dapla_metadata._shared.config import get_dapla_environment
from dapla_metadata._shared.enums import DaplaEnvironment
from dapla_metadata.datasets.utility.utils import VariableListType

logger = logging.getLogger(__name__)

URN_ERROR_MESSAGE_BASE = "The URL is not in a supported format"

URN_ERROR_MESSAGE_TEMPLATE = (
    URN_ERROR_MESSAGE_BASE
    + " for field '{field_name}' of variable '{short_name}'. URL: '{value}'. Please contact Team Metadata if this URL should be supported."
)


VARDEF_URL_TEMPLATE = "https://{subdomain}.{domain}/variable-definitions"


[docs] class SsbNaisDomains(str, Enum): """The available domains on SSBs Nais instance.""" TEST_EXTERNAL = "test.ssb.no" TEST_INTERNAL = "intern.test.ssb.no" PROD_EXTERNAL = "ssb.no" PROD_INTERNAL = "intern.ssb.no"
[docs] class ReferenceUrlTypes(Enum): """The general category of the URL. This can be useful to refer to when constructing a URL from a URN for a specific context. """ API = auto() FRONTEND = auto()
UrlVisibility = Literal["public", "internal"]
[docs] @dataclass class UrnConverter: """Converts URLs to URNs and vice versa. Attributes: urn_base: The format for the URN, up to the identifier. id_pattern: A capturing group pattern which matches identifiers for this resource. url_bases: The list of all the different URL representations for a resource. There will typically be a number of URL representations for a particular resource, depending on which system or technology they are accessed through and other technical factors. This list defines which concrete URLs can be considered equivalent to a URN. """ urn_base: str id_pattern: str url_bases: list[tuple[ReferenceUrlTypes, str]] def _extract_id(self, url: str, pattern: re.Pattern[str]) -> str | None: if match := pattern.match(url): return match.group(1) return None def _build_pattern(self, url_base: str) -> re.Pattern[str]: return re.compile(f"^{url_base}/{self.id_pattern}")
[docs] def get_urn(self, identifier: str) -> str: """Build a URN for the given identifier.""" return f"{self.urn_base}:{identifier}"
[docs] def get_url( self, identifier: str, url_type: ReferenceUrlTypes, visibility: Literal["public", "internal"] = "public", ) -> str | None: """Build concrete URL to reference a resource. There are typically multiple URLs used to refer to one resource, this method attempts to support known variations. Args: identifier (str): The identifier of the resource the URL refers to. url_type (ReferenceUrlTypes): The representation type of the URL visibility (UrlVisibility, optional): Whether the URL should be that which is publicly available or not. Defaults to "public". Returns: str | None: The concrete URL. None if we cannot satisfy the supplied requirements. """ candidates = [base[-1] for base in self.url_bases if base[0] == url_type] def matches_visibility(url: str, visibility: UrlVisibility): return (".intern." in url) is (visibility == "internal") def matches_environment(url: str): current_environment = get_dapla_environment() if current_environment == DaplaEnvironment.TEST: return ".test." in url return ".test." not in url if url := next( ( url for url in candidates if matches_visibility(url, visibility) and matches_environment(url) ), None, ): return url + "/" + identifier return None
[docs] def get_id(self, urn_or_url: str | AnyUrl) -> str | None: """Get an identifier from a URN or URL. Args: urn_or_url (str | AnyUrl): The URN or URL refering to a particular resource Returns: str | None: The identifier for the resource, or None if it cannot be extracted. """ if str(urn_or_url).startswith(self.urn_base): return str(urn_or_url).removeprefix(self.urn_base + ":") return self._extract_id_from_url(urn_or_url)
[docs] def is_id(self, value: str) -> bool: """Check if the value is an identifier for this URN type. Args: value (str): The value to check. """ if not isinstance(value, str): # Mypy thinks it's impossible to reach this branch, but there are no guarantees in Python. return False # type: ignore [unreachable] pattern = re.compile(f"^{self.id_pattern}$") return bool(pattern.match(value))
def _extract_id_from_url(self, url: str | AnyUrl) -> str | None: patterns = (self._build_pattern(url[-1]) for url in self.url_bases) matches = (self._extract_id(str(url), p) for p in patterns) return next((m for m in matches if m), None)
[docs] def convert_url_to_urn(self, url: str | AnyUrl) -> AnyUrl | None: """Convert a URL to a generalized URN for that same resource. Args: url (str | AnyUrl): The URL to convert. Returns: str | None: The URN or None if it can't be converted. """ if str(url).startswith(self.urn_base): # In this case the value is already in the expected format and nothing needs to be done. return AnyUrl(url) if identifier := self._extract_id_from_url(url): return AnyUrl(self.get_urn(identifier)) return None
vardef_urn_converter = UrnConverter( urn_base="urn:ssb:variable-definition:vardef", id_pattern=r"([a-zA-Z0-9-_]{8})", # 8 character Nanoid with default alphabet url_bases=[ *[ ( ReferenceUrlTypes.API, VARDEF_URL_TEMPLATE.format( subdomain="metadata", domain=nais_domain.value ), ) for nais_domain in SsbNaisDomains ], *[ ( ReferenceUrlTypes.FRONTEND, VARDEF_URL_TEMPLATE.format( subdomain="catalog", domain=nais_domain.value ), ) for nais_domain in SsbNaisDomains ], ], ) klass_urn_converter = UrnConverter( urn_base="urn:ssb:classification:klass", id_pattern=r"([0-9]{1,5})", url_bases=[ (ReferenceUrlTypes.FRONTEND, "https://www.ssb.no/klass/klassifikasjoner"), (ReferenceUrlTypes.FRONTEND, "https://www.ssb.no/en/klass/klassifikasjoner"), (ReferenceUrlTypes.API, "https://data.ssb.no/api/klass/v1/classifications"), ], )
[docs] def convert_uris_to_urns( variables: VariableListType, field_name: str, converters: Iterable[UrnConverter] ) -> None: """Where URIs are recognized URLs, convert them to URNs. Where the value is not a known URL we preserve the value as it is and log an ERROR level message. Args: variables (VariableListType): The list of variables. field_name (str): The name of the field which has URLs to convert to URNs converters (Iterable[UrnConverter]): One or more converters which implement conversion of URLs into one specific URN format. These will typically be specific to an individual metadata reference system. """ for v in variables: field = getattr(v, field_name, None) if field: if urn := next((c.convert_url_to_urn(field) for c in converters), None): setattr(v, field_name, urn) else: logger.error( URN_ERROR_MESSAGE_TEMPLATE.format( field_name=field_name, short_name=v.short_name, value=field, ) )