comicbox.transforms.identifiers

[docs] module comicbox.transforms.identifiers

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
"""Identifier Fields."""

from typing import Any
from urllib.parse import urlparse

from loguru import logger

from comicbox.enums.comicbox import IdSources
from comicbox.fields.xml_fields import get_cdata
from comicbox.identifiers.identifiers import (
    IDENTIFIER_PARTS_MAP,
    create_identifier,
    get_id_source_from_url,
)
from comicbox.identifiers.urns import (
    parse_string_identifier,
    to_urn_string,
)
from comicbox.schemas.comicbox import (
    ID_KEY_KEY,
    ID_SOURCE_KEY,
    ID_URL_KEY,
    IDENTIFIER_PRIMARY_SOURCE_KEY,
    IDENTIFIERS_KEY,
)
from comicbox.transforms.spec import MetaSpec

PRIMARY_ID_SOURCE_KEYPATH = f"{IDENTIFIER_PRIMARY_SOURCE_KEY}.{ID_SOURCE_KEY}"


def create_identifier_primary_source(
    id_source: IdSources,
) -> dict:
    """Create identifier primary source."""
    ips: dict[str, Any] = {ID_SOURCE_KEY: id_source}
    id_parts = IDENTIFIER_PARTS_MAP.get(id_source)
    if id_parts and (url := id_parts.url_prefix):
        ips[ID_URL_KEY] = url
    return ips


def _identifier_to_cb(native_identifier: str, naked_id_source: Any) -> tuple[str, dict]:
    """Parse one identifier urn or string."""
    id_source, id_type, id_key = parse_string_identifier(
        native_identifier, naked_id_source
    )
    id_source_str = id_source.value if id_source else ""
    comicbox_identifier = create_identifier(
        id_source_str, id_key, id_type=id_type, default_id_source_str=naked_id_source
    )
    return id_source_str, comicbox_identifier


def identifiers_to_cb(
    native_identifiers: set[str] | None, naked_id_source: Any
) -> dict:
    """Parse identifier struct from a string or sequence."""
    comicbox_identifiers = {}
    if native_identifiers:
        for native_identifier in native_identifiers:
            try:
                id_source_str, identifier = _identifier_to_cb(
                    native_identifier, naked_id_source
                )
                comicbox_identifiers[id_source_str] = identifier
            except Exception as exc:
                logger.warning(f"Parsing identifier {native_identifier}: {exc}")
    return comicbox_identifiers


def identifiers_transform_to_cb(
    identifiers_tag: str, naked_id_source: IdSources
) -> MetaSpec:
    """Transform identifier tags to comicbox identifiers."""

    def to_cb(native_identifiers: set[str]) -> dict[str, dict[str, str]]:
        return identifiers_to_cb(native_identifiers, naked_id_source)

    return MetaSpec(
        key_map={IDENTIFIERS_KEY: identifiers_tag},
        spec=to_cb,
    )


def _identifiers_from_cb(comicbox_identifiers: dict[str, dict[str, str]]) -> set:
    """Unparse identifier struct to set of strings."""
    urn_strings = set()
    for id_source in IdSources:
        if (
            (comicbox_identifier := comicbox_identifiers.get(id_source.value))
            and (id_key := comicbox_identifier.get(ID_KEY_KEY))
            and (urn_str := to_urn_string(id_source.value, "issue", id_key))
        ):
            urn_strings.add(urn_str)
    return urn_strings


def identifiers_transform_from_cb(identifiers_tag: str) -> MetaSpec:
    """Transform comicbox identifiers identifier tag."""
    return MetaSpec(
        key_map={identifiers_tag: IDENTIFIERS_KEY},
        spec=_identifiers_from_cb,
    )


def _parse_unknown_url(url_str: str) -> tuple[str, dict]:
    """Parse unknown urls."""
    identifier = {}
    try:
        url = urlparse(url_str)
        id_source_str = url.netloc
        id_key = ""
        if url.path and url.path != "/":
            id_key += url.path
        if url.query:
            id_key += "?" + url.query
        if url.fragment:
            id_key += "#" + url.fragment
        if id_key:
            identifier[ID_KEY_KEY] = id_key
        if url:
            identifier[ID_URL_KEY] = url_str
    except Exception:
        logger.debug(f"Unparsable url: {url_str}")
        id_source_str = ""
    return id_source_str, identifier


def url_to_cb(
    native_url: str | dict,
) -> tuple[str, dict]:
    """Parse one url into identifier."""
    url_str = get_cdata(native_url)
    if not url_str:
        return "", {}
    id_source_str = get_id_source_from_url(url_str)
    try:
        id_source = IdSources(id_source_str)
    except ValueError:
        id_source = None
    if id_source and (id_parts := IDENTIFIER_PARTS_MAP.get(id_source)):
        id_type, id_key = id_parts.parse_url_path(url_str)
        identifier = create_identifier(
            id_source_str, id_key, id_type=id_type, url=url_str
        )
    else:
        identifier = None
    if not identifier:
        id_source_str, identifier = _parse_unknown_url(url_str)
    return id_source_str, identifier


def urls_to_cb(urls: Any) -> dict:
    """Parse url tags into identifiers."""
    comicbox_identifiers = {}
    if urls:
        for url in urls:
            id_source_str, identifier = url_to_cb(url)
            if id_source_str and identifier:
                comicbox_identifiers[id_source_str] = identifier
    return comicbox_identifiers


def urls_transform_to_cb(urls_tag: str) -> MetaSpec:
    """Transform urls tags to comicbox identifiers."""
    return MetaSpec(
        key_map={IDENTIFIERS_KEY: urls_tag},
        spec=urls_to_cb,
    )


def url_from_cb(
    id_source_str: str,
    comicbox_identifier: dict,
) -> str:
    """Unparse one identifier into one url tag."""
    url = comicbox_identifier.get(ID_URL_KEY, "")
    if not url and (id_key := comicbox_identifier.get(ID_KEY_KEY)):
        new_identifier = create_identifier(id_source_str, id_key)
        url = new_identifier.get(ID_URL_KEY, "")
    return url


def _urls_from_cb(comicbox_identifiers: dict[str, dict[str, str]]) -> set:
    """Unparse urls struct to set of strings."""
    url_strings = set()
    for id_source, comicbox_identifier in comicbox_identifiers.items():
        if url := url_from_cb(id_source, comicbox_identifier):
            url_strings.add(url)
    return url_strings


def urls_transform_from_cb(urls_tag: str) -> MetaSpec:
    """Transform comicbox identifiers to urls tags."""
    return MetaSpec(key_map={urls_tag: IDENTIFIERS_KEY}, spec=_urls_from_cb)