Source code for intan.io._canonicalizer

import re
import difflib

# 1) Define your official label set (CamelCase)
CANON = {
    "rest": "Rest",
    "handopen": "HandOpen",
    "handclose": "HandClose",
    "fingersopen": "FingersOpen",
    "fingersclose": "FingersClose",
    "indexflexion": "IndexFlexion",
    "indexextension": "IndexExtension",
    "thumbflexion": "ThumbFlexion",
    "thumbextension": "ThumbExtension",
    "middleflexion": "MiddleFlexion",
    "middleextension": "MiddleExtension",
    "ringflexion": "RingFlexion",
    "ringextension": "RingExtension",
    "pinkyflexion": "PinkyFlexion",
    "pinkyextension": "PinkyExtension",
    "twofingerpinch": "TwoFingerPinch",
    "threefingerchuck": "ThreeFingerChuck",
}

# 2) Add common synonyms / file-name aliases → canonical key (lowercase, no separators)
SYNONYMS = {
    "fingersopen": "handopen",          # if you want FingersOpen → HandOpen, change target to "fingersopen"
    "fingersclose": "handclose",        # likewise, set to "fingersclose" if that’s your canonical choice
    "resting": "rest",
    "null": "rest",
    "nan": "rest",                      # treat missing labels as Rest; change to "unknown" if you prefer
    "index": "indexflexion",
    "index2": "indexflexion",
    "2fingerinch": "twofingerpinch",
    "3fingerchuck": "threefingerchuck",
    # typos that show up:
    "pinkyfelxion": "pinkyflexion",
    "pin kyflexion": "pinkyflexion",
    "pinkyflexon": "pinkyflexion",
    # the one you mentioned (extra space + odd casing is already handled by normalization)
}

def _normalize_token(s: str) -> str:
    """strip, remove separators, lowercase."""
    return re.sub(r"[\s_-]+", "", (s or "").strip()).lower()


[docs]
def canonical_label(
    s: str,
    *,
    style: str = "lower",     # "camel" -> CamelCase name from CANON; "lower" -> canonical key
    allow_fuzzy: bool = True,
    fuzzy_cutoff: float = 0.88,
    strict: bool = False,     # if True, raise on unknown
) -> str:
    """
    Canonicalize a raw label string to your official namespace.

    - style="camel": return CamelCase (e.g., "PinkyFlexion")
    - style="lower": return canonical token key (e.g., "pinkyflexion")
    """
    tok = _normalize_token(s)

    # map synonyms (including “nan”→rest if you want that behavior)
    tok = SYNONYMS.get(tok, tok)

    if tok in CANON:
        return CANON[tok] if style == "camel" else tok

    if allow_fuzzy:
        # try fuzzy against known keys
        match = difflib.get_close_matches(tok, CANON.keys(), n=1, cutoff=fuzzy_cutoff)
        if match:
            m = match[0]
            return CANON[m] if style == "camel" else m

    if strict:
        raise ValueError(f"Unknown label: {s!r} → {tok!r}")

    # last resort: make a readable CamelCase from the normalized token
    # (still stable, but not guaranteed to be one of your official classes)
    fallback_camel = re.sub(r"([a-z])([0-9])", r"\1 \2", tok).title().replace(" ", "")
    return fallback_camel if style == "camel" else tok
Source code for intan.io._canonicalizer

python-intan

Navigation

Related Topics