Source code for intan.io._canonicalizer

import re
import difflib

# 1) Define your official label set (CamelCase)
CANON = {
    "rest": "Rest",
    "handopen": "HandOpen",
    "handclose": "HandClose",
    "fingersopen": "FingersOpen",
    "fingersclose": "FingersClose",
    "indexflexion": "IndexFlexion",
    "indexextension": "IndexExtension",
    "thumbflexion": "ThumbFlexion",
    "thumbextension": "ThumbExtension",
    "middleflexion": "MiddleFlexion",
    "middleextension": "MiddleExtension",
    "ringflexion": "RingFlexion",
    "ringextension": "RingExtension",
    "pinkyflexion": "PinkyFlexion",
    "pinkyextension": "PinkyExtension",
    "twofingerpinch": "TwoFingerPinch",
    "threefingerchuck": "ThreeFingerChuck",
}

# 2) Add common synonyms / file-name aliases → canonical key (lowercase, no separators)
SYNONYMS = {
    "fingersopen": "handopen",          # if you want FingersOpen → HandOpen, change target to "fingersopen"
    "fingersclose": "handclose",        # likewise, set to "fingersclose" if that’s your canonical choice
    "resting": "rest",
    "null": "rest",
    "nan": "rest",                      # treat missing labels as Rest; change to "unknown" if you prefer
    "index": "indexflexion",
    "index2": "indexflexion",
    "2fingerinch": "twofingerpinch",
    "3fingerchuck": "threefingerchuck",
    # typos that show up:
    "pinkyfelxion": "pinkyflexion",
    "pin kyflexion": "pinkyflexion",
    "pinkyflexon": "pinkyflexion",
    # the one you mentioned (extra space + odd casing is already handled by normalization)
}

def _normalize_token(s: str) -> str:
    """strip, remove separators, lowercase."""
    return re.sub(r"[\s_-]+", "", (s or "").strip()).lower()

[docs] def canonical_label( s: str, *, style: str = "lower", # "camel" -> CamelCase name from CANON; "lower" -> canonical key allow_fuzzy: bool = True, fuzzy_cutoff: float = 0.88, strict: bool = False, # if True, raise on unknown ) -> str: """ Canonicalize a raw label string to your official namespace. - style="camel": return CamelCase (e.g., "PinkyFlexion") - style="lower": return canonical token key (e.g., "pinkyflexion") """ tok = _normalize_token(s) # map synonyms (including “nan”→rest if you want that behavior) tok = SYNONYMS.get(tok, tok) if tok in CANON: return CANON[tok] if style == "camel" else tok if allow_fuzzy: # try fuzzy against known keys match = difflib.get_close_matches(tok, CANON.keys(), n=1, cutoff=fuzzy_cutoff) if match: m = match[0] return CANON[m] if style == "camel" else m if strict: raise ValueError(f"Unknown label: {s!r}{tok!r}") # last resort: make a readable CamelCase from the normalized token # (still stable, but not guaranteed to be one of your official classes) fallback_camel = re.sub(r"([a-z])([0-9])", r"\1 \2", tok).title().replace(" ", "") return fallback_camel if style == "camel" else tok