Skip to content

glazing.framenet.converter

Converting FrameNet XML to JSON Lines.

converter

FrameNet XML to JSON Lines converter.

This module provides conversion from FrameNet XML format to JSON Lines format using the glazing FrameNet models. Supports both full frame files and lu files.

CLASS DESCRIPTION
FrameNetConverter

Convert FrameNet XML files to JSON Lines format.

FUNCTION DESCRIPTION
convert_frame_file

Convert a single frame XML file to Frame model.

convert_lu_file

Convert a single lexical unit XML file to LexicalUnit model.

convert_frames_directory

Convert all frame files in a directory to JSON Lines.

Examples:

>>> from pathlib import Path
>>> from glazing.framenet.converter import FrameNetConverter
>>> converter = FrameNetConverter()
>>> frame = converter.convert_frame_file("frame/Abandonment.xml")
>>> print(frame.name)
'Abandonment'
>>> # Convert entire directory
>>> converter.convert_frames_directory(
...     input_dir="framenet_v17/frame",
...     output_file="frames.jsonl"
... )

Classes

FrameNetConverter(namespace: str = 'http://framenet.icsi.berkeley.edu', validate_schema: bool = False)

Convert FrameNet XML files to JSON Lines format.

PARAMETER DESCRIPTION
namespace

FrameNet XML namespace URI.

TYPE: str DEFAULT: "http://framenet.icsi.berkeley.edu"

validate_schema

Whether to validate against DTD/XSD.

TYPE: bool DEFAULT: False

ATTRIBUTE DESCRIPTION
namespace

FrameNet XML namespace.

TYPE: str

ns

Namespace mapping for XPath.

TYPE: dict[str, str]

METHOD DESCRIPTION
convert_frame_file

Convert a frame XML file to Frame model.

convert_lu_file

Convert a lexical unit XML file to LexicalUnit model.

convert_frames_directory

Convert all frames in a directory to JSON Lines.

convert_frame_relations_file

Convert frRelation.xml to frame relation mappings.

convert_semtypes_file

Convert semTypes.xml to JSON Lines.

convert_fulltext_file

Convert a fulltext XML file to Sentence models.

convert_fulltext_directory

Convert all fulltext files in a directory to JSON Lines.

Initialize the converter.

PARAMETER DESCRIPTION
namespace

FrameNet XML namespace URI.

TYPE: str DEFAULT: 'http://framenet.icsi.berkeley.edu'

validate_schema

Whether to validate XML against schema.

TYPE: bool DEFAULT: False

Source code in src/glazing/framenet/converter.py
def __init__(
    self,
    namespace: str = "http://framenet.icsi.berkeley.edu",
    validate_schema: bool = False,
) -> None:
    """Initialize the converter.

    Parameters
    ----------
    namespace : str
        FrameNet XML namespace URI.
    validate_schema : bool
        Whether to validate XML against schema.
    """
    self.namespace = namespace
    self.ns = {"fn": namespace} if namespace else {}
    self.validate_schema = validate_schema
    self._ns_prefix = f"{{{namespace}}}" if namespace else ""
Functions
convert_frame_file(filepath: Path | str) -> Frame

Convert a frame XML file to Frame model.

PARAMETER DESCRIPTION
filepath

Path to frame XML file.

TYPE: Path | str

RETURNS DESCRIPTION
Frame

Parsed Frame model instance.

Examples:

>>> converter = FrameNetConverter()
>>> frame = converter.convert_frame_file("frame/Abandonment.xml")
>>> print(f"Frame: {frame.name} (ID: {frame.id})")
'Frame: Abandonment (ID: 2031)'
Source code in src/glazing/framenet/converter.py
def convert_frame_file(self, filepath: Path | str) -> Frame:
    """Convert a frame XML file to Frame model.

    Parameters
    ----------
    filepath : Path | str
        Path to frame XML file.

    Returns
    -------
    Frame
        Parsed Frame model instance.

    Examples
    --------
    >>> converter = FrameNetConverter()
    >>> frame = converter.convert_frame_file("frame/Abandonment.xml")
    >>> print(f"Frame: {frame.name} (ID: {frame.id})")
    'Frame: Abandonment (ID: 2031)'
    """
    filepath = Path(filepath)

    # Parse XML
    if self.validate_schema:
        root = parse_with_schema(filepath)
    else:
        tree = etree.parse(str(filepath))
        root = tree.getroot()

    # Extract frame attributes
    attrs = parse_attributes(root, {"ID": int})

    # Parse definition
    def_elem = root.find(f"{{{self.namespace}}}definition" if self.namespace else "definition")
    definition = self._parse_definition(def_elem)

    # Parse frame elements
    frame_elements = []
    fe_tag = f"{{{self.namespace}}}FE" if self.namespace else "FE"
    for fe_elem in root.findall(fe_tag):
        frame_elements.append(self._parse_frame_element(fe_elem))

    return Frame(
        id=int(attrs.get("ID", 0)),
        name=str(attrs.get("name", "")),
        definition=definition,
        frame_elements=frame_elements,
        created_by=str(attrs.get("cBy")) if attrs.get("cBy") else None,
        created_date=self._parse_datetime(
            str(attrs.get("cDate")) if attrs.get("cDate") else None
        ),
        modified_date=self._parse_datetime(
            str(attrs.get("mDate")) if attrs.get("mDate") else None
        ),
    )
convert_frame_relations_file(filepath: Path | str) -> dict[int, list[FrameRelation]]

Convert frRelation.xml to frame relation mappings.

Parses the frame relation types and individual frame relations, creating FrameRelation objects grouped by frame ID.

PARAMETER DESCRIPTION
filepath

Path to frRelation.xml file.

TYPE: Path | str

RETURNS DESCRIPTION
dict[int, list[FrameRelation]]

Dictionary mapping frame IDs to their FrameRelation objects.

Examples:

>>> converter = FrameNetConverter()
>>> relations = converter.convert_frame_relations_file("frRelation.xml")
>>> print(f"Found relations for {len(relations)} frames")
Source code in src/glazing/framenet/converter.py
def convert_frame_relations_file(self, filepath: Path | str) -> dict[int, list[FrameRelation]]:
    """Convert frRelation.xml to frame relation mappings.

    Parses the frame relation types and individual frame relations,
    creating FrameRelation objects grouped by frame ID.

    Parameters
    ----------
    filepath : Path | str
        Path to frRelation.xml file.

    Returns
    -------
    dict[int, list[FrameRelation]]
        Dictionary mapping frame IDs to their FrameRelation objects.

    Examples
    --------
    >>> converter = FrameNetConverter()
    >>> relations = converter.convert_frame_relations_file("frRelation.xml")
    >>> print(f"Found relations for {len(relations)} frames")
    """
    filepath = Path(filepath)

    tree = etree.parse(str(filepath))
    root = tree.getroot()

    relations_by_frame: dict[int, list[FrameRelation]] = {}

    for rel_type_elem in root.findall(self._tag("frameRelationType")):
        type_name = rel_type_elem.get("name", "")

        if type_name not in FRAME_RELATION_TYPE_MAP:
            continue

        sub_type, super_type = FRAME_RELATION_TYPE_MAP[type_name]

        for fr_elem in rel_type_elem.findall(self._tag("frameRelation")):
            sub_frame_id = int(fr_elem.get("subID", "0"))
            sup_frame_id = int(fr_elem.get("supID", "0"))
            sub_frame_name = fr_elem.get("subFrameName", "")
            super_frame_name = fr_elem.get("superFrameName", "")
            relation_id = int(fr_elem.get("ID", "0"))

            # Parse FE relations
            fe_relations: list[FERelation] = []
            for fe_rel_elem in fr_elem.findall(self._tag("FERelation")):
                try:
                    fe_rel = FERelation(  # type: ignore[call-arg]
                        sub_fe_id=int(fe_rel_elem.get("subID", "0")),
                        sub_fe_name=fe_rel_elem.get("subFEName"),
                        super_fe_id=int(fe_rel_elem.get("supID", "0")),
                        super_fe_name=fe_rel_elem.get("superFEName"),
                    )
                    fe_relations.append(fe_rel)
                except (ValueError, TypeError):
                    continue

            # Create FrameRelation for the sub-frame's perspective
            try:
                sub_relation = FrameRelation(
                    id=relation_id,
                    type=sub_type,  # type: ignore[arg-type]
                    sub_frame_id=sub_frame_id,
                    sub_frame_name=sub_frame_name,
                    super_frame_id=sup_frame_id,
                    super_frame_name=super_frame_name,
                    fe_relations=fe_relations,
                )
                relations_by_frame.setdefault(sub_frame_id, []).append(sub_relation)
            except (ValueError, TypeError):
                pass

            # Create FrameRelation for the super-frame's perspective (if applicable)
            if super_type is not None:
                try:
                    super_relation = FrameRelation(
                        id=relation_id,
                        type=super_type,  # type: ignore[arg-type]
                        sub_frame_id=sub_frame_id,
                        sub_frame_name=sub_frame_name,
                        super_frame_id=sup_frame_id,
                        super_frame_name=super_frame_name,
                        fe_relations=fe_relations,
                    )
                    relations_by_frame.setdefault(sup_frame_id, []).append(super_relation)
                except (ValueError, TypeError):
                    pass

    return relations_by_frame
convert_frames_directory(input_dir: Path | str, output_file: Path | str, pattern: str = '*.xml') -> int

Convert all frame files in a directory to JSON Lines with lexical units.

This method parses frame XML files and associates them with lexical units from luIndex.xml (expected to be in the parent directory of input_dir). It also loads frame relations from frRelation.xml and enriches LUs with valence patterns and semantic types from individual lu/*.xml files.

PARAMETER DESCRIPTION
input_dir

Directory containing frame XML files.

TYPE: Path | str

output_file

Output JSON Lines file path.

TYPE: Path | str

pattern

File pattern to match.

TYPE: str DEFAULT: "*.xml"

RETURNS DESCRIPTION
int

Number of frames converted.

Examples:

>>> converter = FrameNetConverter()
>>> count = converter.convert_frames_directory(
...     "framenet_v17/frame",
...     "frames.jsonl"
... )
>>> print(f"Converted {count} frames")
'Converted 1221 frames'
Source code in src/glazing/framenet/converter.py
def convert_frames_directory(
    self,
    input_dir: Path | str,
    output_file: Path | str,
    pattern: str = "*.xml",
) -> int:
    """Convert all frame files in a directory to JSON Lines with lexical units.

    This method parses frame XML files and associates them with lexical units
    from luIndex.xml (expected to be in the parent directory of input_dir).
    It also loads frame relations from frRelation.xml and enriches LUs with
    valence patterns and semantic types from individual lu/*.xml files.

    Parameters
    ----------
    input_dir : Path | str
        Directory containing frame XML files.
    output_file : Path | str
        Output JSON Lines file path.
    pattern : str, default="*.xml"
        File pattern to match.

    Returns
    -------
    int
        Number of frames converted.

    Examples
    --------
    >>> converter = FrameNetConverter()
    >>> count = converter.convert_frames_directory(
    ...     "framenet_v17/frame",
    ...     "frames.jsonl"
    ... )
    >>> print(f"Converted {count} frames")
    'Converted 1221 frames'
    """
    input_dir = Path(input_dir)
    output_file = Path(output_file)

    # First, parse all frames
    frames: list[Frame] = []
    errors: list[tuple[Path, Exception]] = []

    for xml_file in sorted(input_dir.glob(pattern)):
        try:
            frame = self.convert_frame_file(xml_file)
            frames.append(frame)
        except (etree.XMLSyntaxError, ValueError, TypeError) as e:
            errors.append((xml_file, e))

    # Load lexical units from luIndex.xml (in parent directory)
    parent_dir = input_dir.parent if input_dir.name == "frame" else input_dir
    lu_index_path = parent_dir / "luIndex.xml"

    lexical_units: list[LexicalUnit] = []
    if lu_index_path.exists():
        try:
            lexical_units = self.convert_lu_index_file(lu_index_path)
        except (etree.XMLSyntaxError, ValueError, TypeError) as e:
            print(f"Warning: Failed to load lexical units from {lu_index_path}: {e}")

    # Associate LUs with frames by frame_id
    lu_by_frame: dict[int, list[LexicalUnit]] = {}
    for lu in lexical_units:
        if lu.frame_id not in lu_by_frame:
            lu_by_frame[lu.frame_id] = []
        lu_by_frame[lu.frame_id].append(lu)

    # Update frames with their lexical units
    for frame in frames:
        frame.lexical_units = lu_by_frame.get(frame.id, [])

    # Load frame relations from frRelation.xml
    fr_relation_path = parent_dir / "frRelation.xml"
    if fr_relation_path.exists():
        try:
            relations_by_frame = self.convert_frame_relations_file(fr_relation_path)
            for frame in frames:
                frame.frame_relations = relations_by_frame.get(frame.id, [])
        except (etree.XMLSyntaxError, ValueError, TypeError) as e:
            print(f"Warning: Failed to load frame relations from {fr_relation_path}: {e}")

    # Enrich LUs with valence patterns and semtypes from individual lu/*.xml files
    lu_dir = parent_dir / "lu"
    if lu_dir.is_dir():
        for frame in frames:
            for lu in frame.lexical_units:
                lu_file = lu_dir / f"lu{lu.id}.xml"
                if lu_file.exists():
                    try:
                        valence_patterns, semtypes, _annotation_sets = self.convert_lu_file(
                            lu_file
                        )
                        if valence_patterns:
                            lu.valence_patterns = valence_patterns
                        if semtypes:
                            lu.semtypes = semtypes
                    except (etree.XMLSyntaxError, ValueError, TypeError) as e:
                        print(f"Warning: Failed to parse LU file {lu_file}: {e}")
                        continue

    # Write frames with LUs to output file
    count = 0
    with output_file.open("w", encoding="utf-8") as f:
        for frame in frames:
            json_line = frame.model_dump_json(exclude_none=True)
            f.write(json_line + "\n")
            count += 1

    # If there were any errors, raise an exception with details
    if errors:
        error_details = "\n".join(f"  - {file}: {error}" for file, error in errors)
        total_files = count + len(errors)
        error_msg = (
            f"Failed to convert {len(errors)} out of {total_files} files:\n{error_details}"
        )
        raise RuntimeError(error_msg)

    return count
convert_fulltext_directory(input_dir: Path | str, output_file: Path | str, pattern: str = '*.xml') -> int

Convert all fulltext files in a directory to JSON Lines.

PARAMETER DESCRIPTION
input_dir

Directory containing fulltext XML files.

TYPE: Path | str

output_file

Output JSON Lines file path.

TYPE: Path | str

pattern

File pattern to match.

TYPE: str DEFAULT: "*.xml"

RETURNS DESCRIPTION
int

Number of sentences converted.

Examples:

>>> converter = FrameNetConverter()
>>> count = converter.convert_fulltext_directory(
...     "framenet_v17/fulltext",
...     "fulltext.jsonl"
... )
>>> print(f"Converted {count} sentences")
Source code in src/glazing/framenet/converter.py
def convert_fulltext_directory(
    self,
    input_dir: Path | str,
    output_file: Path | str,
    pattern: str = "*.xml",
) -> int:
    """Convert all fulltext files in a directory to JSON Lines.

    Parameters
    ----------
    input_dir : Path | str
        Directory containing fulltext XML files.
    output_file : Path | str
        Output JSON Lines file path.
    pattern : str, default="*.xml"
        File pattern to match.

    Returns
    -------
    int
        Number of sentences converted.

    Examples
    --------
    >>> converter = FrameNetConverter()
    >>> count = converter.convert_fulltext_directory(
    ...     "framenet_v17/fulltext",
    ...     "fulltext.jsonl"
    ... )
    >>> print(f"Converted {count} sentences")
    """
    input_dir = Path(input_dir)
    output_file = Path(output_file)

    count = 0
    errors: list[tuple[Path, Exception]] = []

    with output_file.open("w", encoding="utf-8") as f:
        for xml_file in sorted(input_dir.glob(pattern)):
            try:
                sentences = self.convert_fulltext_file(xml_file)
                for sentence in sentences:
                    json_line = sentence.model_dump_json(exclude_none=True)
                    f.write(json_line + "\n")
                    count += 1
            except (etree.XMLSyntaxError, ValueError, TypeError) as e:
                errors.append((xml_file, e))

    if errors:
        error_details = "\n".join(f"  - {file}: {error}" for file, error in errors)
        total_files = len(list(input_dir.glob(pattern)))
        error_msg = (
            f"Failed to convert {len(errors)} out of {total_files} files:\n{error_details}"
        )
        raise RuntimeError(error_msg)

    return count
convert_fulltext_file(filepath: Path | str) -> list[Sentence]

Convert a fulltext/*.xml file to Sentence models.

Parses annotated corpus sentences with their annotation sets, layers, and labels.

PARAMETER DESCRIPTION
filepath

Path to fulltext XML file.

TYPE: Path | str

RETURNS DESCRIPTION
list[Sentence]

List of parsed Sentence models.

Examples:

>>> converter = FrameNetConverter()
>>> sentences = converter.convert_fulltext_file("fulltext/ANC__110CYL067.xml")
>>> print(f"Found {len(sentences)} sentences")
Source code in src/glazing/framenet/converter.py
def convert_fulltext_file(self, filepath: Path | str) -> list[Sentence]:
    """Convert a fulltext/*.xml file to Sentence models.

    Parses annotated corpus sentences with their annotation sets,
    layers, and labels.

    Parameters
    ----------
    filepath : Path | str
        Path to fulltext XML file.

    Returns
    -------
    list[Sentence]
        List of parsed Sentence models.

    Examples
    --------
    >>> converter = FrameNetConverter()
    >>> sentences = converter.convert_fulltext_file("fulltext/ANC__110CYL067.xml")
    >>> print(f"Found {len(sentences)} sentences")
    """
    filepath = Path(filepath)

    tree = etree.parse(str(filepath))
    root = tree.getroot()

    sentences: list[Sentence] = []

    for sent_elem in root.findall(self._tag("sentence")):
        sent_id_str = sent_elem.get("ID")
        if not sent_id_str:
            continue
        sent_id = int(sent_id_str)

        # Get sentence text
        text_elem = sent_elem.find(self._tag("text"))
        if text_elem is None or not text_elem.text:
            continue
        text = text_elem.text

        # Get sentence metadata
        parag_no_str = sent_elem.get("paragNo")
        sent_no_str = sent_elem.get("sentNo")
        corp_id_str = sent_elem.get("corpID")
        doc_id_str = sent_elem.get("docID")
        apos_str = sent_elem.get("aPos")

        # Parse annotation sets
        annotation_sets: list[AnnotationSet] = []
        for annoset_elem in sent_elem.findall(self._tag("annotationSet")):
            try:
                annoset = self._parse_annotation_set(annoset_elem, sent_id)
                if annoset is not None:
                    annotation_sets.append(annoset)
            except (ValueError, TypeError):
                continue

        try:
            sentence = Sentence(
                id=sent_id,
                text=text,
                paragNo=int(parag_no_str) if parag_no_str else None,
                sentNo=int(sent_no_str) if sent_no_str else None,
                corpID=int(corp_id_str) if corp_id_str else None,
                docID=int(doc_id_str) if doc_id_str else None,
                apos=int(apos_str) if apos_str else None,
                annotation_sets=annotation_sets,
            )
            sentences.append(sentence)
        except (ValueError, TypeError) as e:
            print(f"Warning: Failed to parse sentence {sent_id}: {e}")
            continue

    return sentences
convert_lu_file(filepath: Path | str) -> tuple[list[ValencePattern], list[SemTypeRef], list[AnnotationSet]]

Convert an individual lu/*.xml file to extract valence patterns and semtypes.

Parses valence patterns (FE realizations and their syntactic patterns), semantic type references, and annotation sets from a lexical unit file.

PARAMETER DESCRIPTION
filepath

Path to individual lu XML file (e.g., lu/lu10.xml).

TYPE: Path | str

RETURNS DESCRIPTION
tuple[list[ValencePattern], list[SemTypeRef], list[AnnotationSet]]

Tuple of (valence_patterns, semtypes, annotation_sets).

Examples:

>>> converter = FrameNetConverter()
>>> patterns, semtypes, annosets = converter.convert_lu_file("lu/lu10.xml")
>>> print(f"Found {len(patterns)} valence patterns")
Source code in src/glazing/framenet/converter.py
def convert_lu_file(
    self, filepath: Path | str
) -> tuple[list[ValencePattern], list[SemTypeRef], list[AnnotationSet]]:
    """Convert an individual lu/*.xml file to extract valence patterns and semtypes.

    Parses valence patterns (FE realizations and their syntactic patterns),
    semantic type references, and annotation sets from a lexical unit file.

    Parameters
    ----------
    filepath : Path | str
        Path to individual lu XML file (e.g., lu/lu10.xml).

    Returns
    -------
    tuple[list[ValencePattern], list[SemTypeRef], list[AnnotationSet]]
        Tuple of (valence_patterns, semtypes, annotation_sets).

    Examples
    --------
    >>> converter = FrameNetConverter()
    >>> patterns, semtypes, annosets = converter.convert_lu_file("lu/lu10.xml")
    >>> print(f"Found {len(patterns)} valence patterns")
    """
    filepath = Path(filepath)

    tree = etree.parse(str(filepath))
    root = tree.getroot()

    # Parse semantic types (direct children of root)
    semtypes: list[SemTypeRef] = []
    for semtype_elem in root.findall(self._tag("semType")):
        st_name = semtype_elem.get("name")
        st_id = semtype_elem.get("ID")
        if st_name and st_id:
            try:
                semtypes.append(SemTypeRef(name=st_name, id=int(st_id)))
            except (ValueError, TypeError):
                continue

    # Parse valence patterns from <valences> element
    valence_patterns: list[ValencePattern] = []
    valences_elem = root.find(self._tag("valences"))
    if valences_elem is not None:
        # Parse FE realizations
        fe_realizations: list[FERealization] = []
        for fe_real_elem in valences_elem.findall(self._tag("FERealization")):
            fe_real_total = int(fe_real_elem.get("total", "0"))

            # Get FE name from child <FE> element
            fe_child = fe_real_elem.find(self._tag("FE"))
            fe_name = fe_child.get("name", "") if fe_child is not None else ""

            if not fe_name:
                continue

            # Parse patterns within this FE realization
            patterns: list[ValenceRealizationPattern] = []
            for pattern_elem in fe_real_elem.findall(self._tag("pattern")):
                pattern_total = int(pattern_elem.get("total", "0"))

                # Parse valence units
                valence_units: list[ValenceUnit] = []
                for vu_elem in pattern_elem.findall(self._tag("valenceUnit")):
                    try:
                        vu = ValenceUnit(
                            GF=vu_elem.get("GF", ""),
                            PT=vu_elem.get("PT", ""),
                            FE=vu_elem.get("FE", ""),
                        )
                        valence_units.append(vu)
                    except (ValueError, TypeError):
                        continue

                # Parse annotation set IDs
                anno_set_ids: list[int] = []
                for anno_elem in pattern_elem.findall(self._tag("annoSet")):
                    anno_id = anno_elem.get("ID")
                    if anno_id:
                        anno_set_ids.append(int(anno_id))

                if valence_units and pattern_total > 0:
                    try:
                        patterns.append(
                            ValenceRealizationPattern(
                                valence_units=valence_units,
                                anno_set_ids=anno_set_ids,
                                total=pattern_total,
                            )
                        )
                    except (ValueError, TypeError):
                        continue

            try:
                fe_realizations.append(
                    FERealization(
                        fe_name=fe_name,
                        total=fe_real_total,
                        patterns=patterns,
                    )
                )
            except (ValueError, TypeError):
                continue

        # Build a single ValencePattern if we have FE realizations
        if fe_realizations:
            # Compute total annotated from the root <valences> or LU attributes
            total_annotated = int(root.get("totalAnnotated", "0"))

            # Parse FEGroupRealization / ValenceAnnotationPattern entries
            valence_anno_patterns: list[ValenceAnnotationPattern] = []
            # These come from <FEGroupRealization> elements in the valences section
            # (not all LU files have these)

            valence_patterns.append(
                ValencePattern(
                    total_annotated=total_annotated,
                    fe_realizations=fe_realizations,
                    patterns=valence_anno_patterns,
                )
            )

    # Parse annotation sets (from <subCorpus> sections)
    annotation_sets: list[AnnotationSet] = []
    # Annotation sets in lu files are nested inside subCorpus > sentence > annotationSet
    # We collect them but don't return full sentences here
    for subcorpus_elem in root.findall(self._tag("subCorpus")):
        for sentence_elem in subcorpus_elem.findall(self._tag("sentence")):
            sent_id = int(sentence_elem.get("ID", "0"))
            for annoset_elem in sentence_elem.findall(self._tag("annotationSet")):
                try:
                    annoset = self._parse_annotation_set(annoset_elem, sent_id)
                    if annoset is not None:
                        annotation_sets.append(annoset)
                except (ValueError, TypeError):
                    continue

    return valence_patterns, semtypes, annotation_sets
convert_lu_index_file(filepath: Path | str) -> list[LexicalUnit]

Convert luIndex.xml to a list of LexicalUnit models.

PARAMETER DESCRIPTION
filepath

Path to luIndex.xml file.

TYPE: Path | str

RETURNS DESCRIPTION
list[LexicalUnit]

List of parsed LexicalUnit models.

Examples:

>>> converter = FrameNetConverter()
>>> lus = converter.convert_lu_index_file("framenet_v17/luIndex.xml")
>>> print(f"Loaded {len(lus)} lexical units")
'Loaded 13575 lexical units'
Source code in src/glazing/framenet/converter.py
def convert_lu_index_file(self, filepath: Path | str) -> list[LexicalUnit]:
    """Convert luIndex.xml to a list of LexicalUnit models.

    Parameters
    ----------
    filepath : Path | str
        Path to luIndex.xml file.

    Returns
    -------
    list[LexicalUnit]
        List of parsed LexicalUnit models.

    Examples
    --------
    >>> converter = FrameNetConverter()
    >>> lus = converter.convert_lu_index_file("framenet_v17/luIndex.xml")
    >>> print(f"Loaded {len(lus)} lexical units")
    'Loaded 13575 lexical units'
    """
    filepath = Path(filepath)

    # Parse XML
    if self.validate_schema:
        root = parse_with_schema(filepath)
    else:
        tree = etree.parse(str(filepath))
        root = tree.getroot()

    # Parse all LU elements
    lexical_units = []
    lu_tag = f"{{{self.namespace}}}lu" if self.namespace else "lu"
    for lu_elem in root.findall(lu_tag):
        try:
            lu = self._parse_lu_from_index(lu_elem)
            lexical_units.append(lu)
        except (ValueError, KeyError, TypeError) as e:
            # Skip invalid LUs but continue processing
            lu_name = lu_elem.get("name", "unknown")
            # Log error but don't fail entire conversion
            print(f"Warning: Failed to parse LU '{lu_name}': {e}")
            continue

    return lexical_units
convert_semtypes_file(filepath: Path | str, output_file: Path | str) -> int

Convert semTypes.xml to JSON Lines format.

Parses the semantic type hierarchy and writes each type as a JSON line.

PARAMETER DESCRIPTION
filepath

Path to semTypes.xml file.

TYPE: Path | str

output_file

Output JSON Lines file path.

TYPE: Path | str

RETURNS DESCRIPTION
int

Number of semantic types converted.

Examples:

>>> converter = FrameNetConverter()
>>> count = converter.convert_semtypes_file("semTypes.xml", "semtypes.jsonl")
>>> print(f"Converted {count} semantic types")
Source code in src/glazing/framenet/converter.py
def convert_semtypes_file(self, filepath: Path | str, output_file: Path | str) -> int:
    """Convert semTypes.xml to JSON Lines format.

    Parses the semantic type hierarchy and writes each type as a JSON line.

    Parameters
    ----------
    filepath : Path | str
        Path to semTypes.xml file.
    output_file : Path | str
        Output JSON Lines file path.

    Returns
    -------
    int
        Number of semantic types converted.

    Examples
    --------
    >>> converter = FrameNetConverter()
    >>> count = converter.convert_semtypes_file("semTypes.xml", "semtypes.jsonl")
    >>> print(f"Converted {count} semantic types")
    """
    filepath = Path(filepath)
    output_file = Path(output_file)

    tree = etree.parse(str(filepath))
    root = tree.getroot()

    # semTypes.xml uses the FrameNet namespace
    semtype_tag = self._tag("semType")
    definition_tag = self._tag("definition")
    supertype_tag = self._tag("superType")

    semantic_types: list[SemanticType] = []

    for st_elem in root.findall(semtype_tag):
        st_id = st_elem.get("ID")
        st_name = st_elem.get("name", "")
        st_abbrev = st_elem.get("abbrev", "")

        if not st_id or not st_name:
            continue

        # Parse definition
        def_elem = st_elem.find(definition_tag)
        definition_text = ""
        if def_elem is not None and def_elem.text:
            definition_text = def_elem.text.strip()
        if not definition_text:
            definition_text = f"Semantic type: {st_name}"

        # Parse super type
        super_type_id = None
        super_type_name = None
        sup_elem = st_elem.find(supertype_tag)
        if sup_elem is not None:
            sup_id = sup_elem.get("supID")
            sup_name = sup_elem.get("superTypeName")
            if sup_id:
                super_type_id = int(sup_id)
                super_type_name = sup_name

        try:
            sem_type = SemanticType(
                id=int(st_id),
                name=st_name,
                abbrev=st_abbrev if st_abbrev else st_name,
                definition=definition_text,
                super_type_id=super_type_id,
                super_type_name=super_type_name,
                root_type_id=None,
                root_type_name=None,
            )
            semantic_types.append(sem_type)
        except (ValueError, TypeError) as e:
            print(f"Warning: Failed to parse semantic type '{st_name}': {e}")
            continue

    # Write to output file
    count = 0
    with output_file.open("w", encoding="utf-8") as f:
        for sem_type in semantic_types:
            json_line = sem_type.model_dump_json(exclude_none=True)
            f.write(json_line + "\n")
            count += 1

    return count

Functions

convert_frame_file(filepath: Path | str) -> Frame

Convert a single frame XML file to Frame model.

PARAMETER DESCRIPTION
filepath

Path to frame XML file.

TYPE: Path | str

RETURNS DESCRIPTION
Frame

Parsed Frame model.

Source code in src/glazing/framenet/converter.py
def convert_frame_file(filepath: Path | str) -> Frame:
    """Convert a single frame XML file to Frame model.

    Parameters
    ----------
    filepath : Path | str
        Path to frame XML file.

    Returns
    -------
    Frame
        Parsed Frame model.
    """
    converter = FrameNetConverter()
    return converter.convert_frame_file(filepath)

convert_frames_directory(input_dir: Path | str, output_file: Path | str, pattern: str = '*.xml') -> int

Convert all frames in a directory to JSON Lines.

PARAMETER DESCRIPTION
input_dir

Directory with frame XML files.

TYPE: Path | str

output_file

Output JSON Lines file.

TYPE: Path | str

pattern

File pattern to match.

TYPE: str DEFAULT: '*.xml'

RETURNS DESCRIPTION
int

Number of frames converted.

Source code in src/glazing/framenet/converter.py
def convert_frames_directory(
    input_dir: Path | str,
    output_file: Path | str,
    pattern: str = "*.xml",
) -> int:
    """Convert all frames in a directory to JSON Lines.

    Parameters
    ----------
    input_dir : Path | str
        Directory with frame XML files.
    output_file : Path | str
        Output JSON Lines file.
    pattern : str
        File pattern to match.

    Returns
    -------
    int
        Number of frames converted.
    """
    converter = FrameNetConverter()
    return converter.convert_frames_directory(input_dir, output_file, pattern)