Skip to content

glazing.propbank.converter

Converting PropBank XML to JSON Lines.

converter

PropBank XML to JSON Lines converter.

This module provides conversion from PropBank XML format to JSON Lines format using the glazing PropBank models.

CLASS DESCRIPTION
PropBankConverter

Convert PropBank XML files to JSON Lines format.

FUNCTION DESCRIPTION
convert_frameset_file

Convert a single frameset XML file to Frameset model.

convert_framesets_directory

Convert all frameset files in a directory to JSON Lines.

Examples:

>>> from pathlib import Path
>>> from glazing.propbank.converter import PropBankConverter
>>> converter = PropBankConverter()
>>> frameset = converter.convert_frameset_file("frames/abandon.xml")
>>> print(frameset.predicate_lemma)
'abandon'

Classes

PropBankConverter(validate_schema: bool = False)

Convert PropBank XML files to JSON Lines format.

PARAMETER DESCRIPTION
validate_schema

Whether to validate against DTD.

TYPE: bool DEFAULT: False

ATTRIBUTE DESCRIPTION
validate_schema

Whether to validate XML against schema.

TYPE: bool

METHOD DESCRIPTION
convert_frameset_file

Convert a frameset XML file to Frameset model.

convert_framesets_directory

Convert all framesets in a directory to JSON Lines.

Initialize the converter.

PARAMETER DESCRIPTION
validate_schema

Whether to validate XML against DTD.

TYPE: bool DEFAULT: False

Source code in src/glazing/propbank/converter.py
def __init__(self, validate_schema: bool = False) -> None:
    """Initialize the converter.

    Parameters
    ----------
    validate_schema : bool
        Whether to validate XML against DTD.
    """
    self.validate_schema = validate_schema
Functions
convert_combined_frameset_file(filepath: Path | str) -> list[Frameset]

Convert a combined frameset XML file with multiple predicates.

Handles files like AMR-UMR-91-rolesets.xml where a single root contains multiple children.

PARAMETER DESCRIPTION
filepath

Path to combined frameset XML file.

TYPE: Path | str

RETURNS DESCRIPTION
list[Frameset]

List of parsed Frameset model instances, one per predicate.

Source code in src/glazing/propbank/converter.py
def convert_combined_frameset_file(self, filepath: Path | str) -> list[Frameset]:
    """Convert a combined frameset XML file with multiple predicates.

    Handles files like AMR-UMR-91-rolesets.xml where a single <frameset>
    root contains multiple <predicate> children.

    Parameters
    ----------
    filepath : Path | str
        Path to combined frameset XML file.

    Returns
    -------
    list[Frameset]
        List of parsed Frameset model instances, one per predicate.
    """
    filepath = Path(filepath)
    xml_content = filepath.read_text(encoding="utf-8")
    xml_content = self._fix_xml_errors(xml_content, filepath)

    tree = etree.parse(BytesIO(xml_content.encode("utf-8")))
    root = tree.getroot()

    framesets: list[Frameset] = []
    for predicate_elem in root.findall("predicate"):
        predicate_lemma = predicate_elem.get("lemma", "")

        rolesets = []
        for roleset in predicate_elem.findall("roleset"):
            try:
                rolesets.append(self._parse_roleset(roleset))
            except (ValidationError, ValueError, TypeError):
                # Skip rolesets with non-standard values (e.g., AMR-specific types)
                continue

        notes = []
        for note in predicate_elem.findall("note"):
            if note.text:
                notes.append(note.text)

        framesets.append(
            Frameset(predicate_lemma=predicate_lemma, rolesets=rolesets, notes=notes)
        )

    return framesets
convert_frameset_file(filepath: Path | str) -> Frameset

Convert a frameset XML file to Frameset model.

PARAMETER DESCRIPTION
filepath

Path to frameset XML file.

TYPE: Path | str

RETURNS DESCRIPTION
Frameset

Parsed Frameset model instance.

Examples:

>>> converter = PropBankConverter()
>>> frameset = converter.convert_frameset_file("frames/abandon.xml")
>>> print(f"Predicate: {frameset.predicate_lemma}")
'Predicate: abandon'
Source code in src/glazing/propbank/converter.py
def convert_frameset_file(self, filepath: Path | str) -> Frameset:
    """Convert a frameset XML file to Frameset model.

    Parameters
    ----------
    filepath : Path | str
        Path to frameset XML file.

    Returns
    -------
    Frameset
        Parsed Frameset model instance.

    Examples
    --------
    >>> converter = PropBankConverter()
    >>> frameset = converter.convert_frameset_file("frames/abandon.xml")
    >>> print(f"Predicate: {frameset.predicate_lemma}")
    'Predicate: abandon'
    """
    filepath = Path(filepath)

    # Read and pre-process XML content
    xml_content = filepath.read_text(encoding="utf-8")

    # Fix known XML errors
    xml_content = self._fix_xml_errors(xml_content, filepath)

    # Parse XML
    if self.validate_schema:
        # For schema validation, we need to write the fixed content to a temp file
        with tempfile.NamedTemporaryFile(
            mode="w", suffix=".xml", encoding="utf-8", delete=False
        ) as f:
            f.write(xml_content)
            temp_path = f.name
        try:
            root = parse_with_schema(temp_path, schema_type="dtd")
        finally:
            Path(temp_path).unlink(missing_ok=True)
    else:
        tree = etree.parse(BytesIO(xml_content.encode("utf-8")))
        root = tree.getroot()

    # Get predicate element
    predicate_elem = root.find("predicate")
    if predicate_elem is None:
        error_msg = f"No predicate element found in {filepath}"
        raise ValueError(error_msg)

    # Get predicate lemma
    predicate_lemma = predicate_elem.get("lemma", "")

    # Parse rolesets
    rolesets = []
    for roleset in predicate_elem.findall("roleset"):
        rolesets.append(self._parse_roleset(roleset))

    # Parse notes
    notes = []
    for note in root.findall("note"):
        if note.text:
            notes.append(note.text)

    return Frameset(predicate_lemma=predicate_lemma, rolesets=rolesets, notes=notes)
convert_framesets_directory(input_dir: Path | str, output_file: Path | str, pattern: str = '*.xml') -> int

Convert all frameset files in a directory to JSON Lines.

Also processes combined frameset files (e.g., AMR-UMR-91-rolesets.xml) found in the parent directory.

PARAMETER DESCRIPTION
input_dir

Directory containing frameset XML files.

TYPE: Path | str

output_file

Output JSON Lines file path.

TYPE: Path | str

pattern

File pattern to match.

TYPE: str DEFAULT: "*.xml"

RETURNS DESCRIPTION
int

Number of framesets converted.

Examples:

>>> converter = PropBankConverter()
>>> count = converter.convert_framesets_directory(
...     "propbank-frames/frames",
...     "framesets.jsonl"
... )
>>> print(f"Converted {count} framesets")
'Converted 5559 framesets'
Source code in src/glazing/propbank/converter.py
def convert_framesets_directory(
    self,
    input_dir: Path | str,
    output_file: Path | str,
    pattern: str = "*.xml",
) -> int:
    """Convert all frameset files in a directory to JSON Lines.

    Also processes combined frameset files (e.g., AMR-UMR-91-rolesets.xml)
    found in the parent directory.

    Parameters
    ----------
    input_dir : Path | str
        Directory containing frameset XML files.
    output_file : Path | str
        Output JSON Lines file path.
    pattern : str, default="*.xml"
        File pattern to match.

    Returns
    -------
    int
        Number of framesets converted.

    Examples
    --------
    >>> converter = PropBankConverter()
    >>> count = converter.convert_framesets_directory(
    ...     "propbank-frames/frames",
    ...     "framesets.jsonl"
    ... )
    >>> print(f"Converted {count} framesets")
    'Converted 5559 framesets'
    """
    input_dir = Path(input_dir)
    output_file = Path(output_file)

    count = 0
    errors: list[tuple[Path, Exception]] = []

    with output_file.open("w", encoding="utf-8") as f:
        # Convert individual frameset files
        for xml_file in sorted(input_dir.glob(pattern)):
            try:
                frameset = self.convert_frameset_file(xml_file)
                json_line = frameset.model_dump_json(exclude_none=True)
                f.write(json_line + "\n")
                count += 1
            except (etree.XMLSyntaxError, ValueError, TypeError) as e:
                errors.append((xml_file, e))

        # Also process combined frameset files in parent directory
        amr_file = input_dir.parent / "AMR-UMR-91-rolesets.xml"
        if amr_file.exists():
            try:
                amr_framesets = self.convert_combined_frameset_file(amr_file)
                for frameset in amr_framesets:
                    json_line = frameset.model_dump_json(exclude_none=True)
                    f.write(json_line + "\n")
                    count += 1
            except (etree.XMLSyntaxError, ValueError, TypeError) as e:
                errors.append((amr_file, e))

    # If there were any errors, raise an exception with details
    if errors:
        error_details = "\n".join(f"  - {file}: {error}" for file, error in errors)
        total_files = count + len(errors)
        error_msg = (
            f"Failed to convert {len(errors)} out of {total_files} files:\n{error_details}"
        )
        raise RuntimeError(error_msg)

    return count

Functions

convert_frameset_file(filepath: Path | str) -> Frameset

Convert a single frameset XML file to Frameset model.

PARAMETER DESCRIPTION
filepath

Path to frameset XML file.

TYPE: Path | str

RETURNS DESCRIPTION
Frameset

Parsed Frameset model.

Source code in src/glazing/propbank/converter.py
def convert_frameset_file(filepath: Path | str) -> Frameset:
    """Convert a single frameset XML file to Frameset model.

    Parameters
    ----------
    filepath : Path | str
        Path to frameset XML file.

    Returns
    -------
    Frameset
        Parsed Frameset model.
    """
    converter = PropBankConverter()
    return converter.convert_frameset_file(filepath)

convert_framesets_directory(input_dir: Path | str, output_file: Path | str, pattern: str = '*.xml') -> int

Convert all framesets in a directory to JSON Lines.

PARAMETER DESCRIPTION
input_dir

Directory with frameset XML files.

TYPE: Path | str

output_file

Output JSON Lines file.

TYPE: Path | str

pattern

File pattern to match.

TYPE: str DEFAULT: '*.xml'

RETURNS DESCRIPTION
int

Number of framesets converted.

Source code in src/glazing/propbank/converter.py
def convert_framesets_directory(
    input_dir: Path | str,
    output_file: Path | str,
    pattern: str = "*.xml",
) -> int:
    """Convert all framesets in a directory to JSON Lines.

    Parameters
    ----------
    input_dir : Path | str
        Directory with frameset XML files.
    output_file : Path | str
        Output JSON Lines file.
    pattern : str
        File pattern to match.

    Returns
    -------
    int
        Number of framesets converted.
    """
    converter = PropBankConverter()
    return converter.convert_framesets_directory(input_dir, output_file, pattern)