Skip to content

glazing.references.extractor

Extracting references from datasets.

extractor

Reference extraction from linguistic datasets.

This module provides functionality to extract cross-references from FrameNet, PropBank, VerbNet, and WordNet data models and build efficient indices for mapping lookups.

CLASS DESCRIPTION
ReferenceExtractor

Main class for extracting and indexing cross-dataset references.

Notes

The extractor builds bidirectional indices for efficient lookup of mappings between datasets. All extracted references include confidence scores and metadata where available.

Classes

ReferenceExtractor()

Extract and index cross-references from linguistic datasets.

This class provides methods to extract cross-references from loaded dataset models and build efficient indices for mapping lookups.

ATTRIBUTE DESCRIPTION
mapping_index

Bidirectional index for all extracted mappings.

TYPE: MappingIndex

verbnet_refs

VerbNet member cross-references by verbnet_key.

TYPE: dict[str, VerbNetCrossRefs]

propbank_refs

PropBank roleset cross-references by roleset_id.

TYPE: dict[str, PropBankCrossRefs]

framenet_relations

FrameNet frame relations by frame_id.

TYPE: dict[int, list[FrameRelation]]

wordnet_sense_index

WordNet sense key to synset offset mapping.

TYPE: dict[str, str]

METHOD DESCRIPTION
extract_all

Extract references from all datasets.

extract_verbnet_references

Extract VerbNet member cross-references.

extract_propbank_references

Extract PropBank roleset cross-references.

extract_framenet_relations

Extract FrameNet frame and FE relations.

extract_wordnet_mappings

Build WordNet sense and synset indices.

Initialize the reference extractor.

Source code in src/glazing/references/extractor.py
def __init__(self) -> None:
    """Initialize the reference extractor."""
    self.mapping_index = MappingIndex()
    self.verbnet_refs: dict[str, VerbNetCrossRefs] = {}
    self.propbank_refs: dict[str, PropBankCrossRefs] = {}
    self.framenet_relations: dict[int, list[FrameRelation]] = defaultdict(list)
    self.wordnet_sense_index: dict[str, str] = {}
Functions
extract_all(framenet: list[Frame] | None = None, propbank: list[Frameset] | None = None, verbnet: list[VerbClass] | None = None, wordnet: tuple[list[Synset], list[Sense]] | None = None) -> None

Extract references from all provided datasets.

PARAMETER DESCRIPTION
framenet

FrameNet frames to process.

TYPE: list[Frame] | None DEFAULT: None

propbank

PropBank framesets to process.

TYPE: list[Frameset] | None DEFAULT: None

verbnet

VerbNet classes to process.

TYPE: list[VerbClass] | None DEFAULT: None

wordnet

WordNet synsets and senses to process.

TYPE: tuple[list[Synset], list[Sense]] | None DEFAULT: None

Source code in src/glazing/references/extractor.py
def extract_all(
    self,
    framenet: list[Frame] | None = None,
    propbank: list[Frameset] | None = None,
    verbnet: list[VerbClass] | None = None,
    wordnet: tuple[list[Synset], list[Sense]] | None = None,
) -> None:
    """Extract references from all provided datasets.

    Parameters
    ----------
    framenet : list[Frame] | None, default=None
        FrameNet frames to process.
    propbank : list[Frameset] | None, default=None
        PropBank framesets to process.
    verbnet : list[VerbClass] | None, default=None
        VerbNet classes to process.
    wordnet : tuple[list[Synset], list[Sense]] | None, default=None
        WordNet synsets and senses to process.
    """
    if verbnet:
        self.extract_verbnet_references(verbnet)
    if propbank:
        self.extract_propbank_references(propbank)
    if framenet:
        self.extract_framenet_relations(framenet)
    if wordnet:
        synsets, senses = wordnet
        self.extract_wordnet_mappings(synsets, senses)
extract_framenet_relations(frames: list[Frame]) -> None

Extract frame relations and FE mappings from FrameNet.

Processes frame-to-frame relations and frame element mappings.

PARAMETER DESCRIPTION
frames

FrameNet frames to process.

TYPE: list[Frame]

Source code in src/glazing/references/extractor.py
def extract_framenet_relations(self, frames: list[Frame]) -> None:
    """Extract frame relations and FE mappings from FrameNet.

    Processes frame-to-frame relations and frame element mappings.

    Parameters
    ----------
    frames : list[Frame]
        FrameNet frames to process.
    """
    for frame in frames:
        # Store frame relations
        self.framenet_relations[frame.id] = frame.frame_relations

        # Index frame relations
        for relation in frame.frame_relations:
            if relation.type in ["Inherits from", "Is Inherited by"]:
                # Create inheritance mapping
                source_id = relation.sub_frame_id or frame.id
                target_id = relation.super_frame_id or frame.id

                if source_id and target_id and source_id != target_id:
                    mapping = CrossReference(
                        source_dataset="framenet",
                        source_id=str(source_id),
                        source_version="1.7",
                        target_dataset="framenet",
                        target_id=str(target_id),
                        mapping_type="direct",
                        confidence=MappingConfidence(
                            score=1.0,
                            method="inheritance",
                            factors={"inheritance_score": 1.0},
                        ),
                        metadata=MappingMetadata(
                            created_date=datetime.now(UTC),
                            created_by="framenet",
                            version="1.7",
                            validation_status="validated",
                            notes=f"Frame relation: {relation.type}",
                        ),
                    )
                    self.mapping_index.add_mapping(mapping)

        # Extract lexical unit WordNet mappings if available
        for lu in frame.lexical_units:
            self._extract_lu_mappings(lu, frame.name)
extract_propbank_references(framesets: list[Frameset]) -> None

Extract cross-references from PropBank framesets.

Processes rolesets to extract VerbNet and FrameNet mappings via rolelinks and lexlinks.

PARAMETER DESCRIPTION
framesets

PropBank framesets to process.

TYPE: list[Frameset]

Source code in src/glazing/references/extractor.py
def extract_propbank_references(self, framesets: list[Frameset]) -> None:
    """Extract cross-references from PropBank framesets.

    Processes rolesets to extract VerbNet and FrameNet mappings via
    rolelinks and lexlinks.

    Parameters
    ----------
    framesets : list[Frameset]
        PropBank framesets to process.
    """
    for frameset in framesets:
        for roleset in frameset.rolesets:
            pb_refs = PropBankCrossRefs(
                roleset_id=roleset.id,
                rolelinks=[],
                lexlinks=roleset.lexlinks,
                wn_mappings=[],  # PropBank doesn't directly map to WordNet
            )

            # Extract rolelinks from roles
            for role in roleset.roles:
                pb_refs.rolelinks.extend(role.rolelinks)

            self.propbank_refs[roleset.id] = pb_refs

            # Add to mapping index
            self._index_propbank_mappings(roleset)
extract_verbnet_references(verb_classes: list[VerbClass]) -> None

Extract cross-references from VerbNet classes.

Processes VerbNet members to extract FrameNet, PropBank, and WordNet mappings. Handles subclasses recursively.

PARAMETER DESCRIPTION
verb_classes

VerbNet classes to process.

TYPE: list[VerbClass]

Source code in src/glazing/references/extractor.py
def extract_verbnet_references(self, verb_classes: list[VerbClass]) -> None:
    """Extract cross-references from VerbNet classes.

    Processes VerbNet members to extract FrameNet, PropBank, and WordNet
    mappings. Handles subclasses recursively.

    Parameters
    ----------
    verb_classes : list[VerbClass]
        VerbNet classes to process.
    """
    for verb_class in verb_classes:
        self._extract_class_references(verb_class)
extract_wordnet_mappings(synsets: list[Synset], senses: list[Sense]) -> None

Build WordNet sense and synset indices.

Creates mappings between sense keys and synset offsets for cross-reference resolution.

PARAMETER DESCRIPTION
synsets

WordNet synsets to index.

TYPE: list[Synset]

senses

WordNet senses to index.

TYPE: list[Sense]

Source code in src/glazing/references/extractor.py
def extract_wordnet_mappings(self, synsets: list[Synset], senses: list[Sense]) -> None:
    """Build WordNet sense and synset indices.

    Creates mappings between sense keys and synset offsets for
    cross-reference resolution.

    Parameters
    ----------
    synsets : list[Synset]
        WordNet synsets to index.
    senses : list[Sense]
        WordNet senses to index.
    """
    # Build sense key to synset offset index
    for sense in senses:
        self.wordnet_sense_index[sense.sense_key] = sense.synset_offset

    # Index synsets by offset for fast lookup
    synset_index = {synset.offset: synset for synset in synsets}

    # Create internal WordNet mappings (sense to synset)
    for sense in senses:
        if sense.synset_offset in synset_index:
            mapping = CrossReference(
                source_dataset="wordnet",
                source_id=sense.sense_key,
                source_version="3.1",
                target_dataset="wordnet",
                target_id=sense.synset_offset,
                mapping_type="direct",
                confidence=MappingConfidence(
                    score=1.0,
                    method="internal",
                    factors={},
                ),
                metadata=MappingMetadata(
                    created_date=datetime.now(UTC),
                    created_by="wordnet",
                    version="3.1",
                    validation_status="validated",
                    notes="Sense to synset mapping",
                ),
            )
            self.mapping_index.add_mapping(mapping)
get_mappings_for_entity(entity_id: str, source_dataset: DatasetType) -> list[CrossReference]

Get all mappings for a specific entity.

PARAMETER DESCRIPTION
entity_id

Entity identifier in the source dataset.

TYPE: str

source_dataset

Source dataset type.

TYPE: DatasetType

RETURNS DESCRIPTION
list[CrossReference]

All mappings from the specified entity.

Source code in src/glazing/references/extractor.py
def get_mappings_for_entity(
    self, entity_id: str, source_dataset: DatasetType
) -> list[CrossReference]:
    """Get all mappings for a specific entity.

    Parameters
    ----------
    entity_id : str
        Entity identifier in the source dataset.
    source_dataset : DatasetType
        Source dataset type.

    Returns
    -------
    list[CrossReference]
        All mappings from the specified entity.
    """
    key = f"{source_dataset}:{entity_id}"
    return self.mapping_index.forward_index.get(key, [])
get_reverse_mappings(entity_id: str, target_dataset: DatasetType) -> list[CrossReference]

Get all mappings targeting a specific entity.

PARAMETER DESCRIPTION
entity_id

Entity identifier in the target dataset.

TYPE: str

target_dataset

Target dataset type.

TYPE: DatasetType

RETURNS DESCRIPTION
list[CrossReference]

All mappings to the specified entity.

Source code in src/glazing/references/extractor.py
def get_reverse_mappings(
    self, entity_id: str, target_dataset: DatasetType
) -> list[CrossReference]:
    """Get all mappings targeting a specific entity.

    Parameters
    ----------
    entity_id : str
        Entity identifier in the target dataset.
    target_dataset : DatasetType
        Target dataset type.

    Returns
    -------
    list[CrossReference]
        All mappings to the specified entity.
    """
    key = f"{target_dataset}:{entity_id}"
    return self.mapping_index.reverse_index.get(key, [])