Skip to content

glazing.initialize

Initialization and setup functions.

initialize

Initialize glazing by downloading and converting all datasets.

This module provides functionality to automatically download and convert all linguistic datasets on first use or installation.

FUNCTION DESCRIPTION
check_initialization

Check if datasets are initialized.

get_default_data_dir

Get the default data directory for glazing.

get_default_data_path

Get the default path for a converted data file.

initialize_datasets

Download and convert all datasets.

main

Set up all datasets. Downloads raw data and converts to JSON Lines format.

Classes

Functions

check_initialization(data_dir: Path | None = None) -> bool

Check if datasets are initialized.

PARAMETER DESCRIPTION
data_dir

Data directory to check.

TYPE: Path | None DEFAULT: None

RETURNS DESCRIPTION
bool

True if initialized, False otherwise.

Source code in src/glazing/initialize.py
def check_initialization(data_dir: Path | None = None) -> bool:
    """Check if datasets are initialized.

    Parameters
    ----------
    data_dir : Path | None
        Data directory to check.

    Returns
    -------
    bool
        True if initialized, False otherwise.
    """
    if data_dir is None:
        data_dir = get_default_data_dir()

    marker_file = data_dir / ".initialized"
    return marker_file.exists()

get_default_data_dir() -> Path

Get the default data directory for glazing.

RETURNS DESCRIPTION
Path

Default data directory path.

Source code in src/glazing/initialize.py
def get_default_data_dir() -> Path:
    """Get the default data directory for glazing.

    Returns
    -------
    Path
        Default data directory path.
    """
    # Check GLAZING_DATA_DIR first (used in Docker and for custom installations)
    glazing_data = os.environ.get("GLAZING_DATA_DIR")
    if glazing_data:
        return Path(glazing_data)

    # Use XDG_DATA_HOME if available, otherwise ~/.local/share
    xdg_data = os.environ.get("XDG_DATA_HOME")
    base_dir = Path(xdg_data) if xdg_data else Path.home() / ".local" / "share"

    return base_dir / "glazing"

get_default_data_path(filename: str | None = None) -> Path

Get the default path for a converted data file.

PARAMETER DESCRIPTION
filename

Filename to append to the converted data directory. If None, returns the converted directory path.

TYPE: str | None DEFAULT: None

RETURNS DESCRIPTION
Path

Path to the data file or directory.

Source code in src/glazing/initialize.py
def get_default_data_path(filename: str | None = None) -> Path:
    """Get the default path for a converted data file.

    Parameters
    ----------
    filename : str | None, optional
        Filename to append to the converted data directory.
        If None, returns the converted directory path.

    Returns
    -------
    Path
        Path to the data file or directory.
    """
    base = get_default_data_dir() / "converted"
    return base / filename if filename else base

initialize_datasets(data_dir: Path | None = None, force: bool = False, verbose: bool = True) -> bool

Download and convert all datasets.

PARAMETER DESCRIPTION
data_dir

Directory to store data. If None, uses default.

TYPE: Path | None DEFAULT: None

force

Force re-download even if data exists.

TYPE: bool DEFAULT: False

verbose

Print progress messages.

TYPE: bool DEFAULT: True

RETURNS DESCRIPTION
bool

True if successful, False otherwise.

Source code in src/glazing/initialize.py
def initialize_datasets(
    data_dir: Path | None = None, force: bool = False, verbose: bool = True
) -> bool:
    """Download and convert all datasets.

    Parameters
    ----------
    data_dir : Path | None
        Directory to store data. If None, uses default.
    force : bool
        Force re-download even if data exists.
    verbose : bool
        Print progress messages.

    Returns
    -------
    bool
        True if successful, False otherwise.
    """
    if data_dir is None:
        data_dir = get_default_data_dir()

    data_dir = Path(data_dir)
    data_dir.mkdir(parents=True, exist_ok=True)

    # Check if already initialized (unless force is True)
    marker_file = data_dir / ".initialized"
    if marker_file.exists() and not force:
        if verbose:
            click.echo("Datasets already initialized. Use --force to re-download.")
        return True

    if verbose:
        click.echo(f"Initializing glazing datasets in {data_dir}")
        click.echo("=" * 60)

    # Process each dataset
    datasets = ["verbnet", "propbank", "wordnet", "framenet"]
    results = [_process_dataset(name, data_dir, verbose) for name in datasets]
    success = all(results)

    # Create marker file
    if success:
        marker_file.touch()
        if verbose:
            click.echo("\n" + "=" * 60)
            click.echo("✅ All datasets successfully initialized!")
            click.echo(f"Data location: {data_dir}")
    elif verbose:
        click.echo("\n⚠️  Some datasets failed to initialize", err=True)

    return success

main(data_dir: str | Path | None, force: bool, quiet: bool) -> None

Set up all datasets. Downloads raw data and converts to JSON Lines format.

Source code in src/glazing/initialize.py
@click.command()
@click.option(
    "--data-dir",
    type=click.Path(),
    help="Directory to store datasets (default: ~/.local/share/glazing)",
)
@click.option("--force", is_flag=True, help="Force re-download even if data exists")
@click.option("--quiet", is_flag=True, help="Suppress output messages")
def main(data_dir: str | Path | None, force: bool, quiet: bool) -> None:
    """Set up all datasets. Downloads raw data and converts to JSON Lines format."""
    # Convert to Path if provided
    if data_dir is not None:
        data_dir = Path(data_dir)

    success = initialize_datasets(data_dir=data_dir, force=force, verbose=not quiet)

    sys.exit(0 if success else 1)