Pipelines

Pipelines is a library designed to generate and evaluate data analysis pipelines.

Transformation interface

Pipelines.trainFunction
train(repository::Repository, card::Card, source; schema = nothing)::CardState

Return a trained model for a given card on a table table in the database repository.db.

source
Pipelines.evaluateFunction
evaluate(
    repository::Repository,
    card::Card,
    state::CardState,
    (source, destination)::Pair,
    id::AbstractString;
    schema = nothing
)

Replace table destination in the database repository.db with the outcome of executing the card on the table source. The new table destination will have an additional column id, to be joined with the row number of the original table.

Here, state represents the result of train(repository, card, source; schema). See also train.

source
Pipelines.get_inputsFunction
get_inputs(c::Card; invert::Bool = false, train::Bool = !invert)::Vector{String}

Return the list of inputs for a given card.

source
Pipelines.get_outputsFunction
get_outputs(c::Card; invert::Bool = false)::Vector{String}

Return the list of outputs for a given card.

source

Pipeline computation

Pipelines.NodeType
Node(
    card::Card,
    state = CardState();
    update::Bool = true
)

Generate a Node object from a Card.

source
Pipelines.train!Function
train!(
    repository::Repository,
    node::Node,
    table::AbstractString;
    schema = nothing
)

Train node on table table in repository. The field state of node is modified.

See also evaljoin, train_evaljoin!.

source
Pipelines.evaljoinFunction
evaljoin(
    repository::Repository,
    nodes::AbstractVector,
    table::AbstractString,
    [keep_vars];
    schema = nothing
)

evaljoin(
    repository::Repository,
    node::Node,
    (source, destination)::Pair,
    [keep_vars];
    schema = nothing
)

Replace table in the database repository.db with the outcome of executing all the transformations in nodes, without training the nodes. The resulting outputs of the pipeline are joined with the original columns keep_vars (defaults to keeping all columns).

If only a node is provided, then one should pass both source and destination tables.

See also train!, train_evaljoin!.

Return pipeline graph and metadata.

source
Pipelines.train_evaljoin!Function
train_evaljoin!(
    repository::Repository,
    nodes::AbstractVector,
    table::AbstractString,
    [keep_vars];
    schema = nothing
)

train_evaljoin!(
    repository::Repository,
    node::Node,
    (source, destination)::Pair,
    [keep_vars];
    schema = nothing
)

Replace table in the database repository.db with the outcome of executing all the transformations in nodes, after having trained the nodes. The resulting outputs of the pipeline are joined with the original columns keep_vars (defaults to keeping all columns).

If only a node is provided, then one should pass both source and destination tables.

See also train!, evaljoin.

Return pipeline graph and metadata.

source

Pipeline reports

Pipelines.reportFunction
report(repository::Repository, nodes::AbstractVector)

Create default reports for all nodes referring to a given repository. Each node must be of type Node.

source
report(::Repository, ::Card, ::CardState)

Overload this method (replacing Card with a specific card type) to implement a default report for a given card type.

source

Pipeline visualizations

Pipelines.visualizeFunction
visualize(repository::Repository, nodes::AbstractVector)

Create default visualizations for all nodes referring to a given repository. Each node must be of type Node.

source
visualize(::Repository, ::Card, ::CardState)

Overload this method (replacing Card with a specific card type) to implement a default visualization for a given card type.

source

Cards

Pipelines.SplitCardType
struct SplitCard <: Card
    type::String
    label::String
    method::String
    splitter::SplittingMethod
    order_by::Vector{String}
    by::Vector{String}
    output::String
end

Card to split the data into two groups according to a given function splitter.

Currently supported methods are

  • tiles (requires tiles argument, e.g., tiles = [1, 1, 2, 1, 1, 2]),
  • percentile (requires percentile argument, e.g. percentile = 0.9).
source
Pipelines.RescaleCardType
struct RescaleCard <: Card
    type::String
    label::String
    by::Vector{String}
    inputs::Vector{String}
    targets::Vector{String}
    partition::Union{String, Nothing}
    suffix::String
    target_suffix::Union{String, Nothing}
end

Card to rescale one or more columns according to a given rescaler. The supported methods are

  • zscore,
  • maxabs,
  • minmax,
  • log,
  • logistic.

The resulting rescaled variable is added to the table under the name "$(originalname)_$(suffix)".

source
Pipelines.ClusterCardType
struct ClusterCard <: Card
    type::String
    label::String
    method::String
    clusterer::ClusteringMethod
    inputs::Vector{String}
    weights::Union{String, Nothing}
    partition::Union{String, Nothing}
    output::String
end

Cluster inputs based on clusterer. Save resulting column as output.

source
Pipelines.DimensionalityReductionCardType
struct DimensionalityReductionCard <: Card
    type::String
    label::String
    method::String
    projector::ProjectionMethod
    inputs::Vector{String}
    partition::Union{String, Nothing}
    n_components::Int
    output::String
end

Project inputs based on projector. Save resulting column as output.

source
Pipelines.GLMCardType
struct GLMCard <: Card
  type::String
  label::String
  distribution_name::String
  distribution::Distribution
  link_name::Union{String, Nothing}
  link::Link
  inputs::Vector{Any}
  target::String
  formula::FormulaTerm
  weights::Union{String, Nothing}
  partition::Union{String, Nothing}
  suffix::String
end

Run a Generalized Linear Model (GLM) based on formula.

source
Pipelines.InterpCardType
struct InterpCard <: Card
    type::String
    label::String
    method::String
    interpolator::InterpolationMethod
    input::String
    targets::Vector{String}
    partition::Union{String, Nothing} = nothing
    suffix::String = "hat"
end

Interpolate targets based on input.

source
Pipelines.GaussianEncodingCardType
struct GaussianEncodingCard <: Card

Defines a card for applying Gaussian transformations to a specified column.

Fields:

  • type::String: Card type, i.e., "gaussian_encoding".
  • label::String: Label to represent the card in a UI.
  • method::String: Name of the processing method (see below).
  • temporal_preprocessor::TemporalProcessingMethod: Tranformation to process a given column (see below).
  • input::String: Name of the column to transform.
  • n_components::Int: Number of Gaussian curves to generate.
  • lambda::Float64: Coefficient for scaling the standard deviation.
  • suffix::String: Suffix added to the output column names.

Notes:

  • The method field determines the preprocessing applied to the column.
  • No automatic selection based on column type. The user must ensure compatibility:
    • "identity": Assumes the column is numeric.
    • "dayofyear": Assumes the column is a date or timestamp.
    • "hourofday": Assumes the column is a time or timestamp.

Methods:

  • Defined in the TEMPORAL_PREPROCESSING_METHODS dictionary:
    • "identity": No transformation.
    • "dayofyear": Applies the SQL dayofyear function.
    • "hourofday": Applies the SQL hour function.
    • "minuteofhour": Computes the minute within the hour.
    • "minuteofday": Computes the minute within the day.

Train:

  • Returns: SimpleTable (Dict{String, AbstractVector}) with Gaussian parameters:
    • σ: Standard deviation for Gaussian transformations.
    • d: Normalization value.
    • μ_1, μ_2, ..., μ_n: Gaussian means.

Evaluate:

  • Steps:
    1. Preprocesses the column using the specified method.
    2. Temporarily registers the Gaussian parameters (params_tbl) using with_table.
    3. Joins the source table with the params table via a CROSS JOIN.
    4. Computes Gaussian-transformed columns.
    5. Selects only the required columns (original and transformed).
    6. Replaces the target table with the final results.
source
Pipelines.StreamlinerCardType
struct StreamlinerCard <: Card
    type::String
    label::String
    model_name::String
    model::Model
    training_name::String
    training::Training
    order_by::Vector{String}
    inputs::Vector{String}
    targets::Vector{String}
    partition::Union{String, Nothing} = nothing
    suffix::String = "hat"
end

Run a Streamliner model, predicting targets from inputs.

source
Pipelines.WildCardType
struct WildCard{train, evaluate} <: Card
    type::String
    label::String
    order_by::Vector{String}
    inputs::Vector{String}
    targets::Vector{String}
    weights::Union{String, Nothing}
    partition::Union{String, Nothing}
    outputs::Vector{String}
end

Custom card that uses arbitrary training and evaluations functions.

source

Card registration

Pipelines.CardConfigType
@kwdef struct CardConfig{T <: Card}
    key::String
    label::String
    needs_targets::Bool
    needs_order::Bool
    allows_weights::Bool
    allows_partition::Bool
    widget_configs::StringDict = StringDict()
    methods::StringDict = StringDict()
end

Configuration used to register a card.

source