diff --git a/docs/advanced/model_optimization.rst b/docs/advanced/model_optimization.rst
new file mode 100644
index 0000000000000000000000000000000000000000..9465353c2e18a836dc86557a277d5258f9502f5e
--- /dev/null
+++ b/docs/advanced/model_optimization.rst
@@ -0,0 +1,127 @@
+========================
+hls4ml Optimization API
+========================
+
+Pruning and weight sharing are effective techniques to reduce model footprint and computational requirements. The hls4ml Optimization API introduces hardware-aware pruning and weight sharing.
+By defining custom objectives, the algorithm solves a Knapsack optimization problem aimed at maximizing model performance, while keeping the target resource(s) at a minimum. Out-of-the box objectives include network sparsity, GPU FLOPs, Vivado DSPs, memory utilization etc.
+
+The code block below showcases three use cases of the hls4ml Optimization API - network sparsity (unstructured pruning), GPU FLOPs (structured pruning) and Vivado DSP utilization (pattern pruning). First, we start with unstructured pruning:
+
+.. code-block:: Python
+    from sklearn.metrics import accuracy_score
+    from tensorflow.keras.optimizers import Adam
+    from tensorflow.keras.metrics import CategoricalAccuracy
+    from tensorflow.keras.losses import CategoricalCrossentropy
+    from hls4ml.optimization.keras import optimize_model
+    from hls4ml.optimization.keras.utils import get_model_sparsity
+    from hls4ml.optimization.attributes import get_attributes_from_keras_model
+    from hls4ml.optimization.objectives import ParameterEstimator
+    from hls4ml.optimization.scheduler import PolynomialScheduler
+    # Define baseline model and load data
+    # X_train, y_train = ...
+    # X_val, y_val = ...
+    # X_test, y_test = ...
+    # baseline_model = ...
+    # Evaluate baseline model
+    y_baseline = baseline_model.predict(X_test)
+    acc_base = accuracy_score(np.argmax(y_test, axis=1), np.argmax(y_baseline, axis=1))
+    sparsity, layers = get_model_sparsity(baseline_model)
+    print(f'Baseline Keras accuracy: {acc_base}')
+    print(f'Baseline Keras sparsity, overall: {sparsity}')
+    print(f'Baseline Keras sparsity, per-layer: {layers}')
+    # Defining training parameters
+    # Epochs refers to the number of maximum epochs to train a model, after imposing some sparsity
+    # If the model is pre-trained, a good rule of thumb is to use between a 1/3 and 1/2 of the number of epochs used to train baseline model
+    epochs = 10
+    batch_size = 128
+    metric = 'accuracy'
+    optimizer = Adam()
+    loss_fn = CategoricalCrossentropy(from_logits=True)
+
+    # Define the metric to monitor, as well as if its increasing or decreasing
+    # This disctinction allows us to optimize both regression and classification models
+    # In regression, e.g. minimize validation MSE & for classification e.g. maximize accuracy
+    metric, increasing = CategoricalAccuracy(), True
+    # Relative tolerance (rtol) is the the relative loss in metric the optimized model is allowed to incur
+    rtol = 0.975
+
+    # A scheduler defines how the sparsity is incremented at each step
+    # In this case, the maximum sparsity is 50% and it will be applied at a polynomially decreasing rate, for 10 steps
+    # If the final sparsity is unspecified, it is set to 100%
+    # The optimization algorithm stops either when (i) the relative drop in performance is below threshold or (ii) final sparsity reached
+    scheduler = PolynomialScheduler(5, final_sparsity=0.5)
+    # Get model attributes
+    model_attributes = get_attributes_from_keras_model(baseline_model)
+
+    # Optimize model
+    # ParameterEstimator is the objective and, in this case, the objective is to minimize the total number of parameters
+    optimized_model = optimize_model(
+        baseline_model, model_attributes, ParameterEstimator, scheduler,
+        X_train, y_train, X_val, y_val, batch_size, epochs, optimizer, loss_fn, metric, increasing, rtol
+    )
+    # Evaluate optimized model
+    y_optimized = optimized_model.predict(X_test)
+    acc_optimized = accuracy_score(np.argmax(y_test, axis=1), np.argmax(y_optimized, axis=1))
+    sparsity, layers = get_model_sparsity(optimized_model)
+    print(f'Optimized Keras accuracy: {acc_optimized}')
+    print(f'Optimized Keras sparsity, overall: {sparsity}')
+    print(f'Opimized Keras sparsity, per-layer: {layers}')
+
+In a similar manner, it is possible to target GPU FLOPs or Vivado DSPs. However, in that case, sparsity is not equivalent to model sparsity.
+Instead, it is the sparsity of the target resource. As an example: Starting with a network utilizing 512 DSPs and a final sparsity of 50%; the optimized network will use 256 DSPs.
+
+To optimize GPU FLOPs, the code is similar to above:
+.. code-block:: Python
+    from hls4ml.optimization.objectives.gpu_objectives import GPUFLOPEstimator
+
+    # Optimize model
+    # Note the change from ParameterEstimator to GPUFLOPEstimator
+    optimized_model = optimize_model(
+        baseline_model, model_attributes, GPUFLOPEstimator, scheduler,
+        X_train, y_train, X_val, y_val, batch_size, epochs, optimizer, loss_fn, metric, increasing, rtol
+    )
+
+    # Evaluate optimized model
+    y_optimized = optimized_model.predict(X_test)
+    acc_optimized = accuracy_score(np.argmax(y_test, axis=1), np.argmax(y_optimized, axis=1))
+    print(f'Optimized Keras accuracy: {acc_optimized}')
+    # Note the difference in total number of parameters
+    # Optimizing GPU FLOPs is equivalent to removing entire structures (filters, neurons) from the network
+    print(baseline_model.summary())
+    print(optimized_model.summary())
+
+Finally, optimizing Vivado DSPs is possible, given a hls4ml config:
+.. code-block:: Python
+    from hls4ml.utils.config import config_from_keras_model
+    from hls4ml.optimization.objectives.vivado_objectives import VivadoDSPEstimator
+
+    # Note the change from optimize_model to optimize_keras_for_hls4ml
+    # The function optimize_keras_for_hls4ml acts as a wrapper for the function, parsing hls4ml config to model attributes
+    from hls4ml.optimization import optimize_keras_for_hls4ml
+
+    # Create hls4ml config
+    default_reuse_factor = 4
+    default_precision = 'ac_fixed<16, 6>'
+    hls_config = config_from_keras_model(baseline_model, granularity='name', default_precision=default_precision, default_reuse_factor=default_reuse_factor)
+    hls_config['IOType'] = 'io_parallel'
+     hls_config['Model']['Strategy'] = 'Resource'   # Strategy must be present for optimisation
+
+    # Optimize model
+    # Note the change from ParameterEstimator to VivadoDSPEstimator
+    optimized_model = optimize_keras_for_hls4ml(
+        baseline_model, model_attributes, VivadoDSPEstimator, scheduler,
+        X_train, y_train, X_val, y_val, batch_size, epochs,
+        optimizer, loss_fn, metric, increasing, rtol
+    )
+
+    # Evaluate optimized model
+    y_optimized = optimized_model.predict(X_test)
+    acc_optimized = accuracy_score(np.argmax(y_test, axis=1), np.argmax(y_optimized, axis=1))
+    print(f'Optimized Keras accuracy: {acc_optimized}')
+
+There are two more Vivado "optimizers" - VivadoFFEstimator, aimed at reducing register utilisation and VivadoMultiObjectiveEstimator, aimed at optimising BRAM and DSP utilisation.
+Note, to ensure DSPs are optimized, "unrolled" Dense multiplication must be used before synthesing HLS, by modifying the config:
+.. code-block:: Python
+    hls_config = config_from_keras_model(optimized_model)
+    hls_config['Model']['DenseResourceImplementation'] = 'Unrolled'
+    # Any addition hls4ml config, such as strategy, reuse factor etc...
diff --git a/hls4ml/optimization/__init__.py b/hls4ml/optimization/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8301f58925ce028bbba15a356e2ef04196f4d27
--- /dev/null
+++ b/hls4ml/optimization/__init__.py
@@ -0,0 +1,107 @@
+import numpy as np
+
+from hls4ml.optimization.attributes import get_attributes_from_keras_model_and_hls4ml_config
+from hls4ml.optimization.keras import optimize_model
+
+default_regularization_range = np.logspace(-6, -2, num=16).tolist()
+
+
+def optimize_keras_for_hls4ml(
+    keras_model,
+    hls_config,
+    objective,
+    scheduler,
+    X_train,
+    y_train,
+    X_val,
+    y_val,
+    batch_size,
+    epochs,
+    optimizer,
+    loss_fn,
+    validation_metric,
+    increasing,
+    rtol,
+    callbacks=None,
+    ranking_metric='l1',
+    local=False,
+    verbose=False,
+    rewinding_epochs=1,
+    cutoff_bad_trials=3,
+    directory='hls4ml-optimization',
+    tuner='Bayesian',
+    knapsack_solver='CBC_MIP',
+    regularization_range=default_regularization_range,
+):
+    '''
+    Top-level function for optimizing a Keras model, given hls4ml config and a hardware objective(s)
+
+    Args:
+    - keras_model (keras.Model): Model to be optimized
+    - hls_config (dict): hls4ml configuration, obtained from hls4ml.utils.config.config_from_keras_model(...)
+    - objective (hls4ml.optimization.objectives.ObjectiveEstimator):
+        Parameter, hardware or user-defined objective of optimization
+    - scheduler (hls4ml.optimization.schduler.OptimizationScheduler):
+        Sparsity scheduler, choose between constant, polynomial and binary
+    - X_train (np.array): Training inputs
+    - y_train (np.array): Training labels
+    - X_val (np.array): Validation inputs
+    - y_val (np.array): Validation labels
+    - batch_size (int): Batch size during training
+    - epochs (int): Maximum number of epochs to fine-tune model, in one iteration of pruning
+    - optimizer (keras.optimizers.Optimizer or equivalent-string description): Optimizer used during training
+    - loss_fn (keras.losses.Loss or equivalent loss description): Loss function used during training
+    - validation_metric (keras.metrics.Metric or equivalent loss description): Validation metric, used as a baseline
+    - increasing (boolean): If the metric improves with increased values;
+        e.g. accuracy -> increasing = True, MSE -> increasing = False
+    - rtol (float): Relative tolerance;
+        pruning stops when pruned_validation_metric < (or >) rtol * baseline_validation_metric
+
+    Kwargs:
+    - callbacks (list of keras.callbacks.Callback) Currently not supported, developed in future versions
+    - ranking_metric (string): Metric used for rannking weights and structures;
+        currently supported l1, l2, saliency and Oracle
+    - local (boolean): Layer-wise or global pruning
+    - verbose (boolean): Display debug logs during model optimization
+    - rewinding_epochs (int): Number of epochs to retrain model without weight freezing,
+        allows regrowth of previously pruned weights
+    - cutoff_bad_trials (int): After how many bad trials (performance below threshold),
+        should model pruning / weight sharing stop
+    - directory (string): Directory to store temporary results
+    - tuner (str): Tuning alogorithm, choose between Bayesian, Hyperband and None
+    - knapsack_solver (str): Algorithm to solve Knapsack problem when optimizing;
+        default usually works well; for very large networks, greedy algorithm might be more suitable
+    - regularization_range (list): List of suitable hyperparameters for weight decay
+    '''
+
+    # Extract model attributes
+    model_attributes = get_attributes_from_keras_model_and_hls4ml_config(keras_model, hls_config)
+
+    # Optimize model
+    return optimize_model(
+        keras_model,
+        model_attributes,
+        objective,
+        scheduler,
+        X_train,
+        y_train,
+        X_val,
+        y_val,
+        batch_size,
+        epochs,
+        optimizer,
+        loss_fn,
+        validation_metric,
+        increasing,
+        rtol,
+        callbacks=callbacks,
+        ranking_metric=ranking_metric,
+        local=local,
+        verbose=verbose,
+        rewinding_epochs=rewinding_epochs,
+        cutoff_bad_trials=cutoff_bad_trials,
+        directory=directory,
+        tuner=tuner,
+        knapsack_solver=knapsack_solver,
+        regularization_range=regularization_range,
+    )
diff --git a/hls4ml/optimization/attributes.py b/hls4ml/optimization/attributes.py
new file mode 100644
index 0000000000000000000000000000000000000000..672e2706a0c33d79b6f704c334335d52b30409ed
--- /dev/null
+++ b/hls4ml/optimization/attributes.py
@@ -0,0 +1,246 @@
+import numpy as np
+
+import hls4ml
+from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType
+from hls4ml.optimization.config import SUPPORTED_STRUCTURES
+from hls4ml.optimization.keras.config import SUPPORTED_LAYERS
+
+
+class hls4mlAttributes:
+    '''
+    A class for storing hls4ml information of a single layer
+
+    Args:
+    - n_in (int): Number of inputs (rows) for Dense matrix multiplication
+    - n_out (int): Number of outputs (cols) for Dense matrix multiplication
+    - io_type (string): io_parallel or io_stream
+    - strategy (string): Resource or Latency
+    - weight_precision (FixedPrecisionType): Layer weight precision
+    - output_precision (FixedPrecisionType): Layer output precision
+    - reuse_factor (int): Layer reuse factor
+    - parallelization_factor (int): Layer parallelization factor - [applicable to io_parallel Conv2D]
+    '''
+
+    def __init__(
+        self, n_in, n_out, io_type, strategy, weight_precision, output_precision, reuse_factor, parallelization_factor=1
+    ):
+        if not isinstance(weight_precision, (FixedPrecisionType, IntegerPrecisionType)):
+            raise Exception('Layer weight precision is not in valid format')
+
+        if not isinstance(output_precision, (FixedPrecisionType, IntegerPrecisionType)):
+            raise Exception('Layer weight precision is not in valid format')
+
+        if strategy not in ('Latency', 'latency', 'Resource', 'resource'):
+            raise Exception('Unknown layer strategy')
+
+        if io_type not in ('io_parallel', 'io_stream'):
+            raise Exception('Unknown IO type')
+
+        self.n_in = n_in
+        self.n_out = n_out
+        self.io_type = io_type
+        self.strategy = strategy
+        self.weight_precision = weight_precision
+        self.output_precision = output_precision
+        self.reuse_factor = reuse_factor
+        self.parallelization_factor = parallelization_factor
+
+
+class OptimizationAttributes:
+    '''
+    A class for storing layer optimization attributes
+
+    Args:
+        - structure_type (enum): Targeted structure - unstructured, structured, pattern, block
+        - pruning (boolean): Should pruning be applied to the layer
+        - weight_sharing (boolean): Should weight sharing be applied to the layer
+        - block_shape (tuple): Block shape if structure_type == block
+        - pattern_offset (int): Length of each pattern if structure_type == pattern
+        - consecutive_patterns (int): How many consecutive patterns are grouped together if structure_type == pattern
+
+    Notes:
+        - In the case of hls4ml, pattern_offset is equivalent to the number of weights processed in parallel
+        - The pattern_offset is n_in * n_out / reuse_factor; default case (=1) is equivalent to no unrolling
+    '''
+
+    def __init__(
+        self,
+        structure_type=SUPPORTED_STRUCTURES.UNSTRUCTURED,
+        pruning=False,
+        weight_sharing=False,
+        block_shape=(1, 1),
+        pattern_offset=1,
+        consecutive_patterns=1,
+    ):
+        if not isinstance(structure_type, SUPPORTED_STRUCTURES):
+            raise Exception(f'{self.__class__.__name__} unknown structure type')
+
+        self.structure_type = structure_type
+        self.pruning = pruning
+        self.weight_sharing = weight_sharing
+        self.block_shape = block_shape
+        self.pattern_offset = pattern_offset
+        self.consecutive_patterns = consecutive_patterns
+
+
+class LayerAttributes:
+    '''
+    A class for storing layer information
+
+    Args:
+        - name (string): Layer name
+        - layer_type (keras.Layer): Layer type (e.g. Dense, Conv2D etc.)
+        - inbound_layers (list): List of parent nodes, identified by name
+        - weight_shape (tuple): Layer weight shape
+        - input_shape (tuple): Layer input shape
+        - output_shape (tuple): Layer output shape
+        - optimizable (bool): Should optimizations (pruning, weight sharing) be applied to this layer
+        - optimization_attributes (OptimizationAttributes): Type of optimization,
+            pruning or weight sharing, block shape and pattern offset
+        - args (dict): Additional information,
+            e.g. hls4mlAttributes; dictionary so it can be generic enough for different platforms
+    '''
+
+    def __init__(
+        self,
+        name,
+        layer_type,
+        inbound_layers,
+        weight_shape,
+        input_shape,
+        output_shape,
+        optimizable,
+        optimization_attributes,
+        args,
+    ):
+        self.name = name
+        self.layer_type = layer_type
+        self.inbound_layers = inbound_layers
+        self.weight_shape = weight_shape
+        self.input_shape = input_shape
+        self.output_shape = output_shape
+        self.optimizable = optimizable
+        self.optimization_attributes = optimization_attributes
+        self.args = args
+
+    def update_args(self, updates):
+        self.args.update(updates)
+
+    def __str__(self):
+        return (
+            f'name: {self.name}, '
+            f'layer_type: {self.layer_type}, '
+            f'inbound_layers: {self.inbound_layers}, '
+            f'weight_shape: {self.weight_shape}, '
+            f'input_shape: {self.input_shape}, '
+            f'output_shape: {self.output_shape}, '
+            f'optimizable: {self.optimizable}, '
+            f'optimization_attributes: {self.optimization_attributes}, '
+            f'args: {self.args}, '
+        )
+
+
+def get_attributes_from_keras_model(model):
+    '''
+    Given a Keras model, builds a dictionary of class attributes
+    Additional arguments (e.g. reuse factor), depend on the target hardware platform and are inserted later
+    Per-layer pruning sype (structured, pattern etc.), depend on the pruning objective and are inserted later
+
+    Args:
+        - model (keras.model): Model to extract attributes from
+
+    Return:
+        - model_attributes (dict): Each key corresponds to a layer name, values are instances of LayerAttribute
+    '''
+    is_sequential = model.__class__.__name__ == 'Sequential'
+    model_attributes = {}
+
+    for i, layer in enumerate(model.layers):
+        inbound_layers = []
+        if is_sequential and i > 0:
+            inbound_layers.append(model.layers[i - 1])
+        elif not is_sequential:
+            nodes = model.get_config()['layers'][i]['inbound_nodes']
+            if len(nodes) > 0:
+                inbound_layers.append(node[0] for node in nodes[0])
+
+        layer_weights = layer.get_weights()
+        weight_shape = layer_weights[0].shape if len(layer_weights) > 0 else ()
+
+        model_attributes[layer.name] = LayerAttributes(
+            layer.name,
+            layer.__class__,
+            inbound_layers,
+            weight_shape,
+            layer.input_shape[1:],
+            layer.output_shape[1:],
+            False,
+            OptimizationAttributes(),
+            {},
+        )
+
+    return model_attributes
+
+
+def get_attributes_from_keras_model_and_hls4ml_config(model, config):
+    '''
+    Given a Keras model and hls4ml configuration, builds a dictionary of class attributes
+    Per-layer pruning sype (structured, pruning etc.), depend on the pruning objective and are inserted later
+
+    Args:
+        - model (keras.model): Model to extract attributes from
+        - config (dict): hls4ml dictionary
+
+    Return:
+        - model_attributes (dict): Each key corresponds to a layer name, values are LayerAttribute instances
+    '''
+
+    # Extract Keras attributes
+    model_attributes = get_attributes_from_keras_model(model)
+
+    # Extract hls4ml attributes
+    io_type = config['IOType']
+    default_reuse_factor = config['Model']['ReuseFactor']
+    default_strategy = config['Model']['Strategy']
+    default_precision = config['Model']['Precision']
+
+    # Build dictionary
+    for layer in model_attributes:
+        if model_attributes[layer].layer_type in SUPPORTED_LAYERS:
+            n_in, n_out = __get_layer_mult_size(model_attributes[layer])
+            layer_config = config['LayerName'][layer] if layer in config['LayerName'] else {}
+            reuse_factor = layer_config['ReuseFactor'] if 'ReuseFactor' in layer_config else default_reuse_factor
+            parallelization_factor = layer_config['ParallelizationFactor'] if 'ParallelizationFactor' in layer_config else 1
+            strategy = layer_config['Strategy'] if 'Strategy' in layer_config else default_strategy
+            weight_precision = (
+                layer_config['Precision']['weight'] if 'weight' in layer_config['Precision'] else default_precision
+            )
+            weight_precision = hls4ml.backends.fpga.fpga_backend.FPGABackend.convert_precision_string(weight_precision)
+            output_precision = (
+                layer_config['Precision']['result'] if 'result' in layer_config['Precision'] else default_precision
+            )
+            output_precision = hls4ml.backends.fpga.fpga_backend.FPGABackend.convert_precision_string(output_precision)
+
+            hls4ml_attributes = hls4mlAttributes(
+                n_in, n_out, io_type, strategy, weight_precision, output_precision, reuse_factor, parallelization_factor
+            )
+            model_attributes[layer].update_args({'hls4ml_attributes': hls4ml_attributes})
+
+    return model_attributes
+
+
+def __get_layer_mult_size(attributes):
+    '''
+    Helper function to calculate layer multiplication size
+    '''
+    if 'Dense' in attributes.layer_type.__name__:
+        n_in = np.prod(attributes.input_shape)
+        n_out = np.prod(attributes.output_shape)
+        return n_in, n_out
+
+    if 'Conv' in attributes.layer_type.__name__:
+        n_in = np.prod(attributes.weight_shape[0:-2])
+        n_out = attributes.weight_shape[-1]
+        return n_in, n_out
+
+    raise Exception(f'Cannot get mult size for layer {attributes.name}')
diff --git a/hls4ml/optimization/config.py b/hls4ml/optimization/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..0879c47f6259a1c601674157720eae64d6f572d9
--- /dev/null
+++ b/hls4ml/optimization/config.py
@@ -0,0 +1,47 @@
+from enum import Enum
+
+'''
+A list of currently supported structures when optimizing (pruning, weight sharing)
+For more information, see attributes.py
+
+1. Unstructured:
+    - Pruning: Y
+    - Weight sharing: N
+    - Description: Removes (zeroes out) individual weights
+    - Supports: All layers in SUPPORTED_LAYERS (hls4ml.optimization.keras)
+
+2. Structured:
+    - Pruning: Y
+    - Weight sharing: Y
+    - Description: Zeroes out or quantizes all the weights in a structure:
+        - Dense: Neurons, determined by their outgoing connections (columns in Keras weight tensors)
+        - Conv2D: Filters (structures of size filt_width x filt_height x n_chan)
+        - Notes:
+            - For Dense, it was also possible optimize by incoming connections (rows);
+                However, removing zero neurons becomes harder because of Keras Surgeon
+            - For Conv2D, significant literature explored pruning channels; currently not supported
+    - Supports: All layers in SUPPORTED_LAYERS (hls4ml.optimization.keras)
+
+3. Pattern:
+    - Pruning: Y
+    - Weight sharing: Y
+    - Description: Zeroes out or quantizes all the weights in a group
+       Groups are determined by a variable, n, and every n-th weight in the flattened,
+       Transposed (Resource) weight tensor is collected and stored in the same group
+       Equivalent to pruning/quantizing weight processed by the same DSP in hls4ml
+    - Supports: All layers in SUPPORTED_LAYERS (hls4ml.optimization.keras)
+
+4. Block:
+    - Pruning: Y
+    - Weight sharing: Y
+    - Description: Zeroes out or quantizes all the weights in a block of size (w, h)
+    - Supports: All rank-2 (e.g. Dense, but not Conv2D) layers in SUPPORTED_LAYERS (hls4ml.optimization.keras)
+
+'''
+
+
+class SUPPORTED_STRUCTURES(Enum):
+    UNSTRUCTURED = 'unstructured'
+    STRUCTURED = 'structured'
+    PATTERN = 'pattern'
+    BLOCK = 'block'
diff --git a/hls4ml/optimization/keras/__init__.py b/hls4ml/optimization/keras/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a07f0adac7c83c8e49cc0d083c49a2964711e205
--- /dev/null
+++ b/hls4ml/optimization/keras/__init__.py
@@ -0,0 +1,359 @@
+import os
+import time
+
+import numpy as np
+import tensorflow as tf
+
+# Enables printing of loss tensors during custom training loop
+from tensorflow.python.ops.numpy_ops import np_config
+
+import hls4ml.optimization.keras.utils as utils
+from hls4ml.optimization.config import SUPPORTED_STRUCTURES
+from hls4ml.optimization.keras.builder import build_optimizable_model, remove_custom_regularizers
+from hls4ml.optimization.keras.config import SUPPORTED_LAYERS, SUPPORTED_METRICS, TMP_DIRECTORY
+from hls4ml.optimization.keras.masking import get_model_masks
+from hls4ml.optimization.keras.reduction import reduce_model
+from hls4ml.optimization.scheduler import OptimizationScheduler
+
+np_config.enable_numpy_behavior()
+default_regularization_range = np.logspace(-6, -2, num=16).tolist()
+
+
+def optimize_model(
+    model,
+    model_attributes,
+    objective,
+    scheduler,
+    X_train,
+    y_train,
+    X_val,
+    y_val,
+    batch_size,
+    epochs,
+    optimizer,
+    loss_fn,
+    validation_metric,
+    increasing,
+    rtol,
+    callbacks=None,
+    ranking_metric='l1',
+    local=False,
+    verbose=False,
+    rewinding_epochs=1,
+    cutoff_bad_trials=1,
+    directory=TMP_DIRECTORY,
+    tuner='Bayesian',
+    knapsack_solver='CBC_MIP',
+    regularization_range=default_regularization_range,
+):
+    '''
+    Top-level function for optimizing a Keras model, given objectives
+
+    Args:
+    - model (keras.Model): Model to be optimized
+    - model_attributes (dict): Layer-wise model attributes,
+        obtained from hls4ml.optimization.get_attributes_from_keras_model(...)
+    - objective (hls4ml.optimization.objectives.ObjectiveEstimator):
+        Parameter, hardware or user-defined objective of optimization
+    - scheduler (hls4ml.optimization.schduler.OptimizationScheduler):
+        Sparsity scheduler, choose between constant, polynomial and binary
+    - X_train (np.array): Training inputs
+    - y_train (np.array): Training labels
+    - X_val (np.array): Validation inputs
+    - y_val (np.array): Validation labels
+    - batch_size (int): Batch size during training
+    - epochs (int): Maximum number of epochs to fine-tune model, in one iteration of pruning
+    - optimizer (keras.optimizers.Optimizer or equivalent-string description):
+        Optimizer used during training
+    - loss_fn (keras.losses.Loss or equivalent loss description):
+        Loss function used during training
+    - validation_metric (keras.metrics.Metric or equivalent loss description):
+        Validation metric, used as a baseline
+    - increasing (boolean): If the metric improves with increased values;
+        e.g. accuracy -> increasing = True, MSE -> increasing = False
+    - rtol (float): Relative tolerance;
+        pruning stops when pruned_validation_metric < (or >) rtol * baseline_validation_metric
+
+    Kwargs:
+    - callbacks (list of keras.callbacks.Callback) Currently not supported, developed in future versions
+    - ranking_metric (string): Metric used for rannking weights and structures;
+        currently supported l1, l2, saliency and Oracle
+    - local (boolean): Layer-wise or global pruning
+    - verbose (boolean): Display debug logs during model optimization
+    - rewinding_epochs (int): Number of epochs to retrain model without weight freezing,
+        allows regrowth of previously pruned weights
+    - cutoff_bad_trials (int): After how many bad trials (performance below threshold),
+        should model pruning / weight sharing stop
+    - directory (string): Directory to store temporary results
+    - tuner (str): Tuning alogorithm, choose between Bayesian, Hyperband and None
+    - knapsack_solver (str): Algorithm to solve Knapsack problem when optimizing;
+        default usually works well; for very large networks, greedy algorithm might be more suitable
+    - regularization_range (list): List of suitable hyperparameters for weight decay
+    '''
+
+    if not isinstance(scheduler, OptimizationScheduler):
+        raise Exception(
+            'Scheduler must be an instance of from hls4ml.optimization.scheduler.OptimizationScheduler'
+            'If you provided string description (e.g. \'constant\')'
+            'Please use an object instance (i.e. ConstantScheduler()).'
+            'For a full list of supported schedulers, refer to hls4ml.optimization.scheduler.'
+        )
+
+    if epochs <= rewinding_epochs:
+        raise Exception(
+            'Please increase the number of epochs. \
+                       The current epoch number is too small to perform effective pruning & weight rewinding'
+        )
+
+    if ranking_metric not in SUPPORTED_METRICS:
+        raise Exception('Unknown metric for ranking weights')
+
+    # Loss function needs to be converted to a function, string description cannot be used during custom training loop
+    if isinstance(loss_fn, str):
+        loss_fn = tf.keras.losses.get(loss_fn)
+
+    # Split data set into batches
+    train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
+    train_dataset = train_dataset.shuffle(buffer_size=1024).batch(batch_size)
+    validation_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))
+    validation_dataset = validation_dataset.shuffle(buffer_size=1024).batch(batch_size)
+
+    # Evaluate baseline performance
+    # Use built-in function, and return as list - the metric is the second element (loss if first)
+    model.compile(optimizer, loss_fn, metrics=[validation_metric])
+    baseline_performance = model.evaluate(validation_dataset, verbose=0, return_dict=False)[-1]
+    if verbose:
+        print(f'Baseline performance on validation set: {baseline_performance}')
+
+    # Save best weights
+    # Always save weights to a file, to reduce memory utilization
+    if not os.path.isdir(directory):
+        os.mkdir(directory)
+    if not os.path.isdir(f'{directory}/optimization'):
+        os.mkdir(f'{directory}/optimization')
+    model.save_weights(f'{directory}/optimization/best_weights.h5')
+
+    # Identify optimizable layers, given the current objective
+    last_optimizable_layer = utils.get_last_layer_with_weights(model)
+    for i, layer in enumerate(model.layers):
+        if isinstance(layer, SUPPORTED_LAYERS):
+            optimizable, optimization_attributes = objective.is_layer_optimizable(model_attributes[layer.name])
+            model_attributes[layer.name].optimizable = optimizable
+            model_attributes[layer.name].optimization_attributes = optimization_attributes
+
+            # In the last layer, structured pruning can't be applied, as it removes output labels
+            # Weight sharing, as well as all other types of pruning (unstructured, block etc.) are applicable
+            if (
+                i >= last_optimizable_layer
+                and optimization_attributes.structure_type == SUPPORTED_STRUCTURES.STRUCTURED
+                and optimization_attributes.pruning
+            ):
+                model_attributes[layer.name].optimization_attributes.pruning = False
+                model_attributes[layer.name].optimizable = model_attributes[
+                    layer.name
+                ].optimization_attributes.weight_sharing
+        else:
+            model_attributes[layer.name].optimizable = False
+            model_attributes[layer.name].optimization_attributes = None
+
+    # Add regularization loss to optimizable layers
+    optimizable_model = build_optimizable_model(
+        model,
+        model_attributes,
+        optimizer,
+        loss_fn,
+        validation_metric,
+        increasing,
+        train_dataset,
+        validation_dataset,
+        batch_size,
+        epochs // 2,
+        verbose=verbose,
+        directory=directory,
+        tuner=tuner,
+        regularization_range=regularization_range,
+    )
+
+    # Create class for masked backprop (weight freezing)
+    masked_backprop = MaskedBackprop(optimizable_model, loss_fn, model_attributes)
+
+    # In certain cases, the model might underperform at the current sparsity level, but perform better at a higher sparsity
+    # Therefore, monitor the models performance over several sparsity levels and
+    # Only stop pruning after high loss over several trials
+    bad_trials = 0
+    sparsity_conditions = True
+    target_sparsity = scheduler.get_sparsity()
+
+    while sparsity_conditions:
+        # TODO - This might cause OOM issues on large models / data sets, since it is not done in batches
+        gradients = (
+            utils.get_model_gradients(optimizable_model, loss_fn, X_train, y_train) if ranking_metric == 'gradients' else {}
+        )
+        hessians = (
+            utils.get_model_hessians(optimizable_model, loss_fn, X_train, y_train) if ranking_metric == 'saliency' else {}
+        )
+
+        # Mask weights
+        masks, offsets = get_model_masks(
+            optimizable_model,
+            model_attributes,
+            target_sparsity,
+            objective,
+            metric=ranking_metric,
+            local=local,
+            gradients=gradients,
+            hessians=hessians,
+            knapsack_solver=knapsack_solver,
+        )
+        for layer in optimizable_model.layers:
+            if isinstance(layer, SUPPORTED_LAYERS) and model_attributes[layer.name].optimizable:
+                layer_weights = layer.get_weights()
+                layer_weights[0] = np.multiply(layer_weights[0], masks[layer.name]) + offsets[layer.name]
+                layer.set_weights(layer_weights)
+
+        # Mask gradients
+        # Before training the model at the next sparsity level, reset internal states
+        # Furthemore, modern optimizers (e.g. Adam) accumulate gradients during backprop
+        # Therefore, even if the gradient for a weight is zero, it might be updated, due to previous gradients
+        # Avoid this by resetting the internal variables of an optimizer
+        optimizable_model.reset_metrics()
+        optimizable_model.reset_states()
+        for x in optimizable_model.optimizer.variables():
+            x.assign(tf.zeros_like(x))
+        masked_backprop.update_masks(masks)
+
+        # Train model with weight freezing [pruning]
+        if verbose:
+            print(f'Pruning with a target sparsity of {target_sparsity * 100.0}% [relative to objective]')
+        for epoch in range(epochs - rewinding_epochs):
+            start_time = time.time()
+            epoch_loss_avg = tf.keras.metrics.Mean()
+
+            # Masked backprop
+            for X, y in train_dataset:
+                loss_value = masked_backprop(tf.convert_to_tensor(X), tf.convert_to_tensor(y), target_sparsity)
+                epoch_loss_avg.update_state(loss_value)
+
+            # Evaluate on validation set and print epoch summary
+            if verbose:
+                val_res = optimizable_model.evaluate(validation_dataset, verbose=0, return_dict=False)
+                t = time.time() - start_time
+                avg_loss = round(epoch_loss_avg.result(), 3)
+                print(f'Epoch: {epoch + 1} - Time: {t}s - Average training loss: {avg_loss}')
+                print(f'Epoch: {epoch + 1} - learning_rate: {optimizable_model.optimizer.learning_rate.numpy()}')
+                print(f'Epoch: {epoch + 1} - Validation loss: {val_res[0]} - Performance on validation set: {val_res[1]}')
+
+        # Check if model works after pruning
+        pruned_performance = optimizable_model.evaluate(validation_dataset, verbose=0, return_dict=False)[-1]
+        if verbose:
+            print(f'Optimized model performance on validation set, after fine-tuning: {pruned_performance}')
+
+        if __compare__(pruned_performance, rtol * baseline_performance, not increasing):
+            bad_trials = 0
+            sparsity_conditions, target_sparsity = scheduler.update_step()
+            optimizable_model.save_weights(f'{directory}/optimization/best_weights.h5')
+        else:
+            bad_trials += 1
+            sparsity_conditions, target_sparsity = scheduler.repair_step()
+
+        # If the model performed poorly over several sparsity levels, stop optimization [maximum sparsity reached]
+        if bad_trials > cutoff_bad_trials:
+            break
+
+        # Train model without weight freezing [rewinding]
+        if verbose:
+            print(f'Starting weight rewinding for {rewinding_epochs} epochs')
+        optimizable_model.fit(
+            train_dataset,
+            validation_data=validation_dataset,
+            batch_size=batch_size,
+            epochs=rewinding_epochs,
+            callbacks=callbacks,
+            verbose=verbose,
+        )
+
+    # Load best weights
+    optimizable_model.load_weights(f'{directory}/optimization/best_weights.h5')
+
+    # Remove regularizers and save best model
+    optimizable_model = remove_custom_regularizers(optimizable_model)
+    optimizable_model.compile(optimizer, loss_fn, metrics=[validation_metric])
+
+    # In GPU FLOP Optimization, remove structures to achieve speed-up & fine-tune the smaller architecture
+    # TODO - Extend for Resource strategy in hls4ml FF optimisation
+    if objective.__name__ in ('GPUFLOPEstimator'):
+        optimizable_model = reduce_model(optimizable_model)
+        optimizable_model.compile(optimizer, loss_fn, metrics=[validation_metric])
+        optimizable_model.fit(
+            train_dataset,
+            validation_data=validation_dataset,
+            batch_size=batch_size,
+            epochs=int(1.5 * epochs),
+            callbacks=callbacks,
+        )
+
+    # Evaluate final optimized model [purely for debugging / informative purposes]
+    if verbose:
+        pruned_performance = optimizable_model.evaluate(validation_dataset, verbose=0, return_dict=False)[-1]
+        print(f'Optimized model performance on validation set: {pruned_performance}')
+
+    return optimizable_model
+
+
+class MaskedBackprop:
+    '''
+    A helper class to perform masked backprop (training with frozen weights)
+    The important function is __call__ as it masks gradients, based on frozen weights
+    While this function can exist without a class, taking masks as input would deplete memory
+    Since a new graph is created for every call, causing a large run-time
+    The trick is to set the masks, models etc. as class variables and then pass the sparsity
+    As the sparsity changes, a new graph of the function is created
+    '''
+
+    def __init__(self, model, loss_fn, attributes):
+        self.model = model
+        self.loss_fn = loss_fn
+        self.attributes = attributes
+        self.masks = {}
+
+    def update_masks(self, masks):
+        self.masks = masks
+
+    @tf.function
+    def __call__(self, X, y, s):
+        '''
+        Helper function performing backprop
+
+        Args:
+            - X (tf.Tensor): Input data
+            - y (tf.Tensor): Output data
+            - s (float): Sparsity
+
+        Return:
+            - loss (tf.Varilable): Model loss with input X and output y
+        '''
+        grads = []
+        with tf.GradientTape(persistent=True) as tape:
+            output = self.model(X, training=True)
+            loss = self.loss_fn(y, output)
+            loss += tf.add_n(self.model.losses)
+            for layer in self.model.layers:
+                if layer.trainable_weights:
+                    grad = tape.gradient(loss, layer.trainable_weights)
+                    if self.attributes[layer.name].optimizable:
+                        grad[0] = tf.multiply(grad[0], self.masks[layer.name])
+                    grads += grad
+        self.model.optimizer.apply_gradients(zip(grads, self.model.trainable_weights))
+        return loss
+
+
+def __compare__(x, y, leq=False):
+    '''
+    Helper function for comparing two values, x & y
+    Sometimes, we use the >= sign - e.g. pruned_accuracy >= tolerance * baseline_accuracy [ 0 <= tolerance <= 1]
+    Other times, use the <= sign - e.g. pruned_mse <= tolerance * baseline_mse [tolerance >= 1]
+    '''
+    if leq:
+        return x <= y
+    else:
+        return x >= y
diff --git a/hls4ml/optimization/keras/builder.py b/hls4ml/optimization/keras/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a3bec8d01f8090778f2e8e8fc8a11686faeaf92
--- /dev/null
+++ b/hls4ml/optimization/keras/builder.py
@@ -0,0 +1,252 @@
+import re
+
+import keras_tuner as kt
+import numpy as np
+import tensorflow as tf
+from qkeras import QConv2D, QDense
+from qkeras.utils import _add_supported_quantized_objects
+from tensorflow.keras.callbacks import EarlyStopping
+from tensorflow.keras.layers import Conv2D, Dense
+
+from hls4ml.optimization.keras.config import SUPPORTED_LAYERS, TMP_DIRECTORY
+from hls4ml.optimization.keras.regularizers import Conv2DRegularizer, DenseRegularizer
+
+co = {}
+_add_supported_quantized_objects(co)
+
+
+class HyperOptimizationModel(kt.HyperModel):
+    '''
+    Helper class for Keras Tuner
+
+    Args:
+        - model (keras.Model): Baseline model
+        - attributes (dict): Layer-wise dictionary of attributes
+        - optimizer (keras.optimizers.Optimizer or equvialent string description): Model optimizer
+        - loss_fn (keras.losses.Loss or equivalent string description): Model loss function
+        - validation_metric (keras.metrics.Metric or equivalent string description): Model validation metric
+        - regularization_range (list): List of suitable hyperparameters for weight decay
+    '''
+
+    def __init__(self, model, attributes, optimizer, loss_fn, validation_metric, regularization_range):
+        self.model = model
+        self.attributes = attributes
+        self.optimizer = optimizer
+        self.loss_fn = loss_fn
+        self.validation_metric = validation_metric
+        self.regularization_range = regularization_range
+
+    def build(self, hp):
+        model_to_prune = tf.keras.models.clone_model(self.model)
+        default_regularizaton = self.regularization_range[len(self.regularization_range) // 2]
+
+        # Make regularization loss a tunable hyperparameter
+        for layer in model_to_prune.layers:
+            if isinstance(layer, SUPPORTED_LAYERS) and self.attributes[layer.name].optimizable:
+                structure_type = self.attributes[layer.name].optimization_attributes.structure_type
+                block_shape = self.attributes[layer.name].optimization_attributes.block_shape
+                pattern_offset = self.attributes[layer.name].optimization_attributes.pattern_offset
+                consecutive_patterns = self.attributes[layer.name].optimization_attributes.consecutive_patterns
+
+                pruning = self.attributes[layer.name].optimization_attributes.pruning
+                weight_sharing = self.attributes[layer.name].optimization_attributes.weight_sharing
+
+                alpha = (
+                    hp.Choice(f'{layer.name}_alpha', values=self.regularization_range, default=default_regularizaton)
+                    if pruning
+                    else 0
+                )
+                beta = (
+                    hp.Choice(f'{layer.name}_beta', values=self.regularization_range, default=default_regularizaton)
+                    if weight_sharing
+                    else 0
+                )
+
+                if isinstance(layer, (Dense, QDense)) and self.attributes[layer.name].optimizable:
+                    layer.kernel_regularizer = DenseRegularizer(
+                        alpha,
+                        beta,
+                        norm=1,
+                        structure_type=structure_type,
+                        block_shape=block_shape,
+                        pattern_offset=pattern_offset,
+                        consecutive_patterns=consecutive_patterns,
+                    )
+                elif isinstance(layer, (Conv2D, QConv2D)) and self.attributes[layer.name].optimizable:
+                    layer.kernel_regularizer = Conv2DRegularizer(
+                        alpha,
+                        beta,
+                        norm=1,
+                        structure_type=structure_type,
+                        pattern_offset=pattern_offset,
+                        consecutive_patterns=consecutive_patterns,
+                    )
+
+        # Rebuild model graph
+        model_to_prune = tf.keras.models.model_from_json(model_to_prune.to_json(), custom_objects=co)
+        model_to_prune.set_weights(self.model.get_weights())
+        model_to_prune.compile(optimizer=self.optimizer, loss=self.loss_fn, metrics=[self.validation_metric])
+
+        return model_to_prune
+
+
+default_regularization_range = np.logspace(-6, -2, num=16).tolist()
+
+
+def build_optimizable_model(
+    model,
+    attributes,
+    optimizer,
+    loss_fn,
+    validation_metric,
+    increasing,
+    train_dataset,
+    validation_dataset,
+    batch_size,
+    epochs,
+    verbose=False,
+    directory=TMP_DIRECTORY,
+    tuner='Bayesian',
+    regularization_range=default_regularization_range,
+):
+    '''
+    Function identifying optimizable layers and adding a regularization loss
+
+    Args:
+    - model (keras.Model): Model to be optimized
+    - attributes (dict): Layer-wise model attributes, obtained from hls4ml.optimization.get_attributes_from_keras_model(...)
+    - optimizer (keras.optimizers.Optimizer): Optimizer used during training
+    - loss_fn (keras.losses.Loss): Loss function used during training
+    - validation_metric (keras.metrics.Metric): Validation metric, used as a baseline
+    - train_dataset (tf.Dataset): Training inputs and labels, in the form of an iterable TF Dataset
+    - validation_dataset (tf.Dataset): Validation inputs and labels, in the form of an iterable TF Dataset
+    - batch_size (int): Batch size during training
+    - epochs (int): Maximum number of epochs to fine-tune model, in one iteration of pruning
+
+    Kwargs:
+    - verbose (bool): Whether to log tuner outputs to the console
+    - directory (string): Directory to store tuning results
+    - tuner (str): Tuning alogorithm, choose between Bayesian and Hyperband
+    - regularization_range (list): List of suitable hyperparameters for weight decay
+    - learning_rate_range (list): List of suitable hyperparameters for learning rate
+
+    Notes:
+    - In general, the regularization and learning rate ranges do not need to be provided,
+        as the implementation sets a generic enough range. if the user has an idea on the
+        possible range on hyperparameter ranges, the tuning will complete faster.
+    - The default tuner is Bayesian & when coupled with the correct ranges of hyperparameters,
+        it performs quite well, fast. However, older version of Keras Tuner had a crashing bug with it.
+    - In general, the directory does not need to be specified. However, if pruning several models simultaneously,
+        to avoid conflicting intermediate results, it is useful to specify directory.
+    '''
+    # User provided manual hyper-parameters for regularisation loss
+    # TODO - Maybe we could extend this to be hyper-parameters per layer? or layer-type?
+    # Currently, the same (manually-set) hyper-parameter is set for every layer
+    if tuner == 'Manual':
+        model_to_prune = tf.keras.models.clone_model(model)
+        for layer in model_to_prune.layers:
+            if isinstance(layer, SUPPORTED_LAYERS) and attributes[layer.name].optimizable:
+                structure_type = attributes[layer.name].optimization_attributes.structure_type
+                block_shape = attributes[layer.name].optimization_attributes.block_shape
+                pattern_offset = attributes[layer.name].optimization_attributes.pattern_offset
+                consecutive_patterns = attributes[layer.name].optimization_attributes.consecutive_patterns
+
+                pruning = attributes[layer.name].optimization_attributes.pruning
+                weight_sharing = attributes[layer.name].optimization_attributes.weight_sharing
+
+                alpha = regularization_range[0] if pruning else 0
+                beta = regularization_range[0] if weight_sharing else 0
+
+                if isinstance(layer, (Dense, QDense)) and attributes[layer.name].optimizable:
+                    layer.kernel_regularizer = DenseRegularizer(
+                        alpha,
+                        beta,
+                        norm=1,
+                        structure_type=structure_type,
+                        block_shape=block_shape,
+                        pattern_offset=pattern_offset,
+                        consecutive_patterns=consecutive_patterns,
+                    )
+                elif isinstance(layer, (Conv2D, QConv2D)) and attributes[layer.name].optimizable:
+                    layer.kernel_regularizer = Conv2DRegularizer(
+                        alpha,
+                        beta,
+                        norm=1,
+                        structure_type=structure_type,
+                        pattern_offset=pattern_offset,
+                        consecutive_patterns=consecutive_patterns,
+                    )
+
+        # Rebuild model graph
+        model_to_prune = tf.keras.models.model_from_json(model_to_prune.to_json(), custom_objects=co)
+        model_to_prune.set_weights(model.get_weights())
+        model_to_prune.compile(optimizer=optimizer, loss=loss_fn, metrics=[validation_metric])
+
+        return model_to_prune
+
+    # User opted for hyper-parameter tuning
+    else:
+        objective_direction = 'max' if increasing else 'min'
+        if isinstance(validation_metric, str):
+            objective_name = validation_metric
+        else:
+            objective_name = re.sub(r'(?<!^)(?=[A-Z])', '_', validation_metric.__class__.__name__).lower()
+        if tuner == 'Bayesian':
+            tuner = kt.BayesianOptimization(
+                hypermodel=HyperOptimizationModel(
+                    model, attributes, optimizer, loss_fn, validation_metric, regularization_range
+                ),
+                objective=kt.Objective(objective_name, objective_direction),
+                max_trials=10,
+                overwrite=True,
+                directory=directory + '/tuning',
+            )
+        elif tuner == 'Hyperband':
+            tuner = kt.Hyperband(
+                hypermodel=HyperOptimizationModel(
+                    model, attributes, optimizer, loss_fn, validation_metric, regularization_range
+                ),
+                objective=kt.Objective(objective_name, objective_direction),
+                max_epochs=epochs,
+                factor=3,
+                hyperband_iterations=1,
+                overwrite=True,
+                directory=directory + '/tuning',
+            )
+        else:
+            raise Exception('Unknown tuner; possible options are Bayesian and Hyperband')
+
+        if verbose:
+            tuner.search_space_summary()
+
+        tuner.search(
+            train_dataset,
+            epochs=epochs,
+            batch_size=batch_size,
+            validation_data=validation_dataset,
+            callbacks=[EarlyStopping(monitor='val_loss', patience=1)],
+        )
+
+        if verbose:
+            tuner.results_summary()
+
+        return tuner.get_best_models(num_models=1)[0]
+
+
+def remove_custom_regularizers(model):
+    '''
+    Helper function to remove custom regularizers (DenseRegularizer & Conv2DRegularizer)
+    This makes it possible to load the model in a different environment without hls4ml installed
+
+    Args:
+        - model (keras.Model): Baseline model
+    '''
+    weights = model.get_weights()
+    for layer in model.layers:
+        if hasattr(layer, 'kernel_regularizer'):
+            if isinstance(layer.kernel_regularizer, (DenseRegularizer, Conv2DRegularizer)):
+                layer.kernel_regularizer = None
+
+    model = tf.keras.models.model_from_json(model.to_json(), custom_objects=co)
+    model.set_weights(weights)
+    return model
diff --git a/hls4ml/optimization/keras/config.py b/hls4ml/optimization/keras/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3ade5933d063859feb63013f59186d6544a7cc6
--- /dev/null
+++ b/hls4ml/optimization/keras/config.py
@@ -0,0 +1,26 @@
+from qkeras import QConv2D, QDense
+from tensorflow.keras.layers import Conv2D, Dense
+
+'''
+Optimizable layers in Keras / QKeras
+Any new layers need to be registered here first
+Additional logic in the source files may need to be written (e.g. recurrent layers should also optimize recurrent kernels)
+'''
+SUPPORTED_LAYERS = (Dense, Conv2D, QDense, QConv2D)
+
+
+'''
+Supported ranking metrics, for classifying redundant (groups of) weights
+
+1. l1 - groups of weights are ranked by their l1 norm
+2. l2 - groups of weights are ranked by their l2 norm
+3. oracle - abs(dL / dw * w), introduced by Molchanov et al. (2016)
+    Pruning Convolutional Neural Networks for Resource Efficient Inference
+4. saliency - (d^2L / dw^2 * w)^2, introduced by Lecun et al. (1989) Optimal Brain Damage
+'''
+SUPPORTED_METRICS = ('l1', 'l2', 'oracle', 'saliency')
+
+'''
+Temporary directory for storing best models, tuning results etc.
+'''
+TMP_DIRECTORY = 'hls4ml-optimization-keras'
diff --git a/hls4ml/optimization/keras/masking.py b/hls4ml/optimization/keras/masking.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf673b91549ae7adb45281763f5f13ee65306dab
--- /dev/null
+++ b/hls4ml/optimization/keras/masking.py
@@ -0,0 +1,794 @@
+import logging
+import sys
+
+import numpy as np
+import tensorflow as tf
+from qkeras import QConv2D, QDense
+from tensorflow.keras.layers import Conv2D, Dense
+
+from hls4ml.optimization.config import SUPPORTED_STRUCTURES
+from hls4ml.optimization.keras.config import SUPPORTED_LAYERS, SUPPORTED_METRICS
+from hls4ml.optimization.knapsack import solve_knapsack
+
+
+def get_model_masks(
+    keras_model,
+    model_attributes,
+    sparsity,
+    objective,
+    metric='l1',
+    local=False,
+    gradients=None,
+    hessians=None,
+    knapsack_solver='CBC_MIP',
+):
+    '''
+    Function calculating a binary mask for all optimizable layers
+    Entries equal to one correspond to the weight being updated during the training
+    Entries equal to zero correspond to the weight being frozen during the training
+
+    Masking is such that:
+        * resource_utilization <= (1 - sparsity) * baseline_utilization OR
+        * resource_saving > sparsity * baseline_utilization [equivalent formulation]
+
+    Offsets are used for weight sharing - in the case of weight sharing, the mask is set to zero
+    Therefore, the weights will be frozen during training; however, they still need to be the mean of the group
+    Offsets represent the mean of each weight-shared group - therefore, it is important to have offsets only for
+    frozen weights; that is where the corresponding entry in the mask tensor is zero
+
+    If a layer supports both weight sharing and pruning, both the norm and variance of the group are calculated
+    And the smaller one is considered; so if the norm is smaller, the group will be considered for pruning
+    Otherise, the group will be considered for weight sharing.
+    Both the norm and variance are normalized, to avoid magnitude biases.
+
+    Args:
+        - keras_model (keras.model) - Model to be masked
+        - model_attributes (dict) - A layer-wise dictionary of LayerAttributes classes
+        - sparsity (float) - Desired sparsity, with respect to the objective
+        - objective (ObjectiveEstimator) - Objective to be minimized (e.g. DSP, FLOPs etc.)
+        - metric (string) - Weight ranking metric - l1, l2, Oracle, saliency
+        - local (boolean) - Equal layer-wise sparsity
+        - gradients (dict) - A layer-wise dictionary of weight gradients
+            (needed for Oracle ranking)
+        - hessians (dict) - A layer-wise dictionary of second gradients
+            (needed for saliency ranking)
+        - knapsack_solver (str) - Algorithm for solving Knapsack problem; recommended is to use default.
+            Unless dealing with highly dimensional problems, in which case greedy is better.
+
+    Return:
+        - masks (dict) - Layer-wise dictionary of binary tensors
+        - offsets (dict) - Layer-wise dictionary of offsets for every weight
+    '''
+
+    if metric not in SUPPORTED_METRICS:
+        raise Exception('Unknown metric for ranking weights')
+
+    if metric == 'oracle' and gradients is None:
+        raise Exception('Oracle ranking requires the gradient of the loss with respect to model weights')
+
+    if metric == 'saliency' and hessians is None:
+        raise Exception('Saliency ranking requires second order derivatives')
+
+    if local:
+        return __get_masks_local(
+            keras_model, model_attributes, sparsity, objective, metric, gradients, hessians, knapsack_solver
+        )
+    else:
+        return __get_masks_global(
+            keras_model, model_attributes, sparsity, objective, metric, gradients, hessians, knapsack_solver
+        )
+
+
+def __get_masks_local(keras_model, model_attributes, sparsity, objective, metric, gradients, hessians, knapsack_solver):
+    '''
+    Function calculating a layer-wise binary mask for all optimizable layers
+    This function performs layer-wise masking, so all layers have the same sparsity (with respect to the objective)
+    '''
+    masks = {}
+    offsets = {}
+
+    for layer in keras_model.layers:
+        # Care needs to be taken if layer_savings = 0
+        # As long as the default attributes are used (from is_layer_optimizable(...)), this should never happen
+        # However, if the optimization attributes are manually changed,
+        # It would be possible to select the structure type such that savings = 0
+        # In this case, the goal is to keep resource utilization under a certain threshold;
+        # Savings are equivalent to resources per single group
+        # Therefore, in the knapsack solver, zero-saving would remain unmasked;
+        # As it has a "weight" of zero, it is always stored in the knapsack
+        # So not masking a group without saving is as expected;
+        # However, if solved through greedy knaspack an exception will be thrown (division by zero)
+        if isinstance(layer, SUPPORTED_LAYERS) and model_attributes[layer.name].optimizable:
+            layer_savings = objective.layer_savings(model_attributes[layer.name])
+            layer_resources = objective.layer_resources(model_attributes[layer.name])
+            target_resources = ((1 - sparsity) * np.array(layer_resources)).astype(int)
+            structure_type = model_attributes[layer.name].optimization_attributes.structure_type
+
+            # All the groups (structures, patterns, blocks) have the same resource utilisation in one layer
+            # So we can greedily prune the groups with the lowest "loss" (magnitude, saliency etc.)
+            # Greedily pruning the groups with the lowest loss is a special case of the Knapsack problem with equal weights
+            value = layer.get_weights()[0]
+            if metric == 'oracle':
+                value = np.abs(np.multiply(value, gradients[layer.name]))
+                norm = 1
+            elif metric == 'saliency':
+                value = np.multiply(np.square(value), hessians[layer.name])
+                norm = 1
+            elif metric == 'l1':
+                norm = 1
+            else:
+                norm = 2
+
+            if structure_type == SUPPORTED_STRUCTURES.UNSTRUCTURED:
+                if model_attributes[layer.name].optimization_attributes.weight_sharing:
+                    logging.warn('Weight sharing not suitable for unstructured pruning. Ignoring....')
+
+                if model_attributes[layer.name].optimization_attributes.pruning:
+                    # Since no norm is taken, calculate absolute value [avoids removing large negative weights]
+                    value = np.abs(value)
+
+                    # Get all posible indices in the weight tensor
+                    indices = np.indices(value.shape).reshape(value.ndim, -1).T
+
+                    # Find weights with the lowest loss
+                    groups = []
+                    for i in indices:
+                        groups.append(__WeightGroups__(value[tuple(i)], layer_savings, tuple(i)))
+                    _, selected = solve_knapsack(
+                        np.array([g.value for g in groups]),
+                        np.array([g.resources for g in groups]).T,
+                        target_resources,
+                        implementation=knapsack_solver,
+                    )
+
+                    # Selected weights are not masked
+                    mask = np.zeros(value.shape, dtype=value.dtype)
+                    for i in selected:
+                        mask[groups[i].layer_position] = 1
+
+                    # Offsets are always zero (weight sharing not applicable to unstructured)
+                    masks[layer.name] = mask
+                    offsets[layer.name] = np.zeros(value.shape, dtype=value.dtype)
+
+            if structure_type == SUPPORTED_STRUCTURES.STRUCTURED:
+                # Dense -> Masking neurons (columns)
+                if isinstance(layer, (Dense, QDense)):
+                    # If pruning enabled, find cost associated with pruning each neuron
+                    if model_attributes[layer.name].optimization_attributes.pruning:
+                        vals_norm = np.linalg.norm(value, axis=0, ord=norm)
+                    else:
+                        vals_norm = np.full((value.shape[1]), sys.float_info.max, dtype=value.dtype)
+
+                    # If weight sharing enabled, find cost asociated with quantizing nerons to their mean
+                    if model_attributes[layer.name].optimization_attributes.weight_sharing:
+                        vals_var = np.var(value, axis=0)
+                    else:
+                        vals_var = np.full((value.shape[1]), sys.float_info.max, dtype=value.dtype)
+
+                    # Choose min(pruning, weight sharing)
+                    groups = []
+                    for i in range(vals_norm.shape[0]):
+                        if vals_norm[i] <= vals_var[i]:
+                            groups.append(__WeightGroups__(vals_norm[i], layer_savings, i, optimization_type='pruning'))
+                        else:
+                            groups.append(__WeightGroups__(vals_var[i], layer_savings, i, optimization_type='sharing'))
+
+                    # Select neurons with the lowest loss
+                    _, selected = solve_knapsack(
+                        np.array([g.value for g in groups]),
+                        np.array([g.resources for g in groups]).T,
+                        target_resources,
+                        implementation=knapsack_solver,
+                    )
+
+                    # Selected neurons are not masked
+                    mask = np.zeros(value.shape, value.dtype)
+                    for i in selected:
+                        mask[:, groups[i].layer_position] = 1
+
+                    # Masked neurons can either be pruned or quantized
+                    # If quantized, add the corresponding offset
+                    offset = np.zeros(value.shape, value.dtype)
+                    zeros = np.where(~np.all(mask, axis=0))[0]
+                    for i in zeros:
+                        if groups[i].optimization_type == 'sharing':
+                            offset[:, i] = np.mean(layer.get_weights()[0][:, i])
+                    masks[layer.name] = mask
+                    offsets[layer.name] = offset
+
+                # Conv2D -> Masking filters (W x H x C)
+                elif isinstance(layer, (Conv2D, QConv2D)):
+                    if model_attributes[layer.name].optimization_attributes.pruning:
+                        vals_norm = np.linalg.norm(np.linalg.norm(value, axis=(0, 1), ord='fro'), axis=0, ord=norm)
+                    else:
+                        vals_norm = np.full((value.shape[3]), sys.float_info.max, dtype=value.dtype)
+
+                    if model_attributes[layer.name].optimization_attributes.weight_sharing:
+                        vals_var = np.var(np.linalg.norm(value, axis=(0, 1), ord='fro'), axis=0)
+                    else:
+                        vals_var = np.full((value.shape[3]), sys.float_info.max, dtype=value.dtype)
+
+                    groups = []
+                    for i in range(vals_norm.shape[0]):
+                        if vals_norm[i] <= vals_var[i]:
+                            groups.append(__WeightGroups__(vals_norm[i], layer_savings, i, optimization_type='pruning'))
+                        else:
+                            groups.append(__WeightGroups__(vals_var[i], layer_savings, i, optimization_type='sharing'))
+
+                    _, selected = solve_knapsack(
+                        np.array([g.value for g in groups]),
+                        np.array([g.resources for g in groups]).T,
+                        target_resources,
+                        implementation=knapsack_solver,
+                    )
+
+                    mask = np.zeros(value.shape, value.dtype)
+                    for i in selected:
+                        mask[:, :, :, groups[i].layer_position] = 1
+
+                    offset = np.zeros(value.shape, value.dtype)
+                    zeros = np.where(~np.all(mask, axis=(0, 1, 2)))[0]
+                    for i in zeros:
+                        if groups[i].optimization_type == 'sharing':
+                            offset[:, :, :, i] = np.mean(layer.get_weights()[0][:, :, :, i])
+
+                    masks[layer.name] = mask
+                    offsets[layer.name] = offset
+
+            if structure_type == SUPPORTED_STRUCTURES.PATTERN:
+                pattern_offset = model_attributes[layer.name].optimization_attributes.pattern_offset
+                consecutive_patterns = model_attributes[layer.name].optimization_attributes.consecutive_patterns
+
+                if (np.prod(value.shape)) % pattern_offset != 0:
+                    raise Exception('Pattern offset needs to be a factor of matrix size')
+
+                if pattern_offset % consecutive_patterns != 0:
+                    raise Exception('Consecutive patterns need to be a factor of matrix size')
+
+                # Transpose, as done in hls4ml Resource strategy
+                if isinstance(layer, (Dense, QDense)):
+                    value = value.T
+                    transposed_shape = value.shape
+                elif isinstance(layer, (Conv2D, QConv2D)):
+                    value = np.transpose(value, axes=[3, 0, 1, 2])
+                    transposed_shape = value.shape
+
+                # Reshape weight matrix into [number_of_patterns, pattern_offset]
+                # Note, swapping the axis will mess up the weight order
+                # In the case of hls4ml, number_of_patterns is equivalent to reuse factor
+                # And, pattern_offset, is the number of multiplications done in parallel
+                number_of_patterns = np.prod(transposed_shape) // pattern_offset
+                target_shape = (pattern_offset, number_of_patterns)
+                reshaped = np.reshape(value, target_shape)
+
+                # Group consecutive patterns (rows) into blocks and reshape
+                total_blocks = pattern_offset // consecutive_patterns
+                blocks = np.reshape(
+                    tf.image.extract_patches(
+                        np.expand_dims(np.expand_dims(reshaped, 2), 0),
+                        [1, consecutive_patterns, number_of_patterns, 1],
+                        [1, consecutive_patterns, number_of_patterns, 1],
+                        [1, 1, 1, 1],
+                        'SAME',
+                    ).numpy(),
+                    (total_blocks, -1),
+                )
+
+                # If pruning enabled, find cost associated with pruning each neuron
+                if model_attributes[layer.name].optimization_attributes.pruning:
+                    vals_norm = np.linalg.norm(blocks, axis=1, ord=norm)
+                else:
+                    vals_norm = np.full((blocks.shape[0],), sys.float_info.max, dtype=value.dtype)
+
+                # If weight sharing enabled, find cost asociated with quantizing nerons to their mean
+                if model_attributes[layer.name].optimization_attributes.weight_sharing:
+                    vals_var = np.var(blocks, axis=1)
+                else:
+                    vals_var = np.full((blocks.shape[0],), sys.float_info.max, dtype=value.dtype)
+
+                # Choose min(pruning, weight sharing)
+                groups = []
+                for i in range(vals_norm.shape[0]):
+                    if vals_norm[i] <= vals_var[i]:
+                        groups.append(__WeightGroups__(vals_norm[i], layer_savings, i, optimization_type='pruning'))
+                    else:
+                        groups.append(__WeightGroups__(vals_var[i], layer_savings, i, optimization_type='sharing'))
+
+                # Select groups with highest importance
+                _, selected = solve_knapsack(
+                    np.array([g.value for g in groups]),
+                    np.array([g.resources for g in groups]).T,
+                    target_resources,
+                    implementation=knapsack_solver,
+                )
+
+                # Decode masked groups into transposed shape and set selected groups to one
+                mask = np.zeros((np.prod(transposed_shape),), value.dtype)
+                for i in selected:
+                    pos = i * number_of_patterns * consecutive_patterns
+                    for j in range(pos, pos + number_of_patterns * consecutive_patterns, number_of_patterns):
+                        mask[range(j, j + number_of_patterns)] = 1
+
+                        # Decode offset
+                not_selected = [i for i in range(len(groups)) if i not in selected]
+                offset = np.zeros((np.prod(transposed_shape),), value.dtype)
+                for i in not_selected:
+                    if groups[i].optimization_type == 'sharing':
+                        mean = np.mean(blocks[i, :])
+                        pos = i * number_of_patterns * consecutive_patterns
+                        for j in range(pos, pos + number_of_patterns * consecutive_patterns, number_of_patterns):
+                            offset[range(j, j + number_of_patterns)] = mean
+
+                            # Reshape into original shape and store result
+                if isinstance(layer, (Dense, QDense)):
+                    mask = np.reshape(mask, transposed_shape).T
+                    offset = np.reshape(offset, transposed_shape).T
+                elif isinstance(layer, (Conv2D, QConv2D)):
+                    mask = np.transpose(np.reshape(mask, transposed_shape), (1, 2, 3, 0))
+                    offset = np.transpose(np.reshape(offset, transposed_shape), (1, 2, 3, 0))
+                masks[layer.name] = mask
+                offsets[layer.name] = offset
+
+            if structure_type == SUPPORTED_STRUCTURES.BLOCK:
+                if len(value.shape) != 2:
+                    raise Exception('Block pruning is supported for 2-dimensional weight matrices')
+
+                block_shape = model_attributes[layer.name].optimization_attributes.block_shape
+                if (value.shape[0] % block_shape[0]) != 0 or (value.shape[1] % block_shape[1] != 0):
+                    raise Exception('Block sizes need to be fators of weight matrix dimensions')
+
+                # TensorFlow has a built-in method for exctracting sub-tensors of given shape and stride
+                # This method is commonly used to perform im2col,
+                # Docs: https://www.tensorflow.org/api_docs/python/tf/image/extract_patches
+                total_blocks = (value.shape[0] * value.shape[1]) // (block_shape[0] * block_shape[1])
+                blocks_in_row = value.shape[1] // block_shape[1]
+                blocks = np.reshape(
+                    tf.image.extract_patches(
+                        np.expand_dims(np.expand_dims(value, 2), 0),
+                        [1, block_shape[0], block_shape[1], 1],
+                        [1, block_shape[0], block_shape[1], 1],
+                        [1, 1, 1, 1],
+                        'SAME',
+                    ).numpy(),
+                    (total_blocks, block_shape[0] * block_shape[1]),
+                )
+
+                # If pruning enabled, find cost associated with pruning each neuron
+                if model_attributes[layer.name].optimization_attributes.pruning:
+                    vals_norm = np.linalg.norm(blocks, axis=1, ord=norm)
+                else:
+                    vals_norm = np.full((blocks.shape[0],), sys.float_info.max, dtype=value.dtype)
+
+                # If weight sharing enabled, find cost asociated with quantizing nerons to their mean
+                if model_attributes[layer.name].optimization_attributes.weight_sharing:
+                    vals_var = np.var(blocks, axis=1)
+                else:
+                    vals_var = np.full((blocks.shape[0],), sys.float_info.max, dtype=value.dtype)
+
+                # Choose min(pruning, weight sharing)
+                groups = []
+                for i in range(vals_norm.shape[0]):
+                    if vals_norm[i] <= vals_var[i]:
+                        groups.append(__WeightGroups__(vals_norm[i], layer_savings, i, optimization_type='pruning'))
+                    else:
+                        groups.append(__WeightGroups__(vals_var[i], layer_savings, i, optimization_type='sharing'))
+
+                # Select groups with highest importance
+                _, selected = solve_knapsack(
+                    np.array([g.value for g in groups]),
+                    np.array([g.resources for g in groups]).T,
+                    target_resources,
+                    implementation=knapsack_solver,
+                )
+
+                # Decode position of masked weights and set selected weights to one
+                mask = np.zeros(value.shape, value.dtype)
+                for i in selected:
+                    row = block_shape[0] * (i // blocks_in_row)
+                    col = block_shape[1] * (i % blocks_in_row)
+                    cols = np.linspace(col, col + block_shape[1], block_shape[1], endpoint=False, dtype=np.int32)
+                    rows = np.linspace(row, row + block_shape[0], block_shape[0], endpoint=False, dtype=np.int32)
+                    zeros = np.array(np.meshgrid(rows, cols)).T.reshape(-1, 2)
+                    mask[zeros[:, 0], zeros[:, 1]] = 1
+
+                # Calculate offset
+                not_selected = [i for i in range(len(groups)) if i not in selected]
+                offset = np.zeros(value.shape, value.dtype)
+                for i in not_selected:
+                    if groups[i].optimization_type == 'sharing':
+                        mean = np.mean(blocks[i, :])
+                        row = block_shape[0] * (i // blocks_in_row)
+                        col = block_shape[1] * (i % blocks_in_row)
+                        cols = np.linspace(col, col + block_shape[1], block_shape[1], endpoint=False, dtype=np.int32)
+                        rows = np.linspace(row, row + block_shape[0], block_shape[0], endpoint=False, dtype=np.int32)
+                        pos = np.array(np.meshgrid(rows, cols)).T.reshape(-1, 2)
+                        offset[pos[:, 0], pos[:, 1]] = mean
+
+                masks[layer.name] = mask
+                offsets[layer.name] = offset
+
+    return masks, offsets
+
+
+def __get_masks_global(keras_model, model_attributes, sparsity, objective, metric, gradients, hessians, knapsack_solver):
+    '''
+    Function calculating a layer-wise binary mask for all optimizable layers
+    Global masking, with layers of different sparsity; masks are calculated by solving a Knapsack problem
+    Most of the logic remains similar to local masking; comments describing implementation are given in the function above
+    '''
+    groups = []
+    total_resources = []
+
+    # Iterate through all layers and create a list of all the optimizable groups (single weight, structure, pattern, block)
+    # Each entry contains the value associated with the group,
+    # Alongside the layer it belongs to and its location in the layer
+    # The values is normalised w.r.t to the to largest element in the group, to avoid bias towards large layers
+    # We also keep track of total model resources, with respect to the objective
+    # A detailed comment in the local masking function is given for
+    # Considerations on exception of layer savings and how to address them
+    for layer in keras_model.layers:
+        # Optimizable should be always enabled if either pruning or weight sharing are enabled
+        # However, if the objectives are implemented incorrectly,
+        # It is possible to have optimizatons enabled without any types of optimization (pruning, weight sharing) enabled
+        layer_optimizable = model_attributes[layer.name].optimizable and (
+            model_attributes[layer.name].optimization_attributes.weight_sharing
+            or model_attributes[layer.name].optimization_attributes.pruning
+        )
+        if isinstance(layer, SUPPORTED_LAYERS) and layer_optimizable:
+            value = layer.get_weights()[0]
+            structure_type = model_attributes[layer.name].optimization_attributes.structure_type
+            layer_savings = objective.layer_savings(model_attributes[layer.name])
+            total_resources.append(objective.layer_resources(model_attributes[layer.name]))
+
+            if metric == 'oracle':
+                value = np.abs(np.multiply(value, gradients[layer.name]))
+                norm = 1
+            elif metric == 'saliency':
+                value = np.multiply(np.square(value), hessians[layer.name])
+                norm = 1
+            elif metric == 'l1':
+                norm = 1
+            else:
+                norm = 2
+
+            if structure_type == SUPPORTED_STRUCTURES.UNSTRUCTURED:
+                if model_attributes[layer.name].optimization_attributes.weight_sharing:
+                    logging.warn('Weight sharing not suitable for unstructured pruning. Ignoring....')
+
+                if model_attributes[layer.name].optimization_attributes.pruning:
+                    value = np.abs(value)
+                    value = value / np.max(value)
+                    indices = np.indices(value.shape).reshape(value.ndim, -1).T
+                    for i in indices:
+                        group = __WeightGroups__(
+                            value[tuple(i)], layer_savings, tuple(i), structure_type, layer.name, 'pruning'
+                        )
+                        groups.append(group)
+
+            if structure_type == SUPPORTED_STRUCTURES.STRUCTURED:
+                if isinstance(layer, (Dense, QDense)):
+                    if model_attributes[layer.name].optimization_attributes.pruning:
+                        vals_norm = np.linalg.norm(value, axis=0, ord=norm)
+                        vals_norm = vals_norm / np.max(vals_norm)
+                    else:
+                        vals_norm = np.full((value.shape[1]), sys.float_info.max, dtype=value.dtype)
+
+                    if model_attributes[layer.name].optimization_attributes.weight_sharing:
+                        vals_var = np.var(value, axis=0)
+                    else:
+                        vals_var = np.full((value.shape[1]), sys.float_info.max, dtype=value.dtype)
+
+                    for i in range(vals_norm.shape[0]):
+                        if vals_norm[i] <= vals_var[i]:
+                            groups.append(
+                                __WeightGroups__(
+                                    vals_norm[i], layer_savings, i, structure_type, layer.name, optimization_type='pruning'
+                                )
+                            )
+                        else:
+                            groups.append(
+                                __WeightGroups__(
+                                    vals_var[i], layer_savings, i, structure_type, layer.name, optimization_type='sharing'
+                                )
+                            )
+
+                elif isinstance(layer, (Conv2D, QConv2D)):
+                    if model_attributes[layer.name].optimization_attributes.pruning:
+                        vals_norm = np.linalg.norm(np.linalg.norm(value, axis=(0, 1), ord='fro'), axis=0, ord=norm)
+                        vals_norm = vals_norm / np.max(vals_norm)
+                    else:
+                        vals_norm = np.full((value.shape[3]), sys.float_info.max, dtype=value.dtype)
+
+                    if model_attributes[layer.name].optimization_attributes.weight_sharing:
+                        vals_var = np.var(np.linalg.norm(value, axis=(0, 1), ord='fro'), axis=0)
+                    else:
+                        vals_var = np.full((value.shape[3]), sys.float_info.max, dtype=value.dtype)
+
+                    for i in range(vals_norm.shape[0]):
+                        if vals_norm[i] <= vals_var[i]:
+                            groups.append(
+                                __WeightGroups__(
+                                    vals_norm[i], layer_savings, i, structure_type, layer.name, optimization_type='pruning'
+                                )
+                            )
+                        else:
+                            groups.append(
+                                __WeightGroups__(
+                                    vals_var[i], layer_savings, i, structure_type, layer.name, optimization_type='sharing'
+                                )
+                            )
+
+            if structure_type == SUPPORTED_STRUCTURES.PATTERN:
+                pattern_offset = model_attributes[layer.name].optimization_attributes.pattern_offset
+                consecutive_patterns = model_attributes[layer.name].optimization_attributes.consecutive_patterns
+                if (np.prod(value.shape)) % pattern_offset != 0:
+                    raise Exception('Pattern offset needs to be a factor of matrix size')
+
+                if pattern_offset % consecutive_patterns != 0:
+                    raise Exception('Consecutive patterns need to be a factor of matrix size')
+
+                # Transpose, as done in hls4ml Resource strategy
+                if isinstance(layer, (Dense, QDense)):
+                    value = value.T
+                    transposed_shape = value.shape
+                elif isinstance(layer, (Conv2D, QConv2D)):
+                    value = np.transpose(value, axes=[3, 0, 1, 2])
+                    transposed_shape = value.shape
+
+                # Reshape weight matrix into [number_of_patterns, pattern_offset]
+                # Note, swapping the axis will mess up the weight order
+                # In the case of hls4ml, number_of_patterns is equivalent to reuse factor
+                # And, pattern_offset, is the number of multiplications done in parallel
+                number_of_patterns = np.prod(transposed_shape) // pattern_offset
+                target_shape = (pattern_offset, number_of_patterns)
+                reshaped = np.reshape(value, target_shape)
+
+                # Group consecutive patterns (rows) into blocks and reshape
+                total_blocks = pattern_offset // consecutive_patterns
+                blocks = np.reshape(
+                    tf.image.extract_patches(
+                        np.expand_dims(np.expand_dims(reshaped, 2), 0),
+                        [1, consecutive_patterns, number_of_patterns, 1],
+                        [1, consecutive_patterns, number_of_patterns, 1],
+                        [1, 1, 1, 1],
+                        'SAME',
+                    ).numpy(),
+                    (total_blocks, -1),
+                )
+
+                # If pruning enabled, find cost associated with pruning each neuron
+                if model_attributes[layer.name].optimization_attributes.pruning:
+                    vals_norm = np.linalg.norm(blocks, axis=1, ord=norm)
+                else:
+                    vals_norm = np.full((blocks.shape[0],), sys.float_info.max, dtype=value.dtype)
+
+                # If weight sharing enabled, find cost asociated with quantizing nerons to their mean
+                if model_attributes[layer.name].optimization_attributes.weight_sharing:
+                    vals_var = np.var(blocks, axis=1)
+                else:
+                    vals_var = np.full((blocks.shape[0],), sys.float_info.max, dtype=value.dtype)
+
+                # Choose min(pruning, weight sharing)
+                for i in range(vals_norm.shape[0]):
+                    if vals_norm[i] <= vals_var[i]:
+                        groups.append(
+                            __WeightGroups__(
+                                vals_norm[i], layer_savings, i, structure_type, layer.name, optimization_type='pruning'
+                            )
+                        )
+                    else:
+                        groups.append(
+                            __WeightGroups__(
+                                vals_var[i], layer_savings, i, structure_type, layer.name, optimization_type='sharing'
+                            )
+                        )
+
+            if structure_type == SUPPORTED_STRUCTURES.BLOCK:
+                if len(value.shape) != 2:
+                    raise Exception('Block pruning is supported for 2-dimensional weight matrices')
+
+                block_shape = model_attributes[layer.name].optimization_attributes.block_shape
+                if (value.shape[0] % block_shape[0]) != 0 or (value.shape[1] % block_shape[1] != 0):
+                    raise Exception('Block sizes need to be fators of weight matrix dimensions')
+
+                total_blocks = (value.shape[0] * value.shape[1]) // (block_shape[0] * block_shape[1])
+                blocks = np.reshape(
+                    tf.image.extract_patches(
+                        np.expand_dims(np.expand_dims(value, 2), 0),
+                        [1, block_shape[0], block_shape[1], 1],
+                        [1, block_shape[0], block_shape[1], 1],
+                        [1, 1, 1, 1],
+                        'SAME',
+                    ).numpy(),
+                    (total_blocks, block_shape[0] * block_shape[1]),
+                )
+
+                if model_attributes[layer.name].optimization_attributes.pruning:
+                    vals_norm = np.linalg.norm(blocks, axis=1, ord=norm)
+                    vals_norm = vals_norm / np.max(vals_norm)
+                else:
+                    vals_norm = np.full((blocks.shape[0]), sys.float_info.max, dtype=value.dtype)
+
+                if model_attributes[layer.name].optimization_attributes.weight_sharing:
+                    vals_var = np.var(blocks, axis=1)
+                else:
+                    vals_var = np.full((blocks.shape[0]), sys.float_info.max, dtype=value.dtype)
+
+                for i in range(vals_norm.shape[0]):
+                    if vals_norm[i] <= vals_var[i]:
+                        groups.append(
+                            __WeightGroups__(
+                                vals_norm[i], layer_savings, i, structure_type, layer.name, optimization_type='pruning'
+                            )
+                        )
+                    else:
+                        groups.append(
+                            __WeightGroups__(
+                                vals_var[i], layer_savings, i, structure_type, layer.name, optimization_type='sharing'
+                            )
+                        )
+
+    # The goal is to maximize network accuracy (values) subject to resorces (objective) staying under some threshold
+    # This is a Knapsack problem; several implementations are provided in the helper functions
+    # The selected values correspond to weight / groups being kept in the network; the rest are pruned / weight shared
+    total_resources = np.sum(np.array(total_resources), axis=0)
+    target_resources = ((1 - sparsity) * np.array(total_resources)).astype(int)
+    _, selected = solve_knapsack(
+        np.array([s.value for s in groups]),
+        np.array([s.resources for s in groups]).T,
+        target_resources,
+        implementation=knapsack_solver,
+    )
+    # Update masks and offsets
+    masks = {}
+    offsets = {}
+
+    for layer in keras_model.layers:
+        if isinstance(layer, SUPPORTED_LAYERS) and model_attributes[layer.name].optimizable:
+            structure_type = model_attributes[layer.name].optimization_attributes.structure_type
+            selected_layer = [i for i in selected if groups[i].layer_name == layer.name]
+            not_selected_layer = [
+                i for i in range(len(groups)) if groups[i].layer_name == layer.name and i not in selected_layer
+            ]
+
+            if structure_type == SUPPORTED_STRUCTURES.UNSTRUCTURED:
+                mask = np.zeros(model_attributes[layer.name].weight_shape, layer.get_weights()[0].dtype)
+                for i in selected_layer:
+                    mask[groups[i].layer_position] = 1
+                masks[layer.name] = mask
+                offsets[layer.name] = np.zeros(model_attributes[layer.name].weight_shape, layer.get_weights()[0].dtype)
+
+            if structure_type == SUPPORTED_STRUCTURES.STRUCTURED:
+                mask = np.zeros(model_attributes[layer.name].weight_shape, layer.get_weights()[0].dtype)
+                offset = np.zeros(model_attributes[layer.name].weight_shape, layer.get_weights()[0].dtype)
+                if isinstance(layer, (Dense, QDense)):
+                    for i in selected_layer:
+                        mask[:, groups[i].layer_position] = 1
+                    for i in not_selected_layer:
+                        if groups[i].optimization_type == 'sharing':
+                            offset[:, groups[i].layer_position] = np.mean(layer.get_weights()[0][:, i])
+                if isinstance(layer, (Conv2D, QConv2D)):
+                    for i in selected_layer:
+                        mask[:, :, :, groups[i].layer_position] = 1
+                    for i in not_selected_layer:
+                        if groups[i].optimization_type == 'sharing':
+                            offset[
+                                :,
+                                :,
+                                :,
+                                groups[i].layer_position,
+                            ] = np.mean(layer.get_weights()[0][:, :, :, i])
+                masks[layer.name] = mask
+                offsets[layer.name] = offset
+
+            if structure_type == SUPPORTED_STRUCTURES.PATTERN:
+                pattern_offset = model_attributes[layer.name].optimization_attributes.pattern_offset
+                consecutive_patterns = model_attributes[layer.name].optimization_attributes.consecutive_patterns
+                number_of_patterns = np.prod(model_attributes[layer.name].weight_shape) // pattern_offset
+
+                # Transpose shape, as done in hls4ml Resource strategy
+                # We need the weights to recalculate the block means
+                weight_shape = model_attributes[layer.name].weight_shape
+                if isinstance(layer, (Dense, QDense)):
+                    value = layer.get_weights()[0].T
+                    transposed_shape = (weight_shape[1], weight_shape[0])
+                elif isinstance(layer, (Conv2D, QConv2D)):
+                    value = np.transpose(layer.get_weights()[0], (3, 0, 1, 2))
+                    transposed_shape = (weight_shape[3], weight_shape[0], weight_shape[1], weight_shape[2])
+
+                # Decode masks
+                mask = np.zeros((np.prod(transposed_shape),), layer.get_weights()[0].dtype)
+                for i in selected_layer:
+                    pos = groups[i].layer_position * number_of_patterns * consecutive_patterns
+                    for j in range(pos, pos + number_of_patterns * consecutive_patterns, number_of_patterns):
+                        mask[range(j, j + number_of_patterns)] = 1
+
+                        # Decode offsets
+                offset = np.zeros((np.prod(transposed_shape),), layer.get_weights()[0].dtype)
+
+                # Re-calculate the blocks, they are needed to calculate block means
+                target_shape = (pattern_offset, number_of_patterns)
+                reshaped = np.reshape(value, target_shape)
+
+                total_blocks = pattern_offset // consecutive_patterns
+                blocks = np.reshape(
+                    tf.image.extract_patches(
+                        np.expand_dims(np.expand_dims(reshaped, 2), 0),
+                        [1, consecutive_patterns, number_of_patterns, 1],
+                        [1, consecutive_patterns, number_of_patterns, 1],
+                        [1, 1, 1, 1],
+                        'SAME',
+                    ).numpy(),
+                    (total_blocks, -1),
+                )
+
+                for i in not_selected_layer:
+                    if groups[i].optimization_type == 'sharing':
+                        mean = np.mean(blocks[groups[i].layer_position, :])
+                        pos = groups[i].layer_position * number_of_patterns * consecutive_patterns
+                        for j in range(pos, pos + number_of_patterns * consecutive_patterns, number_of_patterns):
+                            offset[range(j, j + number_of_patterns)] = mean
+
+                            # Reshape into original shape and store result
+                if isinstance(layer, (Dense, QDense)):
+                    mask = np.reshape(mask, transposed_shape).T
+                    offset = np.reshape(offset, transposed_shape).T
+                elif isinstance(layer, (Conv2D, QConv2D)):
+                    mask = np.transpose(np.reshape(mask, transposed_shape), (1, 2, 3, 0))
+                    offset = np.transpose(np.reshape(offset, transposed_shape), (1, 2, 3, 0))
+                masks[layer.name] = mask
+                offsets[layer.name] = offset
+
+            if structure_type == SUPPORTED_STRUCTURES.BLOCK:
+                block_shape = model_attributes[layer.name].optimization_attributes.block_shape
+                total_blocks = np.prod(model_attributes[layer.name].weight_shape) // np.prod(block_shape)
+                blocks_in_row = model_attributes[layer.name].weight_shape[1] // block_shape[1]
+                blocks = np.reshape(
+                    tf.image.extract_patches(
+                        np.expand_dims(np.expand_dims(layer.get_weights()[0], 2), 0),
+                        [1, block_shape[0], block_shape[1], 1],
+                        [1, block_shape[0], block_shape[1], 1],
+                        [1, 1, 1, 1],
+                        'SAME',
+                    ).numpy(),
+                    (total_blocks, block_shape[0] * block_shape[1]),
+                )
+
+                mask = np.zeros(model_attributes[layer.name].weight_shape, layer.get_weights()[0].dtype)
+                for i in selected_layer:
+                    row = block_shape[0] * (groups[i].layer_position // blocks_in_row)
+                    col = block_shape[1] * (groups[i].layer_position % blocks_in_row)
+                    cols = np.linspace(col, col + block_shape[1], block_shape[1], endpoint=False, dtype=np.int32)
+                    rows = np.linspace(row, row + block_shape[0], block_shape[0], endpoint=False, dtype=np.int32)
+                    zeros = np.array(np.meshgrid(rows, cols)).T.reshape(-1, 2)
+                    mask[zeros[:, 0], zeros[:, 1]] = 1
+
+                offset = np.zeros(model_attributes[layer.name].weight_shape, layer.get_weights()[0].dtype)
+                for i in not_selected_layer:
+                    if groups[i].optimization_type == 'sharing':
+                        mean = np.mean(blocks[groups[i].layer_position, :])
+                        row = block_shape[0] * (groups[i].layer_position // blocks_in_row)
+                        col = block_shape[1] * (groups[i].layer_position % blocks_in_row)
+                        cols = np.linspace(col, col + block_shape[1], block_shape[1], endpoint=False, dtype=np.int32)
+                        rows = np.linspace(row, row + block_shape[0], block_shape[0], endpoint=False, dtype=np.int32)
+                        pos = np.array(np.meshgrid(rows, cols)).T.reshape(-1, 2)
+                        offset[pos[:, 0], pos[:, 1]] = mean
+
+                masks[layer.name] = mask
+                offsets[layer.name] = offset
+
+    return masks, offsets
+
+
+class __WeightGroups__:
+    '''
+    A helper class containing information about a group of weights
+    '''
+
+    def __init__(self, value, resources, layer_position, structure_type=None, layer_name=None, optimization_type=None):
+        self.value = value
+        self.resources = resources
+        self.layer_position = layer_position
+        self.structure_type = structure_type
+        self.layer_name = layer_name
+        self.optimization_type = optimization_type
diff --git a/hls4ml/optimization/keras/reduction.py b/hls4ml/optimization/keras/reduction.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd4621ed76274fbaabc56a464e08955f3dcf912a
--- /dev/null
+++ b/hls4ml/optimization/keras/reduction.py
@@ -0,0 +1,63 @@
+import numpy as np
+from tensorflow.keras.layers import Conv2D, Dense
+from tensorflow.keras.models import Sequential
+
+from hls4ml.optimization.keras.utils import get_last_layer_with_weights
+
+'''
+Function for removing zero neurons & filters from a model and rewiring the model graph
+This function is built on top of Keras Surgeon available at: https://github.com/BenWhetton/keras-surgeon
+Keras Surgeon is no longer under active development and does not work for TensorFlow 2.3+ and QKeras
+The baseline version was forked and updated, available at: https://github.com/bo3z/keras-surgeon
+
+Args:
+    - model (keras.model): Input model
+
+Return:
+    - reduced (keras.model): Modified model, with redundant structures removed
+
+    '''
+
+
+def reduce_model(model):
+    try:
+        from kerassurgeon import Surgeon
+    except ModuleNotFoundError:
+        raise Exception(
+            'Keras Surgeon not installed. Unable to reduce model footprint '
+            'Please install up-to-date Keras Surgeon compatible wit TensorFlow 2.3+ and QKeras '
+            'Installation from git: https://github.com/bo3z/keras-surgeon'
+        )
+
+    # Initiate surgeon
+    surgeon = Surgeon(model)
+
+    # Iterate through layers and identify neurons (columns) and filters (tensors, W x H x C) to be removed
+    last_idx = get_last_layer_with_weights(model)
+    for idx, layer in enumerate(model.layers):
+        # Last layer with weights cannot be removed, as it maps to data set labels
+        if idx == last_idx:
+            break
+
+        # Currently supported Dense and Conv2D; these two can be combined in a single if-statement
+        # Keras Surgeon has a full range of support for Conv1D / Conv3D, reucurrent etc. - might extend in the future
+        if isinstance(layer, Dense):
+            weights = layer.get_weights()[0]
+            zeros = np.where(~weights.any(axis=0))[0].tolist()
+            surgeon.add_job('delete_channels', layer, channels=zeros)
+
+        elif isinstance(layer, Conv2D):
+            weights = layer.get_weights()[0]
+            zeros = np.where(~weights.reshape(-1, weights.shape[-1]).any(axis=0))[0].tolist()
+            surgeon.add_job('delete_channels', layer, channels=zeros)
+
+    # Reduce model
+    reduced = surgeon.operate()
+
+    # By default, Keras surgeon returns a Functional model
+    # If the original was a Sequential, convert back
+    is_sequential = model.__class__.__name__ == 'Sequential'
+    if is_sequential:
+        return Sequential(layers=reduced.layers)
+    else:
+        return reduced
diff --git a/hls4ml/optimization/keras/regularizers.py b/hls4ml/optimization/keras/regularizers.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd8a2f3656f604077b5066e2dab0d126286ea2ef
--- /dev/null
+++ b/hls4ml/optimization/keras/regularizers.py
@@ -0,0 +1,255 @@
+import numpy as np
+import tensorflow as tf
+
+from hls4ml.optimization.config import SUPPORTED_STRUCTURES
+
+
+@tf.keras.utils.register_keras_serializable(name='DenseRegularizer')
+class DenseRegularizer(tf.keras.regularizers.Regularizer):
+    '''
+    A flexible regularizer for Dense layers, simultaneously penalizing high values and variance
+
+    Args:
+        - alpha (float): Sparse penalty; a higher value pushes more weights towards zero
+        - beta (float): Variance penalty; a higer value reduces variance between a group of weights
+        - norm (int): Norm type (l1 or l2)
+        - structure_type (string): Type of regularisation - unstructured, structured, pattern, block
+        - block_shape (tuple): Block shape if structure_type == block
+        - pattern_offset (int): Length of each pattern if structure_type == pattern
+        - consecutive_patterns (int): How many consecutive patterns should be considered
+        - weights (tf.Variable): Two-dimensional layer weight tensor, dimensionality (M x N)
+
+    Return:
+        - Regularizer penalty (tf.Variable): Penalty associated with layer weights
+
+    Examples:
+        - structure_type = unstructured: unstructured weight regularisation
+        - structure_type = structured: neuron regularization
+            (group weights by row)
+        - structure_type = pattern: regularization on groups of every n-th weight
+            (e.g. grouping by reuse factor in hls4ml)
+        - structure_type = block: regularisation on blocks within weight matrix
+            (e.g. 4x4, 8x1 for certain SIMD processors)
+
+        - consecutive_patterns is commonly encountered with optimization of BRAM utilization -
+            e.g. while it is true that each DSP pattern consumes one DSP,
+            They likely use less than one BRAM block (e.g. if the BRAM width is 36 bit and weight width is 16)
+            In that case, we need to group several patterns together,
+            So the entire block of patterns can be removed, thus saving DSP and BRAM
+    '''
+
+    def __init__(
+        self,
+        alpha,
+        beta=0,
+        norm=1,
+        structure_type=SUPPORTED_STRUCTURES.UNSTRUCTURED,
+        block_shape=(1, 1),
+        pattern_offset=1,
+        consecutive_patterns=1,
+    ):
+        if norm != 1 and norm != 2:
+            raise Exception(f'{self.__class__.__name__} currently supports l1- and l2-based regularization')
+
+        if isinstance(structure_type, str):
+            structure_type = SUPPORTED_STRUCTURES(structure_type)
+
+        if not isinstance(structure_type, SUPPORTED_STRUCTURES):
+            raise Exception(f'{self.__class__.__name__} unknown regularization type')
+
+        self.alpha = alpha
+        self.beta = beta
+        self.norm = norm
+        self.structure_type = structure_type
+        self.block_shape = block_shape
+        self.pattern_offset = pattern_offset
+        self.consecutive_patterns = consecutive_patterns
+
+    @tf.function
+    def __call__(self, weights):
+        if self.structure_type == SUPPORTED_STRUCTURES.UNSTRUCTURED:
+            sparse_penalty = self.alpha * tf.norm(weights, ord=self.norm)
+            variance_penalty = self.beta * tf.math.reduce_variance(weights)
+            return sparse_penalty + variance_penalty
+
+        if self.structure_type == SUPPORTED_STRUCTURES.STRUCTURED:
+            sparse_penalty = self.alpha * tf.norm(tf.norm(weights, axis=0, ord=2), ord=self.norm)
+            variance_penalty = self.beta * tf.norm(tf.math.reduce_variance(weights, axis=0), ord=self.norm)
+            return sparse_penalty + variance_penalty
+
+        if self.structure_type == SUPPORTED_STRUCTURES.PATTERN:
+            # This is equivalent to penalising all the weights processed by the same DSP block in hls4ml.
+            # The matrix is transposed, according to Resource strategy and reshaped into (pattern_offset, pattern_number)
+            # Pattern offset corresponds to the number of patterns is equivalent to RF
+            if (np.prod(weights.shape)) % self.pattern_offset != 0:
+                print(np.prod(weights.shape), self.pattern_offset)
+                raise Exception(f'{self.__class__.__name__}: pattern offset needs to be a factor of matrix size')
+
+            if self.pattern_offset % self.consecutive_patterns != 0:
+                raise Exception(f'{self.__class__.__name__}: consecutive patterns need to be a factor of pattern offset')
+
+            # Reshape weight matrix into [number_of_patterns, pattern_offset]
+            number_of_patterns = np.prod(weights.shape) // self.pattern_offset
+            target_shape = (self.pattern_offset, number_of_patterns)
+            reshaped = tf.reshape(tf.transpose(weights), target_shape)
+            # Group consecutive patterns (columns) into blocks and reshape
+            # Docs for the functions to extract blocks are below [block regularization]
+            total_blocks = self.pattern_offset // self.consecutive_patterns
+            blocks = tf.reshape(
+                tf.image.extract_patches(
+                    tf.expand_dims(tf.expand_dims(reshaped, 2), 0),
+                    [1, self.consecutive_patterns, number_of_patterns, 1],
+                    [1, self.consecutive_patterns, number_of_patterns, 1],
+                    [1, 1, 1, 1],
+                    'SAME',
+                ),
+                (total_blocks, -1),
+            )
+
+            # Calculate penalty
+            sparse_penalty = self.alpha * tf.norm(tf.norm(blocks, axis=1, ord=2), ord=self.norm)
+            variance_penalty = self.beta * tf.norm(tf.math.reduce_variance(blocks, axis=1), ord=self.norm)
+            return sparse_penalty + variance_penalty
+
+        if self.structure_type == SUPPORTED_STRUCTURES.BLOCK:
+            if (weights.shape[0] % self.block_shape[0]) != 0 or (weights.shape[1] % self.block_shape[1] != 0):
+                raise Exception(f'{self.__class__.__name__}: block sizes need to be fators of weight matrix dimensions')
+
+            # TensorFlow has a built-in method for exctracting sub-tensors of given shape and stride
+            # This method is commonly used to perform im2col,
+            # Docs: https://www.tensorflow.org/api_docs/python/tf/image/extract_patches
+            total_blocks = (weights.shape[0] * weights.shape[1]) // (self.block_shape[0] * self.block_shape[1])
+            blocks = tf.reshape(
+                tf.image.extract_patches(
+                    tf.expand_dims(tf.expand_dims(weights, 2), 0),
+                    [1, self.block_shape[0], self.block_shape[1], 1],
+                    [1, self.block_shape[0], self.block_shape[1], 1],
+                    [1, 1, 1, 1],
+                    'SAME',
+                ),
+                (total_blocks, self.block_shape[0] * self.block_shape[1]),
+            )
+
+            sparse_penalty = self.alpha * tf.norm(tf.norm(blocks, axis=1, ord=2), ord=self.norm)
+            variance_penalty = self.beta * tf.norm(tf.math.reduce_variance(blocks, axis=1), ord=self.norm)
+            return sparse_penalty + variance_penalty
+
+    def get_config(self):
+        return {
+            'alpha': self.alpha,
+            'beta': self.beta,
+            'norm': self.norm,
+            'structure_type': self.structure_type,
+            'block_shape': self.block_shape,
+            'pattern_offset': self.pattern_offset,
+            'consecutive_patterns': self.consecutive_patterns,
+        }
+
+
+@tf.keras.utils.register_keras_serializable(name='Conv2DRegularizer')
+class Conv2DRegularizer(tf.keras.regularizers.Regularizer):
+    '''
+    A flexible regulariser for Conv2D layers, simultaneously performing pruning and clustering
+
+    Args:
+        - alpha (float): Sparse penalty; a higher value pushes more weights towards zero
+        - beta (float): Variance penalty; a higer value reduces variance between a group of weights
+        - norm (int): Norm type (l1 or l2)
+        - structure_type (string): Type of regularisation - unstructured, structured, pattern
+        - pattern_offset (int): Length of each pattern if structure_type == pattern
+        - weights (tf.Variable): Four-dimensional layer weight tensor, dimensionality
+        (filter_width x filter_height x n_chan x n_filt)
+
+    Return:
+        - Regularizer penalty (tf.Variable): Penalty associated with layer weights
+
+    Example use cases:
+        - structure_type = unstructured: unstructured weight regularisation
+        - structure_type = structured: filter regularization
+            (group weights of dimensionality filt_width x filt_height x n_chan)
+        - structure_type = pattern: regularization on groups of every n-th weight in flattened array
+            (e.g. grouping by reuse factor in hls4ml)
+    '''
+
+    def __init__(
+        self,
+        alpha,
+        beta=0,
+        norm=1,
+        structure_type=SUPPORTED_STRUCTURES.UNSTRUCTURED,
+        pattern_offset=1,
+        consecutive_patterns=1,
+    ):
+        if norm != 1 and norm != 2:
+            raise Exception(f'{self.__class__.__name__} currently supports l1- and l2-based regularization')
+
+        if isinstance(structure_type, str):
+            structure_type = SUPPORTED_STRUCTURES(structure_type)
+
+        if not isinstance(structure_type, SUPPORTED_STRUCTURES):
+            raise Exception(f'{self.__class__.__name__} unknown regularization type')
+
+        # Block pruning is only supported for Dense and QDense layers
+        if structure_type == SUPPORTED_STRUCTURES.BLOCK:
+            raise Exception('Block pruning is supported for 2-dimensional weight matrices')
+
+        self.alpha = alpha
+        self.beta = beta
+        self.norm = norm
+        self.structure_type = structure_type
+        self.pattern_offset = pattern_offset
+        self.consecutive_patterns = consecutive_patterns
+
+    @tf.function
+    def __call__(self, weights):
+        if len(weights.shape) != 4:
+            raise Exception(f'{self.__class__.__name__} regularizes Conv2D layers; weight matrix is not 4-dimensional')
+
+        if self.structure_type == SUPPORTED_STRUCTURES.UNSTRUCTURED:
+            sparse_penalty = self.alpha * tf.norm(weights, ord=self.norm)
+            variance_penalty = self.beta * tf.math.reduce_variance(weights)
+            return sparse_penalty + variance_penalty
+
+        if self.structure_type == SUPPORTED_STRUCTURES.STRUCTURED:
+            sparse_penalty = self.alpha * tf.norm(
+                tf.reduce_sum(tf.norm(weights, axis=(0, 1), ord='fro'), axis=0), ord=self.norm
+            )
+            variance_penalty = self.beta * tf.norm(tf.math.reduce_variance(weights, axis=(0, 1, 2)), ord=self.norm)
+            return sparse_penalty + variance_penalty
+
+        if self.structure_type == SUPPORTED_STRUCTURES.PATTERN:
+            if (np.prod(weights.shape)) % self.pattern_offset != 0:
+                raise Exception(f'{self.__class__.__name__}: pattern offset needs to be a factor of matrix size')
+
+            if self.pattern_offset % self.consecutive_patterns != 0:
+                raise Exception(f'{self.__class__.__name__}: consecutive patterns need to be a factor of pattern offset')
+
+            number_of_patterns = np.prod(weights.shape) // self.pattern_offset
+            target_shape = (self.pattern_offset, number_of_patterns)
+            reshaped = tf.reshape(tf.transpose(weights, (3, 0, 1, 2)), target_shape)
+
+            total_blocks = self.pattern_offset // self.consecutive_patterns
+            blocks = tf.reshape(
+                tf.image.extract_patches(
+                    tf.expand_dims(tf.expand_dims(reshaped, 2), 0),
+                    [1, self.consecutive_patterns, number_of_patterns, 1],
+                    [1, self.consecutive_patterns, number_of_patterns, 1],
+                    [1, 1, 1, 1],
+                    'SAME',
+                ),
+                (total_blocks, -1),
+            )
+
+            sparse_penalty = self.alpha * tf.norm(tf.norm(blocks, axis=1, ord=2), ord=self.norm)
+            variance_penalty = self.beta * tf.norm(tf.math.reduce_variance(blocks, axis=1), ord=self.norm)
+            return sparse_penalty + variance_penalty
+
+    def get_config(self):
+        return {
+            'alpha': self.alpha,
+            'beta': self.beta,
+            'norm': self.norm,
+            'structure_type': self.structure_type,
+            'pattern_offset': self.pattern_offset,
+            'consecutive_patterns': self.consecutive_patterns,
+        }
diff --git a/hls4ml/optimization/keras/utils.py b/hls4ml/optimization/keras/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..eaf34f41fc4213d6a2eac651c83cd7f5833deaf3
--- /dev/null
+++ b/hls4ml/optimization/keras/utils.py
@@ -0,0 +1,107 @@
+import numpy as np
+import tensorflow as tf
+
+
+@tf.function
+def get_model_gradients(model, loss_fn, X, y):
+    '''
+    Calculate model gradients with respect to weights
+
+    Args:
+        - model (keras.model): Input model
+        - loss_fn (keras.losses.Loss): Model loss function
+        - X (np.array): Input data
+        - y (np.array): Output data
+
+    Return:
+        - grads (dict): Per-layer gradients of loss with respect to weights
+    '''
+    grads = {}
+    # While persistent GradientTape slows down execution,
+    # Is faster than performing forward pass and non-persistent for every layer
+    with tf.GradientTape(persistent=True) as tape:
+        output = model(X, training=True)
+        loss_value = loss_fn(y, output)
+
+        for layer in model.layers:
+            if len(layer.trainable_weights) > 0:
+                grads[layer.name] = tape.gradient(loss_value, layer.kernel)
+
+    return grads
+
+
+@tf.function
+def get_model_hessians(model, loss_fn, X, y):
+    '''
+    Calculate the second derivatives of the loss with repsect to model weights
+    Note, only diagonal elements of the Hessian are computed
+
+    Args:
+        - model (keras.model): Input model
+        - loss_fn (keras.losses.Loss): Model loss function
+        - X (np.array): Input data
+        - y (np.array): Output data
+
+    Return:
+        - grads (dict): Per-layer second derivatives of loss with respect to weights
+    '''
+    grads = {}
+    with tf.GradientTape(persistent=True) as tape:
+        output = model(X, training=False)
+        loss_value = loss_fn(y, output)
+
+        for layer in model.layers:
+            if hasattr(layer, 'kernel'):
+                grads[layer.name] = tape.gradient(tape.gradient(loss_value, layer.kernel), layer.kernel)
+
+    return grads
+
+
+def get_model_sparsity(model):
+    '''
+    Calculate total and per-layer model sparsity
+
+    Args:
+        - model (keras.model): Model to be evaluated
+
+    Return:
+        - sparsity (float): Model sparsity, as a percentage of zero weights w.r.t to total number of model weights
+        - layers (dict): Key-value dictionary; each key is a layer name and the associated value is the layer's sparsity
+
+    TODO - Extend support for recurrent layers (reccurent_kernel)
+    '''
+
+    total_weights = 0
+    zero_weights = 0
+    layer_sparsity = {}
+
+    for layer in model.layers:
+        if hasattr(layer, 'kernel'):
+            weights = layer.get_weights()[0].flatten()
+            total_weights = total_weights + len(weights)
+            zero_weights = zero_weights + len(weights) - np.count_nonzero(weights)
+            layer_sparsity[layer.name] = 1.0 - np.count_nonzero(weights) / len(weights)
+
+    try:
+        return zero_weights / total_weights, layer_sparsity
+    except ZeroDivisionError:
+        return 0.0, layer_sparsity
+
+
+# TODO - Does this work for non-linear models (e.g. skip connections) ?
+def get_last_layer_with_weights(model):
+    '''
+    Finds the last layer with weights
+    The last layer with weights determined the output shape, so, pruning is sometimes not applicable to it
+    As an example, consider a network with 16 - 32 - 5 neurons - the last layer's neuron (5) cannot be removed
+    Since they map to the data labels
+    Args:
+        - model (keras.model): Input model
+
+    Return:
+        - idx (int): Index location of last layer with params
+    '''
+    for idx, layer in reversed(list(enumerate(model.layers))):
+        if hasattr(layer, 'kernel'):
+            return idx
+    return len(model.layers)
diff --git a/hls4ml/optimization/knapsack.py b/hls4ml/optimization/knapsack.py
new file mode 100644
index 0000000000000000000000000000000000000000..e198945b1b5c18f1fff711264b3638551d5a8fa8
--- /dev/null
+++ b/hls4ml/optimization/knapsack.py
@@ -0,0 +1,262 @@
+import sys
+import time
+
+import numpy as np
+
+
+def solve_knapsack(values, weights, capacity, implementation='CBC_MIP', **kwargs):
+    '''
+    A function for solving the Knapsack problem
+
+    Args:
+        - values (np.array, float): A one-dimensional array, where each entry is the value of an item
+        - weights (np.array, int): An matrix, each row represents the weights of every item, in a given knapsack
+        - capacity (np.array, int): A one-dimensional array, each entry is the maximum weights of a Knapsack
+        - implementation (string): Algorithm to solve Knapsack problem - dynamic programming, greedy, branch and bound
+
+    Kwargs:
+        - time_limit (float): Limit (in seconds) after which the CBC or Branch & Bound should
+            stop looking for a solution and return optimal so far
+        - scaling_factor (float): Scaling factor for floating points values in CBC or B&B
+
+    Return:
+        - optimal_value (float): The optimal values of elements in the knapsack
+        - selected_items (list): A list of indices, corresponding to the selected elements
+
+    Notes:
+        - The general formulation of the Knapsack problem for N items and M knapsacks is:
+                max v.T @ x
+                s.t. A @ x <= W
+                v ~ (N, 1) x ~ (N, 1) A ~ (M, N) W ~ (M, 1)
+                x_{i, j} = {0, 1} and <= is the generalized, element-wise inequlaity for vectors
+
+        - Supported implementations:
+            - Dynamic programming:
+                - Optimal solution
+                - Time complexity: O(nW)
+                - Suitable for single-dimensional constraints and a medium number of items, with integer weights
+            - Branch and bound:
+                - Optimal
+                - Solved using Google OR-Tools
+                - Suitable for multi-dimensional constraints and a large number of items
+            - Branch and bound:
+                - Solution sub-optimal, but often better than greeedy
+                - Solved using Google OR-Tools, with the CBC MIP Solver
+                - Suitable for multi-dimensional constraints and a very high number of items
+            - Greedy:
+                - Solution sub-optimal
+                - Time complexity: O(mn)
+                - Suitable for highly dimensional constraints or a very high number of items
+
+        - Most implementations require integer values of weights and capacities;
+            For pruning & weight sharing this is never a problem
+            In case non-integer weights and capacities are requires,
+            All of the values should be scaled by an appropriate scaling factor
+    '''
+    if implementation not in ('dynamic', 'greedy', 'branch_bound', 'CBC_MIP'):
+        raise Exception('Unknown algorithm for solving Knapsack')
+
+    if len(values.shape) != 1:
+        raise Exception(
+            'Current implementations of Knapsack optimization support single-objective problems. \
+                        Values must be one-dimensional'
+        )
+
+    if len(weights.shape) != 2:
+        raise Exception(
+            'Current implementation of Knapsack assumes weight vector is 2-dimensional,'
+            'to allow for multi-dimensional Knapsack problem. \
+            If solving a one-dimensional Knapsack problem, extend dimensions of weights to a one-row matrix'
+        )
+
+    if values.shape[0] != weights.shape[1]:
+        raise Exception('Uneven number of items and weights')
+
+    if not (np.all(values >= 0) and np.all(weights >= 0)):
+        raise Exception('Current implementation of Knapsack problem requires non-negative values and weights')
+
+    if not np.all(np.equal(np.mod(capacity, 1), 0)) or not np.all(np.equal(np.mod(weights, 1), 0)):
+        raise Exception('Current implementation of Knapsack problem requires integer weights and capacities')
+
+    print(f'Starting to solve Knapsack problem with {values.shape[0]} variables and a weight constraint of {capacity}')
+    start = time.time()
+
+    # Special case, empty list
+    if values.shape[0] == 0:
+        return 0, []
+
+    # Special case, the sum of all weights is less than al the capacity constraints
+    if np.all(np.sum(weights, axis=1) <= capacity):
+        return np.sum(values), list(range(0, values.shape[0]))
+
+    # Special case, all the item weights per knapsack are equal, so we can greedily select the ones with the highest value
+    if np.all([weights[i, :] == weights[i, 0] for i in range(weights.shape[0])]):
+        return __solve_knapsack_equal_weights(values, weights, capacity)
+
+    # General cases
+    if implementation == 'dynamic':
+        if weights.shape[0] == 1:
+            optimal_value, selected_items = __solve_1d_knapsack_dp(values, weights[0], capacity[0])
+        else:
+            raise Exception('Solving Knapsack with dynamic programming requires single-dimensional constraints')
+    elif implementation == 'branch_bound':
+        optimal_value, selected_items = __solve_knapsack_branch_and_bound(values, weights, capacity, **kwargs)
+    elif implementation == 'CBC_MIP':
+        optimal_value, selected_items = __solve_knapsack_cbc_mip(values, weights, capacity, **kwargs)
+    else:
+        optimal_value, selected_items = __solve_knapsack_greedy(values, weights, capacity)
+
+    print(f'Time taken to solve Knapsack {time.time() - start}s')
+    return optimal_value, selected_items
+
+
+def __solve_1d_knapsack_dp(values, weights, capacity):
+    '''
+    Helper function to solve the 1-dimensional Knapsack problem exactly through dynamic programming
+    The dynamic programming approach is only suitable for one-dimensional weight constraints
+    Furthermore, it has a high computational complexity and it is not suitable for highly-dimensional arrays
+    NOTE: The weights and corresponding weight constraint need to be integers;
+    If not, the they should be scaled and rounded beforehand
+    '''
+    assert len(weights.shape) == 1
+
+    # Build look-up table in bottom-up approach
+    N = values.shape[0]
+    K = [[0 for w in range(capacity + 1)] for i in range(N + 1)]
+    for i in range(1, N + 1):
+        for w in range(1, capacity + 1):
+            if weights[i - 1] <= w:
+                K[i][w] = max(values[i - 1] + K[i - 1][w - weights[i - 1]], K[i - 1][w])
+            else:
+                K[i][w] = K[i - 1][w]
+
+    # Reverse Knapsack to find selected groups
+    i = N
+    w = capacity
+    res = K[N][capacity]
+    selected = []
+    while i >= 0 and res > 0:
+        if res == K[i - 1][w]:
+            pass
+        else:
+            selected.append(i - 1)
+            res = res - values[i - 1]
+            w = w - weights[i - 1]
+        i = i - 1
+
+    return K[N][capacity], selected
+
+
+def __solve_knapsack_greedy(values, weights, capacity):
+    '''
+    Helper function that solves the n-dimensional Knapsack algorithm with a greedy algorithm
+    The greedy approach should only be used for problems with many items or highly dimensional weights
+    The solution can [and often will] be sub-optimal; otherwise, dynamic programming, branch & bound etc. should be used
+    '''
+
+    # For each item, calculate the value per weight ratio (this can be thought of as item efficiency)
+    # The weights are scaled for every dimension, to avoid inherent bias towards large weights in a single dimension
+    weights_rescaled = weights / np.max(weights, axis=1)[:, np.newaxis]
+    ratios = values / weights_rescaled.sum(axis=0)
+    indices = np.argsort(ratios)
+
+    # Greedily select item with the highest ratio (efficiency)
+    optimal = 0
+    selected = []
+    accum = np.zeros_like(capacity)
+    for i in reversed(indices):
+        if np.all((accum + weights[:, i]) <= capacity):
+            selected.append(i)
+            optimal += values[i]
+            accum += weights[:, i]
+        else:
+            break
+
+    # The greedy algorithm can be sub-optimal;
+    # However, selecting the above elements or the next element that could not fit into the knapsack
+    # Will lead to solution that is at most (1/2) of the optimal solution;
+    # Therefore, take whichever is higher and satisfies the constraints
+    if values[i] > optimal and np.all(weights[:, i]) <= capacity:
+        return values[i], [i]
+    else:
+        return optimal, selected
+
+
+def __solve_knapsack_branch_and_bound(values, weights, capacity, time_limit=sys.float_info.max, scaling_factor=10e4):
+    '''
+    Helper function to solve Knapsack problem using Branch and Bound;
+    Implemented using Google OR-Tools [weights & capacities need to be integers]
+    The algorithm explores the search space (a tree of all the posible combinations, 2^N nodes),
+    But discards infeasible & sub-optimal solutions
+
+    Additional args:
+        - time_limit - Time limit in seconds
+            After which B&B search should stop and return a sub-optimal solution
+        - scaling_factor - Factor to scale floats in values arrays;
+            OR-Tools requires all values & weights to be integers;
+    '''
+    try:
+        from ortools.algorithms import pywrapknapsack_solver
+    except ModuleNotFoundError:
+        raise Exception('OR-Tools not found. Please insteal Google OR-Tools from pip.')
+
+    solver = pywrapknapsack_solver.KnapsackSolver(
+        pywrapknapsack_solver.KnapsackSolver.KNAPSACK_MULTIDIMENSION_BRANCH_AND_BOUND_SOLVER, 'BB'
+    )
+    solver.set_time_limit(time_limit)
+    solver.Init((values * scaling_factor).astype(int).tolist(), weights.astype(int).tolist(), capacity.astype(int).tolist())
+    optimal = solver.Solve()
+    selected = [i for i in range(values.shape[0]) if solver.BestSolutionContains(i)]
+    return optimal / scaling_factor, selected
+
+
+def __solve_knapsack_cbc_mip(values, weights, capacity, time_limit=sys.float_info.max, scaling_factor=10e4):
+    '''
+    Helper function to solve Knapsack problem using the CBC MIP solver using Google OR-Tools
+
+    Additional args:
+        - time_limit - Time limit (seconds) after which CBC solver should stop and return a sub-optimal solution
+        - scaling_factor - Factor to scale floats in values arrays;
+            OR-Tools requires all values & weights to be integers;
+            So all of the values are scaled by a large number
+    '''
+    try:
+        from ortools.algorithms import pywrapknapsack_solver
+    except ModuleNotFoundError:
+        raise Exception('OR-Tools not found. Please insteal Google OR-Tools from pip.')
+
+    solver = pywrapknapsack_solver.KnapsackSolver(
+        pywrapknapsack_solver.KnapsackSolver.KNAPSACK_MULTIDIMENSION_CBC_MIP_SOLVER, 'CBC'
+    )
+    solver.set_time_limit(time_limit)
+    solver.Init((values * scaling_factor).astype(int).tolist(), weights.astype(int).tolist(), capacity.astype(int).tolist())
+    optimal = solver.Solve()
+    selected = [i for i in range(values.shape[0]) if solver.BestSolutionContains(i)]
+    return optimal / scaling_factor, selected
+
+
+def __solve_knapsack_equal_weights(values, weights, capacity):
+    '''
+    Helper function that solves the n-dimensional Knapsack algorithm with a greedy algorithm
+    The assumption is that all the items have the same weight; while this seems a bit artificial
+    It occurs often in pruning - e.g. in pattern pruning, each DSP block saves one DSP; however, as a counter-example
+    In structured pruning, each structure can save a different amount of FLOPs (Conv2D filter vs Dense neuron)
+    '''
+    assert np.all([weights[i, :] == weights[i, 0] for i in range(weights.shape[0])])
+
+    # Find items with the highest value
+    indices = np.argsort(values)
+
+    # Greedily select item with the highest ratio
+    optimal = 0
+    selected = []
+    accum = np.zeros_like(capacity)
+    for i in reversed(indices):
+        if np.all((accum + weights[:, i]) <= capacity):
+            selected.append(i)
+            optimal += values[i]
+            accum += weights[:, i]
+        else:
+            break
+
+    return optimal, selected
diff --git a/hls4ml/optimization/objectives/__init__.py b/hls4ml/optimization/objectives/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..05d916d85fdb9b9ac9151661756b34d872cefd8a
--- /dev/null
+++ b/hls4ml/optimization/objectives/__init__.py
@@ -0,0 +1,142 @@
+import logging
+from abc import ABC, abstractmethod
+
+import numpy as np
+
+from hls4ml.optimization.attributes import OptimizationAttributes
+from hls4ml.optimization.config import SUPPORTED_STRUCTURES
+
+'''
+Pruning & weight sharing are formulated as an optimization problem, with the aim of minimising some metric
+Metrics can include: total number of weights, DSP utilization, latency, FLOPs etc.
+'''
+
+
+class ObjectiveEstimator(ABC):
+    '''
+    Abstract class with methods for estimating the utilization and savings of a certain layer, with respect to some objective
+    For each objective, an inherited class is written with the correct implementaton of the below methods
+    The objectives can be multi-dimensional, e.g. DSPs and BRAM
+    Care needs to be taken when optimizing several objectives, especially if conflicting
+    '''
+
+    @abstractmethod
+    def is_layer_optimizable(self, layer_attributes):
+        '''
+        For a given layer, checks whether optimizations make sense, with respect to the given objective(s)
+        Furthermore, it returns the type of optimization (structured, unstructured etc.)
+        Most suitable for minimising the objective(s).
+
+        Args:
+            - layer_attributes (hls4ml.optimiation.attributes.LayerAttributes)
+
+        Return:
+            - optimizable (boolean) - can optimizations be applied to this layer
+            - optimization_attributes (hls4ml.optimiation.attributes.OptimizationAttributes) -
+                Most suitable approach for optimization
+
+        Examples:
+            - Metric = Total weights, Layer = Dense, shape = (4, 4) -> return True, unstructured
+            - Metric = DSP, Layer = Dense, Precision = ap_fixed<8, 0> -> return False
+                (Vivado doesn't use DSP when precision < 9)
+            - Metric = DSP, Layer = Dense, Precision = ap_fixed<16, 6> ->
+                return True, pattern structure, both pruning and weight sharing
+        '''
+        pass
+
+    @abstractmethod
+    def layer_resources(self, layer_attributes):
+        '''
+        For a given layer, how many units of the metric are used, given a generic weight matrix
+
+        Args:
+            - layer_attributes (hls4ml.optimiation.attributes.LayerAttributes)
+
+        Return:
+            - resources (list, int) - total resources (w.r.t every dimension of the objective) used
+
+        Example: Metric = Total weights, Layer = Dense, shape = (4, 4) -> return [16] [regardless of layer sparsity]
+        '''
+        pass
+
+    @abstractmethod
+    def layer_savings(self, layer_attributes):
+        '''
+        For a given layer, how many units of the metric are saved, when optimizing one structure
+        The structure type, alongside its parameters (e.g. block shape) are stored in layer attributes
+        For best results, OptimizationAttributes in layer_attribtues should be obtained from is_layer_optimizable
+
+        Args:
+            - layer_attributes (hls4ml.optimiation.attributes.LayerAttributes)
+
+        Return:
+            - savings (list, int) - savings achieved (one for every dimenson of objective)
+                                    With OptimizationAttributes from layer_attributes
+
+        Example: Metric = Total weights, Layer = Dense, shape = (4, 4):
+            - structure_type == unstructured -> return [1]
+            - structure_type == structured -> return [4]
+        '''
+        pass
+
+
+'''
+A class containing objective estimation with the goal of minimizing
+The number of non-zero weights in a layer [corresponds to unstructured pruning]
+'''
+
+
+class ParameterEstimator(ObjectiveEstimator):
+    @classmethod
+    def is_layer_optimizable(self, layer_attributes):
+        if not layer_attributes.weight_shape:
+            return False, None
+        else:
+            return True, OptimizationAttributes(SUPPORTED_STRUCTURES.UNSTRUCTURED, pruning=True, weight_sharing=False)
+
+    @classmethod
+    def layer_resources(self, layer_attributes):
+        if not layer_attributes.weight_shape:
+            return [0]
+        else:
+            return [np.prod(layer_attributes.weight_shape)]
+
+    @classmethod
+    def layer_savings(self, layer_attributes):
+        if not layer_attributes.weight_shape:
+            return [0]
+
+        structure_type = layer_attributes.optimization_attributes.structure_type
+        pruning = layer_attributes.optimization_attributes.pruning
+        weight_sharing = layer_attributes.optimization_attributes.weight_sharing
+
+        if weight_sharing:
+            logging.warn(
+                'Weight sharing does not decrease the number of parameters. \
+                         It is recommened to use the default attributes, returned from is_layer_optimizable(...)'
+            )
+            return [0]
+
+        if not pruning:
+            logging.warn(
+                'Pruning needs to be enabled to decrease the number of parameters. \
+                It is recommened to use the default attributes, returned from is_layer_optimizable(...)'
+            )
+            return [0]
+
+        # In this case, pruning = True and weight_sharing = False,
+        # So calculate savings incurred by removing a group of weights
+        if structure_type == SUPPORTED_STRUCTURES.UNSTRUCTURED:
+            return [1]
+        elif structure_type == SUPPORTED_STRUCTURES.STRUCTURED:
+            if 'Dense' in layer_attributes.layer_type.__name__ or 'Conv2D' in layer_attributes.layer_type.__name__:
+                return [np.prod(layer_attributes.weight_shape[:-1])]
+            else:
+                raise Exception('Unknown layer encountered when estimating parameter savings.')
+        elif structure_type == SUPPORTED_STRUCTURES.PATTERN:
+            number_of_patterns = (
+                np.prod(layer_attributes.weight_shape) // layer_attributes.optimization_attributes.pattern_offset
+            )
+            return [number_of_patterns * layer_attributes.optimization_attributes.consecutive_patterns]
+        elif structure_type == SUPPORTED_STRUCTURES.BLOCK:
+            return [np.prod(layer_attributes.optimization_attributes.block_shape)]
diff --git a/hls4ml/optimization/objectives/gpu_objectives.py b/hls4ml/optimization/objectives/gpu_objectives.py
new file mode 100644
index 0000000000000000000000000000000000000000..8528a318398cb365b4a856e6c82e8c961846d13d
--- /dev/null
+++ b/hls4ml/optimization/objectives/gpu_objectives.py
@@ -0,0 +1,81 @@
+import logging
+
+import numpy as np
+
+from hls4ml.optimization.attributes import OptimizationAttributes
+from hls4ml.optimization.config import SUPPORTED_STRUCTURES
+from hls4ml.optimization.objectives import ObjectiveEstimator
+
+
+class GPUFLOPEstimator(ObjectiveEstimator):
+    @classmethod
+    def is_layer_optimizable(self, layer_attributes):
+        if not layer_attributes.weight_shape:
+            return False, None
+        else:
+            return True, OptimizationAttributes(
+                structure_type=SUPPORTED_STRUCTURES.STRUCTURED, pruning=True, weight_sharing=False
+            )
+
+    @classmethod
+    def layer_resources(self, layer_attributes):
+        if not layer_attributes.weight_shape:
+            return [0]
+        else:
+            if 'Dense' in layer_attributes.layer_type.__name__:
+                return [2 * np.prod(layer_attributes.weight_shape) + layer_attributes.weight_shape[1]]
+            elif 'Conv2D' in layer_attributes.layer_type.__name__:
+                return [
+                    2
+                    * np.prod(layer_attributes.weight_shape)
+                    * layer_attributes.output_shape[0]
+                    * layer_attributes.output_shape[1]
+                    + layer_attributes.weight_shape[3]
+                ]
+            else:
+                raise Exception('Unknown layer encountered when estimating FLOP utilization.')
+
+    @classmethod
+    def layer_savings(self, layer_attributes):
+        if not layer_attributes.weight_shape:
+            return [0]
+
+        structure_type = layer_attributes.optimization_attributes.structure_type
+        pruning = layer_attributes.optimization_attributes.pruning
+        weight_sharing = layer_attributes.optimization_attributes.weight_sharing
+
+        if weight_sharing:
+            logging.warn(
+                'Weight sharing does not decrease FLOPs. \
+                         It is recommened to use the default attributes, returned from is_layer_optimizable(...)'
+            )
+            return [0]
+
+        if not pruning:
+            logging.warn(
+                'Pruning needs to be enabled to decrease FLOPs. \
+                         It is recommened to use the default attributes, returned from is_layer_optimizable(...)'
+            )
+            return [0]
+
+        # TODO - The below formulas underestimate FLOP savings
+        # Removing a filter in a layer removes channels / neurons in subsequent layers
+        if structure_type == SUPPORTED_STRUCTURES.STRUCTURED:
+            if 'Dense' in layer_attributes.layer_type.__name__:
+                return [2 * layer_attributes.weight_shape[0] + 1]
+            elif 'Conv2D' in layer_attributes.layer_type.__name__:
+                return [
+                    2
+                    * np.prod(layer_attributes.weight_shape[0:3])
+                    * layer_attributes.output_shape[0]
+                    * layer_attributes.output_shape[1]
+                    + 1
+                ]
+            else:
+                raise Exception('Unknown layer encountered when estimating FLOP savings.')
+        else:
+            logging.warn(
+                'FLOP savings occur with structured pruning. \
+                         It is recommened to use the default attributes, returned from is_layer_optimizable(...)'
+            )
+            return [0]
diff --git a/hls4ml/optimization/objectives/vivado_objectives.py b/hls4ml/optimization/objectives/vivado_objectives.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0c0c33e09df8ab6e95920236b97e19c6a91d838
--- /dev/null
+++ b/hls4ml/optimization/objectives/vivado_objectives.py
@@ -0,0 +1,372 @@
+import logging
+import math
+
+import numpy as np
+
+from hls4ml.optimization.attributes import OptimizationAttributes
+from hls4ml.optimization.config import SUPPORTED_STRUCTURES
+from hls4ml.optimization.objectives import ObjectiveEstimator
+
+
+# Optimizes DSP utilisation for Vivado backend
+class VivadoDSPEstimator(ObjectiveEstimator):
+    @classmethod
+    def is_layer_optimizable(self, layer_attributes):
+        if not layer_attributes.weight_shape or layer_attributes.args['hls4ml_attributes'].weight_precision.width < 9:
+            return False, None
+        else:
+            if layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'resource':
+                return True, OptimizationAttributes(
+                    SUPPORTED_STRUCTURES.PATTERN,
+                    pruning=True,
+                    weight_sharing=False,
+                    pattern_offset=np.prod(layer_attributes.weight_shape)
+                    // layer_attributes.args['hls4ml_attributes'].reuse_factor,
+                    consecutive_patterns=1,
+                )
+            else:
+                return True, OptimizationAttributes(SUPPORTED_STRUCTURES.UNSTRUCTURED, pruning=True, weight_sharing=False)
+
+    @classmethod
+    def layer_resources(self, layer_attributes):
+        if not layer_attributes.weight_shape or layer_attributes.args['hls4ml_attributes'].weight_precision.width < 9:
+            return [0]
+        else:
+            # TOOD - Extend for parallelisation factor
+            return [np.prod(layer_attributes.weight_shape) // layer_attributes.args['hls4ml_attributes'].reuse_factor]
+
+    @classmethod
+    def layer_savings(self, layer_attributes):
+        if not layer_attributes.weight_shape or layer_attributes.args['hls4ml_attributes'].weight_precision.width < 9:
+            return [0]
+
+        # TODO - Once we know how to implement constant coefficient multiplication via LUT, enable for weight sharing
+        pruning = layer_attributes.optimization_attributes.pruning
+        if not pruning:
+            logging.warn(
+                'Pruning needs to be enabled to decrease the number of DSPs used. \
+                It is recommened to use the default attributes, returned from is_layer_optimizable(...)'
+            )
+            return [0]
+
+        structure_type = layer_attributes.optimization_attributes.structure_type
+        if layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'latency':
+            if layer_attributes.args['hls4ml_attributes'].reuse_factor == 1:
+                return [1]
+            else:
+                return [0]
+        else:
+            if structure_type == SUPPORTED_STRUCTURES.UNSTRUCTURED:
+                if layer_attributes.args['hls4ml_attributes'].reuse_factor == 1:
+                    return [1]
+                else:
+                    return [0]
+            elif structure_type == SUPPORTED_STRUCTURES.STRUCTURED:
+                if (
+                    layer_attributes.args['hls4ml_attributes'].reuse_factor
+                    == layer_attributes.args['hls4ml_attributes'].n_in
+                ):
+                    return [1]
+                else:
+                    return [0]
+            elif structure_type == SUPPORTED_STRUCTURES.PATTERN:
+                pattern_offset = layer_attributes.optimization_attributes.pattern_offset
+                number_of_patterns = np.prod(layer_attributes.weight_shape) // pattern_offset
+
+                if number_of_patterns == layer_attributes.args['hls4ml_attributes'].reuse_factor:
+                    return [layer_attributes.optimization_attributes.consecutive_patterns]
+                else:
+                    return [0]
+            elif structure_type == SUPPORTED_STRUCTURES.BLOCK:
+                logging.warn('hls4ml does not support block sparsity patterns...setting layer savings to zero')
+                return [0]
+
+
+# Optimizes BRAM and DSP for Vivado backend
+class VivadoMultiObjectiveEstimator(ObjectiveEstimator):
+    @classmethod
+    def is_layer_optimizable(self, layer_attributes):
+        if not layer_attributes.weight_shape:
+            return False, None
+
+        if (
+            layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'resource'
+            and layer_attributes.args['hls4ml_attributes'].reuse_factor > 1
+        ):
+            if 36 % layer_attributes.args['hls4ml_attributes'].weight_precision.width == 0:
+                consecutive_patterns = int(36 // layer_attributes.args['hls4ml_attributes'].weight_precision.width)
+            else:
+                consecutive_patterns = int(
+                    math.ceil(2 * 36 / layer_attributes.args['hls4ml_attributes'].weight_precision.width)
+                )
+
+            return True, OptimizationAttributes(
+                SUPPORTED_STRUCTURES.PATTERN,
+                pruning=True,
+                weight_sharing=False,
+                pattern_offset=int(
+                    np.prod(layer_attributes.weight_shape) // layer_attributes.args['hls4ml_attributes'].reuse_factor
+                ),
+                consecutive_patterns=consecutive_patterns,
+            )
+        else:
+            return True, OptimizationAttributes(SUPPORTED_STRUCTURES.UNSTRUCTURED, pruning=True, weight_sharing=False)
+
+    @classmethod
+    def layer_resources(self, layer_attributes):
+        if not layer_attributes.weight_shape:
+            return [0]
+
+        # TOOD - Extend for parallelisation factor
+        if layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'latency':
+            return [
+                int(np.prod(layer_attributes.weight_shape) // layer_attributes.args['hls4ml_attributes'].reuse_factor),
+                0,
+            ]
+        else:
+            # Resource strategy, RF == 1 is similar to Latency strategy (but slower)
+            if layer_attributes.args['hls4ml_attributes'].reuse_factor == 1:
+                return [
+                    int(np.prod(layer_attributes.weight_shape) // layer_attributes.args['hls4ml_attributes'].reuse_factor),
+                    0,
+                ]
+            else:
+                # For RF > 1, BRAM utilised by weights can be estimated by (bit_width * n_in * n_out) / (RF * 36)
+                return [
+                    int(np.prod(layer_attributes.weight_shape) // layer_attributes.args['hls4ml_attributes'].reuse_factor),
+                    int(
+                        math.ceil(
+                            np.prod(layer_attributes.weight_shape)
+                            * layer_attributes.args['hls4ml_attributes'].weight_precision.width
+                            / (layer_attributes.args['hls4ml_attributes'].reuse_factor * 36)
+                        )
+                    ),
+                ]
+
+    @classmethod
+    def layer_savings(self, layer_attributes):
+        if not layer_attributes.weight_shape:
+            return [0]
+
+        # TODO - Once we know how to implement constant coefficient multiplication via LUT, enable for weight sharing
+        pruning = layer_attributes.optimization_attributes.pruning
+        if not pruning:
+            logging.warn(
+                'Pruning needs to be enabled to decrease the number of DSPs used. \
+                It is recommened to use the default attributes, returned from is_layer_optimizable(...)'
+            )
+            return [0]
+
+        structure_type = layer_attributes.optimization_attributes.structure_type
+        if layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'latency':
+            if layer_attributes.args['hls4ml_attributes'].reuse_factor == 1:
+                return [1, 0]
+            else:
+                return [0, 0]
+        else:
+            if (
+                layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'resource'
+                and layer_attributes.args['hls4ml_attributes'].reuse_factor == 1
+            ):
+                return [1, 0]
+            else:
+                if structure_type == SUPPORTED_STRUCTURES.PATTERN:
+                    pattern_offset = layer_attributes.optimization_attributes.pattern_offset
+                    consecutive_patterns = layer_attributes.optimization_attributes.consecutive_patterns
+                    weight_precision = layer_attributes.args['hls4ml_attributes'].weight_precision.width
+
+                    number_of_patterns = np.prod(layer_attributes.weight_shape) // pattern_offset
+                    saved_one_bram_block = (
+                        36 == consecutive_patterns * weight_precision and 36 % weight_precision == 0
+                    ) or (72 == consecutive_patterns * weight_precision)
+
+                    if (
+                        number_of_patterns == layer_attributes.args['hls4ml_attributes'].reuse_factor
+                        and saved_one_bram_block
+                    ):
+                        return [consecutive_patterns, 1]
+                    else:
+                        logging.warn('Support for multi-objective optimisation is not fully implemented yet....')
+                        return [0, 0]
+                else:
+                    logging.warn('Support for multi-objective optimisation is not fully implemented yet....')
+                    return [0, 0]
+
+
+class VivadoFFEstimator(ObjectiveEstimator):
+    @classmethod
+    def is_layer_optimizable(self, layer_attributes):
+        if not layer_attributes.weight_shape:
+            return False, None
+
+        # Resource strategy and I/O type io_stream store both weights and activation tensors in BRAM; skipping
+        if (
+            layer_attributes.args['hls4ml_attributes'].io_type == 'io_stream'
+            and layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'resource'
+        ):
+            logging.warn('FFs are at minimum utilization with io_stream and Resource strategy')
+            return False, None
+
+        # With io_stream in Latency, weight are stored in FFs, so unstructured pruning will benefit the most
+        if (
+            layer_attributes.args['hls4ml_attributes'].io_type == 'io_stream'
+            and layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'latency'
+        ):
+            return True, OptimizationAttributes(SUPPORTED_STRUCTURES.UNSTRUCTURED, pruning=True, weight_sharing=False)
+
+        # In io_parallel with Resource, weights are stored in BRAM but activation tensors in FFs,
+        # So structured pruning is the most suitable, it reduces the size out output before compile-time
+        if (
+            layer_attributes.args['hls4ml_attributes'].io_type == 'io_parallel'
+            and layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'resource'
+        ):
+            return True, OptimizationAttributes(SUPPORTED_STRUCTURES.STRUCTURED, pruning=True, weight_sharing=False)
+
+        # In io_parallel with Latency, weights and activation tensors are all stored in FFs,
+        # So it is equivalent to unstructured, high sparsity pruning
+        if (
+            layer_attributes.args['hls4ml_attributes'].io_type == 'io_parallel'
+            and layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'latency'
+        ):
+            return True, OptimizationAttributes(SUPPORTED_STRUCTURES.UNSTRUCTURED, pruning=True, weight_sharing=False)
+
+    # TODO - This method is inaccurate (accross all cases); in general, estimating FFs is hard,
+    # But as long as it is consistent(ly wrong), it should not matter for the pruning
+    @classmethod
+    def layer_resources(self, layer_attributes):
+        if not layer_attributes.weight_shape:
+            return [0]
+
+        # Resource strategy and I/O type io_stream store both weights and activation tensors in BRAM; minimal FF utilization
+        if (
+            layer_attributes.args['hls4ml_attributes'].io_type == 'io_stream'
+            and layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'resource'
+        ):
+            return [0]
+
+        # With io_stream in Latency, weight are stored in FFs, so FF ~ number_of_weights x weight_precision
+        if (
+            layer_attributes.args['hls4ml_attributes'].io_type == 'io_stream'
+            and layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'latency'
+        ):
+            return [
+                np.prod(layer_attributes.weight_shape) * layer_attributes.args['hls4ml_attributes'].weight_precision.width
+            ]
+
+        # In io_parallel with Resource, weights are stored in BRAM but activation tensors in FFs,
+        # So FF ~ number_of_outputs x weight_precision
+        if (
+            layer_attributes.args['hls4ml_attributes'].io_type == 'io_parallel'
+            and layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'resource'
+        ):
+            return [
+                np.prod(layer_attributes.output_shape) * layer_attributes.args['hls4ml_attributes'].output_precision.width
+            ]
+
+        # In io_parallel with Latency, weights and latency are all stored in FFs,
+        # So it is equivalent to the sum of the above two cases
+        if (
+            layer_attributes.args['hls4ml_attributes'].io_type == 'io_parallel'
+            and layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'latency'
+        ):
+            return [
+                np.prod(layer_attributes.weight_shape) * layer_attributes.args['hls4ml_attributes'].weight_precision.width
+                + np.prod(layer_attributes.output_shape) * layer_attributes.args['hls4ml_attributes'].output_precision.width
+            ]
+
+    @classmethod
+    def layer_savings(self, layer_attributes):
+        if not layer_attributes.weight_shape:
+            return [0]
+
+        structure_type = layer_attributes.optimization_attributes.structure_type
+        pruning = layer_attributes.optimization_attributes.pruning
+        weight_sharing = layer_attributes.optimization_attributes.weight_sharing
+
+        if weight_sharing:
+            logging.warn(
+                'Weight sharing does not decrease the number of parameters. \
+                         It is recommened to use the default attributes, returned from is_layer_optimizable(...)'
+            )
+            return [0]
+
+        if not pruning:
+            logging.warn(
+                'Pruning needs to be enabled to decrease the number of parameters. \
+                         It is recommened to use the default attributes, returned from is_layer_optimizable(...)'
+            )
+            return [0]
+
+        # Resource strategy and I/O type io_stream store both weights and activation tensors in BRAM; minimal FF utilization
+        if (
+            layer_attributes.args['hls4ml_attributes'].io_type == 'io_stream'
+            and layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'resource'
+        ):
+            return [0]
+
+        # With io_stream in Latency, weight are stored in FFs, so any type of pruning will help:
+        if (
+            layer_attributes.args['hls4ml_attributes'].io_type == 'io_stream'
+            and layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'latency'
+        ):
+            if structure_type == SUPPORTED_STRUCTURES.UNSTRUCTURED:
+                return [layer_attributes.args['hls4ml_attributes'].weight_precision.width]
+            elif structure_type == SUPPORTED_STRUCTURES.STRUCTURED:
+                return [
+                    layer_attributes.args['hls4ml_attributes'].n_in
+                    * layer_attributes.args['hls4ml_attributes'].weight_precision.width
+                ]
+            elif structure_type == SUPPORTED_STRUCTURES.PATTERN:
+                number_of_patterns = (
+                    np.prod(layer_attributes.weight_shape) // layer_attributes.optimization_attributes.pattern_offset
+                )
+                return [
+                    number_of_patterns
+                    * layer_attributes.optimization_attributes.consecutive_patterns
+                    * layer_attributes.args['hls4ml_attributes'].weight_precision.width
+                ]
+            elif structure_type == SUPPORTED_STRUCTURES.BLOCK:
+                return [
+                    np.prod(layer_attributes.optimization_attributes.block_shape)
+                    * layer_attributes.args['hls4ml_attributes'].weight_precision.width
+                ]
+
+        # In io_parallel with Resource, weights are stored in BRAM but activation tensors in FFs,
+        # So only structured pruning helps
+        if (
+            layer_attributes.args['hls4ml_attributes'].io_type == 'io_parallel'
+            and layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'resource'
+        ):
+            if structure_type == SUPPORTED_STRUCTURES.STRUCTURED:
+                return [layer_attributes.args['hls4ml_attributes'].output_precision.width]
+            else:
+                return [0]
+
+        # In io_parallel with Latency, weights and latency are all stored in FFs, so any type of pruning helps
+        if (
+            layer_attributes.args['hls4ml_attributes'].io_type == 'io_parallel'
+            and layer_attributes.args['hls4ml_attributes'].strategy.lower() == 'latency'
+        ):
+            # This is a significant under-estimate, as some savings are incurred due to less intermediate results
+            if structure_type == SUPPORTED_STRUCTURES.UNSTRUCTURED:
+                return [layer_attributes.args['hls4ml_attributes'].weight_precision.width]
+            elif structure_type == SUPPORTED_STRUCTURES.STRUCTURED:
+                return [
+                    layer_attributes.args['hls4ml_attributes'].n_in
+                    * layer_attributes.args['hls4ml_attributes'].weight_precision.width
+                    + layer_attributes.args['hls4ml_attributes'].output_precision.width
+                ]
+            elif structure_type == SUPPORTED_STRUCTURES.PATTERN:
+                number_of_patterns = (
+                    np.prod(layer_attributes.weight_shape) // layer_attributes.optimization_attributes.pattern_offset
+                )
+                return [
+                    number_of_patterns
+                    * layer_attributes.optimization_attributes.consecutive_patterns
+                    * layer_attributes.args['hls4ml_attributes'].weight_precision.width
+                ]
+            elif structure_type == SUPPORTED_STRUCTURES.BLOCK:
+                return [
+                    np.prod(layer_attributes.optimization_attributes.block_shape)
+                    * layer_attributes.args['hls4ml_attributes'].weight_precision.width
+                ]
diff --git a/hls4ml/optimization/scheduler.py b/hls4ml/optimization/scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..347427d5dd9f27511d4342f8f1303417360b5a50
--- /dev/null
+++ b/hls4ml/optimization/scheduler.py
@@ -0,0 +1,149 @@
+from abc import ABC, abstractmethod
+
+
+class OptimizationScheduler(ABC):
+    '''
+    Baseline class handling logic regarding target sparsity and its updates at every step
+    '''
+
+    def __init__(self, initial_sparsity=0, final_sparsity=1):
+        '''
+        intial_sparsity and final_sparsity are between 0.0 and 1.0, NOT 0% and 100%
+        '''
+        if initial_sparsity < 0 or initial_sparsity > 1:
+            raise Exception('intial_sparsity must be between 0.0 and 1.0')
+        if final_sparsity < 0 or final_sparsity > 1:
+            raise Exception('final_sparsity must be between 0.0 and 1.0')
+        self.sparsity = initial_sparsity
+        self.lower_bound = initial_sparsity
+        self.upper_bound = final_sparsity
+
+    @abstractmethod
+    def update_step(self):
+        '''
+        Increments the current sparsity, according to the rule, examples:
+            - ConstantScheduler, sparsity = 0.5, increment = 0.05 -> sparsity = 0.55
+            - BinaryScheduler, sparsity = 0.5, target = 1.0 -> sparsity = 0.75
+
+        Return:
+            - updated (boolean) - Has the sparsity changed? If not, the optimization algorithm can stop
+            - sparsity (float) - Updated sparsity
+        '''
+        pass
+
+    @abstractmethod
+    def repair_step(self):
+        '''
+        Method used when the neural architecture does not meet satisfy performance requirement for a given sparsity
+        Then, the target sparsity is decreased according to the rule, examples:
+            - ConstantScheduler, sparsity = 0.5, increment = 0.05 -> sparsity = 0.55 [see ConstantScheduler for explanation]
+            - BinaryScheduler, sparsity = 0.75, target = 1.0, previous = 0.5 -> sparsity = (0.5 + 0.75) / 2 = 0.625
+
+        Return:
+            - updated (boolean) - Has the sparsity changed? If not, the optimization algorithm can stop
+            - sparsity (float) - Updated sparsity
+        '''
+        pass
+
+    def get_sparsity(self):
+        return self.sparsity
+
+
+class ConstantScheduler(OptimizationScheduler):
+    '''
+    Sparsity updated by a constant term, until
+        (i) sparsity target reached OR
+        (ii) optimization algorithm stops requesting state updates
+    '''
+
+    def __init__(self, initial_sparsity=0, final_sparsity=1.0, update_step=0.05):
+        self.increment = update_step
+        super().__init__(initial_sparsity, final_sparsity)
+
+    def update_step(self):
+        if self.sparsity + self.increment <= self.upper_bound:
+            self.sparsity += self.increment
+            return True, self.sparsity
+        else:
+            return False, self.sparsity
+
+    '''
+    In certain cases, a model might underperform at the current sparsity level,
+    But perform better at a higher sparsity
+    In this case, constant sparsity (since it increments by a small amount every time),
+    Will simply jump to the next sparsity level
+    The model's performance over several sparsity levels optimization is tracked and S
+    Stopped after high loss over several trials (see top level pruning/optimization function)
+
+    '''
+
+    def repair_step(self):
+        return self.update_step()
+
+
+class BinaryScheduler(OptimizationScheduler):
+    '''
+    Sparsity updated by binary halving the search space; constantly updates lower and upper bounds
+    In the update step, sparsity is incremented, as the midpoint between previous sparsity and target sparsity (upper bound)
+    In the repair step, sparsity is decrement, as the midpoint between between the lower bound and previous sparsity
+    '''
+
+    def __init__(self, initial_sparsity=0, final_sparsity=1.0, threshold=0.01):
+        self.threshold = threshold
+        super().__init__(initial_sparsity, final_sparsity)
+
+    def update_step(self):
+        if self.upper_bound - self.sparsity >= self.threshold:
+            self.lower_bound = self.sparsity
+            self.sparsity = 0.5 * (self.lower_bound + self.upper_bound)
+            return True, self.sparsity
+        else:
+            self.lower_bound = self.sparsity
+            return False, self.sparsity
+
+    def repair_step(self):
+        if self.sparsity - self.lower_bound >= self.threshold:
+            self.upper_bound = self.sparsity
+            self.sparsity = 0.5 * (self.lower_bound + self.upper_bound)
+            return True, self.sparsity
+        else:
+            self.upper_bound = self.sparsity
+            return False, self.sparsity
+
+
+class PolynomialScheduler(OptimizationScheduler):
+    '''
+    Sparsity updated by at a polynomial decay, until
+        (i) sparsity target reached OR
+        (ii) optimization algorithm stops requesting state updates
+    For more information, see Zhu & Gupta (2016) -
+        'To prune, or not to prune: exploring the efficacy of pruning for model compression'
+    Note, the implementation is slightly different,
+    Since TensorFlow Prune API depends on the total number of epochs and update frequency
+    '''
+
+    def __init__(self, maximum_steps, initial_sparsity=0, final_sparsity=1.0, decay_power=3):
+        self.decay_power = decay_power
+        self.current_step = 0
+        self.maximum_steps = maximum_steps
+        super().__init__(initial_sparsity, final_sparsity)
+
+    def update_step(self):
+        if self.current_step < self.maximum_steps:
+            self.current_step += 1
+            self.sparsity = self.upper_bound + (self.lower_bound - self.upper_bound) * (
+                (1 - self.current_step / self.maximum_steps) ** self.decay_power
+            )
+            return True, self.sparsity
+        else:
+            return False, self.sparsity
+
+    '''
+    In certain cases, a model might underperform at the current sparsity level, but perform better at a higher sparsity
+    In this case, polynomial sparsity, will simply jump to the next sparsity level
+    The model's performance over several sparsity levels optimization is tracked and
+    toped after high loss over several trials (see top level pruning/optimization function)
+    '''
+
+    def repair_step(self):
+        return self.update_step()
diff --git a/setup.cfg b/setup.cfg
index d442412c622a8ce452a0a200f411ff7d2d167491..24e78e4bd680493e7aaceb6d6825a1670e710818 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -40,6 +40,10 @@ pytest_randomly.random_seeder =
     hls4ml = hls4ml:reseed
 
 [options.extras_require]
+optimization =
+    keras-surgeon@git+https://github.com/fastmachinelearning/keras-surgeon.git
+    ortools==9.4.1874
+    packaging
 profiling =
     matplotlib
     pandas
diff --git a/test/pytest/ci-template.yml b/test/pytest/ci-template.yml
index bbe8df4944f892031bda1767545afb13cf0a8173..5477da933a3af62cefbb488be27d89831b9348f9 100644
--- a/test/pytest/ci-template.yml
+++ b/test/pytest/ci-template.yml
@@ -7,7 +7,7 @@
     - source ~/.bashrc
     - if [ $EXAMPLEMODEL == 1 ]; then git submodule init; git submodule update; fi
     - conda activate hls4ml-testing
-    - pip install .[testing,sr]
+    - pip install .[testing,sr,optimization]
   script:
     - cd test/pytest
     - pytest $PYTESTFILE -rA --cov-report xml --cov-report term --cov=hls4ml --junitxml=report.xml --randomly-seed=42 --randomly-dont-reorganize --randomly-dont-reset-seed
diff --git a/test/pytest/generate_ci_yaml.py b/test/pytest/generate_ci_yaml.py
index 6d816a7abbef94fd8fa629b8c1026e019de4fa18..46c1fa9e5b8c2f6e77798ef49dfcedd07f630ab7 100644
--- a/test/pytest/generate_ci_yaml.py
+++ b/test/pytest/generate_ci_yaml.py
@@ -32,5 +32,26 @@ for test in tests:
     else:
         yml.update(new_yml)
 
+# hls4ml Optimization API
+tests = glob.glob('test_optimization/test_*.py')
+for test in tests:
+    name = test.replace('test_optimization/', '').replace('test_', '').replace('.py', '')
+    new_yml = yaml.safe_load(template.format(name, f'test_optimization/test_{name}.py', int(uses_example_model(test))))
+    if yml is None:
+        yml = new_yml
+    else:
+        yml.update(new_yml)
+
+tests = glob.glob('test_optimization/test_keras/test_*.py')
+for test in tests:
+    name = test.replace('test_optimization/test_keras/', '').replace('test_', '').replace('.py', '')
+    new_yml = yaml.safe_load(
+        template.format(name, f'test_optimization/test_keras/test_{name}.py', int(uses_example_model(test)))
+    )
+    if yml is None:
+        yml = new_yml
+    else:
+        yml.update(new_yml)
+
 yamlfile = open('pytests.yml', 'w')
 yaml.safe_dump(yml, yamlfile)
diff --git a/test/pytest/test_optimization/test_attributes.py b/test/pytest/test_optimization/test_attributes.py
new file mode 100644
index 0000000000000000000000000000000000000000..2669321e09d5e3853ef44a0596029ec3c3407533
--- /dev/null
+++ b/test/pytest/test_optimization/test_attributes.py
@@ -0,0 +1,96 @@
+from tensorflow.keras.layers import Conv2D, Dense, Flatten, ReLU
+from tensorflow.keras.models import Sequential
+
+from hls4ml.optimization.attributes import get_attributes_from_keras_model_and_hls4ml_config
+from hls4ml.utils.config import config_from_keras_model
+
+
+def test_attributes():
+    dense_units = 16
+    conv_filters = 6
+    conv_channels = 3
+    conv_shape = (3, 3)
+    input_shape = (8, 8)
+    io_type = 'io_parallel'
+    strategy = 'Resource'
+
+    model = Sequential()
+    model.add(
+        Conv2D(
+            conv_filters,
+            input_shape=(*input_shape, conv_channels),
+            kernel_size=conv_shape,
+            name='conv2d',
+            padding='same',
+            kernel_initializer='ones',
+        )
+    )
+    model.add(Flatten(name='flatten'))
+    model.add(Dense(dense_units, name='dense', kernel_initializer='ones'))
+    model.add(ReLU(name='relu'))
+
+    default_reuse_factor = 2
+    default_precision = 'ap_fixed<8, 0>'
+    cfg = config_from_keras_model(
+        model, granularity='name', default_precision=default_precision, default_reuse_factor=default_reuse_factor
+    )
+    cfg['IOType'] = io_type
+    cfg['Model']['Strategy'] = strategy
+    cfg['LayerName']['dense']['ReuseFactor'] = 1
+
+    # Verify correct information for every layer
+    model_attributes = get_attributes_from_keras_model_and_hls4ml_config(model, cfg)
+    assert len(model_attributes) == 4
+
+    # conv2d
+    assert model_attributes['conv2d'].name == 'conv2d'
+    assert model_attributes['conv2d'].layer_type.__name__ == 'Conv2D'
+    assert model_attributes['conv2d'].inbound_layers == []
+    assert model_attributes['conv2d'].weight_shape == (3, 3, 3, 6)
+    assert model_attributes['conv2d'].input_shape == (8, 8, 3)
+    assert model_attributes['conv2d'].output_shape == (8, 8, 6)
+    assert not model_attributes['conv2d'].optimizable
+    assert model_attributes['conv2d'].args['hls4ml_attributes'].n_in == 9
+    assert model_attributes['conv2d'].args['hls4ml_attributes'].n_out == 6
+    assert model_attributes['conv2d'].args['hls4ml_attributes'].io_type == io_type
+    assert model_attributes['conv2d'].args['hls4ml_attributes'].strategy == strategy
+    assert model_attributes['conv2d'].args['hls4ml_attributes'].reuse_factor == default_reuse_factor
+    assert model_attributes['conv2d'].args['hls4ml_attributes'].weight_precision.width == 8
+    assert model_attributes['conv2d'].args['hls4ml_attributes'].parallelization_factor == 1
+
+    # flatten
+    assert model_attributes['flatten'].name == 'flatten'
+    assert model_attributes['flatten'].layer_type.__name__ == 'Flatten'
+    assert model_attributes['flatten'].weight_shape == ()
+    assert model_attributes['flatten'].input_shape == (8, 8, 6)
+    assert model_attributes['flatten'].output_shape == (384,)
+    assert not model_attributes['flatten'].optimizable
+
+    # Flatten is not optimizable so hls4mlAttributes (n_in, n_out, reuse factor etc.) will not be stored for it
+    assert 'hls4ml_attributes' not in model_attributes['flatten'].args
+
+    # dense
+    assert model_attributes['dense'].name == 'dense'
+    assert model_attributes['dense'].layer_type.__name__ == 'Dense'
+    assert model_attributes['dense'].weight_shape == (384, 16)
+    assert model_attributes['dense'].input_shape == (384,)
+    assert model_attributes['dense'].output_shape == (16,)
+    assert not model_attributes['dense'].optimizable
+    assert model_attributes['dense'].args['hls4ml_attributes'].n_in == 384
+    assert model_attributes['dense'].args['hls4ml_attributes'].n_out == 16
+    assert model_attributes['dense'].args['hls4ml_attributes'].io_type == io_type
+    assert model_attributes['dense'].args['hls4ml_attributes'].strategy == strategy
+    assert model_attributes['dense'].args['hls4ml_attributes'].reuse_factor == 1
+    assert model_attributes['dense'].args['hls4ml_attributes'].output_precision.width == 8
+    assert model_attributes['dense'].args['hls4ml_attributes'].parallelization_factor == 1
+
+    # relu
+    assert model_attributes['relu'].name == 'relu'
+    assert model_attributes['relu'].layer_type.__name__ == 'ReLU'
+    assert model_attributes['relu'].weight_shape == ()
+    assert model_attributes['relu'].input_shape == (16,)
+    assert model_attributes['relu'].output_shape == (16,)
+    assert not model_attributes['relu'].optimizable
+
+    # ReLU is not optimizable so hls4mlAttributes (n_in, n_out, reuse factor etc.) will not be stored for it
+    assert 'hls4ml_attributes' not in model_attributes['relu'].args
diff --git a/test/pytest/test_optimization/test_keras/test_masking.py b/test/pytest/test_optimization/test_keras/test_masking.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c5e60aca76bea4e6d003f247283aef0680ad19a
--- /dev/null
+++ b/test/pytest/test_optimization/test_keras/test_masking.py
@@ -0,0 +1,451 @@
+import numpy as np
+import pytest
+from qkeras import QConv2D, QDense
+from tensorflow.keras.layers import Conv2D, Dense, Flatten
+from tensorflow.keras.models import Sequential
+
+from hls4ml.optimization.attributes import get_attributes_from_keras_model
+from hls4ml.optimization.config import SUPPORTED_STRUCTURES
+from hls4ml.optimization.keras.masking import get_model_masks
+from hls4ml.optimization.objectives import ParameterEstimator
+
+'''
+In all the tests, an artifical network with one Dense/Conv2D layer and pre-determined weights is created
+Then, the tests assert zeros occur in the correct places, based on the masking structure (unstructured, block etc.)
+Furthermore, tests assert the masks are binary, so only zeros and ones occur
+Masking is such that:
+        * non_zero_params <= (1 - sparsity) * total_params OR
+        * zero_params > sparsity * total_params
+Since the targetted objective is ParameterEstimator, weight sharing is not suitable [does not decrease the number of weights]
+Therefore, all the test verify offsets are zero
+'''
+sparsity = 0.33
+local_masking = [True, False]
+dense_layers = [Dense, QDense]
+conv2d_layers = [Conv2D, QConv2D]
+
+
+# Create a Dense layer with artificial weights, so that (1, 1) and (2, 3) (matrix indexing) are pruned
+@pytest.mark.parametrize('local_masking', local_masking)
+@pytest.mark.parametrize('dense', dense_layers)
+def test_dense_masking_unstructured(local_masking, dense):
+    weight_shape = (2, 3)
+    model = Sequential()
+    model.add(dense(weight_shape[1], input_shape=(weight_shape[0],), name='dense'))
+    model.add(Dense(1, name='out'))
+
+    weights = model.layers[0].get_weights()
+    weights[0][0, 0] = 1e-6
+    weights[0][1, 2] = 1e-6
+    model.layers[0].set_weights(weights)
+
+    model_attributes = get_attributes_from_keras_model(model)
+    model_attributes['dense'].optimizable = True
+    model_attributes['dense'].optimization_attributes.pruning = True
+    model_attributes['dense'].optimization_attributes.structure_type = SUPPORTED_STRUCTURES.UNSTRUCTURED
+
+    # 33% sparsity - zero 2 out of 6 blocks with lowest norm [0.33 * 6 = 1.98 -> next largest int is 2]
+    masks, offsets = get_model_masks(model, model_attributes, sparsity, ParameterEstimator, metric='l1', local=local_masking)
+    zeros = np.array([[0, 0], [1, 2]], dtype=np.int32)
+    nonzeros = np.stack(np.where(masks['dense'] != 0), axis=1)
+
+    assert not np.any(offsets['dense'])
+    assert not np.any(masks['dense'][zeros[:, 0], zeros[:, 1]])
+    assert (weight_shape[0] * weight_shape[1]) == (zeros.shape[0] + nonzeros.shape[0])
+
+
+# Create a Dense layer with artificial weights, so that the 1st and 3rd column (neuron) are pruned
+@pytest.mark.parametrize('local_masking', local_masking)
+@pytest.mark.parametrize('dense', dense_layers)
+def test_dense_masking_structured(local_masking, dense):
+    weight_shape = (3, 6)
+    model = Sequential()
+    model.add(dense(weight_shape[1], input_shape=(weight_shape[0],), name='dense'))
+    model.add(Dense(1, name='out'))
+
+    weights = model.layers[0].get_weights()
+    weights[0][:, 0] = 1e-6
+    weights[0][:, 2] = 1e-6
+    model.layers[0].set_weights(weights)
+
+    model_attributes = get_attributes_from_keras_model(model)
+    model_attributes['dense'].optimizable = True
+    model_attributes['dense'].optimization_attributes.pruning = True
+    model_attributes['dense'].optimization_attributes.structure_type = SUPPORTED_STRUCTURES.STRUCTURED
+
+    # 33% sparsity - zero 2 out of 6 blocks with lowest norm [0.33 * 6 = 1.98 -> next largest int is 2]
+    masks, offsets = get_model_masks(model, model_attributes, sparsity, ParameterEstimator, metric='l1', local=local_masking)
+    zeros = np.array(
+        [
+            [0, 0],
+            [1, 0],
+            [2, 0],  # First neuron
+            [0, 2],
+            [1, 2],
+            [2, 2],  # Third neuron
+        ],
+        dtype=np.int32,
+    )
+    nonzeros = np.stack(np.where(masks['dense'] != 0), axis=1)
+
+    assert not np.any(offsets['dense'])
+    assert not np.any(masks['dense'][zeros[:, 0], zeros[:, 1]])
+    assert (weight_shape[0] * weight_shape[1]) == (zeros.shape[0] + nonzeros.shape[0])
+
+
+# Create a Dense layer with artificial weights, so that some patterns are pruned
+# Set pattern offset to 4, which is equivalent to RF = 3 [4 * 3 / 4]
+# In this case consecutive patterns are one, so pruning per DSP block
+@pytest.mark.parametrize('local_masking', local_masking)
+@pytest.mark.parametrize('dense', dense_layers)
+def test_dense_masking_pattern(local_masking, dense):
+    weight_shape = (3, 4)
+    model = Sequential()
+    model.add(dense(weight_shape[1], input_shape=(weight_shape[0],), name='dense'))
+    model.add(Dense(1, name='out'))
+
+    weights = model.layers[0].get_weights()
+
+    # Set 1st block low
+    weights[0][0, 0] = 1e-6
+    weights[0][1, 0] = 1e-6
+    weights[0][2, 0] = 1e-6
+
+    # Set 3rd block low
+    weights[0][0, 2] = 1e-6
+    weights[0][1, 2] = 1e-6
+    weights[0][2, 2] = 1e-6
+
+    model.layers[0].set_weights(weights)
+
+    model_attributes = get_attributes_from_keras_model(model)
+    model_attributes['dense'].optimizable = True
+    model_attributes['dense'].optimization_attributes.pruning = True
+    model_attributes['dense'].optimization_attributes.structure_type = SUPPORTED_STRUCTURES.PATTERN
+    model_attributes['dense'].optimization_attributes.pattern_offset = 4
+    model_attributes['dense'].optimization_attributes.consecutive_patterns = 1
+
+    # 33% sparsity - zero 4 from 12 weights, group by pattern [0.33 * 12 = 3.96] - so will select 2 patterns, 6 weights (>=)
+    masks, offsets = get_model_masks(model, model_attributes, sparsity, ParameterEstimator, metric='l1', local=local_masking)
+    zeros = np.array([[0, 0], [1, 0], [2, 0], [0, 2], [1, 2], [2, 2]], dtype=np.int32)
+    nonzeros = np.stack(np.where(masks['dense'] != 0), axis=1)
+
+    assert not np.any(offsets['dense'])
+    assert not np.any(masks['dense'][zeros[:, 0], zeros[:, 1]])
+    assert (weight_shape[0] * weight_shape[1]) == (zeros.shape[0] + nonzeros.shape[0])
+
+
+# Create a Dense layer with artificial weights, so that the 1st and 4th block are pruned
+@pytest.mark.parametrize('local_masking', local_masking)
+@pytest.mark.parametrize('dense', dense_layers)
+@pytest.mark.skip(
+    reason='Currently disabled as no benefits from block pruning are achieved for hls4ml.'
+)  # TODO - Enable when fully tested
+def test_dense_masking_block(local_masking, dense):
+    weight_shape = (4, 6)
+    model = Sequential()
+    model.add(dense(weight_shape[1], input_shape=(weight_shape[0],), name='dense'))
+    model.add(Dense(1, name='out'))
+
+    weights = model.layers[0].get_weights()
+
+    # Set 1st block low
+    weights[0][0, 0] = 1e-6
+    weights[0][0, 1] = 1e-6
+    weights[0][1, 0] = 1e-6
+    weights[0][1, 2] = 1e-6
+
+    # Set 4th block low
+    weights[0][2, 2] = 1e-6
+    weights[0][2, 3] = 1e-6
+    weights[0][3, 2] = 1e-6
+    weights[0][3, 3] = 1e-6
+
+    model.layers[0].set_weights(weights)
+
+    model_attributes = get_attributes_from_keras_model(model)
+    model_attributes['dense'].optimizable = True
+    model_attributes['dense'].optimization_attributes.pruning = True
+    model_attributes['dense'].optimization_attributes.structure_type = SUPPORTED_STRUCTURES.BLOCK
+    model_attributes['dense'].optimization_attributes.block_shape = (2, 2)
+
+    # 33% sparsity - zero 2 out of 6 blocks with lowest norm
+    # The first block is the smallest, the fourth block is set to zero
+    masks, offsets = get_model_masks(model, model_attributes, sparsity, ParameterEstimator, metric='l1', local=local_masking)
+    zeros = np.array([[0, 0], [0, 1], [1, 0], [1, 1], [2, 2], [2, 3], [3, 2], [3, 3]], dtype=np.int32)
+    nonzeros = np.stack(np.where(masks['dense'] != 0), axis=1)
+
+    assert not np.any(offsets['dense'])
+    assert not np.any(masks['dense'][zeros[:, 0], zeros[:, 1]])
+    assert (weight_shape[0] * weight_shape[1]) == (zeros.shape[0] + nonzeros.shape[0])
+
+
+# Create a Conv2D layer with artificial weights and mask some small weights
+# Target sparsity is 0.33, so there should be <= (1 - 0.33) * 16 = 10.72 non-zero params
+@pytest.mark.parametrize('local_masking', local_masking)
+@pytest.mark.parametrize('conv2d', conv2d_layers)
+def test_conv2d_masking_unstructured(local_masking, conv2d):
+    filt_width = 2
+    filt_height = 2
+    n_channels = 2
+    n_filters = 2
+
+    model = Sequential()
+    model.add(conv2d(n_filters, input_shape=(8, 8, n_channels), kernel_size=(filt_width, filt_height), name='conv2d'))
+    model.add(Flatten())
+    model.add(Dense(1, name='out'))
+
+    weights = model.layers[0].get_weights()
+    weights[0][0, 0, 0, 0] = 1e-6
+    weights[0][1, 0, 0, 0] = 1e-6
+    weights[0][0, 1, 0, 0] = 1e-6
+    weights[0][0, 0, 1, 0] = 1e-6
+    weights[0][0, 0, 0, 1] = 1e-6
+    weights[0][1, 1, 1, 1] = 1e-6
+    model.layers[0].set_weights(weights)
+
+    model_attributes = get_attributes_from_keras_model(model)
+    model_attributes['conv2d'].optimizable = True
+    model_attributes['conv2d'].optimization_attributes.pruning = True
+    model_attributes['conv2d'].optimization_attributes.structure_type = SUPPORTED_STRUCTURES.UNSTRUCTURED
+
+    masks, offsets = get_model_masks(model, model_attributes, sparsity, ParameterEstimator, metric='l1', local=local_masking)
+    zeros = np.array([[0, 0, 0, 0], [1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1], [1, 1, 1, 1]], dtype=np.int32)
+    nonzeros = np.stack(np.where(masks['conv2d'] != 0), axis=1)
+
+    assert not np.any(offsets['conv2d'])
+    assert not np.any(masks['conv2d'][zeros[:, 0], zeros[:, 1], zeros[:, 2], zeros[:, 3]])
+    assert (filt_width * filt_height * n_channels * n_filters) == (zeros.shape[0] + nonzeros.shape[0])
+
+
+# Create a Conv2D layer with artificial weights, so that second and last filter are pruned
+@pytest.mark.parametrize('local_masking', local_masking)
+@pytest.mark.parametrize('conv2d', conv2d_layers)
+def test_conv2d_masking_structured(local_masking, conv2d):
+    filt_width = 3
+    filt_height = 3
+    n_channels = 4
+    n_filters = 6
+
+    model = Sequential()
+    model.add(conv2d(n_filters, input_shape=(8, 8, n_channels), kernel_size=(filt_width, filt_height), name='conv2d'))
+    model.add(Flatten())
+    model.add(Dense(1, name='out'))
+
+    weights = model.layers[0].get_weights()
+    weights[0][:, :, :, 1] = 1e-3
+    weights[0][:, :, :, 5] = 1e-3
+    model.layers[0].set_weights(weights)
+
+    model_attributes = get_attributes_from_keras_model(model)
+    model_attributes['conv2d'].optimizable = True
+    model_attributes['conv2d'].optimization_attributes.pruning = True
+    model_attributes['conv2d'].optimization_attributes.structure_type = SUPPORTED_STRUCTURES.STRUCTURED
+
+    # 33% sparsity - zero 2 out of 6 filters with lowest norm
+    # Generate all possible combinations of width and height pixels with channel using np.meshgrid()
+    # This represents all the positions for a single filter; then append filter position to the last columns
+    masks, offsets = get_model_masks(model, model_attributes, sparsity, ParameterEstimator, metric='l1', local=local_masking)
+    width_pixels = np.array(range(0, filt_width))
+    height_pixels = np.array(range(0, filt_height))
+    channels = np.array(range(0, n_channels))
+    combinations = np.array(np.meshgrid(width_pixels, height_pixels, channels)).T.reshape(-1, 3)
+    zeros = np.array(
+        np.append(combinations, np.full((filt_width * filt_height * n_channels, 1), 1), axis=1).tolist()
+        + np.append(combinations, np.full((filt_width * filt_height * n_channels, 1), 5), axis=1).tolist(),
+        dtype=np.int32,
+    )
+    nonzeros = np.stack(np.where(masks['conv2d'] != 0), axis=1)
+
+    assert not np.any(offsets['conv2d'])
+    assert not np.any(masks['conv2d'][zeros[:, 0], zeros[:, 1], zeros[:, 2], zeros[:, 3]])
+    assert (filt_width * filt_height * n_channels * n_filters) == (zeros.shape[0] + nonzeros.shape[0])
+
+
+# Create a Conv2D layer with artificial weights, so that the first and second pattern are pruned
+# Set pattern offset to 4, which is equivalent to RF = 2 [2 * 2 * 2 / 4]
+@pytest.mark.parametrize('local_masking', local_masking)
+@pytest.mark.parametrize('conv2d', conv2d_layers)
+def test_conv2d_masking_pattern(local_masking, conv2d):
+    filt_width = 2
+    filt_height = 1
+    n_channels = 2
+    n_filters = 2
+
+    model = Sequential()
+    model.add(conv2d(n_filters, input_shape=(8, 8, n_channels), kernel_size=(filt_width, filt_height), name='conv2d'))
+    model.add(Flatten())
+    model.add(Dense(1, name='out'))
+
+    weights = model.layers[0].get_weights()
+
+    # Set the first DSP block to be approximately zero
+    weights[0][0, 0, 0, 0] = 1e-6
+    weights[0][0, 0, 1, 0] = 1e-6
+
+    # Set the third DSP block to be approximately zero
+    weights[0][0, 0, 0, 1] = 1e-6
+    weights[0][0, 0, 1, 1] = 1e-6
+
+    model.layers[0].set_weights(weights)
+
+    model_attributes = get_attributes_from_keras_model(model)
+    model_attributes['conv2d'].optimizable = True
+    model_attributes['conv2d'].optimization_attributes.pruning = True
+    model_attributes['conv2d'].optimization_attributes.structure_type = SUPPORTED_STRUCTURES.PATTERN
+    model_attributes['conv2d'].optimization_attributes.pattern_offset = 4
+
+    # 33% sparsity - zero out the two of the lowest groups
+    masks, offsets = get_model_masks(model, model_attributes, sparsity, ParameterEstimator, metric='l1', local=local_masking)
+    print(masks['conv2d'].shape)
+    print(weights[0].shape)
+    zeros = np.array([[0, 0, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1], [0, 0, 1, 1]], dtype=np.int32)
+    nonzeros = np.stack(np.where(masks['conv2d'] != 0), axis=1)
+
+    assert not np.any(offsets['conv2d'])
+    assert not np.any(masks['conv2d'][zeros[:, 0], zeros[:, 1], zeros[:, 2], zeros[:, 3]])
+    assert (filt_width * filt_height * n_channels * n_filters) == (zeros.shape[0] + nonzeros.shape[0])
+
+
+# Block pruning is only allowed for 2-dimensional matrices, so assert a correct exception is raised when pruning with Conv2D
+@pytest.mark.parametrize('local_masking', local_masking)
+@pytest.mark.parametrize('conv2d', conv2d_layers)
+def test_conv2d_block_masking_raises_exception(local_masking, conv2d):
+    model = Sequential()
+    model.add(conv2d(4, input_shape=(8, 8, 3), kernel_size=(3, 3), name='conv2d'))
+    model.add(Flatten())
+    model.add(Dense(1, name='out'))
+
+    model_attributes = get_attributes_from_keras_model(model)
+    model_attributes['conv2d'].optimizable = True
+    model_attributes['conv2d'].optimization_attributes.pruning = True
+    model_attributes['conv2d'].optimization_attributes.structure_type = SUPPORTED_STRUCTURES.BLOCK
+
+    try:
+        get_model_masks(model, model_attributes, sparsity, ParameterEstimator, metric='l1', local=local_masking)
+    except Exception:
+        assert True
+        return
+    assert not True
+
+
+# Test edge cases: 0% and 100% sparsity
+# Test 50% sparsity with two layers
+@pytest.mark.parametrize('s', [0, 0.5, 1])
+@pytest.mark.parametrize('local_masking', local_masking)
+@pytest.mark.parametrize(
+    'type',
+    [
+        SUPPORTED_STRUCTURES.UNSTRUCTURED,
+        SUPPORTED_STRUCTURES.STRUCTURED,
+        SUPPORTED_STRUCTURES.PATTERN,
+        SUPPORTED_STRUCTURES.BLOCK,
+    ],
+)
+def test_multi_layer_masking(s, local_masking, type):
+    dense_units = 16
+    conv_filters = 6
+    conv_channels = 4
+    conv_shape = (2, 2)  # Using (2, 2) instead of (3, 3) as it's an even number of weights
+    input_shape = (8, 8)
+
+    # Simple model, Conv2D weight shape (2, 2, 4, 6) and Dense weight shape (384, 16)
+    model = Sequential()
+    model.add(
+        Conv2D(
+            conv_filters,
+            input_shape=(*input_shape, conv_channels),
+            kernel_size=conv_shape,
+            name='conv2d',
+            padding='same',
+            kernel_initializer='ones',
+        )
+    )
+    model.add(Flatten())
+    model.add(Dense(dense_units, name='dense', kernel_initializer='ones'))
+
+    # Make 'dense' and 'conv2d' optimizable
+    model_attributes = get_attributes_from_keras_model(model)
+    model_attributes['dense'].optimizable = True
+    model_attributes['dense'].optimization_attributes.pruning = True
+    model_attributes['dense'].optimization_attributes.structure_type = type
+    model_attributes['dense'].optimization_attributes.pattern_offset = 1024  # Equivalent to RF = 6 (384 * 16 / 1024)
+
+    model_attributes['conv2d'].optimizable = True
+    model_attributes['conv2d'].optimization_attributes.pruning = True
+    model_attributes['conv2d'].optimization_attributes.structure_type = (
+        type if type != SUPPORTED_STRUCTURES.BLOCK else SUPPORTED_STRUCTURES.UNSTRUCTURED
+    )
+    model_attributes['conv2d'].optimization_attributes.pattern_offset = 4  # Equivalent to RF = 4 (2 * 2 * 4 * 6 / 4)
+
+    masks, offsets = get_model_masks(model, model_attributes, s, ParameterEstimator, metric='l1', local=local_masking)
+    if s == 1:  # 100% sparsity - all masks are zero
+        print(np.count_nonzero(masks['dense'].flatten()))
+        assert not np.any(masks['dense'])
+        assert not np.any(masks['conv2d'])
+    elif s == 0.5:
+        conv2d_weights = conv_channels * conv_filters * np.prod(conv_shape)
+        dense_weights = dense_units * np.prod(input_shape) * conv_filters
+        if local_masking:
+            assert np.count_nonzero(masks['conv2d']) == int((1 - s) * conv2d_weights)
+            assert np.count_nonzero(masks['dense']) == int((1 - s) * dense_weights)
+        else:
+            # Less than or equal to, since Knapsack problem imposes a hard constrain on the active resources (ones)
+            assert np.count_nonzero(masks['conv2d']) + np.count_nonzero(masks['dense']) <= int(
+                (1 - s) * (conv2d_weights + dense_weights)
+            )
+    elif s == 0:  # 0% sparsity - all masks are one
+        assert np.all(masks['dense'])
+        assert np.all(masks['conv2d'])
+
+    assert not np.any(offsets['dense'])
+    assert not np.any(offsets['conv2d'])
+
+
+# Create a Dense layer with artificial weights, so that some consecutive patterns are pruned
+# Set consecutive patterns to 2, so that the 1st block [1st and 2nd pattern] are pruned
+@pytest.mark.parametrize('local_masking', local_masking)
+@pytest.mark.parametrize('dense', dense_layers)
+def test_consecutive_pattern_masking(local_masking, dense):
+    weight_shape = (3, 4)
+    model = Sequential()
+    model.add(dense(weight_shape[1], input_shape=(weight_shape[0],), name='dense'))
+    model.add(Flatten())
+    model.add(Dense(1, name='out'))
+    weights = model.layers[0].get_weights()
+
+    weights[0] = np.arange(np.prod(weight_shape)).reshape(weight_shape)
+
+    # Set 1st and 2nd pattern low
+    weights[0][0, 0] = 1e-6
+    weights[0][1, 0] = 1e-6
+    weights[0][2, 0] = 1e-6
+    weights[0][0, 1] = 1e-4
+    weights[0][1, 1] = 1e-4
+    weights[0][2, 1] = 1e-4
+
+    # Set 4th pattern lower than second
+    # This pattern should still remain unmasked [even if it has a lower value than the 2nd pattern],
+    # As its neigbouring block has a larger value than the 2nd pattern
+    weights[0][0, 3] = 1e-6
+    weights[0][1, 3] = 1e-6
+    weights[0][2, 3] = 1e-6
+
+    print(weights)
+    model.layers[0].set_weights(weights)
+
+    model_attributes = get_attributes_from_keras_model(model)
+    model_attributes['dense'].optimizable = True
+    model_attributes['dense'].optimization_attributes.pruning = True
+    model_attributes['dense'].optimization_attributes.structure_type = SUPPORTED_STRUCTURES.PATTERN
+    model_attributes['dense'].optimization_attributes.pattern_offset = 4
+    model_attributes['dense'].optimization_attributes.consecutive_patterns = 2
+
+    # 33% sparsity - zero 4 out of 12 weight, group by pattern [0.33 * 12 = 3.96]
+    masks, offsets = get_model_masks(model, model_attributes, sparsity, ParameterEstimator, metric='l1', local=local_masking)
+    zeros = np.array([[0, 0], [1, 0], [2, 0], [0, 1], [1, 1], [2, 1]], dtype=np.int32)
+    nonzeros = np.stack(np.where(masks['dense'] != 0), axis=1)
+
+    assert not np.any(offsets['dense'])
+    assert not np.any(masks['dense'][zeros[:, 0], zeros[:, 1]])
+    assert (weight_shape[0] * weight_shape[1]) == (zeros.shape[0] + nonzeros.shape[0])
diff --git a/test/pytest/test_optimization/test_keras/test_reduction.py b/test/pytest/test_optimization/test_keras/test_reduction.py
new file mode 100644
index 0000000000000000000000000000000000000000..e07cbab656becec2c7558704da208d01bc929032
--- /dev/null
+++ b/test/pytest/test_optimization/test_keras/test_reduction.py
@@ -0,0 +1,144 @@
+import keras
+import pytest
+from packaging import version
+from qkeras import QActivation, QConv2D, QDense, quantized_bits
+from tensorflow.keras.layers import AveragePooling2D, BatchNormalization, Conv2D, Dense, Flatten, MaxPooling2D, ReLU, Softmax
+from tensorflow.keras.models import Sequential
+
+from hls4ml.optimization.keras.reduction import reduce_model
+from hls4ml.optimization.keras.utils import get_model_sparsity
+
+'''
+Set some neurons / filters to zero and verify that these are removed
+Even is some neurons (columns) in the output layer are zero, these should not be removed (to match data set labels)
+Test verify the above property, by setting some zeros in the last layer and verifying these remain in place
+'''
+
+
+@pytest.mark.skipif(
+    version.parse(keras.__version__) > version.parse('2.12.0'), reason='Keras Surgeon only works until Keras 2.12'
+)
+def test_keras_model_reduction():
+    model = Sequential()
+    model.add(Conv2D(8, (3, 3), input_shape=(64, 64, 1), name='conv2d_1', padding='same'))
+    model.add(MaxPooling2D())
+    model.add(BatchNormalization())
+    model.add(ReLU())
+    model.add(Conv2D(32, (5, 5), padding='same', name='conv2d_2'))
+    model.add(AveragePooling2D())
+    model.add(BatchNormalization())
+    model.add(ReLU())
+    model.add(Flatten())
+    model.add(Dense(32, input_shape=(16,), name='dense_1', activation='relu'))
+    model.add(BatchNormalization())
+    model.add(Dense(14, name='dense_2', activation='relu'))
+    model.add(BatchNormalization())
+    model.add(Dense(5, name='dense_3'))
+    model.add(Softmax())
+
+    indices = {
+        'conv2d_1': [2, 4, 7],
+        'conv2d_2': [0, 1, 2, 3, 4, 5],
+        'dense_1': [0, 5, 17, 28],
+        'dense_2': [1, 9, 4],
+        'dense_3': [3],
+    }
+    for layer in model.layers:
+        if isinstance(layer, Dense):
+            weights = layer.get_weights()
+            weights[0][:, indices[layer.name]] = 0
+            layer.set_weights(weights)
+        if isinstance(layer, Conv2D):
+            weights = layer.get_weights()
+            weights[0][:, :, :, indices[layer.name]] = 0
+            layer.set_weights(weights)
+
+    sparsity, _ = get_model_sparsity(model)
+    assert sparsity > 0
+
+    reduced = reduce_model(model)
+    assert reduced.get_layer('conv2d_1').get_weights()[0].shape == (3, 3, 1, 5)
+    assert reduced.get_layer('conv2d_2').get_weights()[0].shape == (5, 5, 5, 26)
+    assert reduced.get_layer('dense_1').get_weights()[0].shape == (6656, 28)
+    assert reduced.get_layer('dense_2').get_weights()[0].shape == (28, 11)
+    assert reduced.get_layer('dense_3').get_weights()[0].shape == (11, 5)
+
+    _, layer_sparsity = get_model_sparsity(reduced)
+    assert layer_sparsity['conv2d_1'] == 0
+    assert layer_sparsity['conv2d_2'] == 0
+    assert layer_sparsity['dense_1'] == 0
+    assert layer_sparsity['dense_2'] == 0
+    assert layer_sparsity['dense_3'] > 0
+
+
+@pytest.mark.skipif(
+    version.parse(keras.__version__) > version.parse('2.12.0'), reason='Keras Surgeon only works until Keras 2.12'
+)
+def test_qkeras_model_reduction():
+    bits = 8
+    activation = 'quantized_relu(4)'
+    quantizer = quantized_bits(bits, 0)
+
+    model = Sequential()
+    model.add(QConv2D(8, (3, 3), input_shape=(64, 64, 1), name='qconv2d_1', padding='same', kernel_quantizer=quantizer))
+    model.add(MaxPooling2D())
+    model.add(BatchNormalization())
+    model.add(QActivation(activation, name='qrelu_1'))
+    model.add(QConv2D(32, (5, 5), padding='same', name='qconv2d_2', kernel_quantizer=quantizer))
+    model.add(AveragePooling2D())
+    model.add(BatchNormalization())
+    model.add(QActivation(activation, name='qrelu_2'))
+    model.add(Flatten())
+    model.add(QDense(32, input_shape=(16,), name='qdense_1', kernel_quantizer=quantizer))
+    model.add(QActivation(activation, name='qrelu_3'))
+    model.add(BatchNormalization())
+    model.add(QDense(14, name='qdense_2', kernel_quantizer=quantizer))
+    model.add(QActivation(activation, name='qrelu_4'))
+    model.add(BatchNormalization())
+    model.add(QDense(5, name='qdense_3', kernel_quantizer=quantizer))
+    model.add(Softmax())
+
+    indices = {
+        'qconv2d_1': [2, 4, 7],
+        'qconv2d_2': [0, 1, 2, 3, 4, 5],
+        'qdense_1': [0, 5, 17, 28],
+        'qdense_2': [1, 9, 4],
+        'qdense_3': [3],
+    }
+    for layer in model.layers:
+        if isinstance(layer, QDense):
+            weights = layer.get_weights()
+            weights[0][:, indices[layer.name]] = 0
+            layer.set_weights(weights)
+        if isinstance(layer, QConv2D):
+            weights = layer.get_weights()
+            weights[0][:, :, :, indices[layer.name]] = 0
+            layer.set_weights(weights)
+
+    sparsity, _ = get_model_sparsity(model)
+    assert sparsity > 0
+
+    reduced = reduce_model(model)
+    assert reduced.get_layer('qconv2d_1').get_weights()[0].shape == (3, 3, 1, 5)
+    assert reduced.get_layer('qconv2d_2').get_weights()[0].shape == (5, 5, 5, 26)
+    assert reduced.get_layer('qdense_1').get_weights()[0].shape == (6656, 28)
+    assert reduced.get_layer('qdense_2').get_weights()[0].shape == (28, 11)
+    assert reduced.get_layer('qdense_3').get_weights()[0].shape == (11, 5)
+
+    _, layer_sparsity = get_model_sparsity(reduced)
+    assert layer_sparsity['qconv2d_1'] == 0
+    assert layer_sparsity['qconv2d_2'] == 0
+    assert layer_sparsity['qdense_1'] == 0
+    assert layer_sparsity['qdense_2'] == 0
+    assert layer_sparsity['qdense_3'] > 0
+
+    # Verify network surgery has no impact on quantization
+    assert isinstance(reduced.get_layer('qrelu_1'), QActivation)
+    assert isinstance(reduced.get_layer('qrelu_2'), QActivation)
+    assert isinstance(reduced.get_layer('qrelu_3'), QActivation)
+    assert isinstance(reduced.get_layer('qrelu_4'), QActivation)
+    assert reduced.get_layer('qconv2d_1').kernel_quantizer['config']['bits'] == bits
+    assert reduced.get_layer('qconv2d_2').kernel_quantizer['config']['bits'] == bits
+    assert reduced.get_layer('qdense_1').kernel_quantizer['config']['bits'] == bits
+    assert reduced.get_layer('qdense_2').kernel_quantizer['config']['bits'] == bits
+    assert reduced.get_layer('qdense_3').kernel_quantizer['config']['bits'] == bits
diff --git a/test/pytest/test_optimization/test_keras/test_regularizers.py b/test/pytest/test_optimization/test_keras/test_regularizers.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fe518caae8e539bef232286f6a5603e17cddb57
--- /dev/null
+++ b/test/pytest/test_optimization/test_keras/test_regularizers.py
@@ -0,0 +1,176 @@
+import numpy as np
+import pytest
+import tensorflow as tf
+from qkeras import QConv2D, QDense
+from tensorflow.keras.layers import Conv2D, Dense, Flatten
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.optimizers import Adam
+
+from hls4ml.optimization.config import SUPPORTED_STRUCTURES
+from hls4ml.optimization.keras.builder import remove_custom_regularizers
+from hls4ml.optimization.keras.regularizers import Conv2DRegularizer, DenseRegularizer
+
+# Constants
+pattern_offset = 4
+consecutive_patterns = 2
+block_shape = (4, 4)
+
+dense_layers = [Dense, QDense]
+conv2d_layers = [Conv2D, QConv2D]
+
+
+# Sets the loss due to data to zero; train model only on regularization loss
+def zero_loss(y_true, y_pred):
+    return tf.reduce_mean(0 * tf.square(y_true - y_pred), axis=-1)
+
+
+# Helper function, calculates the group norm and variance for a single layer
+def get_norm_and_variance(weights, structure_type, layer='dense'):
+    if structure_type == SUPPORTED_STRUCTURES.UNSTRUCTURED:
+        norm = np.linalg.norm(weights.flatten(), ord=1)
+        var = np.var(weights)
+        return norm, var
+
+    if structure_type == SUPPORTED_STRUCTURES.STRUCTURED:
+        if layer == 'conv2d':
+            norm = np.linalg.norm(np.sum(np.linalg.norm(weights, axis=(0, 1), ord='fro'), axis=0), ord=1)
+            var = np.linalg.norm(np.var(weights, axis=(0, 1, 2)), ord=1)
+        else:
+            norm = np.linalg.norm(np.linalg.norm(weights, axis=0, ord=2), ord=1)
+            var = np.linalg.norm(np.var(weights, axis=0), ord=1)
+
+        return norm, var
+
+    if structure_type == SUPPORTED_STRUCTURES.PATTERN:
+        number_of_patterns = np.prod(weights.shape) // pattern_offset
+        target_shape = (number_of_patterns, pattern_offset)
+        reshaped = np.reshape(weights, target_shape)
+        total_blocks = pattern_offset // consecutive_patterns
+        blocks = np.reshape(
+            tf.image.extract_patches(
+                np.expand_dims(np.expand_dims(reshaped, 2), 0),
+                [1, number_of_patterns, consecutive_patterns, 1],
+                [1, number_of_patterns, consecutive_patterns, 1],
+                [1, 1, 1, 1],
+                'SAME',
+            ).numpy(),
+            (total_blocks, number_of_patterns * consecutive_patterns),
+        )
+        norm = np.linalg.norm(np.linalg.norm(blocks, axis=1, ord=2), ord=1)
+        var = np.linalg.norm(np.var(blocks, axis=1), ord=1)
+        return norm, var
+
+    if structure_type == SUPPORTED_STRUCTURES.BLOCK:
+        total_blocks = (weights.shape[0] * weights.shape[1]) // (block_shape[0] * block_shape[1])
+        blocks = np.reshape(
+            tf.image.extract_patches(
+                np.expand_dims(np.expand_dims(weights, 2), 0),
+                [1, block_shape[0], block_shape[1], 1],
+                [1, block_shape[0], block_shape[1], 1],
+                [1, 1, 1, 1],
+                'SAME',
+            ).numpy(),
+            (total_blocks, block_shape[0] * block_shape[1]),
+        )
+
+        norm = np.linalg.norm(np.linalg.norm(blocks, axis=1, ord=2), ord=1)
+        var = np.linalg.norm(np.var(blocks, axis=1), ord=1)
+        return norm, var
+
+
+@pytest.mark.parametrize('dense', dense_layers)
+@pytest.mark.parametrize(
+    'structure_type',
+    [
+        SUPPORTED_STRUCTURES.UNSTRUCTURED,
+        SUPPORTED_STRUCTURES.STRUCTURED,
+        SUPPORTED_STRUCTURES.PATTERN,
+        SUPPORTED_STRUCTURES.BLOCK,
+    ],
+)
+def test_dense_regularizer(structure_type, dense):
+    epochs = 10
+    data_points = 10
+    input_shape = (32,)
+    output_shape = (16,)
+    X = np.random.rand(data_points, *input_shape)
+    y = np.random.rand(data_points, *output_shape)
+    w = np.random.rand(input_shape[0], output_shape[0])
+
+    # First, fit a model without regularization
+    model = Sequential()
+    model.add(dense(output_shape[0], input_shape=input_shape))
+    dense_weights = model.layers[0].get_weights()
+    dense_weights[0] = w
+    model.layers[0].set_weights(dense_weights)
+    model.compile(loss=zero_loss, optimizer=Adam(1.0))
+    model.fit(X, y, epochs=epochs)
+    norm, var = get_norm_and_variance(model.layers[0].get_weights()[0], structure_type)
+
+    # Now, fit a model with strong regularization, starting with the same initial weights
+    dense_weights = model.layers[0].get_weights()
+    dense_weights[0] = w
+    model.layers[0].set_weights(dense_weights)
+    regularizer = DenseRegularizer(
+        alpha=0.5, beta=0.5, structure_type=structure_type, pattern_offset=pattern_offset, block_shape=block_shape
+    )
+    model.layers[0].add_loss(lambda layer=model.layers[0]: regularizer(layer.kernel))
+    model.compile(loss=zero_loss, optimizer=Adam(1.0))
+    model.fit(X, y, epochs=epochs)
+    reg_norm, reg_var = get_norm_and_variance(model.layers[0].get_weights()[0], structure_type)
+
+    # Verify regularization decreased weight magnitude and variance
+    assert reg_norm < norm
+    assert reg_var < var
+
+
+@pytest.mark.parametrize('conv2d', conv2d_layers)
+@pytest.mark.parametrize(
+    'structure_type', [SUPPORTED_STRUCTURES.UNSTRUCTURED, SUPPORTED_STRUCTURES.STRUCTURED, SUPPORTED_STRUCTURES.PATTERN]
+)
+def test_conv2d_regularizer(structure_type, conv2d):
+    epochs = 10
+    data_points = 10
+    input_shape = (16, 16, 3)
+    num_filters = 4
+    X = np.random.rand(data_points, *input_shape)
+    y = np.random.rand(data_points, 1)
+
+    # First, fit a model without regularization
+    model = Sequential()
+    model.add(conv2d(num_filters, (3, 3), input_shape=input_shape))
+    model.add(Flatten())
+    model.add(Dense(1))
+    conv_weights = model.layers[0].get_weights()
+    model.compile(loss=zero_loss, optimizer=Adam())
+    model.fit(X, y, epochs=epochs, verbose=True)
+    norm, var = get_norm_and_variance(model.layers[0].get_weights()[0], structure_type, 'conv2d')
+
+    # Now, fit a model with strong regularization, starting with the same initial weights
+    model.layers[0].set_weights(conv_weights)
+    regularizer = Conv2DRegularizer(alpha=0.5, beta=0.5, structure_type=structure_type, pattern_offset=pattern_offset)
+    model.layers[0].add_loss(lambda layer=model.layers[0]: regularizer(layer.kernel))
+    model.compile(loss=zero_loss, optimizer=Adam())
+    model.fit(X, y, epochs=epochs, verbose=True)
+    reg_norm, reg_var = get_norm_and_variance(model.layers[0].get_weights()[0], structure_type, 'conv2d')
+
+    # Verify regularization decreased weight magnitude and variance
+    assert reg_norm < norm
+    assert reg_var < var
+
+
+def test_removal_of_custom_regularizer():
+    model = Sequential()
+    model.add(Conv2D(8, (3, 3), input_shape=(16, 16, 3), kernel_regularizer=Conv2DRegularizer(1e-3)))
+    model.add(Flatten())
+    model.add(Dense(1, kernel_regularizer=DenseRegularizer(1e-3)))
+    weights = model.get_weights()
+
+    assert isinstance(model.layers[0].kernel_regularizer, Conv2DRegularizer)
+    assert isinstance(model.layers[2].kernel_regularizer, DenseRegularizer)
+
+    model = remove_custom_regularizers(model)
+
+    assert model.layers[0].kernel_regularizer is None
+    for i in range(len(weights)):
+        assert np.all(weights[i] == model.get_weights()[i])
diff --git a/test/pytest/test_optimization/test_keras/test_weight_sharing.py b/test/pytest/test_optimization/test_keras/test_weight_sharing.py
new file mode 100644
index 0000000000000000000000000000000000000000..c274a84da83612d88f9059449cc12f37cdae63fe
--- /dev/null
+++ b/test/pytest/test_optimization/test_keras/test_weight_sharing.py
@@ -0,0 +1,155 @@
+import numpy as np
+import pytest
+from qkeras import QDense
+from tensorflow.keras.layers import Dense
+from tensorflow.keras.models import Sequential
+
+from hls4ml.optimization.attributes import get_attributes_from_keras_model
+from hls4ml.optimization.config import SUPPORTED_STRUCTURES
+from hls4ml.optimization.keras.masking import get_model_masks
+from hls4ml.optimization.objectives import ObjectiveEstimator
+
+# Similar tests in test_masking.py, weight sharing instead of pruning
+sparsity = 0.33
+local_masking = [True, False]
+dense_layers = [Dense, QDense]
+
+'''
+A mock objective class for weight sharing
+When a group of weights is quantized to the mean value, resource savings are equal to the number of weights quantized
+This is similar to ParameterEstimator, but instead of pruning, weight sharing is performed and
+No savings are incurred with unstructured type (unstructured weight sharing doesn't make sense)
+'''
+
+
+class MockWeightSharingEstimator(ObjectiveEstimator):
+    @classmethod
+    def layer_resources(self, layer_attributes):
+        if not layer_attributes.weight_shape:
+            return [0]
+        else:
+            return [np.prod(layer_attributes.weight_shape)]
+
+    @classmethod
+    def layer_savings(self, layer_attributes):
+        if not layer_attributes.weight_shape:
+            return [0]
+
+        structure_type = layer_attributes.optimization_attributes.structure_type
+
+        if layer_attributes.optimization_attributes.weight_sharing:
+            if structure_type == SUPPORTED_STRUCTURES.UNSTRUCTURED:
+                return [0]
+            elif structure_type == SUPPORTED_STRUCTURES.STRUCTURED:
+                if 'Dense' in layer_attributes.layer_type.__name__:
+                    return [layer_attributes.weight_shape[1]]
+            elif structure_type == SUPPORTED_STRUCTURES.PATTERN:
+                number_of_patterns = (
+                    np.prod(layer_attributes.weight_shape) // layer_attributes.optimization_attributes.pattern_offset
+                )
+                return [number_of_patterns * layer_attributes.optimization_attributes.consecutive_patterns]
+            elif structure_type == SUPPORTED_STRUCTURES.BLOCK:
+                return [np.prod(layer_attributes.optimization_attributes.block_shape)]
+        return [0]
+
+
+@pytest.mark.parametrize('local_masking', local_masking)
+@pytest.mark.parametrize('dense', dense_layers)
+def test_weight_sharing_structured(local_masking, dense):
+    weight_shape = (4, 3)
+
+    model = Sequential()
+    model.add(dense(weight_shape[1], input_shape=(weight_shape[0],), name='dense'))
+    weights = model.layers[0].get_weights()
+
+    weights[0][:, 1] = 0.5
+    weights[0][0, 1] -= 1e-4
+    weights[0][2, 1] += 1e-4
+
+    model.layers[0].set_weights(weights)
+
+    model_attributes = get_attributes_from_keras_model(model)
+    model_attributes['dense'].optimizable = True
+    model_attributes['dense'].optimization_attributes.pruning = False
+    model_attributes['dense'].optimization_attributes.weight_sharing = True
+    model_attributes['dense'].optimization_attributes.structure_type = SUPPORTED_STRUCTURES.STRUCTURED
+
+    masks, offsets = get_model_masks(
+        model, model_attributes, sparsity, MockWeightSharingEstimator, metric='l1', local=local_masking
+    )
+    frozen = np.array([[0, 1], [1, 1], [2, 1]], dtype=np.int32)
+
+    assert not np.any(masks['dense'][frozen[:, 0], frozen[:, 1]])
+    assert np.all(offsets['dense'][frozen[:, 0], frozen[:, 1]] == 0.5)
+
+
+@pytest.mark.parametrize('local_masking', local_masking)
+@pytest.mark.parametrize('dense', dense_layers)
+def test_weight_sharing_pattern(local_masking, dense):
+    weight_shape = (3, 4)
+
+    model = Sequential()
+    model.add(dense(weight_shape[1], input_shape=(weight_shape[0],), name='dense'))
+    weights = model.layers[0].get_weights()
+    weights[0][0, 0] = 0.5 + 1e-4
+    weights[0][1, 0] = 0.5 - 1e-4
+    weights[0][2, 0] = 0.5
+
+    weights[0][0, 2] = 0.5 + 1e-4
+    weights[0][1, 2] = 0.5 - 1e-4
+    weights[0][2, 2] = 0.5
+
+    model.layers[0].set_weights(weights)
+
+    model_attributes = get_attributes_from_keras_model(model)
+    model_attributes['dense'].optimizable = True
+    model_attributes['dense'].optimization_attributes.pruning = False
+    model_attributes['dense'].optimization_attributes.weight_sharing = True
+    model_attributes['dense'].optimization_attributes.structure_type = SUPPORTED_STRUCTURES.PATTERN
+    model_attributes['dense'].optimization_attributes.pattern_offset = 4
+    model_attributes['dense'].optimization_attributes.consecutive_patterns = 1
+
+    masks, offsets = get_model_masks(
+        model, model_attributes, sparsity, MockWeightSharingEstimator, metric='l1', local=local_masking
+    )
+    frozen = np.array([[0, 0], [1, 0], [2, 0], [0, 2], [1, 2], [2, 2]], dtype=np.int32)
+
+    assert not np.any(masks['dense'][frozen[:, 0], frozen[:, 1]])
+    assert np.all(offsets['dense'][frozen[:, 0], frozen[:, 1]] == 0.5)
+
+
+@pytest.mark.parametrize('local_masking', local_masking)
+@pytest.mark.parametrize('dense', dense_layers)
+def test_weight_sharing_block(local_masking, dense):
+    weight_shape = (4, 6)
+
+    model = Sequential()
+    model.add(dense(weight_shape[1], input_shape=(weight_shape[0],), name='dense'))
+    weights = model.layers[0].get_weights()
+
+    weights[0][0, 0] = 0.5 + 1e-3
+    weights[0][0, 1] = 0.5 + 1e-3
+    weights[0][1, 0] = 0.5 - 1e-3
+    weights[0][1, 1] = 0.5 - 1e-3
+
+    weights[0][2, 2] = 0.5 + 1e-3
+    weights[0][2, 3] = 0.5 + 1e-3
+    weights[0][3, 2] = 0.5 - 1e-3
+    weights[0][3, 3] = 0.5 - 1e-3
+
+    model.layers[0].set_weights(weights)
+
+    model_attributes = get_attributes_from_keras_model(model)
+    model_attributes['dense'].optimizable = True
+    model_attributes['dense'].optimization_attributes.pruning = False
+    model_attributes['dense'].optimization_attributes.weight_sharing = True
+    model_attributes['dense'].optimization_attributes.structure_type = SUPPORTED_STRUCTURES.BLOCK
+    model_attributes['dense'].optimization_attributes.block_shape = (2, 2)
+
+    masks, offsets = get_model_masks(
+        model, model_attributes, sparsity, MockWeightSharingEstimator, metric='l1', local=local_masking
+    )
+    frozen = np.array([[0, 0], [0, 1], [1, 0], [1, 1], [2, 2], [2, 3], [3, 2], [3, 3]], dtype=np.int32)
+
+    assert not np.any(masks['dense'][frozen[:, 0], frozen[:, 1]])
+    assert np.all(offsets['dense'][frozen[:, 0], frozen[:, 1]] == 0.5)
diff --git a/test/pytest/test_optimization/test_knapsack.py b/test/pytest/test_optimization/test_knapsack.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4145c00d03e799e4136fce47306f2b7313bdcdc
--- /dev/null
+++ b/test/pytest/test_optimization/test_knapsack.py
@@ -0,0 +1,52 @@
+import numpy as np
+import pytest
+
+from hls4ml.optimization.knapsack import solve_knapsack
+
+
+# In the simple case below, both implementations give the optimal answer
+# In general, the greedy algorithm will not give the optimal solution
+@pytest.mark.parametrize('implementation', ['dynamic', 'greedy', 'branch_bound', 'CBC_MIP'])
+def test_knapsack_1d(implementation):
+    values = np.array([4, 5, 6, 8, 3])
+    weights = np.array([[2, 5, 3, 2, 5]])
+    capacity = np.array([8])
+
+    optimal, selected = solve_knapsack(values, weights, capacity, implementation=implementation)
+    assert optimal == 18
+    assert 0 in selected
+    assert 2 in selected
+    assert 3 in selected
+
+
+@pytest.mark.parametrize('implementation', ['greedy', 'branch_bound', 'CBC_MIP'])
+def test_multidimensional_knapsack(implementation):
+    values = np.array([10, 2, 6, 12, 3])
+    weights = np.array([[3, 1, 4, 5, 5], [3, 2, 4, 1, 2]])
+    capacity = np.array([8, 7])
+
+    optimal, selected = solve_knapsack(values, weights, capacity, implementation=implementation)
+    assert optimal == 22
+    assert 0 in selected
+    assert 3 in selected
+
+
+def test_knapsack_equal_weights():
+    values = np.array([10, 2, 6, 8, 3])
+    weights = np.array([[2, 2, 2, 2, 2], [3, 3, 3, 3, 3]])
+    capacity = np.array([7, 7])
+
+    optimal, selected = solve_knapsack(values, weights, capacity)
+    assert optimal == 18
+    assert 0 in selected
+    assert 3 in selected
+
+
+def test_knapsack_all_elements_fit():
+    values = np.array([10, 2, 6, 12, 3])
+    weights = np.array([[3, 1, 4, 5, 5], [3, 2, 4, 1, 2]])
+    capacity = np.array([19, 12])
+
+    optimal, selected = solve_knapsack(values, weights, capacity)
+    assert optimal == 33
+    assert selected == list(range(0, values.shape[0]))
diff --git a/test/pytest/test_optimization/test_objectives.py b/test/pytest/test_optimization/test_objectives.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7d81befe603b0fe3be0874822935ca21f0bc566
--- /dev/null
+++ b/test/pytest/test_optimization/test_objectives.py
@@ -0,0 +1,57 @@
+import numpy as np
+from tensorflow.keras.layers import Conv2D, Dense, Flatten
+from tensorflow.keras.models import Sequential
+
+from hls4ml.optimization.attributes import get_attributes_from_keras_model
+from hls4ml.optimization.objectives import ParameterEstimator
+
+
+# Test attempts to verify one of the estimators (parameter) is correctly declared, the functions are static etc.
+def test_parameter_objective():
+    # Model parameters
+    dense_units = 16
+    conv_filters = 6
+    conv_channels = 3
+    conv_shape = (3, 3)
+    input_shape = (8, 8)
+
+    model = Sequential()
+    model.add(
+        Conv2D(
+            conv_filters,
+            input_shape=(*input_shape, conv_channels),
+            kernel_size=conv_shape,
+            name='conv2d',
+            padding='same',
+            kernel_initializer='ones',
+        )
+    )
+    model.add(Flatten(name='flatten'))
+    model.add(Dense(dense_units, name='dense', kernel_initializer='ones'))
+    model_attributes = get_attributes_from_keras_model(model)
+
+    # Identify optimizable layers and the suitable structure
+    for layer in model.layers:
+        optimizable, optimization_attributes = ParameterEstimator.is_layer_optimizable(model_attributes[layer.name])
+        model_attributes[layer.name].optimizable = optimizable
+        model_attributes[layer.name].optimization_attributes = optimization_attributes
+
+    # Verify conv2d and dense are optimizable, flatten is not
+    assert model_attributes['conv2d'].optimizable
+    assert not model_attributes['flatten'].optimizable
+    assert model_attributes['dense'].optimizable
+
+    # Verify layer resources (number of parameters) are correct
+    assert [conv_filters * conv_channels * np.prod(conv_shape)] == ParameterEstimator.layer_resources(
+        model_attributes['conv2d']
+    )
+    assert [0] == ParameterEstimator.layer_resources(model_attributes['flatten'])
+    assert [conv_filters * np.prod(input_shape) * dense_units] == ParameterEstimator.layer_resources(
+        model_attributes['dense']
+    )
+
+    # Verify layer savings are correct - is_layer_optimizable should have returned UNSTRUCTURED as the pruning type
+    # Since it wasn't overwritten, each pruning step saves one parameter
+    assert [1] == ParameterEstimator.layer_savings(model_attributes['conv2d'])
+    assert [0] == ParameterEstimator.layer_savings(model_attributes['flatten'])
+    assert [1] == ParameterEstimator.layer_savings(model_attributes['dense'])
diff --git a/test/pytest/test_optimization/test_scheduler.py b/test/pytest/test_optimization/test_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..2dc7642bf639ad9a54f643afdbaaabe1ff4f49eb
--- /dev/null
+++ b/test/pytest/test_optimization/test_scheduler.py
@@ -0,0 +1,80 @@
+import numpy as np  # Use np.testing.assert_allclose due to floating point rounding errors
+
+from hls4ml.optimization.scheduler import BinaryScheduler, ConstantScheduler, PolynomialScheduler
+
+
+def test_constant_scheduler():
+    initial_sparsity = 0.25
+    update_step = 0.10
+    target_sparsity = initial_sparsity + 2.5 * update_step
+
+    # Assert initial sparsity correct
+    scheduler = ConstantScheduler(initial_sparsity=initial_sparsity, final_sparsity=target_sparsity, update_step=update_step)
+    np.testing.assert_allclose(scheduler.get_sparsity(), initial_sparsity)
+
+    # Assert update step is correct
+    np.testing.assert_allclose(scheduler.update_step(), (True, initial_sparsity + update_step))
+
+    # Assert repair step is correct
+    np.testing.assert_allclose(scheduler.repair_step(), (True, initial_sparsity + 2 * update_step))
+
+    # Assert cannot update again, since it would go over target sparsity
+    np.testing.assert_allclose(scheduler.update_step(), (False, initial_sparsity + 2 * update_step))
+
+    # Assert final (achievable) sparsity is correct
+    np.testing.assert_allclose(scheduler.get_sparsity(), initial_sparsity + 2 * update_step)
+
+
+def test_binary_scheduler():
+    initial_sparsity = 0.25
+    target_sparsity = 0.5
+    threshold = 0.05
+
+    # Assert initial sparsity correct
+    scheduler = BinaryScheduler(initial_sparsity=initial_sparsity, final_sparsity=target_sparsity, threshold=threshold)
+    np.testing.assert_allclose(scheduler.get_sparsity(), initial_sparsity)
+
+    # Assert 1st update step is correct
+    s1 = 0.5 * (initial_sparsity + target_sparsity)
+    np.testing.assert_allclose(scheduler.update_step(), (True, s1))
+
+    # Assert 1st repair step is correct
+    s2 = 0.5 * (initial_sparsity + s1)
+    np.testing.assert_allclose(scheduler.repair_step(), (True, s2))
+
+    # Assert 2nd update step is correct
+    s3 = 0.5 * (s2 + s1)
+    np.testing.assert_allclose(scheduler.update_step(), (True, s3))
+
+    # Assert 2nd repair step doest not take place, difference < threshold
+    np.testing.assert_allclose(scheduler.repair_step(), (False, s3))
+
+    # Assert final (achievable) sparsity is correct
+    np.testing.assert_allclose(scheduler.get_sparsity(), s3)
+
+
+def test_polynomial_scheduler():
+    decay_power = 2
+    maximum_steps = 2
+    initial_sparsity = 0.25
+    target_sparsity = 0.5
+
+    # Assert initial sparsity correct
+    scheduler = PolynomialScheduler(
+        maximum_steps, initial_sparsity=initial_sparsity, final_sparsity=target_sparsity, decay_power=decay_power
+    )
+    np.testing.assert_allclose(scheduler.get_sparsity(), initial_sparsity)
+
+    # Assert 1st update step is correct
+    s1 = target_sparsity + (initial_sparsity - target_sparsity) * ((1 - 1 / maximum_steps) ** decay_power)
+    np.testing.assert_allclose(scheduler.update_step(), (True, s1))
+
+    # Assert 1st repair step is correct
+    s2 = target_sparsity + (initial_sparsity - target_sparsity) * ((1 - 2 / maximum_steps) ** decay_power)
+    np.testing.assert_allclose(scheduler.repair_step(), (True, s2))
+
+    # Assert 2nd update step does not occur, since current_step = maximum_steps
+    np.testing.assert_allclose(scheduler.update_step(), (False, s2))
+
+    # Assert final (achievable) sparsity is correct
+    np.testing.assert_allclose(scheduler.get_sparsity(), target_sparsity)