Source code for ska_pst.stat.utility.hdf5_file_generator

# -*- coding: utf-8 -*-
#
# This file is part of the SKA PST project
#
# Distributed under the terms of the BSD 3-clause new license.
# See LICENSE for more info.
"""This module provides the ability to generate random data and turn into HDF5 file."""

from __future__ import annotations

import pathlib
from dataclasses import dataclass, field
from datetime import datetime
from typing import Any, Generator, List

import h5py
import numpy as np
from ska_pst.stat.hdf5.consts import (
    FILE_FORMAT_VERSION_1_1_0,
    HDF5_FILE_FORMAT_VERSION,
    HDF5_HEADER,
    HDF5_HISTOGRAM_1D_FREQ_AVG,
    HDF5_HISTOGRAM_1D_FREQ_AVG_RFI_EXCISED,
    HDF5_HISTOGRAM_REBINNED_1D_FREQ_AVG,
    HDF5_HISTOGRAM_REBINNED_1D_FREQ_AVG_RFI_EXCISED,
    HDF5_HISTOGRAM_REBINNED_2D_FREQ_AVG,
    HDF5_HISTOGRAM_REBINNED_2D_FREQ_AVG_RFI_EXCISED,
    HDF5_MAX_SPECTRAL_POWER,
    HDF5_MAX_WEIGHTS,
    HDF5_MEAN_FREQUENCY_AVG,
    HDF5_MEAN_FREQUENCY_AVG_RFI_EXCISED,
    HDF5_MEAN_SPECTRAL_POWER,
    HDF5_MEAN_SPECTRUM,
    HDF5_MEAN_WEIGHTS,
    HDF5_MIN_WEIGHTS,
    HDF5_NUM_CLIPPED_SAMPLES,
    HDF5_NUM_CLIPPED_SAMPLES_RFI_EXCISED,
    HDF5_NUM_CLIPPED_SAMPLES_SPECTRUM,
    HDF5_SPECTROGRAM,
    HDF5_TIMESERIES,
    HDF5_TIMESERIES_RFI_EXCISED,
    HDF5_VARIANCE_FREQUENCY_AVG,
    HDF5_VARIANCE_FREQUENCY_AVG_RFI_EXCISED,
    HDF5_VARIANCE_SPECTRUM,
    Polarisation,
    TimeseriesDimension,
)
from ska_pst.stat.hdf5.model import StatisticsData, StatisticsMetadata, get_stat_file_format, string_dt

from ska_pst.stat import Statistics


[docs]@dataclass(kw_only=True)
class StatConfig:
    """
    A data class used as configuration for generating random data.

    :ivar npol: number of polarisations, default 2.
    :vartype npol: int
    :ivar ndim: number of dimensions in the data, default 2.
    :vartype ndim: int
    :ivar nchan: number of channels in the data, default 432.
    :vartype nchan: int
    :ivar nsamp: number of samples of each channel per heap, default 32.
    :vartype nsamp: int
    :ivar nheap: number of heaps of data to produce, default is 1.
    :vartype nheap: int
    :ivar nbit: the number of bits per data, this can only be 8 or 16.
    :vartype nbit: int
    :ivar nfreq_bins: requested number of frequency bins for spectrogram.
        This gets updated to be a factor of the number of channels.
    :vartype nfreq_bins: int
    :ivar ntime_bins: requested number of temporal bins for spectrogram
        and timeseries. This gets updated to be a factor of the total
        number of samples per channel.
    :vartype ntime_bins: int
    :ivar nrebin: number of bins to use for rebinned histograms
    :vartype nrebin: int
    :ivar sigma: number standard deviations to use to clip data. This
        is only used in the generator.
    :vartype sigma: float
    :ivar freq_mask: the frequency ranges to mask. (Currently not used)
    :vartype freq_mask: str
    :ivar frequency_mhz: the centre frequency for the data as a whole
    :vartype frequency_mhz: float
    :ivar bandwidth_mhz: the bandwidth of data
    :vartype bandwidth_mhz: float
    :ivar start_chan: the starting channel number.
    :vartype start_chan: int
    :ivar tsamp: the time, in microseconds, per sample
    :vartype tsamp: float
    :ivar os_factor: the oversampling factor
    :vartype os_factor: float
    """

    ndim: int = 2
    nchan: int = 432
    nsamp: int = 32
    nheap: int = 1
    nbit: int = 16
    nfreq_bins: int = 36
    ntime_bins: int = 4
    nrebin: int = 256
    sigma: float = 6.0
    freq_mask: str = ""
    frequency_mhz: float = 87.5
    bandwidth_mhz: float = 75.0
    start_chan: int = 0
    tsamp: float = 207.36
    os_factor: float = 4 / 3
    polarisations: List[Polarisation] = field(default_factory=lambda: list(Polarisation))

    def __post_init__(self: StatConfig) -> None:
        """Ensure configuration is valid."""
        assert self.nbit in [8, 16], "expected nbits to be either 8 or 16"

        self.dtype = np.int8 if self.nbit == 8 else np.int16

        def _recalc_nbins(num_items: int, req_bins: int) -> int:
            if num_items % req_bins == 0:
                return req_bins

            nbin_factor = max(num_items // req_bins, 1)
            while nbin_factor > 1:
                if num_items % nbin_factor == 0:
                    return num_items // nbin_factor

                nbin_factor -= 1

            return num_items

        self.nfreq_bins = _recalc_nbins(self.nchan, self.nfreq_bins)
        self.ntime_bins = _recalc_nbins(self.nheap * self.nsamp, self.ntime_bins)

    @property
    def npol(self: StatConfig) -> int:
        """Get the number of polarisations."""
        return len(self.polarisations)

    @property
    def scale(self: StatConfig) -> float:
        """Get scale of the Gaussian distribution."""
        return self.sigma / self.nbit_limit

    @property
    def nbit_limit(self: StatConfig) -> int:
        """Get the limit for current nbit."""
        return 2 ** (self.nbit - 1)

    @property
    def clipped_low(self: StatConfig) -> int:
        """Get the minimum value for the current nbit."""
        return -self.nbit_limit

    @property
    def clipped_high(self: StatConfig) -> int:
        """Get the maximum value for the current nbit."""
        return self.nbit_limit - 1

    @property
    def rfi_excised_channel_indexes(self: StatConfig) -> List[int]:
        """Get the indexes of the RFI excised channels."""
        return []

    @property
    def non_rfi_channel_indexes(self: StatConfig) -> List[int]:
        """Get the index of channels that are not RFI excised."""
        rfi_excised_channels = self.rfi_excised_channel_indexes
        return [c for c in range(self.nchan) if c not in rfi_excised_channels]

    @property
    def nbin(self: StatConfig) -> int:
        """Get the number of bins for histogram."""
        return 1 << self.nbit

    @property
    def rebin_offset(self: StatConfig) -> int:
        """Get the offset to apply when doing rebinning."""
        return self.nrebin // 2

    @property
    def rebin_max(self: StatConfig) -> int:
        """Get the maximum value after rebinning."""
        return self.nrebin - 1

    @property
    def total_samples_per_channel(self: StatConfig) -> int:
        """Get the total number of samples per channel."""
        return self.nheap * self.nsamp

    @property
    def tsamp_secs(self: StatConfig) -> float:
        """Get the TSAMP value in seconds."""
        return self.tsamp * 1e-6

    @property
    def total_sample_time(self: StatConfig) -> float:
        """Get the total sample time in seconds."""
        return self.tsamp_secs * self.total_samples_per_channel


[docs]class Hdf5FileGenerator:
    """Class used to generate a random HDF5 statistics file."""

    def __init__(
        self: Hdf5FileGenerator,
        file_path: pathlib.Path | str,
        eb_id: str,
        telescope: str,
        scan_id: int,
        beam_id: str,
        config: StatConfig,
        utc_start: str | None = None,
        version: str = FILE_FORMAT_VERSION_1_1_0,
    ) -> None:
        """
        Initialise the Hdf5FileGenerator.

        :param file_path: path to the file to create
        :type file_path: pathlib.Path | str
        :param eb_id: the execution block ID of the generated data file
        :type eb_id: str
        :param telescope: the telescope used for the generated data file
        :type telescope: str
        :param scan_id: the scan id for the generated data file
        :type scan_id: int
        :param beam_id: the beam id for the generated data file
        :type beam_id: str
        :param config: the configuration to use to generate the data file
        :type config: StatConfig
        :param utc_start: an ISO formatted string of the UTC time at the start of the scan.
        :param utc_start: str
        """
        file_path = pathlib.Path(file_path)
        if not file_path.parent.exists():
            file_path.parent.mkdir(parents=True, exist_ok=True)

        self._file_path = file_path
        self._params: dict = {
            "config": config,
            "eb_id": eb_id,
            "telescope": telescope,
            "scan_id": scan_id,
            "beam_id": beam_id,
            "utc_start": utc_start,
        }
        self._stats: Statistics | None = None
        self._version = version

    @property
    def stats(self: Hdf5FileGenerator) -> Statistics:
        """
        Get generated statistics.

        This will throw an :py:class:`AssertionError` if :py:meth:`generate`
        has not been called.
        """
        assert self._stats is not None, "Statistics has not been generated."
        return self._stats

    @property
    def stats_metadata(self: Hdf5FileGenerator) -> StatisticsMetadata:
        """
        Get metadata of generated statistics.

        This will throw an :py:class:`AssertionError` if :py:meth:`generate`
        has not been called.
        """
        assert self._stats is not None, "Statistics have not been generated."
        return self._stats.metadata

[docs]    def generate(self: Hdf5FileGenerator) -> None:
        """Generate a HDF5 file to use in a test."""
        if self._file_path.exists():
            self._file_path.unlink()

        self._stats = _calc_stats(**self._params, version=self._version)

        file_format = get_stat_file_format(version=self._version)

        metadata = self._stats.metadata

        header_data_params: tuple = (
            metadata.eb_id,
            metadata.telescope,
            metadata.scan_id,
            metadata.beam_id,
            metadata.utc_start,
            metadata.t_min,
            metadata.t_max,
            metadata.frequency_mhz,
            metadata.bandwidth_mhz,
            metadata.start_chan,
            metadata.npol,
            metadata.ndim,
            metadata.nchan,
            metadata.nchan_ds,
            metadata.ndat_ds,
            metadata.histogram_nbin,
            metadata.nrebin,
            metadata.channel_freq_mhz,
            metadata.frequency_bins,
            metadata.timeseries_bins,
            metadata.num_samples,
            metadata.num_samples_rfi_excised,
            metadata.num_samples_spectrum,
            metadata.num_invalid_packets,
        )

        if file_format.has_weights:
            header_data_params = (*header_data_params, metadata.num_weight_samples)

        if file_format.has_polarisations:
            header_data_params = (*header_data_params, metadata.polarisations)

        header_data = np.array(
            [header_data_params],
            dtype=file_format.header_dtype,
        )

        data = self._stats.data
        with h5py.File(self._file_path, "w") as f:
            file_format_ds = f.create_dataset(HDF5_FILE_FORMAT_VERSION, shape=(), dtype=string_dt)
            file_format_ds[()] = self._version

            header_ds = f.create_dataset(HDF5_HEADER, 1, dtype=file_format.header_dtype)
            header_ds[...] = header_data

            self._create_data_set(f, HDF5_MEAN_FREQUENCY_AVG, data.mean_frequency_avg)
            self._create_data_set(f, HDF5_MEAN_FREQUENCY_AVG_RFI_EXCISED, data.mean_frequency_avg_rfi_excised)
            self._create_data_set(f, HDF5_VARIANCE_FREQUENCY_AVG, data.variance_frequency_avg)
            self._create_data_set(
                f, HDF5_VARIANCE_FREQUENCY_AVG_RFI_EXCISED, data.variance_frequency_avg_rfi_excised
            )
            self._create_data_set(f, HDF5_MEAN_SPECTRUM, data.mean_spectrum)
            self._create_data_set(f, HDF5_VARIANCE_SPECTRUM, data.variance_spectrum)
            self._create_data_set(f, HDF5_MEAN_SPECTRAL_POWER, data.mean_spectral_power)
            self._create_data_set(f, HDF5_MAX_SPECTRAL_POWER, data.max_spectral_power)
            self._create_data_set(f, HDF5_HISTOGRAM_1D_FREQ_AVG, data.histogram_1d_freq_avg)
            self._create_data_set(
                f, HDF5_HISTOGRAM_1D_FREQ_AVG_RFI_EXCISED, data.histogram_1d_freq_avg_rfi_excised
            )
            self._create_data_set(f, HDF5_HISTOGRAM_REBINNED_2D_FREQ_AVG, data.rebinned_histogram_2d_freq_avg)
            self._create_data_set(
                f,
                HDF5_HISTOGRAM_REBINNED_2D_FREQ_AVG_RFI_EXCISED,
                data.rebinned_histogram_2d_freq_avg_rfi_excised,
            )
            self._create_data_set(f, HDF5_HISTOGRAM_REBINNED_1D_FREQ_AVG, data.rebinned_histogram_1d_freq_avg)
            self._create_data_set(
                f,
                HDF5_HISTOGRAM_REBINNED_1D_FREQ_AVG_RFI_EXCISED,
                data.rebinned_histogram_1d_freq_avg_rfi_excised,
            )
            self._create_data_set(f, HDF5_NUM_CLIPPED_SAMPLES_SPECTRUM, data.num_clipped_samples_spectrum)
            self._create_data_set(f, HDF5_NUM_CLIPPED_SAMPLES, data.num_clipped_samples)
            self._create_data_set(
                f, HDF5_NUM_CLIPPED_SAMPLES_RFI_EXCISED, data.num_clipped_samples_rfi_excised
            )
            self._create_data_set(f, HDF5_SPECTROGRAM, data.spectrogram)
            self._create_data_set(f, HDF5_TIMESERIES, data.timeseries)
            self._create_data_set(f, HDF5_TIMESERIES_RFI_EXCISED, data.timeseries_rfi_excised)

            if file_format.has_weights:
                self._create_data_set(f, HDF5_MIN_WEIGHTS, data.min_weights)
                self._create_data_set(f, HDF5_MAX_WEIGHTS, data.max_weights)
                self._create_data_set(f, HDF5_MEAN_WEIGHTS, data.mean_weights)

    def _create_data_set(
        self: Hdf5FileGenerator,
        file: h5py.File,
        key: str,
        data: np.ndarray,
    ) -> None:
        ds = file.create_dataset(key, data.shape, dtype=data.dtype)
        ds[...] = data


def simple_gaussian_generator(config: StatConfig) -> Generator[np.ndarray, None, None]:
    """Get a generator that can yield Gaussian distributed data based on config."""
    min_value = config.clipped_low
    max_value = config.clipped_high
    while True:
        data = (
            np.random.randn(config.npol, config.ndim, config.nchan, config.total_samples_per_channel)
            / config.scale
        )
        data = np.rint(data)

        data[data < min_value] = min_value
        data[data > max_value] = max_value

        yield data.astype(dtype=config.dtype)


def _calc_stats(
    *args: Any,
    config: StatConfig,
    eb_id: str,
    telescope: str,
    scan_id: int,
    beam_id: str,
    version: str,
    utc_start: str | None = None,
    **kwargs: Any,
) -> Statistics:
    """Calculate statistics from random data based on provided config."""
    stat_file_format = get_stat_file_format(version=version)

    if utc_start is None:
        utc_start = datetime.now().strftime("%Y-%m-%d-%H:%M:%S")

    non_rfi_channel_idx = config.non_rfi_channel_indexes

    num_samples_spectrum = (config.total_samples_per_channel * np.ones(shape=config.nchan)).astype(
        dtype=np.uint32
    )

    num_samples = np.sum(num_samples_spectrum, dtype=np.uint32)
    num_samples_rfi_excised = np.sum(num_samples_spectrum[non_rfi_channel_idx], dtype=np.uint32)
    num_invalid_packets = 0

    # need to calc freq bins
    low_freq = config.frequency_mhz - config.bandwidth_mhz / 2.0
    high_freq = low_freq + config.bandwidth_mhz

    channel_freq_mhz = np.linspace(low_freq, high_freq, num=config.nchan, endpoint=False, dtype=np.float64)
    # this gives us the start freq. Need to have channel centre freq. This offset it BW/nchan/2
    channel_freq_mhz += config.bandwidth_mhz / config.nchan / 2
    freq_bin_factor: int = config.nchan // config.nfreq_bins

    # need to calc freq bins
    frequency_bins = np.linspace(low_freq, high_freq, num=config.nfreq_bins, endpoint=False, dtype=np.float64)
    frequency_bin_bw = config.bandwidth_mhz / config.nfreq_bins
    frequency_bins += frequency_bin_bw / 2

    # need to calc temporal bins
    total_sample_time = config.total_sample_time
    temporal_bin_secs = total_sample_time / config.ntime_bins
    timeseries_bins = (
        np.linspace(0.0, total_sample_time, num=config.ntime_bins, endpoint=False, dtype=np.float64)
        + temporal_bin_secs / 2.0
    )
    temporal_bin_factor = config.total_samples_per_channel // config.ntime_bins

    raw = next(simple_gaussian_generator(config=config))
    assert raw.shape == (config.npol, config.ndim, config.nchan, config.total_samples_per_channel)

    raw_flattened = np.reshape(raw, newshape=(config.npol, config.ndim, -1))
    assert raw_flattened.shape == (config.npol, config.ndim, config.nchan * config.total_samples_per_channel)

    raw_rebinned = np.clip(raw_flattened + config.rebin_offset, 0, config.rebin_max)

    scaled = config.scale * raw
    assert scaled.shape == (config.npol, config.ndim, config.nchan, config.total_samples_per_channel)

    mean_spectrum: np.ndarray = np.mean(scaled, axis=-1, dtype=np.float32)
    assert mean_spectrum.shape == (config.npol, config.ndim, config.nchan)

    mean_frequency_avg: np.ndarray = np.mean(mean_spectrum, axis=-1, dtype=np.float32)
    assert mean_frequency_avg.shape == (config.npol, config.ndim)

    mean_frequency_avg_rfi_excised: np.ndarray = np.mean(
        mean_spectrum[:, :, non_rfi_channel_idx], axis=-1, dtype=np.float32
    )
    assert mean_frequency_avg_rfi_excised.shape == (config.npol, config.ndim)

    variance_spectrum: np.ndarray = np.var(scaled, axis=-1, ddof=1, dtype=np.float32)
    assert variance_spectrum.shape == (config.npol, config.ndim, config.nchan)

    variance_frequency_avg: np.ndarray = np.var(scaled, axis=(-2, -1), ddof=1, dtype=np.float32)
    assert variance_frequency_avg.shape == (config.npol, config.ndim)

    variance_frequency_avg_rfi_excised: np.ndarray = np.var(
        scaled[:, :, non_rfi_channel_idx, :], axis=(-2, -1), ddof=1, dtype=np.float32
    )
    assert variance_frequency_avg_rfi_excised.shape == (config.npol, config.ndim)

    power: np.ndarray = np.sum(scaled**2, axis=1, dtype=np.float32)
    assert power.shape == (config.npol, config.nchan, config.total_samples_per_channel)

    mean_spectral_power: np.ndarray = np.mean(power, axis=-1, dtype=np.float32)
    assert mean_spectral_power.shape == (config.npol, config.nchan)

    max_spectral_power: np.ndarray = np.max(power, axis=-1)
    assert max_spectral_power.shape == (config.npol, config.nchan)

    histogram_1d_freq_avg = np.zeros(shape=(config.npol, config.ndim, config.nbin), dtype=np.uint32)
    histogram_1d_freq_avg_rfi_excised = np.zeros_like(histogram_1d_freq_avg)
    rebinned_histogram_1d_freq_avg = np.zeros(
        shape=(config.npol, config.ndim, config.nrebin), dtype=np.uint32
    )
    rebinned_histogram_1d_freq_avg_rfi_excised = np.zeros_like(rebinned_histogram_1d_freq_avg)

    rebinned_histogram_2d_freq_avg = np.zeros(
        shape=(config.npol, config.nrebin, config.nrebin), dtype=np.uint32
    )
    rebinned_histogram_2d_freq_avg_rfi_excised = np.zeros_like(rebinned_histogram_2d_freq_avg)

    for ipol in range(config.npol):
        for idim in range(config.ndim):
            histogram_1d_freq_avg[ipol, idim] = np.histogram(
                raw_flattened[ipol, idim],
                bins=config.nbin,
                range=(config.clipped_low, config.clipped_high),
            )[0]
            histogram_1d_freq_avg_rfi_excised[ipol, idim] = np.histogram(
                raw_flattened[ipol, idim, non_rfi_channel_idx],
                bins=config.nbin,
                range=(config.clipped_low, config.clipped_high),
            )[0]
            rebinned_histogram_1d_freq_avg[ipol, idim] = np.histogram(
                raw_rebinned[ipol, idim], bins=config.nrebin, range=(0, config.nrebin - 1)
            )[0]
            rebinned_histogram_1d_freq_avg_rfi_excised[ipol, idim] = np.histogram(
                raw_rebinned[ipol, idim, non_rfi_channel_idx],
                bins=config.nrebin,
                range=(0, config.nrebin - 1),
            )[0]

        rebinned_histogram_2d_freq_avg[ipol] = np.histogram2d(
            raw_rebinned[ipol, 0],
            raw_rebinned[ipol, 1],
            bins=config.nrebin,
            range=[[0, config.nrebin - 1], [0, config.nrebin - 1]],
        )[0]
        rebinned_histogram_2d_freq_avg_rfi_excised[ipol] = np.histogram2d(
            raw_rebinned[ipol, 0, non_rfi_channel_idx],
            raw_rebinned[ipol, 1, non_rfi_channel_idx],
            bins=config.nrebin,
            range=[[0, config.nrebin - 1], [0, config.nrebin - 1]],
        )[0]

    num_clipped_samples_spectrum = np.apply_along_axis(
        lambda x: np.count_nonzero((x <= config.clipped_low) | (x >= config.clipped_high)),
        axis=-1,
        arr=raw,
    ).astype(np.uint32)
    assert num_clipped_samples_spectrum.shape == (config.npol, config.ndim, config.nchan)

    num_clipped_samples = np.sum(num_clipped_samples_spectrum, axis=-1).astype(np.uint32)
    assert num_clipped_samples.shape == (config.npol, config.ndim)

    num_clipped_samples_rfi_excised = np.sum(
        num_clipped_samples_spectrum[:, :, non_rfi_channel_idx], axis=-1
    ).astype(np.uint32)
    assert num_clipped_samples_rfi_excised.shape == (config.npol, config.ndim)

    spectrogram = np.zeros(shape=(config.npol, config.nfreq_bins, config.ntime_bins), dtype=np.float32)
    timeseries = np.zeros(shape=(config.npol, config.ntime_bins, 3), dtype=np.float32)
    timeseries_rfi_excised = np.zeros(shape=(config.npol, config.ntime_bins, 3), dtype=np.float32)

    for temporal_bin, isamp in enumerate(range(0, config.total_samples_per_channel, temporal_bin_factor)):
        for freq_bin, ichan in enumerate(range(0, config.nchan, freq_bin_factor)):
            # loop over the channel bins and get a slice
            spectrogram_power_slice = power[
                :, ichan : ichan + freq_bin_factor, isamp : isamp + temporal_bin_factor
            ]
            spectrogram[:, freq_bin, temporal_bin] = np.sum(spectrogram_power_slice, axis=(1, 2))

        # all channels over the current temporal bin
        timeseries_power_slice = power[:, :, isamp : isamp + temporal_bin_factor]
        timeseries[:, temporal_bin, TimeseriesDimension.MAX] = np.max(timeseries_power_slice, axis=(1, 2))
        timeseries[:, temporal_bin, TimeseriesDimension.MIN] = np.min(timeseries_power_slice, axis=(1, 2))
        timeseries[:, temporal_bin, TimeseriesDimension.MEAN] = np.mean(timeseries_power_slice, axis=(1, 2))

        # get the power for channels that aren't rfi excised
        timeseries_power_slice_rfi_excised = power[
            :, non_rfi_channel_idx, isamp : isamp + temporal_bin_factor
        ]
        timeseries_rfi_excised[:, temporal_bin, TimeseriesDimension.MAX] = np.max(
            timeseries_power_slice_rfi_excised, axis=(1, 2)
        )
        timeseries_rfi_excised[:, temporal_bin, TimeseriesDimension.MIN] = np.min(
            timeseries_power_slice_rfi_excised, axis=(1, 2)
        )
        timeseries_rfi_excised[:, temporal_bin, TimeseriesDimension.MEAN] = np.mean(
            timeseries_power_slice_rfi_excised, axis=(1, 2)
        )

    min_weights = np.zeros(config.nchan, dtype=np.float32)
    max_weights = np.zeros(config.nchan, dtype=np.float32)
    mean_weights = np.zeros(config.nchan, dtype=np.float32)

    if stat_file_format.has_weights:
        weights = (
            np.random.randint(0, 2**15 + 1, size=(config.nchan, num_samples), dtype=np.uint16) / 2**15
        ).astype(np.float32)

        min_weights = np.min(weights, axis=1)
        max_weights = np.max(weights, axis=1)
        mean_weights = np.mean(weights, axis=1)

    metadata = StatisticsMetadata(
        file_format_version=version,
        eb_id=eb_id,
        telescope=telescope,
        scan_id=scan_id,
        beam_id=beam_id,
        utc_start=utc_start,
        t_min=0,
        t_max=config.total_sample_time,
        frequency_mhz=config.frequency_mhz,
        bandwidth_mhz=config.bandwidth_mhz,
        start_chan=config.start_chan,
        npol=config.npol,
        ndim=config.ndim,
        nchan=config.nchan,
        nchan_ds=config.nfreq_bins,
        ndat_ds=config.ntime_bins,
        histogram_nbin=config.nbin,
        nrebin=config.nrebin,
        channel_freq_mhz=channel_freq_mhz,
        timeseries_bins=timeseries_bins,
        frequency_bins=frequency_bins,
        num_samples=num_samples,
        num_samples_rfi_excised=num_samples_rfi_excised,
        num_samples_spectrum=num_samples_spectrum,
        num_invalid_packets=num_invalid_packets,
        has_weights=stat_file_format.has_weights,
        polarisations=Polarisation.as_string(config.polarisations),
    )

    data = StatisticsData(
        mean_frequency_avg=mean_frequency_avg,
        mean_frequency_avg_rfi_excised=mean_frequency_avg_rfi_excised,
        variance_frequency_avg=variance_frequency_avg,
        variance_frequency_avg_rfi_excised=variance_frequency_avg_rfi_excised,
        mean_spectrum=mean_spectrum,
        variance_spectrum=variance_spectrum,
        mean_spectral_power=mean_spectral_power,
        max_spectral_power=max_spectral_power,
        histogram_1d_freq_avg=histogram_1d_freq_avg,
        histogram_1d_freq_avg_rfi_excised=histogram_1d_freq_avg_rfi_excised,
        rebinned_histogram_2d_freq_avg=rebinned_histogram_2d_freq_avg,
        rebinned_histogram_2d_freq_avg_rfi_excised=rebinned_histogram_2d_freq_avg_rfi_excised,
        rebinned_histogram_1d_freq_avg=rebinned_histogram_1d_freq_avg,
        rebinned_histogram_1d_freq_avg_rfi_excised=rebinned_histogram_1d_freq_avg_rfi_excised,
        num_clipped_samples_spectrum=num_clipped_samples_spectrum,
        num_clipped_samples=num_clipped_samples,
        num_clipped_samples_rfi_excised=num_clipped_samples_rfi_excised,
        spectrogram=spectrogram,
        timeseries=timeseries,
        timeseries_rfi_excised=timeseries_rfi_excised,
        min_weights=min_weights,
        max_weights=max_weights,
        mean_weights=mean_weights,
    )

    return Statistics(metadata=metadata, data=data)