Basic CML processing workflow

[1]:

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import tqdm
import xarray as xr
import pandas as pd

import pycomlink as pycml

Have a quick look at the data of one CML

We store each CML as item of a list because we want to do the processing per CML.

In the plot, note that there are some outliers. These correspond to fill values, which we get rid of in the next step.

[3]:

cml_list = [cmls.isel(cml_id=i) for i in range(len(cmls.cml_id))]

[4]:

fig, axs = plt.subplots(2, 1, sharex=True, figsize=(12,5))
cml_list[0].tsl.plot.line(x='time', ax=axs[0])
cml_list[0].rsl.plot.line(x='time', ax=axs[1])
axs[0].set_xlabel('')
axs[1].set_title('');

../_images/notebooks_Basic_CML_processing_workflow_7_0.png

Set default and fill values to NaN and calculate TRSL

TRSL is the transmitted signal level minus the received signal level and represents the total path loss along the CML.

[5]:

for cml in cml_list:
    cml['tsl'] = cml.tsl.where(cml.tsl != 255.0)
    cml['rsl'] = cml.rsl.where(cml.rsl != -99.9)
    cml['trsl'] = cml.tsl - cml.rsl

We can now also see TSL and RSL withouth the fill values that we removed.

Also note that TRSL changes during weak attenuation events stem mostly from TSL changes because this CML has automatic transmission power control (ATPC) which tries to keep RSL constant as long as possible by increasing TSL when path-attenuation occurs.

[6]:

fig, axs = plt.subplots(3, 1, sharex=True, figsize=(12,7))
cml_list[0].tsl.plot.line(x='time', ax=axs[0])
cml_list[0].rsl.plot.line(x='time', ax=axs[1])
cml_list[0].trsl.plot.line(x='time', ax=axs[2])
axs[0].set_xlabel('')
axs[1].set_xlabel('')
axs[1].set_title('')
axs[2].set_title('');

../_images/notebooks_Basic_CML_processing_workflow_11_0.png

Interpolate short gaps in TRSL

Our CML data might have data gaps. These gaps can stem from outages of the data acuqisition or from blackout gaps. If they stem from a blackout gap, there are better methods than linear interpolation. See the notebook Blackout gap detection example for more details.

Here we stick to a simple linear interpolatoin of a maximum gap width of 5 minutes.

[7]:

for cml in cml_list:
    cml['trsl'] = cml.trsl.interpolate_na(dim='time', method='linear', max_gap='5min')

[8]:

cml = cml_list[0].copy() # make a copy here to not change the CML dataset in the list over which we want to iterate later
cml.trsl.plot.line(x='time', figsize=(12,3));

../_images/notebooks_Basic_CML_processing_workflow_14_0.png

Do simple wet-dry classification using a rolling standard deviation

This is the most basic way of doing a wet-dry classification. The crucial part here is to find the optimal threshold. To keep this example short, we use a standard value of 0.8 which is more or less suitable for most CML time series. But please note, that this threshold should be adapted to the noisiness of the individual CML time series, e.g. as in Graf et al. 2020 to get the good performance for CML rainfall estimation.

[9]:

threshold = 0.8

roll_std_dev = cml.trsl.rolling(time=60, center=True).std()
cml['wet'] = cml.trsl.rolling(time=60, center=True).std() > threshold

[10]:

fig, axs = plt.subplots(2, 1, figsize=(12,5), sharex=True)

roll_std_dev.plot.line(x='time', ax=axs[0])
axs[0].axhline(threshold, color='k', linestyle='--')

cml.trsl.plot.line(x='time', ax=axs[1]);

# Get start and end of dry event
wet_start = np.roll(cml.wet, -1) & ~cml.wet
wet_end = np.roll(cml.wet, 1) & ~cml.wet

# Plot shaded area for each wet event
for wet_start_i, wet_end_i in zip(
    wet_start.isel(channel_id=0).values.nonzero()[0],
    wet_end.isel(channel_id=0).values.nonzero()[0],
):
    axs[1].axvspan(cml.time.values[wet_start_i], cml.time.values[wet_end_i], color='b', alpha=0.1)

axs[1].set_title('');

../_images/notebooks_Basic_CML_processing_workflow_17_0.png

Determine baseline

[11]:

cml['baseline'] = pycml.processing.baseline.baseline_constant(trsl=cml.trsl, wet=cml.wet, n_average_last_dry=5)

fig, ax = plt.subplots(figsize=(12,3))

cml.trsl.sel(time=slice('2018-05-13', '2018-05-15')).plot.line(x='time', alpha=0.5)
plt.gca().set_prop_cycle(None)
cml.baseline.sel(time=slice('2018-05-13', '2018-05-15')).plot.line(x='time');
plt.gca().set_prop_cycle(None)
plt.ylabel('TRSL');

../_images/notebooks_Basic_CML_processing_workflow_19_0.png

Perform wet antenna estimation and compare to uncorrected baseline

[12]:

cml['waa'] = pycml.processing.wet_antenna.waa_schleiss_2013(
    rsl=cml.trsl,
    baseline=cml.baseline,
    wet=cml.wet,
    waa_max=2.2,
    delta_t=1,
    tau=15,
)

[13]:

fig, axs = plt.subplots(2, 1, figsize=(12,5), sharex=True)

plt.sca(axs[0])
cml.isel(channel_id=0).trsl.plot.line(x='time', alpha=0.5, label='TRSL')
plt.gca().set_prop_cycle(None)
cml.isel(channel_id=0).baseline.plot.line(x='time', linestyle=':', label='baseline without WAA');
plt.gca().set_prop_cycle(None)
(cml.baseline + cml.waa).isel(channel_id=0).plot.line(x='time', label='baseline with WAA');
plt.ylabel('TRSL (dB)')
axs[0].legend()

# estimate WAA and correct baseline
cml['A'] = cml.trsl - cml.baseline - cml.waa
cml['A'].values[cml.A < 0] = 0
cml['A_no_waa_correct'] = cml.trsl - cml.baseline
cml['A_no_waa_correct'].values[cml.A_no_waa_correct < 0] = 0

plt.sca(axs[1])
cml.A_no_waa_correct.isel(channel_id=0).plot.line(x='time', linestyle=':', label='without WAA');
plt.gca().set_prop_cycle(None)
cml.A.isel(channel_id=0).plot.line(x='time', label='with WAA');
plt.ylabel('path attenuation\nfrom rain (dB)');
axs[1].set_title('');
axs[1].legend()

axs[1].set_xlim(pd.to_datetime('2018-05-16 23:00:00'), pd.to_datetime('2018-05-17 06:00:00'));

../_images/notebooks_Basic_CML_processing_workflow_22_0.png

Calculate rain rate

[14]:

cml['R'] = pycml.processing.k_R_relation.calc_R_from_A(A=cml.A, L_km=float(cml.length), f_GHz=cml.frequency/1e9, pol=cml.polarization)

[15]:

fig, axs = plt.subplots(2, 1, figsize=(12,5), sharex=True)
cml.trsl.plot.line(x='time', ax=axs[0])
cml.R.plot.line(x='time', ax=axs[1])
axs[1].set_title('');

../_images/notebooks_Basic_CML_processing_workflow_25_0.png

Now do the whole processing for all CMLs

[16]:

for cml in tqdm.tqdm(cml_list):
    cml['wet'] = cml.trsl.rolling(time=60, center=True).std(skipna=False) > 0.8

    cml['wet_fraction'] = (cml.wet==1).sum() / len(cml.time)

    cml['baseline'] = pycml.processing.baseline.baseline_constant(
        trsl=cml.trsl,
        wet=cml.wet,
        n_average_last_dry=5,
    )
    cml['waa'] = pycml.processing.wet_antenna.waa_schleiss_2013(
        rsl=cml.trsl,
        baseline=cml.baseline,
        wet=cml.wet,
        waa_max=2.2,
        delta_t=1,
        tau=15,
    )
    cml['A'] = cml.trsl - cml.baseline - cml.waa

    # Note that we set A < 0 to 0 here, but it is not strictly required for
    # the next step, because calc_R_from_A sets all rainfall rates below
    # a certain threshold (default is 0.1) to 0. Some people might want to
    # keep A as it is to check later if there were negative numbers.
    cml['A'].values[cml.A < 0] = 0

    cml['R'] = pycml.processing.k_R_relation.calc_R_from_A(
        A=cml.A, L_km=float(cml.length), f_GHz=cml.frequency/1e9, pol=cml.polarization
   )

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:11<00:00, 42.68it/s]

Find CMLs that show erratic behavior

Find the CMLs that have a very high ratio of wet/dry data. For the 10-day dataset that is used here and for central-European climate, a ratio higher than 0.3 can be considered very unlikely, i.e. it is unlikely that it was rainy for 30% of the one-minute data points within the 10-day period.

For the plots below, we use a higher threshold 0.8 to only plot a small number of CMLs, but find those that show very strong erratic behavior during dry periods.

[17]:

for cml in cml_list:
    if cml.wet_fraction > 0.8:
        cml.trsl.plot.line(x='time', figsize=(12,2))
        plt.title(f'cml_id: {cml.cml_id.values} wet_fraction: {cml.wet_fraction.values:0.2f}')
        plt.show()

../_images/notebooks_Basic_CML_processing_workflow_30_0.png

../_images/notebooks_Basic_CML_processing_workflow_30_1.png

../_images/notebooks_Basic_CML_processing_workflow_30_2.png

../_images/notebooks_Basic_CML_processing_workflow_30_3.png

Compare derived rain rates with reference data

As reference, path-averaged rain rates along the CMLs paths from RADKLIM-YW are provided. This data has a temporal resolution of 5 minutes. First, we compare one CML timeseries aggregated to five minutes individually against its reference timeseries. Then we resample all cml data und prepare a scatterplot between CML and reference data. Finally some metrics are claculated. (for simplicity only channel 1 is evaluated here)

[18]:

path_ref = xr.open_dataset(data_path + '/example_path_averaged_reference_data.nc')

[19]:

for i in [0, 12, 57]:
    # Plot reference rainfall amount (converted to 5-minute rainfall rate)
    (path_ref.sel(cml_id=cml_list[i].cml_id).rainfall_amount * 12).plot(
        label="RADKLIM_YW", color='C3', figsize=(12,3)
    )
    # Plot 5-minute mean rainfall rates from CMLs
    (cml_list[i].sel(channel_id="channel_1").R.resample(time="5min").mean()).plot(
        x="time", label="CML", color='C0'
    )

    plt.xlim(pd.to_datetime('2018-05-13'), pd.to_datetime('2018-05-15'))
    plt.ylabel('5-min rainfall rate (mm/h)')
    plt.legend();

../_images/notebooks_Basic_CML_processing_workflow_34_0.png

../_images/notebooks_Basic_CML_processing_workflow_34_1.png

../_images/notebooks_Basic_CML_processing_workflow_34_2.png

Aggregate the CML data to 5 minute rainfall sums and concat the indiviudal CML xarray.Datasets to one joint xarray.Dataset for the comparison with the reference.

[20]:

ds_cmls = xr.concat(cml_list, dim='cml_id')

[21]:

rainsum_5min = ds_cmls.sel(channel_id="channel_1").R.resample(time="5min").sum() / 60

fig, ax = plt.subplots(figsize=(6, 5))
hx = ax.hexbin(
    rainsum_5min.values.T.flatten(),
    path_ref.rainfall_amount.values.flatten(),
    mincnt=1,
    bins="log",
    gridsize=45,
    extent=(0, 10, 0, 10),
    cmap=plt.get_cmap('viridis', 10),
)
ax.plot([0, 10], [0, 10], color='gray', alpha=0.7)
ax.set_xlabel('CML 5-minute rainfall sum [kg]');
ax.set_ylabel('RADKLIM_YW 5-minute rainfall sum \npath-averaged along CML [kg]');
cbar = fig.colorbar(hx)
cbar.set_label('count')

../_images/notebooks_Basic_CML_processing_workflow_37_0.png

Calculate several performance metrics. rainfall_threshold_wet defines the threshold below which a timestep is considered as dry.

[22]:

metrics = pycml.validation.stats.calc_rain_error_performance_metrics(
    rainsum_5min.values.T.flatten(),
    path_ref.rainfall_amount.values.flatten(),
    rainfall_threshold_wet=0.1)

[print(f'{metric}={value}') for metric, value in zip(metrics._fields, metrics)];

pearson_correlation=0.6834012664635292
coefficient_of_variation=8.646148144620074
root_mean_square_error=0.06931875505595712
mean_absolute_error=0.011793011431390889
R_sum_reference=12630.237843307716
R_sum_predicted=24069.548867923764
R_mean_reference=0.007973670239267292
R_mean_predicted=0.015195489416887425
false_wet_rate=0.02548173384389654
missed_wet_rate=0.1581905678537055
false_wet_precipitation_rate=0.2328701841190468
missed_wet_precipitation_rate=0.25619966292398055
rainfall_threshold_wet=0.1
N_all_pairs=1584000
N_nan_pairs=7
N_nan_reference_only=0
N_nan_predicted_only=7