[1]:
# Remove input cells at runtime (nbsphinx)
import IPython.core.display as d
d.display_html('<script>jQuery(function() {if (jQuery("body.notebook_app").length == 0) { jQuery(".input_area").toggle(); jQuery(".prompt").toggle();}});</script>', raw=True)

Energy estimation for classification

This notebook contains benchmarks for the protopipe pipeline regarding information from training data used for the training of the classification model.
It therefore uses the gamma-2 sample.

Only valid showers (meaning reconstructed with success) are considered.

[2]:
import os
from pathlib import Path

import tables
import pandas
import numpy as np
from scipy.stats import binned_statistic
import uproot
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm

# font size
font_size = 16

# Set general font size
plt.rcParams['font.size'] = font_size
[3]:
def get_camera_names(inputPath = None, fileName = None):
    """Read the names of the cameras.

    Parameters
    ==========
    infile : str
        Full path of the input DL1 file.
    fileName : str
        Name of the input DL1 file.

    Returns
    =======
    camera_names : list(str)
        Table names as a list.
    """
    if (inputPath is None) or (fileName is None):
        print("ERROR: check input")
    h5file = tables.open_file(os.path.join(inputPath, fileName), mode='r')
    group = h5file.get_node("/")
    camera_names = [x.name for x in group._f_list_nodes()]
    h5file.close()
    return camera_names
[4]:
def load_reset_infile_protopipe(inputPath = None, fileName = None, camera_names=None):
    """(Re)load the file containing DL1(a) data and extract the data per telescope type.

    Parameters
    ==========
    infile : str
        Full path of the input DL1 file.
    fileName : str
        Name of the input DL1 file.

    Returns
    =======
    dataFrames : dict(pandas.DataFrame)
        Dictionary of tables per camera.
    """
    if (inputPath is None) or (fileName is None):
        print("ERROR: check input")
    if camera_names is None:
        print("ERROR: no cameras specified")
    # load DL1 images
    dataFrames = {camera : pandas.read_hdf(os.path.join(inputPath, fileName), f"/{camera}") for camera in camera_names}
    return dataFrames
[5]:
# Modify these variables according to your local setup outside of the Vagrant Box
parentDir = "/Users/michele/Applications/ctasoft/dirac" # path to 'shared_folder'
analysisName = "v0.4.0_dev1"

indir = os.path.join(parentDir,
                      "shared_folder/analyses",
                      analysisName,
                      "data/TRAINING/for_particle_classification")
infile = "TRAINING_classification_tail_gamma_merged.h5"

cameras = get_camera_names(inputPath = indir,
                                   fileName = infile)
data = load_reset_infile_protopipe(inputPath = indir,
                                   fileName = infile,
                                   camera_names=cameras)
[6]:
# select only successfully reconstructed showers
valid_showers = {}
for camera in cameras:
    valid_showers[camera] = data[camera][(data[camera]["is_valid"]==True)]
[7]:
# then merge the tables
for i, camera in enumerate(cameras):
    if i==0:
        all_valid_showers = valid_showers[camera]
    else:
        all_valid_showers = all_valid_showers.append(valid_showers[camera])
# Finally drop duplicate showers (stereo information is the same for each event ID)
unique_all_valid_showers = all_valid_showers.drop_duplicates(subset=['event_id'])
[8]:
indir_refData = "/Volumes/DataCEA_PERESANO/Data/CTA/ASWG/Prod3b/Release_2019/CTAMARS_reference_data/TRAINING/DL2" # path to CTAMARS ROOT file
mars_dl2b_energy_fileName = "CTA_4L15M_check_Erec.root"
path_mars_dl2b_energy = os.path.join(indir_refData, mars_dl2b_energy_fileName)
[9]:
with uproot.open(path_mars_dl2b_energy) as CTAMARS:
    CTAMARS_H = CTAMARS["Erec_over_E_vs_E"]
    CTAMARS_Eres = CTAMARS["Eres"]

Energy dispersion

[10]:
x = np.log10(unique_all_valid_showers["true_energy"].values)
y = unique_all_valid_showers["reco_energy"].values / unique_all_valid_showers["true_energy"].values

bin_edges_x = CTAMARS_H.member("fXaxis").edges()
bin_edges_y = CTAMARS_H.member("fYaxis").edges()

plt.figure(figsize=(10,7))

plt.hist2d(x, y, bins=[bin_edges_x, bin_edges_y], cmap="rainbow")

plt.grid(which="both", axis="both")
plt.colorbar(label='Counts')
plt.xlabel('log10(true energy) [TeV]')
plt.ylabel('reco / true')

None # to remove clutter by mpl objects
../../../_images/contribute_benchmarks_TRAINING_benchmarks_DL2_to_classification_17_0.png

Same, but with a Y-axis normalization to ensure that the integral probability of reconstructing a photon with a certain true energy at a certain reconstructed energy is 1.0

[15]:
x = np.log10(unique_all_valid_showers["true_energy"].values)
y = unique_all_valid_showers["reco_energy"].values / unique_all_valid_showers["true_energy"].values

bin_edges_x = CTAMARS_H.member("fXaxis").edges()
bin_edges_y = CTAMARS_H.member("fYaxis").edges()

plt.figure(figsize=(10,7))

h, _, _ = np.histogram2d(x, y, bins=[bin_edges_x, bin_edges_y])
# normalize y-axis so to get a max probability of 1 within 1 bin in true energy
h = h/np.sum(h, axis=1)[np.newaxis].T
# re-plot
plt.pcolormesh(bin_edges_x, bin_edges_y, h.T, cmap="rainbow")

plt.grid(which="both", axis="both")
plt.colorbar(label='Counts')
plt.xlabel('log10(true energy) [TeV]')
plt.ylabel('reco / true - 1')

None # to remove clutter by mpl objects
../../../_images/contribute_benchmarks_TRAINING_benchmarks_DL2_to_classification_19_0.png

Energy resolution and bias

Considering the distribution,

x = (E_reco / E_true) - 1

Energy bias as the mean of x, also in bins of true energy.We plot the bias as the mean of (Erec/Etrue-1), also in bins of true energy.

Energy resolution is here calculated in bins of true energy - as the 68%-quantile of the distribution of abs(x).

Note that by using this definition, any possible reconstruction bias is “absorbed” in the resolution.

  • as the same quantity, but bias-corrected as 68%-quantile of the distribution of abs(x - bias)

[72]:
CTAMARS_H.member("fXaxis").edges().shape
[72]:
(22,)
[42]:
reco = unique_all_valid_showers["reco_energy"].values
true = unique_all_valid_showers["true_energy"].values

bin_edges_x = CTAMARS_H.member("fXaxis").edges()
bin_edges_y = CTAMARS_H.member("fYaxis").edges()

plt.figure(figsize=(15,5))
plt.subplots_adjust(wspace = 0.4)

# RESOLUTION

plt.subplot(1,2,1)

# Set tick font size
ax = plt.gca()
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
    label.set_fontsize(font_size)

resolution = binned_statistic(np.log10(true),
                              reco/true - 1,
                              statistic = lambda x: np.percentile(np.abs(x), 68),
                              bins=bin_edges_x,)

corr_resolution = binned_statistic(np.log10(true),
                                  reco/true - 1,
                                  statistic = lambda x: np.percentile(np.abs(x-np.mean(x)), 68),
                                  bins=bin_edges_x)


plt.plot(CTAMARS_H.member("fXaxis").centers(), resolution[0], "bo", label="protopipe")
plt.plot(CTAMARS_H.member("fXaxis").centers(), corr_resolution[0], "ro", label="protopipe (bias corrected)")


plt.plot(CTAMARS_Eres.members["fX"],CTAMARS_Eres.members["fY"], 'o', ls="None", label="CTAMARS")

plt.hlines(0.0, plt.gca().get_xlim()[0], plt.gca().get_xlim()[1], ls="--", color="green")

plt.grid(which="both", axis="both")
plt.xlabel('log10(true energy) [TeV]')
plt.ylabel('abs(reco/true - 1)_68%')
plt.xlim(-2.1, 2.1)
plt.ylim(-0.5, 2)
plt.legend(loc="best")

# BIAS

plt.subplot(1,2,2)

# Set tick font size
ax = plt.gca()
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
    label.set_fontsize(font_size)

bias = binned_statistic(np.log10(true),
                        reco/true - 1,
                        statistic="mean",
                        bins=bin_edges_x)


plt.plot(CTAMARS_H.member("fXaxis").centers(), bias[0], "bo")
plt.hlines(0.0, plt.gca().get_xlim()[0], plt.gca().get_xlim()[1], ls="--", color="green")

plt.grid(which="both", axis="both")
plt.xlabel('log10(true energy) [TeV]')
plt.ylabel('mean(reco/true - 1)')
plt.xlim(-2.1, 2.1)
plt.ylim(-0.5, 2)

None # to remove clutter by mpl objects
../../../_images/contribute_benchmarks_TRAINING_benchmarks_DL2_to_classification_23_0.png
Now we can superimpose these benchmarks to the normalized energy dispersion around 1.
The error bars correspond to the bias-corrected resolution.
[70]:
x = np.log10(unique_all_valid_showers["true_energy"].values)
y = unique_all_valid_showers["reco_energy"].values / unique_all_valid_showers["true_energy"].values - 1

bin_edges_x = CTAMARS_H.member("fXaxis").edges()
#bin_edges_y = CTAMARS_H.member("fYaxis").edges()
bin_edges_y = np.linspace(-1,3,300)

plt.figure(figsize=(10,7))

#plt.pcolormesh(bin_edges_x, bin_edges_y, h.T, cmap="rainbow")
h, _, _ = np.histogram2d(x, y, bins=[bin_edges_x, bin_edges_y])
# normalize y-axis so to get a max probability of 1 within 1 bin in true energy
h = h/np.sum(h, axis=1)[np.newaxis].T
# re-plot
plt.pcolormesh(bin_edges_x, bin_edges_y, h.T, cmap="rainbow")


corr_resolution = binned_statistic(np.log10(true),
                                  reco/true - 1,
                                  statistic = lambda x: np.percentile(np.abs(x-np.mean(x)), 68),
                                  bins=bin_edges_x)

bias = binned_statistic(np.log10(true),
                        reco/true -1,
                        statistic="mean",
                        bins=bin_edges_x)

plt.errorbar(x = CTAMARS_H.member("fXaxis").centers(),
             y = bias[0],
             xerr = np.diff(bin_edges_x)/2,
             yerr = corr_resolution[0],
             ls="none",
             fmt = "o",
            color="black")

plt.vlines(np.log10(0.03), plt.gca().get_ylim()[0], plt.gca().get_ylim()[1], ls="--", lw=2.5, color="magenta", label="30 GeV")

plt.grid(which="both", axis="both")
plt.colorbar(label='Counts')
plt.xlabel('log10(true energy) [TeV]')
plt.ylabel('reco / true - 1')
plt.legend()

None # to remove clutter by mpl objects
../../../_images/contribute_benchmarks_TRAINING_benchmarks_DL2_to_classification_25_0.png

Migration energy matrix

[14]:
x = np.log10(unique_all_valid_showers["true_energy"].values)
y = np.log10(unique_all_valid_showers["reco_energy"].values)

bin_edges = np.arange(-1.9, 2.3, 1/50)

plt.figure(figsize=(10,5))

plt.hist2d(x, y, bins=[bin_edges, bin_edges], norm=LogNorm())
plt.grid(which="both", axis="both")
plt.colorbar(label='Counts')
plt.xlabel('log10(true energy) [TeV]')
plt.ylabel('log10(reco energy) [TeV]')

plt.plot(bin_edges, bin_edges, color="red")

None # to remove clutter by mpl objects
../../../_images/contribute_benchmarks_TRAINING_benchmarks_DL2_to_classification_27_0.png
[ ]: