diff --git a/pysegcnn/core/graphics.py b/pysegcnn/core/graphics.py
index 7b410f11cba5530653ef3f51a4b9b412aefa0ae9..f6db0b838b2f3c48ede277cf6598d039901feed0 100644
--- a/pysegcnn/core/graphics.py
+++ b/pysegcnn/core/graphics.py
@@ -22,7 +22,6 @@ import logging
 
 # externals
 import numpy as np
-import pandas as pd
 import torch
 import matplotlib
 import matplotlib.pyplot as plt
@@ -34,7 +33,8 @@ from matplotlib.animation import ArtistAnimation
 from matplotlib import cm as colormap
 
 # locals
-from pysegcnn.core.utils import accuracy_function, check_filename_length
+from pysegcnn.core.utils import (accuracy_function, check_filename_length,
+                                 report2df)
 from pysegcnn.main.train_config import HERE
 
 # plot font size configuration
@@ -688,9 +688,11 @@ def plot_classification_report(report, labels, figsize=(10, 10), **kwargs):
 
     Parameters
     ----------
-    report : `dict`
+    report : `dict` or :py:class:`pandas.DataFrame`
         The dictionary returned by setting ``output_dict=True`` in
-        :py:func:`sklearn.metrics.classification_report`.
+        :py:func:`sklearn.metrics.classification_report` or the
+        :py:class:`pandas.DataFrame` returned by
+        :py:func:`pysegcnn.core.utils.report2df`.
     labels : `list` [`str`]
         Names of the classes.
     figsize : `tuple` [`int`], optional
@@ -704,30 +706,15 @@ def plot_classification_report(report, labels, figsize=(10, 10), **kwargs):
         An instance of :py:class:`matplotlib.figure.Figure`.
 
     """
-    # overall accuracy
-    overall_accuracy = report['accuracy']
-
-    # convert classification report to pandas DataFrame
-    report_df = pd.DataFrame(report).transpose()
-
-    # add errors of commission and omission
-    report_df.insert(loc=3, column='commission', value=1 - report_df.precision)
-    report_df.insert(loc=4, column='omission', value=1 - report_df.recall)
-
-    # create a DataFrame only consisting of the class-wise statistics
-    class_statistics = report_df.transpose()[labels].transpose()
-
-    # create a DataFrame only consisting of the average metrics
-    avg_metrics = report_df.transpose().drop(
-        columns=labels + ['accuracy']).transpose()
-    avg_metrics.support = 1
-
-    # convert support values to relative values
-    class_statistics.support = (class_statistics.support /
-                                class_statistics.support.sum())
-
-    # merge dataframes
-    metrics = class_statistics.append(avg_metrics)
+    # check input type
+    df = report
+    if isinstance(report, dict):
+        # convert to DataFrame
+        df = report2df(report, labels)
+
+    # drop overall accuracy
+    overall_accuracy = df.loc['accuracy'].loc['f1-score']
+    metrics = df.drop(index='accuracy')
 
     # create a figure
     fig, ax = plt.subplots(1, 1, figsize=figsize)
diff --git a/pysegcnn/core/trainer.py b/pysegcnn/core/trainer.py
index a109d7a5999c86494f9e75cf407d032e0eadc7ab..cfdc4b4baf7546a05a9d1e95a54c4d31752fc2aa 100644
--- a/pysegcnn/core/trainer.py
+++ b/pysegcnn/core/trainer.py
@@ -30,6 +30,7 @@ from logging.config import dictConfig
 
 # externals
 import numpy as np
+import pandas as pd
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -42,7 +43,7 @@ from pysegcnn.core.dataset import SupportedDatasets
 from pysegcnn.core.transforms import Augment
 from pysegcnn.core.utils import (item_in_enum, accuracy_function,
                                  reconstruct_scene, check_filename_length,
-                                 array_replace)
+                                 array_replace, report2df)
 from pysegcnn.core.split import SupportedSplits
 from pysegcnn.core.models import (SupportedModels, SupportedOptimizers,
                                   Network)
@@ -2674,10 +2675,6 @@ class NetworkInference(BaseConfig):
                 y_pred = np.asarray([v['y_pred'].flatten() for _, v
                                      in output.items()]).flatten()
 
-                # predictions and ground truth of the entire target dataset
-                output['y_true'] = y_true
-                output['y_pred'] = y_pred
-
                 # calculate classification report from sklearn
                 report_name = self.report_path.joinpath(
                     self.report_name(state))
@@ -2688,7 +2685,7 @@ class NetworkInference(BaseConfig):
                     output_dict=True)
 
                 # store report in output dictionary
-                output['report'] = report
+                output['report'] = report2df(report, self.class_names)
 
                 # plot classification report
                 fig = plot_classification_report(report, self.class_names)
@@ -2727,30 +2724,24 @@ class NetworkInference(BaseConfig):
             kfold = base_name.parent.joinpath(
                 str(base_name.name).replace(fold_number, 'kfold'))
 
-            # predictions of the different models
-            y_true = np.asarray(
-                [output['y_true'] for output in inference.values()]).flatten()
-            y_pred = np.asarray(
-                [output['y_pred'] for output in inference.values()]).flatten()
-
-            # calculate classification over all different models
+            # aggregate classification reports across the different folds
             LOGGER.info('Aggregating statistics of models:')
             LOGGER.info(('\n ' + (len(__name__) + 1) * ' ').join(
                 ['{}'.format(mstate.name) for mstate in self.state_files]))
 
-            # calculate classification report from sklearn
-            report_name = self.report_path.joinpath(self.report_name(kfold))
-            LOGGER.info('Calculating classification report: {}'
-                        .format(report_name))
-            report = classification_report(y_true, y_pred,
-                                           target_names=self.class_names,
-                                           output_dict=True)
+            # iterate over the individual models
+            df = pd.DataFrame()
+            for name, output in inference.items():
+                # classification report DataFrame for all individual models
+                df = pd.concat((df, df))
 
-            # save aggregated classification report
+            # compute k-fold average estimate of each metric across all models
+            report = df.groupby(df.index, sort=False).mean()
             inference['report'] = report
 
             # plot classification report
             fig = plot_classification_report(report, self.class_names)
+            report_name = self.report_path.joinpath(self.report_name(kfold))
             fig.savefig(report_name, dpi=300, bbox_inches='tight')
 
             # chech whether to compute the aggregated confusion matrix
diff --git a/pysegcnn/core/utils.py b/pysegcnn/core/utils.py
index eefc064f2e7231533e2a63bdcff12184ef3869de..1e18dea571a20f76fe741325ecdbc6ca991e19ef 100644
--- a/pysegcnn/core/utils.py
+++ b/pysegcnn/core/utils.py
@@ -2635,27 +2635,29 @@ def _tmp_path(path):
                                           '_tmp{}'.format(path.suffix)))
 
 
-def report2latex(classification_report, filename=None):
-    """Convert :py:func:`sklearn.metrics.classification_reports` to Latex.
+def report2df(report, labels=None):
+    """Convert :py:func:`sklearn.metrics.classification_reports` to a
+    :py:class:`pandas.DataFrame`.
 
     Parameters
     ----------
-    classification_report : `dict`
+    report : `dict`
         The dictionary returned by
         :py:class:`sklearn.metrics.classification_report`.
-    filename : `str` or :py:class:`pathlib.Path` or `None`, optional
-        The object to write the Latex table to. If `None` the table is
-        returned as string.
+    labels : `list` [`str`]
+        List of class labels. If specified, the number of samples of each class
+        is normalized. The default is `None`.
 
     """
-    # convert to pandas DataFrame and export to latex
-    df = pd.DataFrame(classification_report).transpose()
+    # convert to pandas DataFrame
+    df = pd.DataFrame(report).transpose()
 
-    # check if output filename exists
-    if filename is not None:
-        filename = pathlib.Path(filename)
-        if not filename.exists():
-            filename.parent.mkdir(exist_ok=True, parents=True)
+    # add errors of commission and omission
+    df.insert(loc=3, column='commission', value=1 - df.precision)
+    df.insert(loc=4, column='omission', value=1 - df.recall)
 
-    # export to latex
-    df.to_latex(buf=str(filename))
+    # normalize support values to relative values
+    if labels is not None:
+        df.support = df.support / df.loc[labels].support.sum()
+
+    return df