Preparing to split transfer learning to dedicated training script.

90a211a4 · Frisinghelli Daniel · 53373694 · 90a211a4 · 53373694
Commit 90a211a4 authored 4 years ago by Frisinghelli Daniel
--- a/pysegcnn/main/config.py
+++ b/pysegcnn/main/config.py
@@ -28,7 +28,7 @@ HERE = pathlib.Path(__file__).resolve().parent

 # path to the datasets on the current machine
 DRIVE_PATH = pathlib.Path('C:/Eurac/Projects/CCISNOW/_Datasets/')
-# DRIVE_PATH = pathlib.Path('/mnt/CEPH_PROJECTS/cci_snow/dfrisinghelli/_Datasets/')
+# DRIVE_PATH = pathlib.Path('/mnt/CEPH_PROJECTS/cci_snow/dfrisinghelli/_Datasets/')  # nopep8

 # name and paths to the datasets
 DATASETS = {'Sparcs': DRIVE_PATH.joinpath('Sparcs'),
@@ -47,6 +47,9 @@ BANDS = ['red', 'green', 'blue', 'nir', 'swir1', 'swir2']
 # tile size of a single sample
 TILE_SIZE = 128

+# number of folds for cross validation
+K_FOLDS = 2
+
 # the source dataset configuration dictionary
 src_ds_config = {

@@ -81,11 +84,6 @@ src_ds_config = {
    # 'pad': False,
    'pad': True,

-    # the random seed for the numpy random number generator
-    # ensures reproducibility of the training, validation and test data split
-    # used if split_mode='random' and split_mode='scene'
-    'seed': 0,
-
    # whether to sort the dataset in chronological order, useful for time
    # series data
    # 'sort': True,
@@ -156,11 +154,11 @@ trg_ds_config = {
    'bands': BANDS,
    'tile_size': TILE_SIZE,
    'pad': True,
-    'seed': 0,
    'sort': True,
    'transforms': [],
    'merge_labels': {'Cirrus': 'Cloud',
                     'Not_used': 'No_data'}
+
 }

 # the source dataset split configuration dictionary
@@ -172,64 +170,58 @@ src_split_config = {

    # the mode to split the dataset:
    #
-    #    - 'random': randomly split the scenes
-    #                for each scene, the tiles can be distributed among the
+    #    - 'tile':   for each scene, the tiles can be distributed among the
    #                training, validation and test set
    #
-    #    - 'scene':  randomly split the scenes
-    #                for each scene, all the tiles of the scene are included in
+    #    - 'scene':  for each scene, all the tiles of the scene are included in
    #                either the training set, the validation set or the test
    #                set, respectively
-    #
-    #    - 'date':   split the scenes of a dataset based on a date, useful for
-    #                time series data
-    #                scenes before date build the training set, scenes after
-    #                the date build the validation set, the test set is empty
-    # 'split_mode': 'date',
-    # 'split_mode': 'random',
+    # 'split_mode': 'tile',
    'split_mode': 'scene',

+    # the number of folds for cross validation
+    #
+    # k_folds = 1 : The model is trained with a single dataset split based on
+    #               'tvratio' and 'ttratio'
+    # k_folds > 1 : The model is trained via cross validation on k_folds splits
+    #               of the dataset
+    'k_folds': K_FOLDS,
+
+    # the random seed for the random number generator
+    # ensures reproducibility of the training, validation and test data split
+    'seed': 0,
+
+    # whether to shuffle the data before splitting
+    'shuffle': True,
+
+    # -------------------------------------------------------------------------
+    # IMPORTANT: these setting only apply if 'kfolds=1'
+    # -------------------------------------------------------------------------
+
    # (ttratio * 100) % of the dataset will be used for training and
    # validation
-    # used if split_mode='random' and split_mode='scene'
+    # used if 'kfolds=1'
    'ttratio': 1,

    # (ttratio * tvratio) * 100 % will be used for training
    # (1 - ttratio * tvratio) * 100 % will be used for validation
-    # used if split_mode='random' and split_mode='scene'
+    # used if 'kfolds=1'
    'tvratio': 0.8,

-    # the date to split the scenes
-    # format: 'yyyymmdd'
-    # scenes before date build the training set, scenes after the date build
-    # the validation set, the test set is empty
-    # used if split_mode='date'
-    'date': '',
-    'dateformat': '%Y%m%d',
-
-    # whether to drop samples (during training only) with a fraction of
-    # pixels equal to the constant padding value cval >= drop
-    # drop=1 means, do not use a sample if all pixels = cval
-    # drop=0.8 means, do not use a sample if 80% or more of the pixels are
-    #                 equal to cval
-    # drop=0.2 means, ...
-    # drop=0 means, do not drop any samples
-    'drop': 0,
-
-    }
+}

 # the target dataset split configuration dictionary
 trg_split_config = {

-    # 'split_mode': 'date',
-    # 'split_mode': 'random',
+    # 'split_mode': 'tile',
    'split_mode': 'scene',
+    'k_folds': K_FOLDS,
+    'seed': 0,
+    'shuffle': True,
    'ttratio': 1,
    'tvratio': 0.8,
-    'date': '',
-    'dateformat': '%Y%m%d',
-    'drop': 0,
-    }
+
+}

 # the model configuration dictionary
 model_config = {
@@ -302,8 +294,8 @@ tlda_config = {

    # whether to apply any sort of transfer learning
    # if transfer=False, the model is only trained on the source dataset
-    'transfer': True,
-    # 'transfer': False,
+    # 'transfer': True,
+    'transfer': False,

    # Supervised vs. Unsupervised ---------------------------------------------
    # -------------------------------------------------------------------------

--- a/pysegcnn/main/train.py
+++ b/pysegcnn/main/train.py
-"""Main script to train a model.
-
-Steps to launch a model run:
-
-    1. Configure the model run in :py:mod:`pysegcnn.main.config.py`
-        - configure the dataset(s): ``src_ds_config`` and ``trg_ds_config``
-        - configure the split(s)  : ``src_ds_config`` and ``trg_ds_config``
-        - configure the model     : ``model_config``
-    2. Save :py:mod:`pysegcnn.main.config.py`
-    3. In a terminal, navigate to the repository's root directory
-    4. Run
-
-    .. code-block:: bash
-
-        python pysegcnn/main/train.py
-
-
-License
-------
-
-    Copyright (c) 2020 Daniel Frisinghelli
-
-    This source code is licensed under the GNU General Public License v3.
-
-    See the LICENSE file in the repository's root directory.
-
-"""
-
-# !/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-# builtins
-from logging.config import dictConfig
-
-# locals
-from pysegcnn.core.trainer import (DatasetConfig, SplitConfig, ModelConfig,
-                                   TransferLearningConfig, StateConfig,
-                                   LogConfig, DomainAdaptationTrainer)
-from pysegcnn.main.config import (src_ds_config, src_split_config,
-                                  trg_ds_config, trg_split_config,
-                                  model_config, tlda_config)
-from pysegcnn.core.logging import log_conf
-
-
-if __name__ == '__main__':
-
-    # (i) instanciate the source domain configurations
-    src_dc = DatasetConfig(**src_ds_config)   # source domain dataset
-    src_sc = SplitConfig(**src_split_config)  # source domain dataset split
-
-    # (ii) instanciate the target domain configuration
-    trg_dc = DatasetConfig(**trg_ds_config)   # target domain dataset
-    trg_sc = SplitConfig(**trg_split_config)  # target domain dataset split
-
-    # (iii) instanciate the model configuration
-    net_mc = ModelConfig(**model_config)
-
-    # (iv) instanciate the transfer learning configuration
-    trn_sf = TransferLearningConfig(**tlda_config)
-
-    # (v) instanciate the model state file
-    net_sc = StateConfig(src_dc, src_sc, trg_dc, trg_sc, net_mc, trn_sf)
-    state_file = net_sc.init_state()
-
-    # (vi) instanciate logging configuration
-    net_lc = LogConfig(state_file)
-    dictConfig(log_conf(net_lc.log_file))
-
-    # (vii) instanciate the datasets to train the model on
-    src_ds = src_dc.init_dataset()
-    trg_ds = trg_dc.init_dataset()
-
-    # (viii) instanciate the training, validation and test datasets and
-    # dataloaders for the source domain
-    src_tra_ds, src_val_ds, src_tes_ds = src_sc.train_val_test_split(src_ds)
-    src_tra_dl, src_val_dl, src_tes_dl = src_sc.dataloaders(
-        src_tra_ds, src_val_ds, src_tes_ds, batch_size=net_mc.batch_size,
-        shuffle=True, drop_last=False)
-
-    # (ix) instanciate the training, validation and test datasets and
-    # dataloaders dor the target domain
-    trg_tra_ds, trg_val_ds, trg_tes_ds = trg_sc.train_val_test_split(trg_ds)
-    trg_tra_dl, trg_val_dl, trg_tes_dl = trg_sc.dataloaders(
-        trg_tra_ds, trg_val_ds, trg_tes_ds, batch_size=net_mc.batch_size,
-        shuffle=True, drop_last=False)
-
-    # (x) instanciate the model
-    if trn_sf.transfer and (trn_sf.supervised or trn_sf.uda_from_pretrained):
-        # check whether to load a pretrained model for (un)supervised transfer
-        # learning
-        net, optimizer, checkpoint = trn_sf.transfer_model(
-            trn_sf.pretrained_path,
-            nclasses=len(src_ds).labels,
-            optim_kwargs=net_mc.optim_kwargs,
-            freeze=trn_sf.freeze)
-    else:
-        # initialize model from scratch or from an existing model checkpoint
-        net, optimizer, checkpoint = net_mc.init_model(
-            len(src_ds.use_bands), len(src_ds.labels), state_file)
-
-    # (xi) instanciate the network trainer class
-    trainer = DomainAdaptationTrainer(
-        model=net,
-        optimizer=optimizer,
-        state_file=net.state_file,
-        src_train_dl=src_tra_dl,
-        src_valid_dl=src_val_dl,
-        src_test_dl=src_tes_dl,
-        epochs=net_mc.epochs,
-        nthreads=net_mc.nthreads,
-        early_stop=net_mc.early_stop,
-        mode=net_mc.mode,
-        delta=net_mc.delta,
-        patience=net_mc.patience,
-        checkpoint_state=checkpoint,
-        save=net_mc.save,
-        supervised=trn_sf.supervised,
-        trg_train_dl=trg_tra_dl,
-        trg_valid_dl=trg_val_dl,
-        trg_test_dl=trg_tes_dl,
-        uda_loss_function=trn_sf.uda_loss_function,
-        uda_lambda=trn_sf.uda_lambda,
-        uda_pos=trn_sf.uda_pos)
-
-    # (xi) train the model
-    # training_state = trainer.train()