Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
P
PySegCNN
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Deploy
Releases
Package registry
Container Registry
Model registry
Operate
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
earth_observation_public
PySegCNN
Commits
ad3d88eb
Commit
ad3d88eb
authored
4 years ago
by
Frisinghelli Daniel
Browse files
Options
Downloads
Patches
Plain Diff
Implemented cross-validation subsampling.
parent
b3b65b40
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
pysegcnn/core/split.py
+174
-572
174 additions, 572 deletions
pysegcnn/core/split.py
with
174 additions
and
572 deletions
pysegcnn/core/split.py
+
174
−
572
View file @
ad3d88eb
...
@@ -15,11 +15,11 @@ License
...
@@ -15,11 +15,11 @@ License
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
# builtins
# builtins
import
datetime
import
enum
import
enum
# externals
# externals
import
numpy
as
np
import
numpy
as
np
from
sklearn.model_selection
import
KFold
from
torch.utils.data.dataset
import
Subset
from
torch.utils.data.dataset
import
Subset
# the names of the subsets
# the names of the subsets
...
@@ -45,93 +45,50 @@ def _ds_len(ds, ratio):
...
@@ -45,93 +45,50 @@ def _ds_len(ds, ratio):
return
int
(
np
.
round
(
len
(
ds
)
*
ratio
))
return
int
(
np
.
round
(
len
(
ds
)
*
ratio
))
def
random_tile_split
(
ds
,
tvratio
,
ttratio
=
1
,
seed
=
0
):
def
pairwise_disjoint
(
sets
):
"""
Randomly split the tiles of a dataset.
"""
Check if ``sets`` are pairwise disjoint.
For each scene, the tiles of the scene can be distributed among the
training, validation and test set.
The parameters ``ttratio`` and ``tvratio`` control the size of the
training, validation and test datasets.
Test dataset size : ``(1 - ttratio) * len(ds)``
Sets are pairwise disjoint if the length of their union equals the sum of
Train dataset size : ``ttratio * tvratio * len(ds)``
their lengths.
Validation dataset size: ``ttratio * (1 - tvratio) * len(ds)``
Parameters
Parameters
----------
----------
ds : :py:class:`pysegcnn.core.dataset.ImageDataset`
sets : `list` [:py:class:`collections.Sized`]
An instance of :py:class:`pysegcnn.core.dataset.ImageDataset`.
A list of sized objects.
tvratio : `float`
The ratio of training data to validation data, e.g. ``tvratio=0.8``
means 80% training, 20% validation.
ttratio : `float`, optional
The ratio of training and validation data to test data, e.g.
``ttratio=0.6`` means 60% for training and validation, 40% for
testing. The default is `1`.
seed : `int`, optional
The random seed for reproducibility. The default is `0`.
Raises
------
AssertionError
Raised if the splits are not pairwise disjoint.
Returns
Returns
-------
-------
subsets : `dict`
disjoint : `bool`
Subset dictionary with keys:
Whether the sets are pairwise disjoint.
``
'
train
'
``
The training scenes (`dict`).
``
'
valid
'
``
The validation scenes (`dict`).
``
'
test
'
``
The test scenes (`dict`).
"""
"""
# set the random seed for reproducibility
union
=
set
().
union
(
*
sets
)
np
.
random
.
seed
(
seed
)
n
=
sum
(
len
(
u
)
for
u
in
sets
)
return
n
==
len
(
union
)
# randomly permute indices to access dataset
indices
=
np
.
random
.
permutation
(
len
(
ds
))
# length of the training and validation dataset
# number of samples: (ttratio * len(ds))
trav_len
=
_ds_len
(
indices
,
ttratio
)
trav_indices
=
indices
[:
trav_len
]
# length of the training dataset
# number of samples: (ttratio * tvratio * len(ds))
train_len
=
_ds_len
(
trav_indices
,
tvratio
)
train_ind
=
trav_indices
[:
train_len
]
# length of the validation dataset
# number of samples: (ttratio * (1- tvratio) * len(ds))
valid_ind
=
trav_indices
[
train_len
:]
# length of the test dataset
# number of samples: ((1 - ttratio) * len(ds))
test_ind
=
indices
[
trav_len
:]
# get the tiles of the scenes of each dataset
subsets
=
{}
for
name
,
dataset
in
enumerate
([
train_ind
,
valid_ind
,
test_ind
]):
# store the indices and corresponding tiles of the current subset to
def
index_dict
(
indices
):
# dictionary
"""
Generate the training, validation and test set index dictionary.
subsets
[
SUBSET_NAMES
[
name
]]
=
{
k
:
ds
.
scenes
[
k
]
for
k
in
dataset
}
# check if the splits are disjoint
Parameters
assert
pairwise_disjoint
([
s
.
keys
()
for
s
in
subsets
.
values
()])
----------
indices : `list` [:py:class:`numpy.ndarray`]
An ordered list composed of three :py:class:`numpy.ndarray` containing
the indices to the training, validation and test set.
return
subsets
Returns
-------
index_dict : `dict`
The index dictionary, where the keys are equal to ``SUBSET_NAMES`` and
the values are py:class:`numpy.ndarray` containing the indices to the
training, validation and test set.
"""
return
{
k
:
v
for
k
,
v
in
zip
(
SUBSET_NAMES
,
indices
)}
def
random_scene_split
(
ds
,
tvratio
,
ttratio
=
1
,
seed
=
0
):
"""
Semi-randomly split the tiles of a dataset.
For each scene, all the tiles of the scene are included in either the
def
random_split
(
ds
,
tvratio
=
0.8
,
ttratio
=
1
,
seed
=
0
,
shuffle
=
True
):
training, validation
or
test set
, respectively
.
"""
Randomly split an iterable into
training, validation
and
test set.
The parameters ``ttratio`` and ``tvratio`` control the size of the
The parameters ``ttratio`` and ``tvratio`` control the size of the
training, validation and test datasets.
training, validation and test datasets.
...
@@ -142,17 +99,20 @@ def random_scene_split(ds, tvratio, ttratio=1, seed=0):
...
@@ -142,17 +99,20 @@ def random_scene_split(ds, tvratio, ttratio=1, seed=0):
Parameters
Parameters
----------
----------
ds : :py:class:`
pysegcnn.core.dataset.ImageDataset
`
ds : :py:class:`
collections.Sized
`
An
instance of :py:class:`pysegcnn.core.dataset.ImageDataset`
.
An
object with a :py:meth:`__len__` method
.
tvratio : `float`
tvratio : `float`
, optional
The ratio of training data to validation data, e.g. ``tvratio=0.8``
The ratio of training data to validation data, e.g. ``tvratio=0.8``
means 80% training, 20% validation.
means 80% training, 20% validation.
The default is `0.8`.
ttratio : `float`, optional
ttratio : `float`, optional
The ratio of training and validation data to test data, e.g.
The ratio of training and validation data to test data, e.g.
``ttratio=0.6`` means 60% for training and validation, 40% for
``ttratio=0.6`` means 60% for training and validation, 40% for
testing. The default is `1`.
testing. The default is `1`.
seed : `int`, optional
seed : `int`, optional
The random seed for reproducibility. The default is `0`.
The random seed for reproducibility. The default is `0`.
shuffle : `bool`, optional
Whether to shuffle the data before splitting into batches. The default
is `True`.
Raises
Raises
------
------
...
@@ -161,568 +121,210 @@ def random_scene_split(ds, tvratio, ttratio=1, seed=0):
...
@@ -161,568 +121,210 @@ def random_scene_split(ds, tvratio, ttratio=1, seed=0):
Returns
Returns
-------
-------
subsets : `dict`
indices : `list` [`dict`]
Subset dictionary with keys:
List of index dictionaries as composed by
``
'
train
'
``
:py:func:`pysegcnn.core.split.index_dict`.
The training scenes (`dict`).
``
'
valid
'
``
The validation scenes (`dict`).
``
'
test
'
``
The test scenes (`dict`).
"""
"""
# set the random seed for reproducibility
# set the random seed for reproducibility
np
.
random
.
seed
(
seed
)
np
.
random
.
seed
(
seed
)
# get the names of the scenes and generate random permutation
# whether to shuffle the data before splitting
scene_ids
=
np
.
random
.
permutation
(
np
.
unique
([
s
[
'
id
'
]
for
s
in
ds
.
scenes
]))
indices
=
np
.
arange
(
len
(
ds
))
if
shuffle
:
# randomly permute indices to access the iterable
indices
=
np
.
random
.
permutation
(
indices
)
# the training and validation scenes
# the training and validation scenes
# number of samples: (ttratio * nscenes)
# number of samples: (ttratio * len(ds))
trav_len
=
_ds_len
(
scene_ids
,
ttratio
)
trav_len
=
_ds_len
(
ds
,
ttratio
)
trav_scenes
=
scene_ids
[:
trav_len
]
trav_ids
=
indices
[:
trav_len
]
# the training scenes
# number of samples: (ttratio * tvratio * nscenes)
train_len
=
_ds_len
(
trav_scenes
,
tvratio
)
train_scenes
=
trav_scenes
[:
train_len
]
# the validation scenes
# number of samples: (ttratio * (1- tvratio) * nscenes)
valid_scenes
=
trav_scenes
[
train_len
:]
# the test scenes
# number of samples:((1 - ttratio) * nscenes)
test_scenes
=
scene_ids
[
trav_len
:]
# get the tiles of the scenes of each dataset
# the training dataset indices
subsets
=
{}
# number of samples: (ttratio * tvratio * len(ds))
for
name
,
dataset
in
enumerate
([
train_scenes
,
valid_scenes
,
test_scenes
]):
train_len
=
_ds_len
(
trav_ids
,
tvratio
)
train_ids
=
trav_ids
[:
train_len
]
# store the indices and corresponding tiles of the current subset to
# the validation dataset indices
# dictionary
# number of samples: (ttratio * (1- tvratio) * len(ds))
subsets
[
SUBSET_NAMES
[
name
]]
=
{
k
:
v
for
k
,
v
in
enumerate
(
ds
.
scenes
)
valid_ids
=
trav_ids
[
train_len
:]
if
v
[
'
id
'
]
in
dataset
}
# check if the splits are disjoint
# the test dataset indices
assert
pairwise_disjoint
([
s
.
keys
()
for
s
in
subsets
.
values
()])
# number of samples:((1 - ttratio) * len(ds))
test_ids
=
trav_ids
[
trav_len
:]
return
subsets
# check whether the different datasets or pairwise disjoint
indices
=
index_dict
([
train_ids
,
valid_ids
,
test_ids
])
assert
pairwise_disjoint
(
indices
.
values
())
return
[
indices
]
def
date_scene_split
(
ds
,
date
,
dateformat
=
'
%Y%m%d
'
):
"""
Split the dataset based on a date.
Scenes before ``date`` build the training dataset, scenes after ``date``
def
kfold_split
(
ds
,
k_folds
=
5
,
seed
=
0
,
shuffle
=
True
):
the validation dataset. The test set is empty
.
"""
Randomly split an iterable into ``k_folds`` folds
.
Useful for time series data.
This function uses the cross validation index generator
:py:class:`sklearn.model_selection.KFold`.
Parameters
Parameters
----------
----------
ds : :py:class:`pysegcnn.core.dataset.ImageDataset`
ds : :py:class:`collections.Sized`
An instance of :py:class:`pysegcnn.core.dataset.ImageDataset`.
An object with a :py:meth:`__len__` method.
date : `str`
k_folds: `int`, optional
A date in the format ``dateformat``.
The number of folds. Must be a least 2. The default is `5`.
dateformat : `str`, optional
seed : `int`, optional
The format of ``date``. ``dateformat`` is used by
The random seed for reproducibility. The default is `0`.
:py:func:`datetime.datetime.strptime
'
to parse ``date`` to a
shuffle : `bool`, optional
:py:class:`datetime.datetime` object. The default is `
'
%Y%m%d
'
`.
Whether to shuffle the data before splitting into batches. The default
is `True`.
Raises
Raises
------
------
AssertionError
AssertionError
Raised if the splits are not pairwise disjoint.
Raised if the (training, validation) folds are not pairwise disjoint.
Returns
-------
subsets : `dict`
Subset dictionary with keys:
``
'
train
'
``
The training scenes (`dict`).
``
'
valid
'
``
The validation scenes (`dict`).
``
'
test
'
``
The test scenes (`dict`).
"""
"""
# convert date to datetime object
# set the random seed for reproducibility
date
=
datetime
.
datetime
.
strptime
(
date
,
dateformat
)
np
.
random
.
seed
(
seed
)
# the training, validation and test scenes
train_scenes
=
{
i
:
s
for
i
,
s
in
enumerate
(
ds
.
scenes
)
if
s
[
'
date
'
]
<=
date
}
valid_scenes
=
{
i
:
s
for
i
,
s
in
enumerate
(
ds
.
scenes
)
if
s
[
'
date
'
]
>
date
}
test_scenes
=
{}
# build the training and test datasets
subsets
=
{}
for
name
,
scenes
in
enumerate
([
train_scenes
,
valid_scenes
,
test_scenes
]):
# store the indices and corresponding tiles of the current subset to
# dictionary
subsets
[
SUBSET_NAMES
[
name
]]
=
scenes
# check if the splits are disjoint
assert
pairwise_disjoint
([
s
.
keys
()
for
s
in
subsets
.
values
()])
return
subsets
def
pairwise_disjoint
(
sets
):
"""
Check if ``sets`` are pairwise disjoint.
Sets are pairwise disjoint if the length of their union equals the sum of
their lengths.
Parameters
----------
sets : `list` [:py:class:`collections.Sized`]
A list of sized objects.
Returns
-------
disjoint : `bool`
Whether the sets are pairwise disjoint.
"""
union
=
set
().
union
(
*
sets
)
n
=
sum
(
len
(
u
)
for
u
in
sets
)
return
n
==
len
(
union
)
class
CustomSubset
(
Subset
):
"""
Generic custom subset inheriting :py:class:`torch.utils.data.Subset`.
.. important::
The training,
validation
a
nd
test datasets should be subclasses of
# cross
validation
i
nd
ex generator from scikit-learn
:py:class:`pysegcnn.core.split.CustomSubset`.
kf
=
KFold
(
k_folds
,
random_state
=
seed
,
shuffle
=
shuffle
)
See :py:class:`pysegcnn.core.split.RandomTileSplit` for an example
# generate the indices of the different folds
implementing the :py:class:`pysegcnn.core.split.RandomSubset` subset
folds
=
[]
class.
for
i
,
(
train
,
valid
)
in
enumerate
(
kf
.
split
(
ds
)):
folds
.
append
(
index_dict
([
train
,
valid
,
np
.
array
([])]))
assert
pairwise_disjoint
(
folds
[
i
].
values
())
return
folds
Attributes
----------
dataset : :py:class:`pysegcnn.core.dataset.ImageDataset`
The dataset to split into subsets.
split_mode : `str`
The mode to split the dataset.
indices : `list` [`int`]
List of indices to access the dataset.
name : `str`
Name of the subset.
scenes : `list` [`dict`]
List of the subset tiles.
ids : `list` or :py:class:`numpy.ndarray`
Container of the scene identifiers.
"""
class
RandomSplit
(
object
):
"""
Base class for random splits of a `torch.utils.data.Dataset`.
"""
def
__init__
(
self
,
ds
,
split_mode
,
indices
,
name
,
scenes
,
scene_ids
):
def
__init__
(
self
,
ds
,
k_folds
,
seed
=
0
,
shuffle
=
True
,
tvratio
=
0.8
,
"""
Initialize.
ttratio
=
1
):
"""
Randomly split a dataset into training, validation and test set.
Parameters
Parameters
----------
----------
ds : :py:class:`pysegcnn.core.dataset.ImageDataset`
ds : :py:class:`collections.Sized`
An instance of :py:class:`pysegcnn.core.dataset.ImageDataset`.
An object with a :py:meth:`__len__` method.
split_mode : `str`
k_folds: `int`
The mode to split the dataset.
The number of folds.
indices : `list` [`int`]
seed : `int`, optional
List of indices to access ``ds``. ``indices`` must be pairwise
The random seed for reproducibility. The default is `0`.
disjoint for each subset derived from the same dataset ``ds``.
shuffle : `bool`, optional
name : `str`
Whether to shuffle the data before splitting into batches. The
Name of the subset.
default is `True`.
scenes : `list` [`dict`]
tvratio : `float`, optional
List of the subset tiles.
The ratio of training data to validation data, e.g. ``tvratio=0.8``
scene_ids : `list` or :py:class:`numpy.ndarray`
means 80% training, 20% validation. The default is `0.8`. Used if
Container of the scene identifiers.
``k_folds=1``.
ttratio : `float`, optional
"""
The ratio of training and validation data to test data, e.g.
super
().
__init__
(
dataset
=
ds
,
indices
=
indices
)
``ttratio=0.6`` means 60% for training and validation, 40% for
testing. The default is `1`. Used if ``k_folds=1``.
# the mode to split the dataset
self
.
split_mode
=
split_mode
# the name of the subset
self
.
name
=
name
# the scene in the subset
self
.
scenes
=
scenes
# the names of the scenes
self
.
ids
=
scene_ids
def
__repr__
(
self
):
"""
Representation string.
Returns
-------
fs : `str`
The representation string.
"""
"""
fs
=
'
- {}: {:d} tiles ({:.2f}%), mode = {}
'
.
format
(
self
.
name
,
len
(
self
.
scenes
),
100
*
len
(
self
.
scenes
)
/
len
(
self
.
dataset
),
self
.
split_mode
)
return
fs
class
SceneSubset
(
CustomSubset
):
"""
A custom subset for dataset splits where the scenes are preserved.
"""
def
__init__
(
self
,
ds
,
split_mode
,
indices
,
name
,
scenes
,
scene_ids
):
super
().
__init__
(
ds
,
split_mode
,
indices
,
name
,
scenes
,
scene_ids
)
class
RandomSubset
(
CustomSubset
):
"""
A custom subset for random dataset splits.
"""
def
__init__
(
self
,
ds
,
split_mode
,
indices
,
name
,
scenes
,
scene_ids
):
super
().
__init__
(
ds
,
split_mode
,
indices
,
name
,
scenes
,
scene_ids
)
class
Split
(
object
):
"""
Generic class handling how ``ds`` is split.
Each dataset should be split by a subclass of
:py:class:`pysegcnn.core.split.Split`, by calling the
:py:meth:`pysegcnn.core.split.Split.split` method.
.. important::
The :py:meth:`~pysegcnn.core.split.Split.subsets` and
# instance attributes
:py:meth:`~pysegcnn.core.split.Split.subset_type` methods have to be
implemented when inheriting :py:class:`pysegcnn.core.split.Split`.
Furthermore, a class attribute ``split_mode`` (`str`) has to be
defined and added to :py:class:`pysegcnn.core.split.SupportedSplits`.
See :py:class:`pysegcnn.core.split.RandomTileSplit` for an example.
Attributes
----------
ds : :py:class:`pysegcnn.core.dataset.ImageDataset`
The dataset to split into training, validation and test set.
"""
def
__init__
(
self
,
ds
):
"""
Initialize.
Parameters
----------
ds : :py:class:`pysegcnn.core.dataset.ImageDataset`
An instance of :py:class:`pysegcnn.core.dataset.ImageDataset`.
"""
# the dataset to split
self
.
ds
=
ds
self
.
ds
=
ds
self
.
k_folds
=
k_folds
self
.
seed
=
seed
self
.
shuffle
=
shuffle
def
split
(
self
):
# instance attributes: training/validation/test split ratios
"""
Split dataset into training, validation and test set.
# used if kfolds=1
self
.
tvratio
=
tvratio
:py:meth:`~pysegcnn.core.split.Split.split` works only if
self
.
ttratio
=
ttratio
:py:meth:`~pysegcnn.core.split.Split.subsets` and
:py:meth:`~pysegcnn.core.split.Split.subset_type` are implemented.
"""
# build the subsets
ds_split
=
[]
for
name
,
sub
in
self
.
subsets
().
items
():
# the scene identifiers of the current subset: preserve the order
# of the scene identifiers
ids
,
idx
=
np
.
unique
([
s
[
'
id
'
]
for
s
in
sub
.
values
()],
return_index
=
True
)
ids
=
ids
[
np
.
argsort
(
idx
)]
# build the subset
sbst
=
self
.
subset_type
()(
self
.
ds
,
self
.
split_mode
,
list
(
sub
.
keys
()),
name
,
list
(
sub
.
values
()),
ids
)
ds_split
.
append
(
sbst
)
return
ds_split
def
subsets
(
self
):
"""
Define training, validation and test sets.
Wrapper method for
def
generate_splits
(
self
):
:py:func:`pysegcnn.core.split.Split.random_tile_split`,
:py:func:`pysegcnn.core.split.Split.random_scene_split` or
:py:func:`pysegcnn.core.split.Split.date_scene_split`.
Raises
# check whether to generate a single or multiple folds
------
if
self
.
k_folds
>
1
:
NotImplementedError
# k-fold split
Raised if :py:class:`pysegcnn.core.split.Split` is not inherited.
indices
=
kfold_split
(
self
.
indices_to_split
,
self
.
k_folds
,
self
.
seed
,
self
.
shuffle
)
else
:
# single-fold split
indices
=
random_split
(
self
.
indices_to_split
,
self
.
tvratio
,
self
.
ttratio
,
self
.
seed
,
self
.
shuffle
)
Returns
return
indices
-------
None.
"""
@property
def
indices_to_split
(
self
):
raise
NotImplementedError
raise
NotImplementedError
def
subset_type
(
self
):
@property
"""
Define the type of each subset.
def
indices
(
self
):
Wrapper method for :py:class:`pysegcnn.core.split.RandomSubset` or
:py:class:`pysegcnn.core.split.SceneSubset`.
Raises
------
NotImplementedError
Raised if :py:class:`pysegcnn.core.split.Split` is not inherited.
Returns
-------
None.
"""
raise
NotImplementedError
raise
NotImplementedError
def
split
(
self
):
class
DateSplit
(
Split
):
# initialize training, validation and test subsets
"""
Split a dataset based on a date.
subsets
=
[]
.. important::
Scenes before ``date`` build the training dataset, scenes after
``date`` the validation dataset. The test set is empty.
Useful for time series data.
Class wrapper for :py:func:`pysegcnn.core.split.date_scene_split`.
Attributes
----------
split_mode : `str`
The mode to split the dataset, i.e. `
'
date
'
`.
ds : :py:class:`pysegcnn.core.dataset.ImageDataset`
The dataset to split into training, validation and test set.
date : `str`
The date used to split the dataset.
dateformat : `str`
The format of ``date``.
"""
# the split mode
split_mode
=
'
date
'
def
__init__
(
self
,
ds
,
date
,
dateformat
):
"""
Initialize.
Parameters
----------
ds : :py:class:`pysegcnn.core.dataset.ImageDataset`
An instance of :py:class:`pysegcnn.core.dataset.ImageDataset`.
date : `str`
A date in the format ``dateformat``.
dateformat : `str`
The format of ``date``. ``dateformat`` is used by
:py:func:`datetime.datetime.strptime
'
to parse ``date`` to a
:py:class:`datetime.datetime` object.
"""
super
().
__init__
(
ds
)
# the date to split the dataset
# before: training set
# after : validation set
self
.
date
=
date
# the format of the date
self
.
dateformat
=
dateformat
def
subsets
(
self
):
"""
Wrap :py:func:`pysegcnn.core.split.Split.date_scene_split`.
Returns
-------
subsets : `dict`
Subset dictionary with keys:
``
'
train
'
``
The training scenes (`dict`).
``
'
valid
'
``
The validation scenes (`dict`).
``
'
test
'
``
The test scenes, empty (`dict`).
"""
return
date_scene_split
(
self
.
ds
,
self
.
date
,
self
.
dateformat
)
def
subset_type
(
self
):
"""
Wrap :py:class:`pysegcnn.core.split.SceneSubset`.
Returns
-------
SceneSubset : :py:class:`pysegcnn.core.split.SceneSubset`
The subset type.
"""
return
SceneSubset
class
RandomSplit
(
Split
):
"""
Generic class for random dataset splits.
"""
def
__init__
(
self
,
ds
,
ttratio
,
tvratio
,
seed
):
"""
Initialize.
Parameters
----------
ds : :py:class:`pysegcnn.core.dataset.ImageDataset`
An instance of :py:class:`pysegcnn.core.dataset.ImageDataset`.
tvratio : `float`
The ratio of training data to validation data, e.g.
``tvratio=0.8`` means 80% training, 20% validation.
ttratio : `float`
The ratio of training and validation data to test data, e.g.
``ttratio=0.6`` means 60% for training and validation, 40% for
testing.
seed : `int`
The random seed used to generate the split. Useful for
reproducibility.
"""
super
().
__init__
(
ds
)
# the training, validation and test set ratios
# the training, validation and test indices
self
.
ttratio
=
ttratio
for
folds
in
self
.
indices
:
self
.
tvratio
=
tvratio
subsets
.
append
(
index_dict
([
Subset
(
self
.
ds
,
ids
)
for
ids
in
folds
.
values
()]))
# the random seed: useful for reproducibility
return
subsets
self
.
seed
=
seed
class
RandomTileSplit
(
RandomSplit
):
class
RandomTileSplit
(
RandomSplit
):
"""
Randomly split the dataset.
"""
Split a :py:class:`pysegcnn.core.dataset.ImageDataset` into tiles.
"""
.. important::
def
__init__
(
self
,
ds
,
k_folds
,
seed
=
0
,
shuffle
=
True
,
tvratio
=
0.8
,
ttratio
=
1
):
# initialize super class
super
().
__init__
(
ds
,
k_folds
,
seed
,
shuffle
,
tvratio
,
ttratio
)
For each scene, the tiles of the scene can be distributed among the
@property
training, validation and test set.
def
indices_to_split
(
self
):
return
np
.
arange
(
len
(
self
.
ds
))
Class wrapper for :py:func:`pysegcnn.core.split.random_tile_split`.
@property
def
indices
(
self
):
Attributes
return
self
.
generate_splits
()
----------
split_mode : `str`
The mode to split the dataset, i.e. `
'
random
'
`.
ds : :py:class:`pysegcnn.core.dataset.ImageDataset`
The dataset to split into training, validation and test set.
tvratio : `float`
The ratio of training data to validation data.
ttratio : `float`
The ratio of training and validation data to test data.
seed : `int`
The random seed used to generate the split.
"""
# the split mode
split_mode
=
'
random
'
def
__init__
(
self
,
ds
,
ttratio
,
tvratio
,
seed
):
super
().
__init__
(
ds
,
ttratio
,
tvratio
,
seed
)
def
subsets
(
self
):
"""
Wrap :py:func:`pysegcnn.core.split.Split.random_tile_split`.
Returns
-------
subsets : `dict`
Subset dictionary with keys:
``
'
train
'
``
The training scenes (`dict`).
``
'
valid
'
``
The validation scenes (`dict`).
``
'
test
'
``
The test scenes (`dict`).
"""
return
random_tile_split
(
self
.
ds
,
self
.
tvratio
,
self
.
ttratio
,
self
.
seed
)
def
subset_type
(
self
):
"""
Wrap :py:class:`pysegcnn.core.split.RandomSubset`.
Returns
-------
SceneSubset : :py:class:`pysegcnn.core.split.RandomSubset`
The subset type.
"""
return
RandomSubset
class
RandomSceneSplit
(
RandomSplit
):
class
RandomSceneSplit
(
RandomSplit
):
"""
Semi-randomly split the dataset.
"""
Split a :py:class:`pysegcnn.core.dataset.ImageDataset` into scenes.
"""
.. important::
For each scene, all the tiles of the scene are included in either the
training, validation or test set, respectively.
Class wrapper for :py:func:`pysegcnn.core.split.random_scene_split`.
Attributes
----------
split_mode : `str`
The mode to split the dataset, i.e. `
'
scene
'
`.
ds : :py:class:`pysegcnn.core.dataset.ImageDataset`
The dataset to split into training, validation and test set.
tvratio : `float`
The ratio of training data to validation data.
ttratio : `float`
The ratio of training and validation data to test data.
seed : `int`
The random seed used to generate the split.
"""
def
__init__
(
self
,
ds
,
k_folds
,
seed
=
0
,
shuffle
=
True
,
tvratio
=
0.8
,
ttratio
=
1
):
# the split mode
# initialize super class
split_mode
=
'
scene
'
super
().
__init__
(
ds
,
k_folds
,
seed
,
shuffle
,
tvratio
,
ttratio
)
def
__init__
(
self
,
ds
,
ttratio
,
tvratio
,
seed
):
super
().
__init__
(
ds
,
ttratio
,
tvratio
,
seed
)
def
subsets
(
self
):
# the number of the scenes in the dataset
"""
Wrap :py:func:`pysegcnn.core.split.Split.random_scene_split`.
self
.
scenes
=
np
.
array
([
v
[
'
scene
'
]
for
v
in
self
.
ds
.
scenes
])
Returns
@property
-------
def
indices_to_split
(
self
):
subsets : `dict`
return
np
.
unique
(
self
.
scenes
)
Subset dictionary with keys:
``
'
train
'
``
The training scenes (`dict`).
``
'
valid
'
``
The validation scenes (`dict`).
``
'
test
'
``
The test scenes (`dict`).
"""
@property
return
random_scene_split
(
self
.
ds
,
self
.
tvratio
,
self
.
ttratio
,
def
indices
(
self
):
self
.
seed
)
# indices of the different scene identifiers
indices
=
self
.
generate_splits
()
def
subset_type
(
self
):
"""
Wrap :py:class:`pysegcnn.core.split.SceneSubset`.
Returns
# iterate over the different folds
-------
scene_indices
=
[]
SceneSubset : :py:class:`pysegcnn.core.split.SceneSubset`
for
folds
in
indices
:
The subset type.
# iterate over the training, validation and test set
subset
=
{}
for
name
,
ids
in
folds
.
items
():
subset
[
name
]
=
np
.
where
(
np
.
isin
(
self
.
scenes
,
ids
))[
0
]
scene_indices
.
append
(
subset
)
"""
return
scene_indices
return
SceneSubset
class
SupportedSplits
(
enum
.
Enum
):
class
SupportedSplits
(
enum
.
Enum
):
"""
Names and corresponding classes of the implemented split modes.
"""
"""
Names and corresponding classes of the implemented split modes.
"""
random
=
RandomTileSplit
tile
=
RandomTileSplit
scene
=
RandomSceneSplit
scene
=
RandomSceneSplit
date
=
DateSplit
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment