Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
C
clustering
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
URS
JustNature
clustering
Commits
950524b1
Commit
950524b1
authored
2 years ago
by
Pietro Zambelli
Browse files
Options
Downloads
Patches
Plain Diff
Add Hopkins statistics function
parent
7a3ff421
No related branches found
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
hopkins.py
+89
-0
89 additions, 0 deletions
hopkins.py
justclust/paths/bolzano.py
+0
-0
0 additions, 0 deletions
justclust/paths/bolzano.py
with
89 additions
and
0 deletions
hopkins.py
0 → 100644
+
89
−
0
View file @
950524b1
from
typing
import
Union
import
numpy
as
np
import
pandas
as
pd
from
sklearn.neighbors
import
BallTree
def
hopkins
(
data_frame
:
Union
[
np
.
ndarray
,
pd
.
DataFrame
],
sampling_size
:
int
)
->
float
:
"""
Assess the clusterability of a dataset. A score between 0 and 1, a score around 0.5 express
no clusterability and a score tending to 0 express a high cluster tendency.
Examples
--------
>>>
from
sklearn
import
datasets
>>>
from
pyclustertend
import
hopkins
>>>
X
=
datasets
.
load_iris
().
data
>>>
hopkins
(
X
,
150
)
0.16
"""
if
type
(
data_frame
)
==
np
.
ndarray
:
data_frame
=
pd
.
DataFrame
(
data_frame
)
data_frame_sample
=
sample_observation_from_dataset
(
data_frame
,
sampling_size
)
sample_distances_to_nearest_neighbours
=
get_distance_sample_to_nearest_neighbours
(
data_frame
,
data_frame_sample
)
uniformly_selected_observations_df
=
simulate_df_with_same_variation
(
data_frame
,
sampling_size
)
df_distances_to_nearest_neighbours
=
get_nearest_sample
(
data_frame
,
uniformly_selected_observations_df
)
x
=
sum
(
sample_distances_to_nearest_neighbours
)
y
=
sum
(
df_distances_to_nearest_neighbours
)
if
x
+
y
==
0
:
raise
Exception
(
"
The denominator of the hopkins statistics is null
"
)
return
x
/
(
x
+
y
)[
0
]
def
get_nearest_sample
(
df
:
pd
.
DataFrame
,
uniformly_selected_observations
:
pd
.
DataFrame
):
tree
=
BallTree
(
df
,
leaf_size
=
2
)
dist
,
_
=
tree
.
query
(
uniformly_selected_observations
,
k
=
1
)
uniformly_df_distances_to_nearest_neighbours
=
dist
return
uniformly_df_distances_to_nearest_neighbours
def
simulate_df_with_same_variation
(
df
:
pd
.
DataFrame
,
sampling_size
:
int
)
->
pd
.
DataFrame
:
max_data_frame
=
df
.
max
()
min_data_frame
=
df
.
min
()
uniformly_selected_values_0
=
np
.
random
.
uniform
(
min_data_frame
[
0
],
max_data_frame
[
0
],
sampling_size
)
uniformly_selected_values_1
=
np
.
random
.
uniform
(
min_data_frame
[
1
],
max_data_frame
[
1
],
sampling_size
)
uniformly_selected_observations
=
np
.
column_stack
(
(
uniformly_selected_values_0
,
uniformly_selected_values_1
)
)
if
len
(
max_data_frame
)
>=
2
:
for
i
in
range
(
2
,
len
(
max_data_frame
)):
uniformly_selected_values_i
=
np
.
random
.
uniform
(
min_data_frame
[
i
],
max_data_frame
[
i
],
sampling_size
)
to_stack
=
(
uniformly_selected_observations
,
uniformly_selected_values_i
)
uniformly_selected_observations
=
np
.
column_stack
(
to_stack
)
uniformly_selected_observations_df
=
pd
.
DataFrame
(
uniformly_selected_observations
)
return
uniformly_selected_observations_df
def
get_distance_sample_to_nearest_neighbours
(
df
:
pd
.
DataFrame
,
data_frame_sample
):
tree
=
BallTree
(
df
,
leaf_size
=
2
)
dist
,
_
=
tree
.
query
(
data_frame_sample
,
k
=
2
)
data_frame_sample_distances_to_nearest_neighbours
=
dist
[:,
1
]
return
data_frame_sample_distances_to_nearest_neighbours
def
sample_observation_from_dataset
(
df
,
sampling_size
:
int
):
if
sampling_size
>
df
.
shape
[
0
]:
raise
Exception
(
"
The number of sample of sample is bigger than the shape of D
"
)
data_frame_sample
=
df
.
sample
(
n
=
sampling_size
)
return
data_frame_sample
This diff is collapsed.
Click to expand it.
paths
.py
→
justclust/paths/bolzano
.py
+
0
−
0
View file @
950524b1
File moved
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment