Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Hamming Distance KNN similarity metric for long property #193

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
/*
* Copyright (c) "Neo4j"
* Neo4j Sweden AB [http://neo4j.com]
*
* This file is part of Neo4j.
*
* Neo4j is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package org.neo4j.gds.similarity.knn.metrics;

/**
* We compute the Hamming Distance,
* (https://en.wikipedia.org/wiki/Hamming_distance) and turn it into
* a similarity metric by clamping into 0..1 range using a linear
* transformation.
*/
public final class HammingDistance {
private HammingDistance() {}

public static double longMetric(long left, long right) {
return normalizeBitCount(
Long.bitCount(left ^ right)
);
}

/**
* We use unity-based normalization to scale the bit
* count to the [0-1] range:
* y = (x_i - min(x)) / (max(x) - min(x)) See
* https://stats.stackexchange.com/a/70807 for example.
* In our case, min(x) = 0 since you cannot have a negative
* bit count, and max(x) = 64 since in Java, a long is
* 64 bits in size.
*
* We then subtract the normalized range from 1.0 to map
* 1.0 as most similar, and 0.0 as least similar.
*/
private static double normalizeBitCount(long bitCount) {
return 1.0 - (bitCount / 64.0);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -24,22 +24,20 @@

final class LongPropertySimilarityComputer implements SimilarityComputer {
private final NodePropertyValues nodePropertyValues;
private final LongPropertySimilarityMetric metric;

LongPropertySimilarityComputer(NodePropertyValues nodePropertyValues) {
LongPropertySimilarityComputer(NodePropertyValues nodePropertyValues, LongPropertySimilarityMetric metric) {
if (nodePropertyValues.valueType() != ValueType.LONG) {
throw new IllegalArgumentException("The property is not of type LONG");
}
this.nodePropertyValues = nodePropertyValues;
this.metric = metric;
}

@Override
public double similarity(long firstNodeId, long secondNodeId) {
var left = nodePropertyValues.longValue(firstNodeId);
var right = nodePropertyValues.longValue(secondNodeId);
var abs = Math.abs(left - right);
if (abs == Long.MIN_VALUE) {
abs = Long.MAX_VALUE;
}
return 1.0 / (1.0 + abs);
return metric.compute(left, right);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
/*
* Copyright (c) "Neo4j"
* Neo4j Sweden AB [http://neo4j.com]
*
* This file is part of Neo4j.
*
* Neo4j is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package org.neo4j.gds.similarity.knn.metrics;
interface LongPropertySimilarityMetric {
double compute(long left, long right);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
/*
* Copyright (c) "Neo4j"
* Neo4j Sweden AB [http://neo4j.com]
*
* This file is part of Neo4j.
*
* Neo4j is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package org.neo4j.gds.similarity.knn.metrics;

public final class NormalizedAbsoluteDifference {
private NormalizedAbsoluteDifference() {}

public static double longMetric(long left, long right) {
var abs = Math.abs(left - right);
if (abs == Long.MIN_VALUE) {
abs = Long.MAX_VALUE;
}
return 1.0 / (1.0 + abs);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,11 @@ static SimilarityComputer ofProperty(
) {
switch (properties.valueType()) {
case LONG:
return ofLongProperty(properties);
return ofLongProperty(
name,
properties,
defaultSimilarityMetric
);
case DOUBLE:
return ofDoubleProperty(properties);
case DOUBLE_ARRAY:
Expand Down Expand Up @@ -107,8 +111,15 @@ static SimilarityComputer ofDoubleProperty(NodePropertyValues nodePropertyValues
return new DoublePropertySimilarityComputer(nodePropertyValues);
}

static SimilarityComputer ofLongProperty(NodePropertyValues nodePropertyValues) {
return new LongPropertySimilarityComputer(nodePropertyValues);
static SimilarityComputer ofLongProperty(String name, NodePropertyValues properties, SimilarityMetric metric) {
switch (metric) {
case HAMMING_DISTANCE:
return new LongPropertySimilarityComputer(properties, HammingDistance::longMetric);
case NORMALIZED_ABSOLUTE_DIFFERENCE:
return new LongPropertySimilarityComputer(properties, NormalizedAbsoluteDifference::longMetric);
default:
throw unsupportedSimilarityMetric(name, properties.valueType(), metric);
}
}

static SimilarityComputer ofFloatArrayProperty(String name, NodePropertyValues properties, SimilarityMetric metric) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@
import static org.neo4j.gds.utils.StringFormatting.toUpperCaseWithLocale;

public enum SimilarityMetric {
JACCARD, OVERLAP, COSINE, EUCLIDEAN, PEARSON, LONG_PROPERTY_METRIC, DOUBLE_PROPERTY_METRIC, DEFAULT;
JACCARD, OVERLAP, COSINE, EUCLIDEAN, PEARSON,
NORMALIZED_ABSOLUTE_DIFFERENCE, DOUBLE_PROPERTY_METRIC,
HAMMING_DISTANCE, DEFAULT;

public static SimilarityMetric parse(String value) {
return SimilarityMetric.valueOf(toUpperCaseWithLocale(value));
Expand All @@ -34,7 +36,7 @@ public static SimilarityMetric parse(String value) {
public static SimilarityMetric defaultMetricForType(ValueType valueType) {
switch (valueType) {
case LONG:
return LONG_PROPERTY_METRIC;
return NORMALIZED_ABSOLUTE_DIFFERENCE;
case DOUBLE:
return DOUBLE_PROPERTY_METRIC;
case DOUBLE_ARRAY:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ public long size() {
idMap,
"myProperty",
nodeProperties,
SimilarityMetric.LONG_PROPERTY_METRIC
SimilarityMetric.NORMALIZED_ABSOLUTE_DIFFERENCE
);

var random = new SplittableRandom();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ public long size() {
idMap,
"myProperty",
nodeProperties,
SimilarityMetric.LONG_PROPERTY_METRIC
SimilarityMetric.NORMALIZED_ABSOLUTE_DIFFERENCE
);

var random = new SplittableRandom();
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
/*
* Copyright (c) "Neo4j"
* Neo4j Sweden AB [http://neo4j.com]
*
* This file is part of Neo4j.
*
* Neo4j is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package org.neo4j.gds.similarity.knn.metrics;

import org.junit.jupiter.api.Test;

import static org.junit.jupiter.api.Assertions.assertEquals;

class HammingDistanceTest {
@Test
void shouldReturnFullCorrelationWhenArgsAreIdentical() {
double dist = HammingDistance.longMetric(12345L, 12345L);

assertEquals(1.0, dist);
}

@Test
void shouldReturnCorrectCorrelation() {
double dist = HammingDistance.longMetric(12345L, 54321L);

assertEquals(0.921875, dist);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
/*
* Copyright (c) "Neo4j"
* Neo4j Sweden AB [http://neo4j.com]
*
* This file is part of Neo4j.
*
* Neo4j is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package org.neo4j.gds.similarity.knn.metrics;

import org.junit.jupiter.api.Test;

import static org.junit.jupiter.api.Assertions.assertEquals;

class NormalizedAbsoluteDifferenceTest {
@Test
void shouldComputeNormalizedAbsoluteDifference() {
double diff = NormalizedAbsoluteDifference.longMetric(1L, 2L);

assertEquals(0.5, diff);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -66,16 +66,22 @@ void doublePropertySimilarityReturnsValuesBetween0And1(@ForAll @From("differentV
}

@Property
void longPropertySimilarityReturns1ForEqualValues(@ForAll @Positive long id) {
void longPropertySimilarityReturns1ForEqualValues(
@ForAll @Positive long id,
@ForAll @From("longMetrics") SimilarityMetric similarityMetric
) {
NodePropertyValues props = new LongTestPropertyValues(nodeId -> nodeId);
var sim = SimilarityComputer.ofLongProperty(props);
var sim = SimilarityComputer.ofLongProperty("", props, similarityMetric);
assertThat(sim.similarity(id, id)).isEqualTo(1.0);
}

@Property
void longPropertySimilarityReturnsValuesBetween0And1(@ForAll @From("differentValues") LongLongPair ids) {
void longPropertySimilarityReturnsValuesBetween0And1(
@ForAll @From("differentValues") LongLongPair ids,
@ForAll @From("longMetrics") SimilarityMetric similarityMetric
) {
NodePropertyValues props = new LongTestPropertyValues(nodeId -> nodeId);
var sim = SimilarityComputer.ofLongProperty(props);
var sim = SimilarityComputer.ofLongProperty("", props, similarityMetric);
assertThat(sim.similarity(ids.getOne(), ids.getTwo())).isStrictlyBetween(0.0, 1.0);
}

Expand Down Expand Up @@ -305,6 +311,11 @@ final Arbitrary<LongLongPair> differentValues() {
.map(n2 -> PrimitiveTuples.pair((long) n1, (long) n2)));
}

@Provide("longMetrics")
final Arbitrary<SimilarityMetric> longMetrics() {
return Arbitraries.of(SimilarityMetric.NORMALIZED_ABSOLUTE_DIFFERENCE, SimilarityMetric.HAMMING_DISTANCE);
}

@Provide("longArrayMetrics")
final Arbitrary<SimilarityMetric> longArrayMetrics() {
return Arbitraries.of(SimilarityMetric.JACCARD, SimilarityMetric.OVERLAP);
Expand Down