Skip to content

Commit

Permalink
Update link references of ownership from nexB to aboutcode-org
Browse files Browse the repository at this point in the history
Signed-off-by: Chin Yeung Li <[email protected]>
  • Loading branch information
chinyeungli committed Aug 7, 2024
1 parent 9af4202 commit 5ccab02
Show file tree
Hide file tree
Showing 12 changed files with 67 additions and 40 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
Changelog
=========

v5.1.1
------

*2024-08-07* -- Update link references of ownership from nexB to aboutcode-org.


v5.1.0
------

Expand Down
2 changes: 1 addition & 1 deletion NOTICE
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# Copyright (c) nexB Inc. and others.
# SPDX-License-Identifier: Apache-2.0
#
# Visit https://aboutcode.org and https://github.com/nexB/matchcode-toolkit for support and download.
# Visit https://aboutcode.org and https://github.com/aboutcode-org/matchcode-toolkit for support and download.
# ScanCode is a trademark of nexB Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down
4 changes: 2 additions & 2 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ MatchCode toolkit
=================
MatchCode toolkit is a Python library that provides the file and directory
fingerprinting functionality for `ScanCode toolkit
<https://github.com/nexB/scancode-toolkit>`_ and `ScanCode.io
<https://github.com/nexB/scancode.io>`_ by implementing the HaloHash algorithm
<https://github.com/aboutcode-org/scancode-toolkit>`_ and `ScanCode.io
<https://github.com/aboutcode-org/scancode.io>`_ by implementing the HaloHash algorithm
and using it in ScanCode toolkit and ScanCode.io plugins and pipelines.


Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ license = Apache-2.0
description = matchcode-toolkit
long_description = file:README.rst
long_description_content_type = text/x-rst
url = https://github.com/nexB/matchcode-toolkit
url = https://github.com/aboutcode-org/matchcode-toolkit

author = nexB. Inc. and others
author_email = [email protected]
Expand Down
2 changes: 1 addition & 1 deletion src/matchcode_toolkit/fingerprinting.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# purldb is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/nexB/purldb for support or download.
# See https://github.com/aboutcode-org/purldb for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#

Expand Down
2 changes: 1 addition & 1 deletion src/matchcode_toolkit/halohash.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# purldb is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/nexB/purldb for support or download.
# See https://github.com/aboutcode-org/purldb for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#

Expand Down
4 changes: 2 additions & 2 deletions src/matchcode_toolkit/pipelines/fingerprint_codebase.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
#
# http://nexb.com and https://github.com/nexB/scancode.io
# http://nexb.com and https://github.com/aboutcode-org/scancode.io
# The ScanCode.io software is licensed under the Apache License version 2.0.
# Data generated with ScanCode.io is provided as-is without warranties.
# ScanCode is a trademark of nexB Inc.
Expand All @@ -18,7 +18,7 @@
# for any legal advice.
#
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/nexB/scancode.io for support and download.
# Visit https://github.com/aboutcode-org/scancode.io for support and download.

from scanpipe.pipelines import Pipeline
from scanpipe.pipes import matchcode
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
#
# http://nexb.com and https://github.com/nexB/scancode.io
# http://nexb.com and https://github.com/aboutcode-org/scancode.io
# The ScanCode.io software is licensed under the Apache License version 2.0.
# Data generated with ScanCode.io is provided as-is without warranties.
# ScanCode is a trademark of nexB Inc.
Expand All @@ -18,7 +18,7 @@
# for any legal advice.
#
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/nexB/scancode.io for support and download.
# Visit https://github.com/aboutcode-org/scancode.io for support and download.

from scanpipe.pipelines.scan_single_package import ScanSinglePackage
from scanpipe.pipes import matchcode
Expand Down
4 changes: 2 additions & 2 deletions src/matchcode_toolkit/plugin_fingerprint.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# ScanCode is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/nexB/scancode-toolkit for support or download.
# See https://github.com/aboutcode-org/scancode-toolkit for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#

Expand Down Expand Up @@ -40,7 +40,7 @@ class FingerprintScanner(ScanPlugin):

def is_enabled(self, fingerprint, **kwargs):
return fingerprint

def get_scanner(self, **kwargs):
return get_file_fingerprint_hashes

Expand Down
32 changes: 21 additions & 11 deletions tests/test_fingerprinting.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# purldb is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/nexB/purldb for support or download.
# See https://github.com/aboutcode-org/purldb for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#

Expand Down Expand Up @@ -31,7 +31,8 @@ def __init__(self, path='', size=0, sha1=''):


class TestFingerprintingFunctions(FileBasedTesting):
test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles/fingerprinting')
test_data_dir = os.path.join(os.path.dirname(
__file__), 'testfiles/fingerprinting')

def test__create_directory_fingerprint(self):
test_input = [
Expand All @@ -48,10 +49,12 @@ def test__create_directory_fingerprint(self):

def test_split_fingerprint(self):
directory_fingerprint = '0000000410d24471969646cb5402032288493126'
indexed_elements_count, bah128 = split_fingerprint(directory_fingerprint)
indexed_elements_count, bah128 = split_fingerprint(
directory_fingerprint)

expected_indexed_elements_count = 4
self.assertEqual(expected_indexed_elements_count, indexed_elements_count)
self.assertEqual(expected_indexed_elements_count,
indexed_elements_count)

expected_bah128 = '10d24471969646cb5402032288493126'
self.assertEqual(expected_bah128, bah128)
Expand Down Expand Up @@ -81,7 +84,8 @@ def test_create_structure_fingerprint(self):
Resource(path='package/index.js', size=608),
Resource(path='package/package.json', size=677),
]
fingerprint = create_structure_fingerprint(test_top_resource, test_child_resources)
fingerprint = create_structure_fingerprint(
test_top_resource, test_child_resources)
expected_fingerprint = '00000003ce72f4308a1bc1afb0fb47ed590b5c53'
self.assertEqual(expected_fingerprint, fingerprint)

Expand Down Expand Up @@ -134,8 +138,10 @@ def test_get_file_fingerprint_hashes_one_line_removed(self):
result2 = get_file_fingerprint_hashes(test_file2)
result1 = result1.get('halo1')
result2 = result2.get('halo1')
result1_indexed_elements_count, result1_fingerprint = split_fingerprint(result1)
result2_indexed_elements_count, result2_fingerprint = split_fingerprint(result2)
result1_indexed_elements_count, result1_fingerprint = split_fingerprint(
result1)
result2_indexed_elements_count, result2_fingerprint = split_fingerprint(
result2)

expected_result1_indexed_elements_count = 6395
expected_result2_indexed_elements_count = 6388
Expand All @@ -147,7 +153,8 @@ def test_get_file_fingerprint_hashes_one_line_removed(self):
assert result1_fingerprint == expected_result1_fingerprint
assert result2_fingerprint == expected_result2_fingerprint

assert byte_hamming_distance(result1_fingerprint, result2_fingerprint) == 2
assert byte_hamming_distance(
result1_fingerprint, result2_fingerprint) == 2

def test_get_file_fingerprint_hashes_one_line_added(self):
test_file1 = self.get_test_loc('inflate.c')
Expand All @@ -156,8 +163,10 @@ def test_get_file_fingerprint_hashes_one_line_added(self):
result2 = get_file_fingerprint_hashes(test_file2)
result1 = result1.get('halo1')
result2 = result2.get('halo1')
result1_indexed_elements_count, result1_fingerprint = split_fingerprint(result1)
result2_indexed_elements_count, result2_fingerprint = split_fingerprint(result2)
result1_indexed_elements_count, result1_fingerprint = split_fingerprint(
result1)
result2_indexed_elements_count, result2_fingerprint = split_fingerprint(
result2)

expected_result1_indexed_elements_count = 6395
expected_result2_indexed_elements_count = 6398
Expand All @@ -169,4 +178,5 @@ def test_get_file_fingerprint_hashes_one_line_added(self):
assert result1_fingerprint == expected_result1_fingerprint
assert result2_fingerprint == expected_result2_fingerprint

assert byte_hamming_distance(result1_fingerprint, result2_fingerprint) == 3
assert byte_hamming_distance(
result1_fingerprint, result2_fingerprint) == 3
42 changes: 27 additions & 15 deletions tests/test_halohash.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# purldb is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/nexB/purldb for support or download.
# See https://github.com/aboutcode-org/purldb for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#

Expand Down Expand Up @@ -85,7 +85,8 @@ def calculate_mean_and_standard_deviation(hamming_distances):
number_of_hamming_distances = len(hamming_distances)

# 1: Find the mean.
mean_hamming_distance = sum(hamming_distances) / number_of_hamming_distances
mean_hamming_distance = sum(hamming_distances) / \
number_of_hamming_distances

# 2: For each data point, find the square of its distance to the mean, then sum the values.
s0 = sum(
Expand All @@ -103,7 +104,8 @@ def calculate_mean_and_standard_deviation(hamming_distances):


class TestHalohash(FileBasedTesting):
test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles/halohash')
test_data_dir = os.path.join(
os.path.dirname(__file__), 'testfiles/halohash')

def setUp(self):
words_loc = self.get_test_loc('words.txt')
Expand All @@ -128,13 +130,16 @@ def test_halohash_random_delete(self, regen=False):
modified_content
)
number_of_elements = len(modified_content)
hamming_distance_by_number_of_elements[number_of_elements].append(hamming_distance)
modified_content.pop(random.randint(0, len(modified_content) - 1))
hamming_distance_by_number_of_elements[number_of_elements].append(
hamming_distance)
modified_content.pop(random.randint(
0, len(modified_content) - 1))

# Take mean and standard deviation
results = []
for number_of_elements, hamming_distances in hamming_distance_by_number_of_elements.items():
mean_hamming_distance, standard_deviation = calculate_mean_and_standard_deviation(hamming_distances)
mean_hamming_distance, standard_deviation = calculate_mean_and_standard_deviation(
hamming_distances)
results.append(
{
'number of hashed elements': int(number_of_elements),
Expand All @@ -143,8 +148,10 @@ def test_halohash_random_delete(self, regen=False):
}
)

expected_results_loc = self.get_test_loc(f'{number_of_words}-delete-expected-results.csv')
check_results(results, expected_results_loc, ['number of hashed elements', 'mean hamming distance', 'standard deviation'], regen=regen)
expected_results_loc = self.get_test_loc(
f'{number_of_words}-delete-expected-results.csv')
check_results(results, expected_results_loc, [
'number of hashed elements', 'mean hamming distance', 'standard deviation'], regen=regen)

def test_halohash_random_replace(self, regen=False):
for number_of_words in [500,]:
Expand All @@ -165,9 +172,11 @@ def test_halohash_random_replace(self, regen=False):
original_hash,
modified_content
)
hamming_distance_by_number_of_words_replaced[words_replaced].append(hamming_distance)
hamming_distance_by_number_of_words_replaced[words_replaced].append(
hamming_distance)

modified_content.pop(random.randint(0, len(modified_content) - 1))
modified_content.pop(random.randint(
0, len(modified_content) - 1))
new_word = (
subprocess.run(
['shuf', '-n', '1', '/usr/share/dict/american-english'],
Expand All @@ -178,14 +187,15 @@ def test_halohash_random_replace(self, regen=False):
.strip()
.replace('"', '')
)
modified_content[random.randint(0, len(modified_content) - 1)] = bytes(new_word, 'utf-8')
modified_content[random.randint(
0, len(modified_content) - 1)] = bytes(new_word, 'utf-8')
words_replaced += 1


# Take mean and standard deviation
results = []
for words_replaced, hamming_distances in hamming_distance_by_number_of_words_replaced.items():
mean_hamming_distance, standard_deviation = calculate_mean_and_standard_deviation(hamming_distances)
mean_hamming_distance, standard_deviation = calculate_mean_and_standard_deviation(
hamming_distances)
results.append(
{
'words replaced': int(words_replaced),
Expand All @@ -194,5 +204,7 @@ def test_halohash_random_replace(self, regen=False):
}
)

expected_results_loc = self.get_test_loc(f'{number_of_words}-replaced-expected-results.csv')
check_results(results, expected_results_loc, ['words replaced', 'mean hamming distance', 'standard deviation'], regen=regen)
expected_results_loc = self.get_test_loc(
f'{number_of_words}-replaced-expected-results.csv')
check_results(results, expected_results_loc, [
'words replaced', 'mean hamming distance', 'standard deviation'], regen=regen)
3 changes: 1 addition & 2 deletions tests/test_plugin_fingerprinting.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# ScanCode is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/nexB/scancode-toolkit for support or download.
# See https://github.com/aboutcode-org/scancode-toolkit for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#

Expand All @@ -17,7 +17,6 @@
from matchcode_toolkit.fingerprinting import get_file_fingerprint_hashes



"""
These tests spawn new process as if launched from the command line.
"""
Expand Down

0 comments on commit 5ccab02

Please sign in to comment.