From ae9aa741d00ecc2e86a14c86d14327978a717734 Mon Sep 17 00:00:00 2001 From: Sergey Chernyshev Date: Sun, 25 Aug 2024 20:43:17 -0400 Subject: [PATCH] Created a more deterministic clusterizer --- package-lock.json | 23 --- package.json | 2 - src/clusterizers/bysize.js | 22 --- src/clusterizers/filetree.js | 243 ---------------------------- src/clusterizers/orderedFileTree.js | 76 +++++++++ src/clusterizers/tools/tree.js | 193 ++++++++++++++++++++++ src/gameConfig.js | 2 +- src/repo.js | 2 + 8 files changed, 272 insertions(+), 291 deletions(-) delete mode 100644 src/clusterizers/bysize.js delete mode 100644 src/clusterizers/filetree.js create mode 100644 src/clusterizers/orderedFileTree.js create mode 100644 src/clusterizers/tools/tree.js diff --git a/package-lock.json b/package-lock.json index 6c42b54..84e83dc 100644 --- a/package-lock.json +++ b/package-lock.json @@ -9,8 +9,6 @@ "version": "1.2.1", "license": "MIT", "dependencies": { - "kmeansjs": "^0.0.3", - "node-kmeans": "^1.1.9", "simple-git": "^3.25.0" }, "devDependencies": { @@ -249,11 +247,6 @@ "url": "https://github.com/sponsors/sindresorhus" } }, - "node_modules/kmeansjs": { - "version": "0.0.3", - "resolved": "https://registry.npmjs.org/kmeansjs/-/kmeansjs-0.0.3.tgz", - "integrity": "sha512-rZQ/xf2v6Mocngyu8flXXX90KAWS6aYtc0ZAVdpUFq5EHzKpkDMQqQqvQF98hGdCAH+ZUE6p58RkTwLRJElIYw==" - }, "node_modules/meow": { "version": "12.1.1", "resolved": "https://registry.npmjs.org/meow/-/meow-12.1.1.tgz", @@ -271,17 +264,6 @@ "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.2.tgz", "integrity": "sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w==" }, - "node_modules/node-kmeans": { - "version": "1.1.9", - "resolved": "https://registry.npmjs.org/node-kmeans/-/node-kmeans-1.1.9.tgz", - "integrity": "sha512-qZiuD4ab4cvsSWQSccWzLb4j1cHlm6wjGiWrbhCPZweVzpfcWBQ2CMqK4YBtJJ3OhK27NkZ1PKiLlZ52Z///cQ==", - "dependencies": { - "underscore": "^1.9.1" - }, - "engines": { - "node": ">= v0.6.0" - } - }, "node_modules/open": { "version": "10.1.0", "resolved": "https://registry.npmjs.org/open/-/open-10.1.0.tgz", @@ -493,11 +475,6 @@ "url": "https://github.com/sponsors/sindresorhus" } }, - "node_modules/underscore": { - "version": "1.13.7", - "resolved": "https://registry.npmjs.org/underscore/-/underscore-1.13.7.tgz", - "integrity": "sha512-GMXzWtsc57XAtguZgaQViUOzs0KTkk8ojr3/xAxXLITqf/3EMwxC0inyETfDFjH/Krbhuep0HNbbjI9i/q3F3g==" - }, "node_modules/unique-string": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/unique-string/-/unique-string-3.0.0.tgz", diff --git a/package.json b/package.json index 0592155..89c78d0 100644 --- a/package.json +++ b/package.json @@ -21,8 +21,6 @@ "open-cli": "^8.0.0" }, "dependencies": { - "kmeansjs": "^0.0.3", - "node-kmeans": "^1.1.9", "simple-git": "^3.25.0" } } diff --git a/src/clusterizers/bysize.js b/src/clusterizers/bysize.js deleted file mode 100644 index 8bf2e6f..0000000 --- a/src/clusterizers/bysize.js +++ /dev/null @@ -1,22 +0,0 @@ -import kmeans from "kmeansjs"; - -async function clusterize(files, number_of_blocks) { - const vector = files.map((file) => [file, file.Bytes, file.Lines]); - - const clusters = await new Promise((resolve, reject) => { - const k = - vector.length > number_of_blocks ? number_of_blocks : vector.length; - - kmeans(vector, k, function (err, res) { - if (err) { - reject(err); - } else { - resolve(res); - } - }); - }); - - return clusters.filter((cluster) => Array.isArray(cluster)); -} - -export default clusterize; diff --git a/src/clusterizers/filetree.js b/src/clusterizers/filetree.js deleted file mode 100644 index af03de8..0000000 --- a/src/clusterizers/filetree.js +++ /dev/null @@ -1,243 +0,0 @@ -import path from "node:path"; -import kmeans from "node-kmeans"; -import fs from "node:fs"; - -// Function to add a file path to the tree -function addPathToTree(tree, file) { - const parts = file.Location.split(path.sep); - let currentLevel = tree; - - parts.forEach((part, index) => { - if (!currentLevel[part]) { - if (index === parts.length - 1) { - // It's a file - currentLevel[part] = { file }; - } else { - // It's a directory - currentLevel[part] = { children: {} }; - } - } - currentLevel = currentLevel[part].children || currentLevel[part]; - }); -} - -// Function to build a tree from a list of filenames -function buildTree(files) { - const root = {}; - files.forEach((filepath) => addPathToTree(root, filepath)); - return { children: root, siblingIndex: 0 }; -} - -// Function to calculate the closest higher order of magnitude -function getNextOrderOfMagnitude(num) { - return num <= 0 ? 0 : Math.pow(10, Math.ceil(Math.log10(num))); -} - -function calculateCountsAndIndices(node) { - if (node.children) { - const siblings = Object.keys(node.children); - - let magnitudes = 0; - siblings.forEach((key, index) => { - const child = node.children[key]; - child.siblingIndex = index; // Index in the sibling list - const childMagnitude = calculateCountsAndIndices(child); - - magnitudes += childMagnitude; - }); - - node.magnitude = getNextOrderOfMagnitude(magnitudes); - - return node.magnitude; - } else { - return 1; - } -} - -function calculateFileScores(node, parentScore = 0) { - node.score = - parentScore + - (node.magnitude > 1 ? node.magnitude : 1) * - (node.siblingIndex ? node.siblingIndex : 0); - - if (node.children) { - const siblings = Object.keys(node.children); - - siblings.forEach((key) => { - const child = node.children[key]; - calculateFileScores(child, node.score); - }); - } -} - -function getScoredFiles(node, files = []) { - if (node.children) { - const siblings = Object.keys(node.children); - - siblings.forEach((key) => { - const child = node.children[key]; - getScoredFiles(child, files); - }); - } else { - files.push(node); - } - - return files; -} - -const NODE_RADIUS = 3; -const LEVEL_HEIGHT = 20; -const PADDING = 50; -const XSCALE = 5; - -function createNodeSVGElement(name, node, level, elements) { - const color = node.file ? "green" : "black"; - - const title = node.file ? node.file.Location : name; - - const x = PADDING + node.score * XSCALE; - const y = - PADDING + - (node.file ? level * LEVEL_HEIGHT - NODE_RADIUS * 2 : level * LEVEL_HEIGHT); - - const branchBottom = y; - const branchTop = PADDING + (level - 1) * LEVEL_HEIGHT; - - // node circle - elements.push( - ` - ${title} - ` - ); - - // vertical branch line - elements.push( - ` - ` - ); - - let maxX = x; - let maxY = y; - - if (node.children) { - const siblings = Object.keys(node.children); - - siblings.forEach((key) => { - const child = node.children[key]; - const { x: maxChildX, y: maxChildY } = createNodeSVGElement( - key, - child, - level + 1, - elements - ); - - if (maxChildX > maxX) { - maxX = maxChildX; - } - - if (maxChildY > maxY) { - maxY = maxChildY; - } - }); - - // horizontal branch line - elements.push(``); - } - - return { x: maxX, y: maxY }; -} - -function visualizeTree(tree, filename) { - const elements = []; - const { x: maxX, y: maxY } = createNodeSVGElement("root", tree, 0, elements); - - const SVG = ` - - - - ${elements.join("\n")} - - - - `; - - fs.writeFileSync(filename, SVG); -} - -function saveTree(tree, filename) { - fs.writeFileSync(filename, JSON.stringify(tree, null, 2)); -} - -export default async function clusterize(files, number_of_blocks) { - const tree = buildTree(files); - calculateCountsAndIndices(tree); - calculateFileScores(tree); - - saveTree(tree, "tree.json"); - visualizeTree(tree, "tree.html"); - - // console.log(JSON.stringify(tree, null, 2)); - // process.exit(0); - - const scoredFiles = getScoredFiles(tree); - - // console.log(JSON.stringify(scoredFiles, null, 2)); - - const vector = scoredFiles.map(({ score }) => [score]); - - // console.log(JSON.stringify(vector, null, 2)); - - const scoreClusters = await new Promise((resolve, reject) => { - const k = - vector.length > number_of_blocks ? number_of_blocks : vector.length; - - kmeans.clusterize(vector, { k }, function (err, res) { - if (err) { - reject(err); - } else { - resolve(res); - } - }); - }); - - // console.log(JSON.stringify(scoreClusters, null, 2)); - // process.exit(0); - - const fileClusters = scoreClusters.map((cluster) => - cluster.clusterInd.map((index) => [ - scoredFiles[index].file, - scoredFiles[index].score, - ]) - ); - - // console.log(JSON.stringify(fileClusters, null, 2)); - // process.exit(0); - - const validClusters = fileClusters.filter((cluster) => cluster.length > 1); - - return validClusters; -} diff --git a/src/clusterizers/orderedFileTree.js b/src/clusterizers/orderedFileTree.js new file mode 100644 index 0000000..2dac8a0 --- /dev/null +++ b/src/clusterizers/orderedFileTree.js @@ -0,0 +1,76 @@ +import fs from "fs"; +import { buildTree, saveTree, visualizeTree } from "./tools/tree.js"; + +function calculateSizes(node) { + if (node.children) { + let folderSize = 0; + + node.children.forEach((child) => { + folderSize += calculateSizes(child); + }); + + node.size = folderSize; + } else { + node.size = node.file.Bytes; + } + + return node.size; +} + +function getFiles(node, files = []) { + if (node.children) { + node.children.forEach((child) => { + getFiles(child, files); + }); + } else { + files.push(node); + } + + return files; +} + +function labelClusters(node, totalClusters, clusterIndex = 0) { + if (node.children) { + node.children.forEach((child) => { + if (child.file) { + child.clusterIndex = clusterIndex; + if (clusterIndex < totalClusters) { + clusterIndex++; + } + } + }); + + node.children.forEach((child) => { + labelClusters(child, totalClusters, clusterIndex); + }); + } +} + +export default async function clusterize(files, numberOfClusters) { + if (files.length < numberOfClusters) { + numberOfClusters = files.length; + } + + const tree = buildTree(files); + + calculateSizes(tree); + labelClusters(tree, numberOfClusters); + + const clusterizedFiles = getFiles(tree); + + const fileClusters = clusterizedFiles.reduce((clusters, fileEntry) => { + let cluster = clusters[fileEntry.clusterIndex]; + if (!cluster) { + cluster = []; + clusters[fileEntry.clusterIndex] = cluster; + } + + cluster.push([fileEntry.file]); + + return clusters; + }, []); + + const validClusters = fileClusters.filter((cluster) => cluster.length >= 1); + + return validClusters; +} diff --git a/src/clusterizers/tools/tree.js b/src/clusterizers/tools/tree.js new file mode 100644 index 0000000..3e64590 --- /dev/null +++ b/src/clusterizers/tools/tree.js @@ -0,0 +1,193 @@ +import GitHubLanguages from "../../GitHubLanguages.js"; +import fs from "node:fs"; +import path from "node:path"; + +const NODE_RADIUS = 3; +const LEVEL_HEIGHT = 20; +const PADDING = 50; +const XSCALE = 5; + +const allColors = Object.values(GitHubLanguages) + .map((lang) => lang.color) + .filter((color) => color); + +let fileNumber = 0; + +function createNodeSVGElement(name, node, level, elements) { + const color = + node.clusterIndex !== undefined + ? allColors[node.clusterIndex % allColors.length] + : "black"; + + const title = node.file ? node.file.Location : name; + + const x = PADDING + fileNumber * XSCALE; + + if (node.file) { + fileNumber++; + } + + const y = + PADDING + + (node.file ? level * LEVEL_HEIGHT - NODE_RADIUS * 2 : level * LEVEL_HEIGHT); + + const branchBottom = y; + const branchTop = PADDING + (level - 1) * LEVEL_HEIGHT; + + // vertical branch line + elements.push( + ` + ` + ); + + let maxX = x; + let maxY = y; + + if (node.children) { + // file circle + elements.push( + ` + ${title} + ` + ); + const siblings = Object.keys(node.children); + + siblings.forEach((key) => { + const child = node.children[key]; + const { x: maxChildX, y: maxChildY } = createNodeSVGElement( + key, + child, + level + 1, + elements + ); + + if (maxChildX > maxX) { + maxX = maxChildX; + } + + if (maxChildY > maxY) { + maxY = maxChildY; + } + }); + + // horizontal branch line + elements.push(``); + } else { + // file circle + elements.push( + ` + ${title} + ` + ); + } + + return { x: maxX, y: maxY }; +} + +export function visualizeTree(tree, filename) { + const elements = []; + const { x: maxX, y: maxY } = createNodeSVGElement("root", tree, 0, elements); + + const SVG = ` + + + + ${elements.join("\n")} + + + + `; + + fs.writeFileSync(filename, SVG); +} + +export function saveTree(tree, filename) { + fs.writeFileSync(filename, JSON.stringify(tree, null, 2)); +} + +// Function to add a file path to the tree +function addPathToTree(tree, file) { + const parts = file.Location.split(path.sep); + let currentLevel = tree; + + parts.forEach((part, index) => { + if (!currentLevel[part]) { + if (index === parts.length - 1) { + // It's a file + currentLevel[part] = { file }; + } else { + // It's a directory + currentLevel[part] = { children: {} }; + } + } + currentLevel = currentLevel[part].children || currentLevel[part]; + }); +} + +function orderChildren(node) { + if (node.children) { + const childrenSortedByName = Object.keys(node.children) + .sort() + .map((key) => { + const child = node.children[key]; + child.name = key; + + return child; + }); + + childrenSortedByName.forEach((child) => { + orderChildren(child); + }); + + node.children = childrenSortedByName; + } +} + +// Function to build a tree from a list of filenames +export function buildTree(files) { + const root = {}; + + files.forEach((file) => { + addPathToTree(root, file); + }); + + const tree = { children: root }; + + orderChildren(tree); + + return tree; +} diff --git a/src/gameConfig.js b/src/gameConfig.js index dcdb643..d56b344 100644 --- a/src/gameConfig.js +++ b/src/gameConfig.js @@ -3,7 +3,7 @@ import fs from "fs"; import { pathToFileURL } from "url"; import novaTerraPrime from "./tiles/novaTerraPrime.js"; -import clusterize from "./clusterizers/bysize.js"; +import clusterize from "./clusterizers/orderedFileTree.js"; export const defaultGameConfig = { tileSet: novaTerraPrime, diff --git a/src/repo.js b/src/repo.js index efabe08..5eecda8 100644 --- a/src/repo.js +++ b/src/repo.js @@ -9,6 +9,8 @@ export async function processRepo(gameConfig, SCC, folder) { const SCCResult = spawnSync(SCC, [ folder, "--by-file", + "--no-cocomo", + "--no-complexity", "--format=json", `--output=${tempStatsFile}`, ]);