Skip to content

Commit

Permalink
improve performance of approximated harmonic.
Browse files Browse the repository at this point in the history
from ~2-3 days down to ~16 hours on production page graph with ~3.2 billion nodes
  • Loading branch information
mikkeldenker committed Feb 28, 2024
1 parent 5d4183c commit f1f8394
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 13 deletions.
27 changes: 16 additions & 11 deletions crates/core/src/webgraph/centrality/approx_harmonic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,9 @@
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.

use std::{path::Path, sync::Mutex};
use std::path::Path;

use dashmap::DashMap;
use indicatif::ParallelProgressIterator;
use rayon::prelude::*;

Expand Down Expand Up @@ -44,31 +45,35 @@ impl ApproxHarmonic {

tracing::info!("sampling {} nodes", num_samples);

let sampled = graph.random_nodes(num_samples);
let sampled = graph.random_nodes_with_outgoing(num_samples);

let res = Mutex::new(Self {
inner: RocksDbStore::open(output),
});
let centralities: DashMap<NodeID, f32> = DashMap::new();

let norm = num_nodes as f64 / (num_samples as f64 * (num_nodes as f64 - 1.0));
let norm = num_nodes as f32 / (num_samples as f32 * (num_nodes as f32 - 1.0));

sampled.into_par_iter().progress().for_each(|source| {
let dists = graph.raw_distances_with_max(source, 5);

let res = res.lock().unwrap();
for (target, dist) in dists {
if dist == 0 {
continue;
}

let dist = dist as f64;
let dist = dist as f32;

let old = res.inner.get(&target).unwrap_or(0.0);
res.inner.insert(target, old + ((1.0 / dist) * norm));
*centralities.entry(target).or_default() += (1.0 / dist) * norm;
}
});

res.into_inner().unwrap()
let res = Self {
inner: RocksDbStore::open(output),
};

for (node, centrality) in centralities {
res.inner.insert(node, centrality as f64);
}

res
}

pub fn get(&self, node: &NodeID) -> Option<f64> {
Expand Down
10 changes: 8 additions & 2 deletions crates/core/src/webgraph/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ use std::path::Path;
use std::sync::Arc;
use std::{cmp, fs};

use itertools::Itertools;
use rand::seq::SliceRandom;
use rayon::prelude::*;
use url::Url;
Expand Down Expand Up @@ -864,9 +865,14 @@ impl Webgraph {
self.id2node.keys()
}

pub fn random_nodes(&self, num: usize) -> Vec<NodeID> {
pub fn random_nodes_with_outgoing(&self, num: usize) -> Vec<NodeID> {
let mut rng = rand::thread_rng();
let mut nodes = self.nodes().take(num).collect::<Vec<_>>();
let mut nodes = self
.edges()
.map(|e| e.from)
.unique()
.take(num)
.collect::<Vec<_>>();
nodes.shuffle(&mut rng);
nodes.into_iter().take(num).collect()
}
Expand Down

0 comments on commit f1f8394

Please sign in to comment.