From 0681588b2ae09387a33fc02a439db12de24fb435 Mon Sep 17 00:00:00 2001 From: Mikkel Denker Date: Wed, 20 Nov 2024 10:19:07 +0100 Subject: [PATCH] ugc rel tag and filter during centrality calculation --- .../entrypoint/ampc/approximated_harmonic_centrality/mapper.rs | 1 + crates/core/src/webgraph/centrality/harmonic.rs | 1 + crates/core/src/webpage/html/links.rs | 2 ++ 3 files changed, 4 insertions(+) diff --git a/crates/core/src/entrypoint/ampc/approximated_harmonic_centrality/mapper.rs b/crates/core/src/entrypoint/ampc/approximated_harmonic_centrality/mapper.rs index 4cb420e2..8def6de7 100644 --- a/crates/core/src/entrypoint/ampc/approximated_harmonic_centrality/mapper.rs +++ b/crates/core/src/entrypoint/ampc/approximated_harmonic_centrality/mapper.rs @@ -41,6 +41,7 @@ pub static SKIPPED_REL: std::sync::LazyLock = std::sync::LazyLock::new | RelFlags::SEARCH | RelFlags::LINK_TAG | RelFlags::SCRIPT_TAG + | RelFlags::UGC }); #[derive(Debug, Clone, bincode::Decode, bincode::Encode)] diff --git a/crates/core/src/webgraph/centrality/harmonic.rs b/crates/core/src/webgraph/centrality/harmonic.rs index fdcab2ae..8bb5721a 100644 --- a/crates/core/src/webgraph/centrality/harmonic.rs +++ b/crates/core/src/webgraph/centrality/harmonic.rs @@ -45,6 +45,7 @@ pub static SKIPPED_REL: std::sync::LazyLock = std::sync::LazyLock::new | RelFlags::LINK_TAG | RelFlags::SCRIPT_TAG | RelFlags::SAME_ICANN_DOMAIN + | RelFlags::UGC }); type Counter = BTreeMap>; diff --git a/crates/core/src/webpage/html/links.rs b/crates/core/src/webpage/html/links.rs index c1dc47ab..09361a60 100644 --- a/crates/core/src/webpage/html/links.rs +++ b/crates/core/src/webpage/html/links.rs @@ -82,6 +82,7 @@ impl RelFlags { "tag" => res |= RelFlags::TAG, "terms-of-service" => res |= RelFlags::TERMS_OF_SERVICE, "sponsored" => res |= RelFlags::SPONSORED, + "ugc" => res |= RelFlags::UGC, _ => {} } } @@ -135,6 +136,7 @@ bitflags! { const SCRIPT_TAG = 1 << 19; const META_TAG = 1 << 20; const SAME_ICANN_DOMAIN = 1 << 21; + const UGC = 1 << 22; } }