Skip to content

Commit

Permalink
Task to purge archives for deleted websites and segments (matomo-org#…
Browse files Browse the repository at this point in the history
…14317)

* Purge archives for deleted sites and segments

* Purge archives for deleted sites and segments

* Add new purgeOrphanedArchives task to expected list

* Fix build

* PR improvements

* Fix consistency of method names

* Fix typo

* Unit tests for getSegmentHashesByIdSite

* PR changes

* add note on how to test the command

* minor tweak to make sure no injections are possible
  • Loading branch information
Kate Butler authored and tsteur committed May 3, 2019
1 parent 8f1f8ec commit b000144
Show file tree
Hide file tree
Showing 7 changed files with 450 additions and 8 deletions.
73 changes: 73 additions & 0 deletions core/Archive/ArchivePurger.php
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,13 @@
namespace Piwik\Archive;

use Piwik\ArchiveProcessor\Rules;
use Piwik\Common;
use Piwik\Config;
use Piwik\Container\StaticContainer;
use Piwik\DataAccess\ArchiveTableCreator;
use Piwik\DataAccess\Model;
use Piwik\Date;
use Piwik\Db;
use Piwik\Piwik;
use Psr\Log\LoggerInterface;
use Psr\Log\LogLevel;
Expand Down Expand Up @@ -154,6 +156,77 @@ public function purgeOutdatedArchives(Date $dateStart)
return $deletedRowCount;
}

public function purgeDeletedSiteArchives(Date $dateStart)
{
$idArchivesToDelete = $this->getDeletedSiteArchiveIds($dateStart);

return $this->purge($idArchivesToDelete, $dateStart, 'deleted sites');
}

/**
* @param Date $dateStart
* @param array $segmentHashesByIdSite List of valid segment hashes, indexed by site ID
* @return int
*/
public function purgeDeletedSegmentArchives(Date $dateStart, array $segmentHashesByIdSite)
{
$idArchivesToDelete = $this->getDeletedSegmentArchiveIds($dateStart, $segmentHashesByIdSite);

return $this->purge($idArchivesToDelete, $dateStart, 'deleted segments');
}

/**
* Purge all numeric and blob archives with the given IDs from the database.
* @param array $idArchivesToDelete
* @param Date $dateStart
* @param string $reason
* @return int
*/
protected function purge(array $idArchivesToDelete, Date $dateStart, $reason)
{
$deletedRowCount = 0;
if (!empty($idArchivesToDelete)) {
$deletedRowCount = $this->deleteArchiveIds($dateStart, $idArchivesToDelete);

$this->logger->info(
"Deleted {count} rows in archive tables (numeric + blob) for {reason} for {date}.",
array(
'count' => $deletedRowCount,
'date' => $dateStart,
'reason' => $reason
)
);

$this->logger->debug("[Deleted IDs: {deletedIds}]", array(
'deletedIds' => implode(',', $idArchivesToDelete)
));
} else {
$this->logger->debug(
"No archives for {reason} found in archive numeric table for {date}.",
array('date' => $dateStart, 'reason' => $reason)
);
}

return $deletedRowCount;
}

protected function getDeletedSiteArchiveIds(Date $date)
{
$archiveTable = ArchiveTableCreator::getNumericTable($date);
return $this->model->getArchiveIdsForDeletedSites(
$archiveTable,
$this->getOldestTemporaryArchiveToKeepThreshold()
);
}

protected function getDeletedSegmentArchiveIds(Date $date, array $segmentHashesByIdSite)
{
$archiveTable = ArchiveTableCreator::getNumericTable($date);
return $this->model->getArchiveIdsForDeletedSegments(
$archiveTable, $segmentHashesByIdSite, $this->getOldestTemporaryArchiveToKeepThreshold()
);
}

protected function getOutdatedArchiveIds(Date $date, $purgeArchivesOlderThan)
{
$archiveTable = ArchiveTableCreator::getNumericTable($date);
Expand Down
69 changes: 69 additions & 0 deletions core/DataAccess/Model.php
Original file line number Diff line number Diff line change
Expand Up @@ -333,6 +333,75 @@ public function getSitesWithInvalidatedArchive($numericTable)
return $result;
}

/**
* Get a list of IDs of archives that don't have any matching rows in the site table. Excludes temporary archives
* that may still be in use, as specified by the $oldestToKeep passed in.
* @param string $archiveTableName
* @param string $oldestToKeep Datetime string
* @return array of IDs
*/
public function getArchiveIdsForDeletedSites($archiveTableName, $oldestToKeep)
{
$sql = "SELECT DISTINCT idarchive FROM " . $archiveTableName . " a "
. " LEFT JOIN " . Common::prefixTable('site') . " s USING (idsite)"
. " WHERE s.idsite IS NULL"
. " AND ts_archived < ?";

$rows = Db::fetchAll($sql, array($oldestToKeep));

return array_column($rows, 'idarchive');
}

/**
* Get a list of IDs of archives with segments that no longer exist in the DB. Excludes temporary archives that
* may still be in use, as specified by the $oldestToKeep passed in.
* @param string $archiveTableName
* @param array $segmentHashesById Whitelist of existing segments, indexed by site ID
* @param string $oldestToKeep Datetime string
* @return array With keys idarchive, name, idsite
*/
public function getArchiveIdsForDeletedSegments($archiveTableName, array $segmentHashesById, $oldestToKeep)
{
$validSegmentClauses = [];

foreach ($segmentHashesById as $idSite => $segments) {
// segments are md5 hashes and such not a problem re sql injection. for performance etc we don't want to use
// bound parameters for the query
foreach ($segments as $segment) {
if (!ctype_xdigit($segment)) {
throw new Exception($segment . ' expected to be an md5 hash');
}
}

// Special case as idsite=0 means the segments are not site-specific
if ($idSite === 0) {
foreach ($segments as $segmentHash) {
$validSegmentClauses[] = '(name LIKE "done' . $segmentHash . '%")';
}
continue;
}

$idSite = (int)$idSite;

// Vanilla case - segments that are valid for a single site only
$sql = '(idsite = ' . $idSite . ' AND (';
$sql .= 'name LIKE "done' . implode('%" OR name LIKE "done', $segments) . '%"';
$sql .= '))';
$validSegmentClauses[] = $sql;
}

$isValidSegmentSql = implode(' OR ', $validSegmentClauses);

$sql = 'SELECT idarchive FROM ' . $archiveTableName
. ' WHERE name LIKE "done%" AND name != "done"'
. ' AND ts_archived < ?'
. ' AND NOT (' . $isValidSegmentSql . ')';

$rows = Db::fetchAll($sql, array($oldestToKeep));

return array_map(function($row) { return $row['idarchive']; }, $rows);
}

/**
* Returns the SQL condition used to find successfully completed archives that
* this instance is querying for.
Expand Down
10 changes: 7 additions & 3 deletions core/Segment.php
Original file line number Diff line number Diff line change
Expand Up @@ -307,9 +307,13 @@ public function getHash()
if (empty($this->string)) {
return '';
}
// normalize the string as browsers may send slightly different payloads for the same archive
$normalizedSegmentString = urldecode($this->string);
return md5($normalizedSegmentString);
return self::getSegmentHash($this->string);
}

public static function getSegmentHash($definition)
{
// urldecode to normalize the string, as browsers may send slightly different payloads for the same archive
return md5(urldecode($definition));
}

/**
Expand Down
51 changes: 51 additions & 0 deletions plugins/CoreAdminHome/Tasks.php
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
use Piwik\API\Request;
use Piwik\ArchiveProcessor\Rules;
use Piwik\Archive\ArchivePurger;
use Piwik\Common;
use Piwik\Config;
use Piwik\Container\StaticContainer;
use Piwik\DataAccess\ArchiveTableCreator;
Expand All @@ -24,6 +25,7 @@
use Piwik\Plugins\CoreAdminHome\Tasks\ArchivesToPurgeDistributedList;
use Piwik\Plugins\SitesManager\SitesManager;
use Piwik\Scheduler\Schedule\SpecificTime;
use Piwik\Segment;
use Piwik\Settings\Storage\Backend\MeasurableSettingsTable;
use Piwik\Tracker\Failures;
use Piwik\Site;
Expand Down Expand Up @@ -64,6 +66,8 @@ public function schedule()
// general data purge on invalidated archive records, executed daily
$this->daily('purgeInvalidatedArchives', null, self::LOW_PRIORITY);

$this->weekly('purgeOrphanedArchives', null, self::NORMAL_PRIORITY);

// lowest priority since tables should be optimized after they are modified
$this->monthly('optimizeArchiveTable', null, self::LOWEST_PRIORITY);

Expand Down Expand Up @@ -258,6 +262,53 @@ public function updateSpammerBlacklist()
Option::set(ReferrerSpamFilter::OPTION_STORAGE_NAME, serialize($list));
}

/**
* To test execute the following command:
* `./console core:run-scheduled-tasks "Piwik\Plugins\CoreAdminHome\Tasks.purgeOrphanedArchives"`
*
* @throws \Exception
*/
public function purgeOrphanedArchives()
{
$segmentHashesByIdSite = $this->getSegmentHashesByIdSite();
$archiveTables = ArchiveTableCreator::getTablesArchivesInstalled('numeric');

$datesPurged = array();
foreach ($archiveTables as $table) {
$date = ArchiveTableCreator::getDateFromTableName($table);
list($year, $month) = explode('_', $date);

$dateObj = Date::factory("$year-$month-15");

$this->archivePurger->purgeDeletedSiteArchives($dateObj);
$this->archivePurger->purgeDeletedSegmentArchives($dateObj, $segmentHashesByIdSite);

$datesPurged[$date] = true;
}
}

/**
* Get a list of all segment hashes that currently exist, indexed by idSite.
* @return array
*/
public function getSegmentHashesByIdSite()
{
//Get a list of hashes of all segments that exist now
$sql = "SELECT DISTINCT definition, enable_only_idsite FROM " . Common::prefixTable('segment')
. " WHERE deleted = 0";
$rows = Db::fetchAll($sql);
$segmentHashes = array();
foreach ($rows as $row) {
$idSite = (int)$row['enable_only_idsite'];
if (! isset($segmentHashes[$idSite])) {
$segmentHashes[$idSite] = array();
}
$segmentHashes[$idSite][] = Segment::getSegmentHash($row['definition']);
}

return $segmentHashes;
}

/**
* we should only purge outdated & custom range archives if we know cron archiving has just run,
* or if browser triggered archiving is enabled. if cron archiving has run, then we know the latest
Expand Down
101 changes: 101 additions & 0 deletions plugins/CoreAdminHome/tests/Integration/TasksTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@
use Piwik\Plugins\CoreAdminHome\Emails\TrackingFailuresEmail;
use Piwik\Plugins\CoreAdminHome\Tasks;
use Piwik\Plugins\CoreAdminHome\Tasks\ArchivesToPurgeDistributedList;
use Piwik\Plugins\CustomDimensions\CustomDimensions;
use Piwik\Plugins\CustomDimensions\Dao\Configuration;
use Piwik\Plugins\SegmentEditor\Model;
use Piwik\Scheduler\Task;
use Piwik\Tests\Fixtures\RawArchiveDataWithTempAndInvalidated;
use Piwik\Tests\Framework\Fixture;
Expand Down Expand Up @@ -131,6 +134,7 @@ public function test_schedule_addsRightAmountOfTasks()
$expected = [
'purgeOutdatedArchives.',
'purgeInvalidatedArchives.',
'purgeOrphanedArchives.',
'optimizeArchiveTable.',
'cleanupTrackingFailures.',
'notifyTrackingFailures.',
Expand Down Expand Up @@ -215,6 +219,103 @@ public function test_notifyTrackingFailures_sendsMailWhenThereAreTrackingFailure
$this->assertEquals(2, $mail->getNumFailures());
}

public function test_getSegmentHashesByIdSite_emptyWhenNoSegments()
{
$segmentsByIdSite = $this->tasks->getSegmentHashesByIdSite();
$this->assertEquals(array(), $segmentsByIdSite);
}

public function test_getSegmentHashesByIdSite_allWebsiteAndSiteSpecificSegments()
{
$model = new Model();
$model->createSegment(array(
'name' => 'Test Segment 1',
'definition' => 'continentCode==eur',
'enable_only_idsite' => 0,
'deleted' => 0
));
$model->createSegment(array(
'name' => 'Test Segment 2',
'definition' => 'countryCode==nz',
'enable_only_idsite' => 0,
'deleted' => 0
));
$model->createSegment(array(
'name' => 'Test Segment 3',
'definition' => 'countryCode==au',
'enable_only_idsite' => 2,
'deleted' => 0
));

$segmentsByIdSite = $this->tasks->getSegmentHashesByIdSite();
$expected = array(
0 => array('be90051048558489e1d62f4245a6dc65', 'b92fbb3009b32cf632965802de2fb760'),
2 => array('cffd4336c22c6782211f853495076b1a')
);
$this->assertEquals($expected, $segmentsByIdSite);
}

public function test_getSegmentHashesByIdSite_invalidSegment()
{
$model = new Model();
$model->createSegment(array(
'name' => 'Test Segment 4',
'definition' => 'countryCode=nz', //The single "=" is invalid - we should generate a hash anyway
'enable_only_idsite' => 0,
'deleted' => 0
));
$model->createSegment(array(
'name' => 'Test Segment 5',
'definition' => 'countryCode==au',
'enable_only_idsite' => 0,
'deleted' => 0
));

$expected = array(
0 => array('5ffe7e116fae7576c047b1fb811584a5', 'cffd4336c22c6782211f853495076b1a'),
);

$segmentsByIdSite = $this->tasks->getSegmentHashesByIdSite();
$this->assertEquals($expected, $segmentsByIdSite);
}

public function test_getSegmentHashesByIdSite_siteSpecificCustomDimension()
{
// Insert a custom dimension for idsite = 1
$configuration = new Configuration();
$configuration->configureNewDimension(
1,
'mydimension',
CustomDimensions::SCOPE_VISIT,
1,
1,
array(),
true
);

$model = new Model();
$model->createSegment(array(
'name' => 'Test Segment 6',
'definition' => 'mydimension==red',
'enable_only_idsite' => 1,
'deleted' => 0
));
$model->createSegment(array(
'name' => 'Test Segment 7',
'definition' => 'countryCode==au',
'enable_only_idsite' => 2,
'deleted' => 0
));

$expected = array(
1 => array('240d2a84a309debd26bdbaa8eb3d363c'),
2 => array('cffd4336c22c6782211f853495076b1a')
);

$segmentsByIdSite = $this->tasks->getSegmentHashesByIdSite();
$this->assertEquals($expected, $segmentsByIdSite);
}

/**
* @param Date[] $dates
*/
Expand Down
Loading

0 comments on commit b000144

Please sign in to comment.