Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions tests/model/cycle_data/test_perfherder_cycling.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,53 @@ def test_irrelevant_repos_data_removal(
).exists()


def test_irrelevant_data_removal_cycles_past_empty_repo(
monkeypatch,
test_repository,
try_repository,
push_stored,
test_perf_signature,
taskcluster_notify_mock,
):
test_repository.name = f"{test_repository.name}-test"
test_repository.save()

expired = datetime.now() - timedelta(days=181)
push = Push.objects.first()

PerformanceDatum.objects.create(
repository=test_repository,
push=push,
job=None,
signature=test_perf_signature,
push_timestamp=datetime.now() - timedelta(days=1),
value=1.0,
)
PerformanceDatum.objects.create(
repository=try_repository,
push=push,
job=None,
signature=test_perf_signature,
push_timestamp=expired,
value=1.0,
)

monkeypatch.setattr(
IrrelevantDataRemoval,
"_IrrelevantDataRemoval__irrelevant_repos",
[try_repository.id, test_repository.id],
raising=False,
)

call_command("cycle_data", "from:perfherder")
assert PerformanceDatum.objects.filter(repository=test_repository).exists()
assert not PerformanceDatum.objects.filter(
push_timestamp__lte=expired,
repository=try_repository,
).exists()
assert PerformanceDatum.objects.count() == 1


def test_signature_remover(
test_perf_signature,
test_perf_signature_2,
Expand Down
44 changes: 33 additions & 11 deletions treeherder/model/data_cycling/removal_strategies.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import logging
from abc import ABC, abstractmethod
from datetime import datetime, timedelta
from itertools import cycle

from django.db.backends.utils import CursorWrapper

Expand Down Expand Up @@ -218,7 +217,7 @@ def __init__(self, chunk_size: int, days: int = None):

self._manager = PerformanceDatum.objects
self.__irrelevant_repos = None
self.__circular_repos = None
self.__target_repository = None

@property
def max_timestamp(self):
Expand All @@ -235,36 +234,59 @@ def irrelevant_repositories(self):
return self.__irrelevant_repos

@property
def irrelevant_repo(self):
if self.__circular_repos is None:
self.__circular_repos = cycle(self.irrelevant_repositories)
return next(self.__circular_repos)
def target_repository(self):
if self.__target_repository is None:
_ = self.irrelevant_repositories
self.__lookup_new_repository()
return self.__target_repository

@property
def name(self) -> str:
return "irrelevant data removal strategy"

def remove(self, using: CursorWrapper):
while True:
try:
self.__attempt_remove(using)
deleted_rows = using.rowcount
if deleted_rows > 0:
break # deletion was successful
self.__lookup_new_repository()
except LookupError as ex:
logger.debug(
f"Could not target any (new) irrelevant repository to delete data from. {ex}"
)
break

def __attempt_remove(self, using: CursorWrapper):
chunk_size = self._find_ideal_chunk_size()
deleted, _ = PerformanceDatum.objects.filter(
id__in=PerformanceDatum.objects.filter(
repository_id=self.irrelevant_repo, push_timestamp__lte=self._max_timestamp
repository_id=self.target_repository, push_timestamp__lte=self._max_timestamp
).values_list("id")[:chunk_size]
).delete()
using.rowcount = deleted

def __lookup_new_repository(self):
if not self.__irrelevant_repos:
raise LookupError("Exhausted all irrelevant repositories.")
self.__target_repository = self.__irrelevant_repos.pop()

def _find_ideal_chunk_size(self) -> int:
max_id_of_non_expired_row = (
self._manager.filter(push_timestamp__gt=self._max_timestamp)
.filter(repository_id__in=self.irrelevant_repositories)
.order_by("-id")[0]
.id
.filter(repository_id=self.target_repository)
.order_by("-id")
.values_list("id", flat=True)
.first()
)
if max_id_of_non_expired_row is None:
return self._chunk_size
older_perf_data_rows = (
self._manager.filter(
push_timestamp__lte=self._max_timestamp, id__lte=max_id_of_non_expired_row
)
.filter(repository_id__in=self.irrelevant_repositories)
.filter(repository_id=self.target_repository)
.order_by("id")[: self._chunk_size]
)
return len(older_perf_data_rows) or self._chunk_size
Expand Down