diff --git a/.gitignore b/.gitignore index 9f9debd85c..3edee3bc5e 100644 --- a/.gitignore +++ b/.gitignore @@ -25,10 +25,14 @@ var/ # Ignore editor / IDE related data .vscode/ +.gemini/ # IntelliJ IDE, except project config .idea/ /*.iml +.junie/ +.aiassistant/ +.aiignore # ignore future updates to run configuration .run/devserver.run.xml diff --git a/contentcuration/contentcuration/management/commands/fix_missing_import_sources.py b/contentcuration/contentcuration/management/commands/fix_missing_import_sources.py new file mode 100644 index 0000000000..6f40cb569a --- /dev/null +++ b/contentcuration/contentcuration/management/commands/fix_missing_import_sources.py @@ -0,0 +1,164 @@ +import csv +import logging +import time + +from django.core.management.base import BaseCommand +from django.db.models import Exists +from django.db.models import FilteredRelation +from django.db.models import OuterRef +from django.db.models import Q +from django.db.models.expressions import F +from django_cte import With + +from contentcuration.models import Channel +from contentcuration.models import ContentNode + + +logger = logging.getLogger(__name__) + + +class Command(BaseCommand): + """ + Audits nodes that have imported content from public channels and whether the imported content + has a missing source node. + + TODO: this does not yet FIX them + """ + + def handle(self, *args, **options): + start = time.time() + + public_cte = self.get_public_cte() + + # preliminary filter on channels to those private and non-deleted, which have content + # lft=1 is always true for root nodes, so rght>2 means it actually has children + private_channels_cte = With( + Channel.objects.filter( + public=False, + deleted=False, + ) + .annotate( + non_empty_main_tree=FilteredRelation( + "main_tree", condition=Q(main_tree__rght__gt=2) + ), + ) + .annotate( + tree_id=F("non_empty_main_tree__tree_id"), + ) + .values("id", "name", "tree_id"), + name="dest_channel_cte", + ) + + # reduce the list of private channels to those that have an imported node + # from a public channel + destination_channels = ( + private_channels_cte.queryset() + .with_cte(public_cte) + .with_cte(private_channels_cte) + .filter( + Exists( + public_cte.join( + ContentNode.objects.filter( + tree_id=OuterRef("tree_id"), + ), + original_channel_id=public_cte.col.id, + ) + ) + ) + .values("id", "name", "tree_id") + .order_by("id") + ) + + logger.info("=== Iterating over private destination channels. ===") + channel_count = 0 + total_node_count = 0 + + with open("fix_missing_import_sources.csv", "w", newline="") as csv_file: + csv_writer = csv.DictWriter( + csv_file, + fieldnames=[ + "channel_id", + "channel_name", + "contentnode_id", + "contentnode_title", + "public_channel_id", + "public_channel_name", + "public_channel_deleted", + ], + ) + csv_writer.writeheader() + + for channel in destination_channels.iterator(): + node_count = self.handle_channel(csv_writer, channel) + + if node_count > 0: + total_node_count += node_count + channel_count += 1 + + logger.info("=== Done iterating over private destination channels. ===") + logger.info(f"Found {total_node_count} nodes across {channel_count} channels.") + logger.info(f"Finished in {time.time() - start}") + + def get_public_cte(self) -> With: + # This CTE gets all public channels with their main tree info + return With( + Channel.objects.filter(public=True) + .annotate( + tree_id=F("main_tree__tree_id"), + ) + .values("id", "name", "deleted", "tree_id"), + name="public_cte", + ) + + def handle_channel(self, csv_writer: csv.DictWriter, channel: dict) -> int: + public_cte = self.get_public_cte() + channel_id = channel["id"] + channel_name = channel["name"] + tree_id = channel["tree_id"] + + missing_source_nodes = ( + public_cte.join( + ContentNode.objects.filter(tree_id=tree_id), + original_channel_id=public_cte.col.id, + ) + .with_cte(public_cte) + .annotate( + public_channel_id=public_cte.col.id, + public_channel_name=public_cte.col.name, + public_channel_deleted=public_cte.col.deleted, + ) + .filter( + Q(public_channel_deleted=True) + | ~Exists( + ContentNode.objects.filter( + tree_id=public_cte.col.tree_id, + node_id=OuterRef("original_source_node_id"), + ) + ) + ) + .values( + "public_channel_id", + "public_channel_name", + "public_channel_deleted", + contentnode_id=F("id"), + contentnode_title=F("title"), + ) + ) + + # Count and log results + node_count = missing_source_nodes.count() + + # TODO: this will be replaced with logic to correct the missing source nodes + if node_count > 0: + logger.info( + f"{channel_id}:{channel_name}\t{node_count} node(s) with missing source nodes." + ) + row_dict = { + "channel_id": channel_id, + "channel_name": channel_name, + } + for node_dict in missing_source_nodes.iterator(): + row_dict.update(node_dict) + csv_writer.writerow(row_dict) + + return node_count diff --git a/contentcuration/contentcuration/models.py b/contentcuration/contentcuration/models.py index a3f15770cd..272b9ff054 100644 --- a/contentcuration/contentcuration/models.py +++ b/contentcuration/contentcuration/models.py @@ -47,6 +47,7 @@ from django.utils import timezone from django.utils.translation import gettext as _ from django_cte import CTEManager +from django_cte import CTEQuerySet from django_cte import With from le_utils import proquint from le_utils.constants import content_kinds @@ -837,7 +838,7 @@ def exists(self, *filters): return Exists(self.queryset().filter(*filters).values("user_id")) -class ChannelModelQuerySet(models.QuerySet): +class ChannelModelQuerySet(CTEQuerySet): def create(self, **kwargs): """ Create a new object with the given kwargs, saving it to the database @@ -863,6 +864,12 @@ def update_or_create(self, defaults=None, **kwargs): return super().update_or_create(defaults, **kwargs) +class ChannelModelManager(models.Manager.from_queryset(ChannelModelQuerySet)): + """Custom Channel models manager with CTE support""" + + pass + + class Channel(models.Model): """ Permissions come from association with organizations """ @@ -994,7 +1001,7 @@ class Channel(models.Model): ] ) - objects = ChannelModelQuerySet.as_manager() + objects = ChannelModelManager() @classmethod def get_editable(cls, user, channel_id): diff --git a/contentcuration/contentcuration/tests/management/__init__.py b/contentcuration/contentcuration/tests/management/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/contentcuration/contentcuration/tests/management/commands/__init__.py b/contentcuration/contentcuration/tests/management/commands/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/contentcuration/contentcuration/tests/management/commands/test_fix_missing_import_sources.py b/contentcuration/contentcuration/tests/management/commands/test_fix_missing_import_sources.py new file mode 100644 index 0000000000..e624313ff8 --- /dev/null +++ b/contentcuration/contentcuration/tests/management/commands/test_fix_missing_import_sources.py @@ -0,0 +1,100 @@ +from unittest.mock import mock_open +from unittest.mock import patch + +from django.core.management import call_command + +from contentcuration.tests import testdata +from contentcuration.tests.base import StudioTestCase + + +class CommandTestCase(StudioTestCase): + """Test suite for the fix_missing_import_sources management command""" + + def setUp(self): + open_patcher = patch( + "contentcuration.management.commands.fix_missing_import_sources.open", + mock_open(), + ) + self.mock_open = open_patcher.start() + self.mock_file = self.mock_open.return_value + self.mock_file.__enter__.return_value = self.mock_file + self.addCleanup(open_patcher.stop) + + csv_writer_patcher = patch( + "contentcuration.management.commands.fix_missing_import_sources.csv.DictWriter" + ) + self.mock_csv_writer = csv_writer_patcher.start() + self.mock_csv_writer_instance = self.mock_csv_writer.return_value + self.addCleanup(csv_writer_patcher.stop) + + self.public_channel = testdata.channel("Public Channel") + self.public_channel.public = True + self.public_channel.save() + + self.private_channel = testdata.channel("Private Channel") + + # see tree.json for this file + self.original_node = ( + self.public_channel.main_tree.get_descendants() + .filter(node_id="00000000000000000000000000000003") + .first() + ) + self.copied_node = self.original_node.copy_to( + target=self.private_channel.main_tree + ) + + def test_handle__opens_csv_file(self): + call_command("fix_missing_import_sources") + + self.mock_open.assert_called_once_with( + "fix_missing_import_sources.csv", "w", newline="" + ) + + self.mock_csv_writer.assert_called_once_with( + self.mock_file, + fieldnames=[ + "channel_id", + "channel_name", + "contentnode_id", + "contentnode_title", + "public_channel_id", + "public_channel_name", + "public_channel_deleted", + ], + ) + + self.mock_csv_writer_instance.writeheader.assert_called_once() + self.mock_csv_writer_instance.writerow.assert_not_called() + + def test_handle__finds_missing(self): + self.original_node.delete() + call_command("fix_missing_import_sources") + + self.mock_csv_writer_instance.writerow.assert_called_once_with( + { + "channel_id": self.private_channel.id, + "channel_name": self.private_channel.name, + "contentnode_id": self.copied_node.id, + "contentnode_title": self.copied_node.title, + "public_channel_id": self.public_channel.id, + "public_channel_name": self.public_channel.name, + "public_channel_deleted": False, + } + ) + + def test_handle__finds_for_deleted_channel(self): + self.public_channel.deleted = True + self.public_channel.save(actor_id=testdata.user().id) + call_command("fix_missing_import_sources") + + self.mock_csv_writer_instance.writerow.assert_called_once_with( + { + "channel_id": self.private_channel.id, + "channel_name": self.private_channel.name, + "contentnode_id": self.copied_node.id, + "contentnode_title": self.copied_node.title, + "public_channel_id": self.public_channel.id, + "public_channel_name": self.public_channel.name, + "public_channel_deleted": True, + } + )