diff --git a/docs/docs/administration/CLI_tasks/database.md b/docs/docs/administration/CLI_tasks/database.md index 580c9d32b..f92cd1a52 100644 --- a/docs/docs/administration/CLI_tasks/database.md +++ b/docs/docs/administration/CLI_tasks/database.md @@ -48,10 +48,15 @@ This will prune remote posts older than 90 days (configurable with [`config :ple ### Options -- `--keep-threads` - Don't prune posts when they are part of a thread where at least one post has seen local interaction (e.g. one of the posts is a local post, or is favourited by a local user, or has been repeated by a local user...). It also wont delete posts when at least one of the posts in that thread is kept (e.g. because one of the posts has seen recent activity). +- `--keep-followed ` - If set to `posts` all posts and boosts of users with local follows will be kept. + If set to `full` it will additionally keep any posts such users interacted with; this requires `--keep-threads`. + By default this is set to `none` and followed users are not treated special. +- `--keep-threads` - Don't prune posts when they are part of a thread where at least one post has seen local interaction (e.g. one of the posts is a local post, or is favourited by a local user, or has been repeated by a local user...). It also won’t delete posts when at least one of the posts in the thread has seen recent activity or is kept due to `--keep-followed`. - `--keep-non-public` - Keep non-public posts like DM's and followers-only, even if they are remote. - `--limit` - limits how many remote posts get pruned. This limit does **not** apply to any of the follow up jobs. If wanting to keep the database load in check it is thus advisable to run the standalone `prune_orphaned_activities` task with a limit afterwards instead of passing `--prune-orphaned-activities` to this task. - `--prune-orphaned-activities` - Also prune orphaned activities afterwards. Activities are things like Like, Create, Announce, Flag (aka reports)... They can significantly help reduce the database size. +- `--prune-pinned` - Also prune pinned posts; keeping pinned posts does not suffice to protect their threads from pruning, even when using `--keep-threads`. + Note, if using this option and pinned posts are pruned, they and their threads will just be refetched on the next user update. Therefore it usually doesn't bring much gain while incurring a heavy fetch load after pruning. - `--vacuum` - Run `VACUUM FULL` after the objects are pruned. This should not be used on a regular basis, but is useful if your instance has been running for a long time before pruning. ## Prune orphaned activities from the database diff --git a/lib/mix/tasks/pleroma/database.ex b/lib/mix/tasks/pleroma/database.ex index 0a09a1c4a..c8b6c2329 100644 --- a/lib/mix/tasks/pleroma/database.ex +++ b/lib/mix/tasks/pleroma/database.ex @@ -20,6 +20,10 @@ defmodule Mix.Tasks.Pleroma.Database do @shortdoc "A collection of database related tasks" @moduledoc File.read!("docs/docs/administration/CLI_tasks/database.md") + defp maybe_concat(str, condition, appendix) do + if condition, do: str <> appendix, else: str + end + defp maybe_limit(query, limit_cnt) do if is_number(limit_cnt) and limit_cnt > 0 do limit(query, [], ^limit_cnt) @@ -116,6 +120,149 @@ def prune_orphaned_activities(limit \\ 0, opts \\ []) when is_number(limit) do del_single + del_array end + defp query_pinned_object_apids() do + Pleroma.User + |> select([u], %{ap_id: fragment("jsonb_object_keys(?)", u.pinned_objects)}) + end + + defp query_pinned_object_ids() do + # If this additional level of subquery is omitted and we directly supply AP ids + # to te final query, it appears to overexert PostgreSQL(17)'s planner leading + # to a very inefficient query with enormous memory and time consumption. + # By supplying database IDs it ends up quite cheap however. + Object + |> where([o], fragment("?->>'id' IN ?", o.data, subquery(query_pinned_object_apids()))) + |> select([o], o.id) + end + + defp query_followed_remote_user_apids() do + Pleroma.FollowingRelationship + |> join(:inner, [rel], ufing in User, on: rel.following_id == ufing.id) + |> join(:inner, [rel], ufer in User, on: rel.follower_id == ufer.id) + |> where([rel], rel.state == :follow_accept) + |> where([_rel, ufing, ufer], ufer.local and not ufing.local) + |> select([_rel, ufing], %{ap_id: ufing.ap_id}) + end + + defp parse_keep_followed_arg(options) do + case Keyword.get(options, :keep_followed) do + "full" -> :full + "posts" -> :posts + "none" -> false + nil -> false + _ -> raise "Invalid argument for keep_followed! Must be 'full', 'posts' or 'none'" + end + end + + defp maybe_restrict_followed_activities(query, options) do + case Keyword.get(options, :keep_followed) do + :full -> + having( + query, + [a], + fragment( + "bool_and(?->>'actor' NOT IN ?)", + a.data, + subquery(query_followed_remote_user_apids()) + ) + ) + + :posts -> + having( + query, + [a], + not fragment( + "bool_or(?->>'actor' IN ? AND ?->>'type' = ANY('{Create,Announce}'))", + a.data, + subquery(query_followed_remote_user_apids()), + a.data + ) + ) + + _ -> + query + end + end + + defp deletable_objects_keeping_threads(time_deadline, limit_cnt, options) do + # We want to delete objects from threads where + # 1. the newest post is still old + # 2. none of the activities is local + # 3. none of the activities is bookmarked + # 4. optionally none of the posts is non-public + deletable_context = + if Keyword.get(options, :keep_non_public) do + Pleroma.Activity + |> join(:left, [a], b in Pleroma.Bookmark, on: a.id == b.activity_id) + |> group_by([a], fragment("? ->> 'context'::text", a.data)) + |> having( + [a], + not fragment( + # Posts (checked on Create Activity) is non-public + "bool_or((not(?->'to' \\? ? OR ?->'cc' \\? ?)) and ? ->> 'type' = 'Create')", + a.data, + ^Pleroma.Constants.as_public(), + a.data, + ^Pleroma.Constants.as_public(), + a.data + ) + ) + else + Pleroma.Activity + |> join(:left, [a], b in Pleroma.Bookmark, on: a.id == b.activity_id) + |> group_by([a], fragment("? ->> 'context'::text", a.data)) + end + |> having([a], max(a.updated_at) < ^time_deadline) + |> having([a], not fragment("bool_or(?)", a.local)) + |> having([_, b], fragment("max(?::text) is null", b.id)) + |> maybe_restrict_followed_activities(options) + |> maybe_limit(limit_cnt) + |> select([a], fragment("? ->> 'context'::text", a.data)) + + Pleroma.Object + |> where([o], fragment("? ->> 'context'::text", o.data) in subquery(deletable_context)) + end + + defp deletable_objects_breaking_threads(time_deadline, limit_cnt, options) do + deletable = + if Keyword.get(options, :keep_non_public) do + Pleroma.Object + |> where( + [o], + fragment( + "?->'to' \\? ? OR ?->'cc' \\? ?", + o.data, + ^Pleroma.Constants.as_public(), + o.data, + ^Pleroma.Constants.as_public() + ) + ) + else + Pleroma.Object + end + |> where([o], o.updated_at < ^time_deadline) + |> where( + [o], + fragment("split_part(?->>'actor', '/', 3) != ?", o.data, ^Pleroma.Web.Endpoint.host()) + ) + |> then(fn q -> + if Keyword.get(options, :keep_followed) do + where( + q, + [o], + fragment("?->>'actor'", o.data) not in subquery(query_followed_remote_user_apids()) + ) + else + q + end + end) + |> maybe_limit(limit_cnt) + |> select([o], o.id) + + Pleroma.Object + |> where([o], o.id in subquery(deletable)) + end + def run(["remove_embedded_objects" | args]) do {options, [], []} = OptionParser.parse( @@ -173,16 +320,9 @@ def run(["prune_orphaned_activities" | args]) do {limit, options} = Keyword.pop(options, :limit, 0) - log_message = "Pruning orphaned activities" - - log_message = - if limit > 0 do - log_message <> ", limiting deletion to #{limit} rows" - else - log_message - end - - Logger.info(log_message) + "Pruning orphaned activities" + |> maybe_concat(limit > 0, ", limiting deletion to #{limit} rows") + |> Logger.info() deleted = prune_orphaned_activities(limit, options) @@ -195,13 +335,22 @@ def run(["prune_objects" | args]) do args, strict: [ vacuum: :boolean, + keep_followed: :string, keep_threads: :boolean, keep_non_public: :boolean, prune_orphaned_activities: :boolean, + prune_pinned: :boolean, limit: :integer ] ) + kf = parse_keep_followed_arg(options) + options = Keyword.put(options, :keep_followed, kf) + + if kf == :full and not Keyword.get(options, :keep_threads) do + raise "keep_followed=full only works in conjunction with keep_thread!" + end + start_pleroma() deadline = Pleroma.Config.get([:instance, :remote_post_retention_days]) @@ -209,111 +358,35 @@ def run(["prune_objects" | args]) do limit_cnt = Keyword.get(options, :limit, 0) - log_message = "Pruning objects older than #{deadline} days" - - log_message = - if Keyword.get(options, :keep_non_public) do - log_message <> ", keeping non public posts" - else - log_message - end - - log_message = - if Keyword.get(options, :keep_threads) do - log_message <> ", keeping threads intact" - else - log_message - end - - log_message = - if Keyword.get(options, :prune_orphaned_activities) do - log_message <> ", pruning orphaned activities" - else - log_message - end - - log_message = - if Keyword.get(options, :vacuum) do - log_message <> - ", doing a full vacuum (you shouldn't do this as a recurring maintanance task)" - else - log_message - end - - log_message = - if limit_cnt > 0 do - log_message <> ", limiting to #{limit_cnt} rows" - else - log_message - end - - Logger.info(log_message) + "Pruning objects older than #{deadline} days" + |> maybe_concat(Keyword.get(options, :keep_non_public), ", keeping non public posts") + |> maybe_concat(Keyword.get(options, :keep_threads), ", keeping threads intact") + |> maybe_concat(kf, ", keeping #{kf} activities of followed users") + |> maybe_concat(Keyword.get(options, :prune_pinned), ", pruning pinned posts") + |> maybe_concat( + Keyword.get(options, :prune_orphaned_activities), + ", pruning orphaned activities" + ) + |> maybe_concat( + Keyword.get(options, :vacuum), + ", doing a full vacuum (you shouldn't do this as a recurring maintanance task)" + ) + |> maybe_concat(limit_cnt > 0, ", limiting to #{limit_cnt} rows") + |> Logger.info() {del_obj, _} = if Keyword.get(options, :keep_threads) do - # We want to delete objects from threads where - # 1. the newest post is still old - # 2. none of the activities is local - # 3. none of the activities is bookmarked - # 4. optionally none of the posts is non-public - deletable_context = - if Keyword.get(options, :keep_non_public) do - Pleroma.Activity - |> join(:left, [a], b in Pleroma.Bookmark, on: a.id == b.activity_id) - |> group_by([a], fragment("? ->> 'context'::text", a.data)) - |> having( - [a], - not fragment( - # Posts (checked on Create Activity) is non-public - "bool_or((not(?->'to' \\? ? OR ?->'cc' \\? ?)) and ? ->> 'type' = 'Create')", - a.data, - ^Pleroma.Constants.as_public(), - a.data, - ^Pleroma.Constants.as_public(), - a.data - ) - ) - else - Pleroma.Activity - |> join(:left, [a], b in Pleroma.Bookmark, on: a.id == b.activity_id) - |> group_by([a], fragment("? ->> 'context'::text", a.data)) - end - |> having([a], max(a.updated_at) < ^time_deadline) - |> having([a], not fragment("bool_or(?)", a.local)) - |> having([_, b], fragment("max(?::text) is null", b.id)) - |> maybe_limit(limit_cnt) - |> select([a], fragment("? ->> 'context'::text", a.data)) - - Pleroma.Object - |> where([o], fragment("? ->> 'context'::text", o.data) in subquery(deletable_context)) + deletable_objects_keeping_threads(time_deadline, limit_cnt, options) else - deletable = - if Keyword.get(options, :keep_non_public) do - Pleroma.Object - |> where( - [o], - fragment( - "?->'to' \\? ? OR ?->'cc' \\? ?", - o.data, - ^Pleroma.Constants.as_public(), - o.data, - ^Pleroma.Constants.as_public() - ) - ) - else - Pleroma.Object - end - |> where([o], o.updated_at < ^time_deadline) - |> where( - [o], - fragment("split_part(?->>'actor', '/', 3) != ?", o.data, ^Pleroma.Web.Endpoint.host()) - ) - |> maybe_limit(limit_cnt) - |> select([o], o.id) - - Pleroma.Object - |> where([o], o.id in subquery(deletable)) + deletable_objects_breaking_threads(time_deadline, limit_cnt, options) end + |> then(fn q -> + if Keyword.get(options, :prune_pinned) do + q + else + where(q, [o], o.id not in subquery(query_pinned_object_ids())) + end + end) |> Repo.delete_all(timeout: :infinity) Logger.info("Deleted #{del_obj} objects...") diff --git a/test/mix/tasks/pleroma/database_test.exs b/test/mix/tasks/pleroma/database_test.exs index 4f97a978a..b1de10c9b 100644 --- a/test/mix/tasks/pleroma/database_test.exs +++ b/test/mix/tasks/pleroma/database_test.exs @@ -88,6 +88,74 @@ test "it prunes old objects from the database", %{old_insert_date: old_insert_da refute Object.get_by_id(note_remote_non_public_id) end + test "it retains pinned posts by default", %{old_insert_date: old_insert_date} do + insert(:note) + + pin_user = insert(:user, local: false) + + %{id: note_remote_pinned_id, data: note_remote_pinned_data} = + :note + |> insert(user: pin_user) + |> Ecto.Changeset.change(%{updated_at: old_insert_date}) + |> Repo.update!() + + User.add_pinned_object_id(pin_user, note_remote_pinned_data["id"]) + + note_remote_non_public = + %{id: note_remote_non_public_id, data: note_remote_non_public_data} = + :note + |> insert() + + note_remote_non_public + |> Ecto.Changeset.change(%{ + updated_at: old_insert_date, + data: note_remote_non_public_data |> update_in(["to"], fn _ -> [] end) + }) + |> Repo.update!() + + assert length(Repo.all(Object)) == 3 + + Mix.Tasks.Pleroma.Database.run(["prune_objects"]) + + assert length(Repo.all(Object)) == 2 + assert Object.get_by_id(note_remote_pinned_id) + refute Object.get_by_id(note_remote_non_public_id) + end + + test "it prunes pinned posts with --prune-pinned", %{old_insert_date: old_insert_date} do + insert(:note) + + pin_user = insert(:user, local: false) + + %{id: note_remote_pinned_id, data: note_remote_pinned_data} = + :note + |> insert(user: pin_user) + |> Ecto.Changeset.change(%{updated_at: old_insert_date}) + |> Repo.update!() + + User.add_pinned_object_id(pin_user, note_remote_pinned_data["id"]) + + note_remote_non_public = + %{id: note_remote_non_public_id, data: note_remote_non_public_data} = + :note + |> insert() + + note_remote_non_public + |> Ecto.Changeset.change(%{ + updated_at: old_insert_date, + data: note_remote_non_public_data |> update_in(["to"], fn _ -> [] end) + }) + |> Repo.update!() + + assert length(Repo.all(Object)) == 3 + + Mix.Tasks.Pleroma.Database.run(["prune_objects", "--prune-pinned"]) + + assert length(Repo.all(Object)) == 1 + refute Object.get_by_id(note_remote_pinned_id) + refute Object.get_by_id(note_remote_non_public_id) + end + test "it cleans up bookmarks", %{old_insert_date: old_insert_date} do user = insert(:user) {:ok, old_object_activity} = CommonAPI.post(user, %{status: "yadayada"}) @@ -351,6 +419,85 @@ test "with the --keep-threads option it keeps old threads with bookmarked posts" assert length(Repo.all(Object)) == 1 end + defp prepare_keep_followed_test(old_insert_date) do + remote_user = insert(:user, local: false) + local_user = insert(:user, local: true) + third_party = insert(:user, local: false) + + CommonAPI.follow(local_user, remote_user) + CommonAPI.accept_follow_request(local_user, remote_user) + + assert :follow_accept == Pleroma.FollowingRelationship.get(local_user, remote_user).state + + {:ok, old_remote_post_activity} = + CommonAPI.post(remote_user, %{status: "some thing", local: false}) + + old_remote_post_activity + |> Ecto.Changeset.change(%{local: false, updated_at: old_insert_date}) + |> Repo.update!() + + old_remote_post_activity.object + |> Ecto.Changeset.change(%{updated_at: old_insert_date}) + |> Repo.update!() + + {:ok, old_liked_post_activity} = + CommonAPI.post(third_party, %{status: "boo!", local: false}) + + {:ok, old_like_activity} = CommonAPI.favorite(remote_user, old_liked_post_activity.id) + + old_liked_post_activity + |> Ecto.Changeset.change(%{local: false, updated_at: old_insert_date}) + |> Repo.update!() + + old_liked_post_activity.object + |> Ecto.Changeset.change(%{updated_at: old_insert_date}) + |> Repo.update!() + + old_like_activity + |> Ecto.Changeset.change(%{local: false, updated_at: old_insert_date}) + |> Repo.update!() + + assert length(Repo.all(Object)) == 2 + + {old_remote_post_activity.object.id, old_liked_post_activity.object.id} + end + + test "by default does not keep posts of followed users", %{ + old_insert_date: old_insert_date + } do + _ = prepare_keep_followed_test(old_insert_date) + Mix.Tasks.Pleroma.Database.run(["prune_objects"]) + assert length(Repo.all(Object)) == 0 + end + + test "with the --keep-followed posts option it keeps old posts of followed users", %{ + old_insert_date: old_insert_date + } do + {old_remote_post_id, old_liked_post_id} = + prepare_keep_followed_test(old_insert_date) + + Mix.Tasks.Pleroma.Database.run(["prune_objects", "--keep-followed", "posts"]) + + assert length(Repo.all(Object)) == 1 + assert Object.get_by_id(old_remote_post_id) + refute Object.get_by_id(old_liked_post_id) + end + + test "with the --keep-followed full option it keeps old posts liked by a followed user", %{ + old_insert_date: old_insert_date + } do + _ = prepare_keep_followed_test(old_insert_date) + + Mix.Tasks.Pleroma.Database.run([ + "prune_objects", + "--keep-followed", + "full", + "--keep-threads" + ]) + + assert length(Repo.all(Object)) == 2 + end + test "We don't have unexpected tables which may contain objects that are referenced by activities" do # We can delete orphaned activities. For that we look for the objects they reference in the 'objects', 'activities', and 'users' table. # If someone adds another table with objects (idk, maybe with separate relations, or collections or w/e), then we need to make sure we