Merge pull request 'dbprune: add --keep-followed and don't prune pinned posts by default' (#897) from Oneric/akkoma:mix-prune_newopts into develop

Reviewed-on: https://akkoma.dev/AkkomaGang/akkoma/pulls/897
This commit is contained in:
Oneric 2025-05-09 21:31:36 +00:00
commit 7e5a5db63d
3 changed files with 336 additions and 111 deletions

View file

@ -48,10 +48,15 @@ This will prune remote posts older than 90 days (configurable with [`config :ple
### Options ### Options
- `--keep-threads` - Don't prune posts when they are part of a thread where at least one post has seen local interaction (e.g. one of the posts is a local post, or is favourited by a local user, or has been repeated by a local user...). It also wont delete posts when at least one of the posts in that thread is kept (e.g. because one of the posts has seen recent activity). - `--keep-followed <mode>` - If set to `posts` all posts and boosts of users with local follows will be kept.
If set to `full` it will additionally keep any posts such users interacted with; this requires `--keep-threads`.
By default this is set to `none` and followed users are not treated special.
- `--keep-threads` - Don't prune posts when they are part of a thread where at least one post has seen local interaction (e.g. one of the posts is a local post, or is favourited by a local user, or has been repeated by a local user...). It also wont delete posts when at least one of the posts in the thread has seen recent activity or is kept due to `--keep-followed`.
- `--keep-non-public` - Keep non-public posts like DM's and followers-only, even if they are remote. - `--keep-non-public` - Keep non-public posts like DM's and followers-only, even if they are remote.
- `--limit` - limits how many remote posts get pruned. This limit does **not** apply to any of the follow up jobs. If wanting to keep the database load in check it is thus advisable to run the standalone `prune_orphaned_activities` task with a limit afterwards instead of passing `--prune-orphaned-activities` to this task. - `--limit` - limits how many remote posts get pruned. This limit does **not** apply to any of the follow up jobs. If wanting to keep the database load in check it is thus advisable to run the standalone `prune_orphaned_activities` task with a limit afterwards instead of passing `--prune-orphaned-activities` to this task.
- `--prune-orphaned-activities` - Also prune orphaned activities afterwards. Activities are things like Like, Create, Announce, Flag (aka reports)... They can significantly help reduce the database size. - `--prune-orphaned-activities` - Also prune orphaned activities afterwards. Activities are things like Like, Create, Announce, Flag (aka reports)... They can significantly help reduce the database size.
- `--prune-pinned` - Also prune pinned posts; keeping pinned posts does not suffice to protect their threads from pruning, even when using `--keep-threads`.
Note, if using this option and pinned posts are pruned, they and their threads will just be refetched on the next user update. Therefore it usually doesn't bring much gain while incurring a heavy fetch load after pruning.
- `--vacuum` - Run `VACUUM FULL` after the objects are pruned. This should not be used on a regular basis, but is useful if your instance has been running for a long time before pruning. - `--vacuum` - Run `VACUUM FULL` after the objects are pruned. This should not be used on a regular basis, but is useful if your instance has been running for a long time before pruning.
## Prune orphaned activities from the database ## Prune orphaned activities from the database

View file

@ -20,6 +20,10 @@ defmodule Mix.Tasks.Pleroma.Database do
@shortdoc "A collection of database related tasks" @shortdoc "A collection of database related tasks"
@moduledoc File.read!("docs/docs/administration/CLI_tasks/database.md") @moduledoc File.read!("docs/docs/administration/CLI_tasks/database.md")
defp maybe_concat(str, condition, appendix) do
if condition, do: str <> appendix, else: str
end
defp maybe_limit(query, limit_cnt) do defp maybe_limit(query, limit_cnt) do
if is_number(limit_cnt) and limit_cnt > 0 do if is_number(limit_cnt) and limit_cnt > 0 do
limit(query, [], ^limit_cnt) limit(query, [], ^limit_cnt)
@ -116,6 +120,149 @@ def prune_orphaned_activities(limit \\ 0, opts \\ []) when is_number(limit) do
del_single + del_array del_single + del_array
end end
defp query_pinned_object_apids() do
Pleroma.User
|> select([u], %{ap_id: fragment("jsonb_object_keys(?)", u.pinned_objects)})
end
defp query_pinned_object_ids() do
# If this additional level of subquery is omitted and we directly supply AP ids
# to te final query, it appears to overexert PostgreSQL(17)'s planner leading
# to a very inefficient query with enormous memory and time consumption.
# By supplying database IDs it ends up quite cheap however.
Object
|> where([o], fragment("?->>'id' IN ?", o.data, subquery(query_pinned_object_apids())))
|> select([o], o.id)
end
defp query_followed_remote_user_apids() do
Pleroma.FollowingRelationship
|> join(:inner, [rel], ufing in User, on: rel.following_id == ufing.id)
|> join(:inner, [rel], ufer in User, on: rel.follower_id == ufer.id)
|> where([rel], rel.state == :follow_accept)
|> where([_rel, ufing, ufer], ufer.local and not ufing.local)
|> select([_rel, ufing], %{ap_id: ufing.ap_id})
end
defp parse_keep_followed_arg(options) do
case Keyword.get(options, :keep_followed) do
"full" -> :full
"posts" -> :posts
"none" -> false
nil -> false
_ -> raise "Invalid argument for keep_followed! Must be 'full', 'posts' or 'none'"
end
end
defp maybe_restrict_followed_activities(query, options) do
case Keyword.get(options, :keep_followed) do
:full ->
having(
query,
[a],
fragment(
"bool_and(?->>'actor' NOT IN ?)",
a.data,
subquery(query_followed_remote_user_apids())
)
)
:posts ->
having(
query,
[a],
not fragment(
"bool_or(?->>'actor' IN ? AND ?->>'type' = ANY('{Create,Announce}'))",
a.data,
subquery(query_followed_remote_user_apids()),
a.data
)
)
_ ->
query
end
end
defp deletable_objects_keeping_threads(time_deadline, limit_cnt, options) do
# We want to delete objects from threads where
# 1. the newest post is still old
# 2. none of the activities is local
# 3. none of the activities is bookmarked
# 4. optionally none of the posts is non-public
deletable_context =
if Keyword.get(options, :keep_non_public) do
Pleroma.Activity
|> join(:left, [a], b in Pleroma.Bookmark, on: a.id == b.activity_id)
|> group_by([a], fragment("? ->> 'context'::text", a.data))
|> having(
[a],
not fragment(
# Posts (checked on Create Activity) is non-public
"bool_or((not(?->'to' \\? ? OR ?->'cc' \\? ?)) and ? ->> 'type' = 'Create')",
a.data,
^Pleroma.Constants.as_public(),
a.data,
^Pleroma.Constants.as_public(),
a.data
)
)
else
Pleroma.Activity
|> join(:left, [a], b in Pleroma.Bookmark, on: a.id == b.activity_id)
|> group_by([a], fragment("? ->> 'context'::text", a.data))
end
|> having([a], max(a.updated_at) < ^time_deadline)
|> having([a], not fragment("bool_or(?)", a.local))
|> having([_, b], fragment("max(?::text) is null", b.id))
|> maybe_restrict_followed_activities(options)
|> maybe_limit(limit_cnt)
|> select([a], fragment("? ->> 'context'::text", a.data))
Pleroma.Object
|> where([o], fragment("? ->> 'context'::text", o.data) in subquery(deletable_context))
end
defp deletable_objects_breaking_threads(time_deadline, limit_cnt, options) do
deletable =
if Keyword.get(options, :keep_non_public) do
Pleroma.Object
|> where(
[o],
fragment(
"?->'to' \\? ? OR ?->'cc' \\? ?",
o.data,
^Pleroma.Constants.as_public(),
o.data,
^Pleroma.Constants.as_public()
)
)
else
Pleroma.Object
end
|> where([o], o.updated_at < ^time_deadline)
|> where(
[o],
fragment("split_part(?->>'actor', '/', 3) != ?", o.data, ^Pleroma.Web.Endpoint.host())
)
|> then(fn q ->
if Keyword.get(options, :keep_followed) do
where(
q,
[o],
fragment("?->>'actor'", o.data) not in subquery(query_followed_remote_user_apids())
)
else
q
end
end)
|> maybe_limit(limit_cnt)
|> select([o], o.id)
Pleroma.Object
|> where([o], o.id in subquery(deletable))
end
def run(["remove_embedded_objects" | args]) do def run(["remove_embedded_objects" | args]) do
{options, [], []} = {options, [], []} =
OptionParser.parse( OptionParser.parse(
@ -173,16 +320,9 @@ def run(["prune_orphaned_activities" | args]) do
{limit, options} = Keyword.pop(options, :limit, 0) {limit, options} = Keyword.pop(options, :limit, 0)
log_message = "Pruning orphaned activities" "Pruning orphaned activities"
|> maybe_concat(limit > 0, ", limiting deletion to #{limit} rows")
log_message = |> Logger.info()
if limit > 0 do
log_message <> ", limiting deletion to #{limit} rows"
else
log_message
end
Logger.info(log_message)
deleted = prune_orphaned_activities(limit, options) deleted = prune_orphaned_activities(limit, options)
@ -195,13 +335,22 @@ def run(["prune_objects" | args]) do
args, args,
strict: [ strict: [
vacuum: :boolean, vacuum: :boolean,
keep_followed: :string,
keep_threads: :boolean, keep_threads: :boolean,
keep_non_public: :boolean, keep_non_public: :boolean,
prune_orphaned_activities: :boolean, prune_orphaned_activities: :boolean,
prune_pinned: :boolean,
limit: :integer limit: :integer
] ]
) )
kf = parse_keep_followed_arg(options)
options = Keyword.put(options, :keep_followed, kf)
if kf == :full and not Keyword.get(options, :keep_threads) do
raise "keep_followed=full only works in conjunction with keep_thread!"
end
start_pleroma() start_pleroma()
deadline = Pleroma.Config.get([:instance, :remote_post_retention_days]) deadline = Pleroma.Config.get([:instance, :remote_post_retention_days])
@ -209,111 +358,35 @@ def run(["prune_objects" | args]) do
limit_cnt = Keyword.get(options, :limit, 0) limit_cnt = Keyword.get(options, :limit, 0)
log_message = "Pruning objects older than #{deadline} days" "Pruning objects older than #{deadline} days"
|> maybe_concat(Keyword.get(options, :keep_non_public), ", keeping non public posts")
log_message = |> maybe_concat(Keyword.get(options, :keep_threads), ", keeping threads intact")
if Keyword.get(options, :keep_non_public) do |> maybe_concat(kf, ", keeping #{kf} activities of followed users")
log_message <> ", keeping non public posts" |> maybe_concat(Keyword.get(options, :prune_pinned), ", pruning pinned posts")
else |> maybe_concat(
log_message Keyword.get(options, :prune_orphaned_activities),
end ", pruning orphaned activities"
)
log_message = |> maybe_concat(
if Keyword.get(options, :keep_threads) do Keyword.get(options, :vacuum),
log_message <> ", keeping threads intact" ", doing a full vacuum (you shouldn't do this as a recurring maintanance task)"
else )
log_message |> maybe_concat(limit_cnt > 0, ", limiting to #{limit_cnt} rows")
end |> Logger.info()
log_message =
if Keyword.get(options, :prune_orphaned_activities) do
log_message <> ", pruning orphaned activities"
else
log_message
end
log_message =
if Keyword.get(options, :vacuum) do
log_message <>
", doing a full vacuum (you shouldn't do this as a recurring maintanance task)"
else
log_message
end
log_message =
if limit_cnt > 0 do
log_message <> ", limiting to #{limit_cnt} rows"
else
log_message
end
Logger.info(log_message)
{del_obj, _} = {del_obj, _} =
if Keyword.get(options, :keep_threads) do if Keyword.get(options, :keep_threads) do
# We want to delete objects from threads where deletable_objects_keeping_threads(time_deadline, limit_cnt, options)
# 1. the newest post is still old
# 2. none of the activities is local
# 3. none of the activities is bookmarked
# 4. optionally none of the posts is non-public
deletable_context =
if Keyword.get(options, :keep_non_public) do
Pleroma.Activity
|> join(:left, [a], b in Pleroma.Bookmark, on: a.id == b.activity_id)
|> group_by([a], fragment("? ->> 'context'::text", a.data))
|> having(
[a],
not fragment(
# Posts (checked on Create Activity) is non-public
"bool_or((not(?->'to' \\? ? OR ?->'cc' \\? ?)) and ? ->> 'type' = 'Create')",
a.data,
^Pleroma.Constants.as_public(),
a.data,
^Pleroma.Constants.as_public(),
a.data
)
)
else
Pleroma.Activity
|> join(:left, [a], b in Pleroma.Bookmark, on: a.id == b.activity_id)
|> group_by([a], fragment("? ->> 'context'::text", a.data))
end
|> having([a], max(a.updated_at) < ^time_deadline)
|> having([a], not fragment("bool_or(?)", a.local))
|> having([_, b], fragment("max(?::text) is null", b.id))
|> maybe_limit(limit_cnt)
|> select([a], fragment("? ->> 'context'::text", a.data))
Pleroma.Object
|> where([o], fragment("? ->> 'context'::text", o.data) in subquery(deletable_context))
else else
deletable = deletable_objects_breaking_threads(time_deadline, limit_cnt, options)
if Keyword.get(options, :keep_non_public) do
Pleroma.Object
|> where(
[o],
fragment(
"?->'to' \\? ? OR ?->'cc' \\? ?",
o.data,
^Pleroma.Constants.as_public(),
o.data,
^Pleroma.Constants.as_public()
)
)
else
Pleroma.Object
end
|> where([o], o.updated_at < ^time_deadline)
|> where(
[o],
fragment("split_part(?->>'actor', '/', 3) != ?", o.data, ^Pleroma.Web.Endpoint.host())
)
|> maybe_limit(limit_cnt)
|> select([o], o.id)
Pleroma.Object
|> where([o], o.id in subquery(deletable))
end end
|> then(fn q ->
if Keyword.get(options, :prune_pinned) do
q
else
where(q, [o], o.id not in subquery(query_pinned_object_ids()))
end
end)
|> Repo.delete_all(timeout: :infinity) |> Repo.delete_all(timeout: :infinity)
Logger.info("Deleted #{del_obj} objects...") Logger.info("Deleted #{del_obj} objects...")

View file

@ -88,6 +88,74 @@ test "it prunes old objects from the database", %{old_insert_date: old_insert_da
refute Object.get_by_id(note_remote_non_public_id) refute Object.get_by_id(note_remote_non_public_id)
end end
test "it retains pinned posts by default", %{old_insert_date: old_insert_date} do
insert(:note)
pin_user = insert(:user, local: false)
%{id: note_remote_pinned_id, data: note_remote_pinned_data} =
:note
|> insert(user: pin_user)
|> Ecto.Changeset.change(%{updated_at: old_insert_date})
|> Repo.update!()
User.add_pinned_object_id(pin_user, note_remote_pinned_data["id"])
note_remote_non_public =
%{id: note_remote_non_public_id, data: note_remote_non_public_data} =
:note
|> insert()
note_remote_non_public
|> Ecto.Changeset.change(%{
updated_at: old_insert_date,
data: note_remote_non_public_data |> update_in(["to"], fn _ -> [] end)
})
|> Repo.update!()
assert length(Repo.all(Object)) == 3
Mix.Tasks.Pleroma.Database.run(["prune_objects"])
assert length(Repo.all(Object)) == 2
assert Object.get_by_id(note_remote_pinned_id)
refute Object.get_by_id(note_remote_non_public_id)
end
test "it prunes pinned posts with --prune-pinned", %{old_insert_date: old_insert_date} do
insert(:note)
pin_user = insert(:user, local: false)
%{id: note_remote_pinned_id, data: note_remote_pinned_data} =
:note
|> insert(user: pin_user)
|> Ecto.Changeset.change(%{updated_at: old_insert_date})
|> Repo.update!()
User.add_pinned_object_id(pin_user, note_remote_pinned_data["id"])
note_remote_non_public =
%{id: note_remote_non_public_id, data: note_remote_non_public_data} =
:note
|> insert()
note_remote_non_public
|> Ecto.Changeset.change(%{
updated_at: old_insert_date,
data: note_remote_non_public_data |> update_in(["to"], fn _ -> [] end)
})
|> Repo.update!()
assert length(Repo.all(Object)) == 3
Mix.Tasks.Pleroma.Database.run(["prune_objects", "--prune-pinned"])
assert length(Repo.all(Object)) == 1
refute Object.get_by_id(note_remote_pinned_id)
refute Object.get_by_id(note_remote_non_public_id)
end
test "it cleans up bookmarks", %{old_insert_date: old_insert_date} do test "it cleans up bookmarks", %{old_insert_date: old_insert_date} do
user = insert(:user) user = insert(:user)
{:ok, old_object_activity} = CommonAPI.post(user, %{status: "yadayada"}) {:ok, old_object_activity} = CommonAPI.post(user, %{status: "yadayada"})
@ -351,6 +419,85 @@ test "with the --keep-threads option it keeps old threads with bookmarked posts"
assert length(Repo.all(Object)) == 1 assert length(Repo.all(Object)) == 1
end end
defp prepare_keep_followed_test(old_insert_date) do
remote_user = insert(:user, local: false)
local_user = insert(:user, local: true)
third_party = insert(:user, local: false)
CommonAPI.follow(local_user, remote_user)
CommonAPI.accept_follow_request(local_user, remote_user)
assert :follow_accept == Pleroma.FollowingRelationship.get(local_user, remote_user).state
{:ok, old_remote_post_activity} =
CommonAPI.post(remote_user, %{status: "some thing", local: false})
old_remote_post_activity
|> Ecto.Changeset.change(%{local: false, updated_at: old_insert_date})
|> Repo.update!()
old_remote_post_activity.object
|> Ecto.Changeset.change(%{updated_at: old_insert_date})
|> Repo.update!()
{:ok, old_liked_post_activity} =
CommonAPI.post(third_party, %{status: "boo!", local: false})
{:ok, old_like_activity} = CommonAPI.favorite(remote_user, old_liked_post_activity.id)
old_liked_post_activity
|> Ecto.Changeset.change(%{local: false, updated_at: old_insert_date})
|> Repo.update!()
old_liked_post_activity.object
|> Ecto.Changeset.change(%{updated_at: old_insert_date})
|> Repo.update!()
old_like_activity
|> Ecto.Changeset.change(%{local: false, updated_at: old_insert_date})
|> Repo.update!()
assert length(Repo.all(Object)) == 2
{old_remote_post_activity.object.id, old_liked_post_activity.object.id}
end
test "by default does not keep posts of followed users", %{
old_insert_date: old_insert_date
} do
_ = prepare_keep_followed_test(old_insert_date)
Mix.Tasks.Pleroma.Database.run(["prune_objects"])
assert length(Repo.all(Object)) == 0
end
test "with the --keep-followed posts option it keeps old posts of followed users", %{
old_insert_date: old_insert_date
} do
{old_remote_post_id, old_liked_post_id} =
prepare_keep_followed_test(old_insert_date)
Mix.Tasks.Pleroma.Database.run(["prune_objects", "--keep-followed", "posts"])
assert length(Repo.all(Object)) == 1
assert Object.get_by_id(old_remote_post_id)
refute Object.get_by_id(old_liked_post_id)
end
test "with the --keep-followed full option it keeps old posts liked by a followed user", %{
old_insert_date: old_insert_date
} do
_ = prepare_keep_followed_test(old_insert_date)
Mix.Tasks.Pleroma.Database.run([
"prune_objects",
"--keep-followed",
"full",
"--keep-threads"
])
assert length(Repo.all(Object)) == 2
end
test "We don't have unexpected tables which may contain objects that are referenced by activities" do test "We don't have unexpected tables which may contain objects that are referenced by activities" do
# We can delete orphaned activities. For that we look for the objects they reference in the 'objects', 'activities', and 'users' table. # We can delete orphaned activities. For that we look for the objects they reference in the 'objects', 'activities', and 'users' table.
# If someone adds another table with objects (idk, maybe with separate relations, or collections or w/e), then we need to make sure we # If someone adds another table with objects (idk, maybe with separate relations, or collections or w/e), then we need to make sure we