Merge pull request 'dbprune: add --keep-followed and don't prune pinned posts by default' (#897) from Oneric/akkoma:mix-prune_newopts into develop

Reviewed-on: https://akkoma.dev/AkkomaGang/akkoma/pulls/897
This commit is contained in:
Oneric 2025-05-09 21:31:36 +00:00
commit 7e5a5db63d
3 changed files with 336 additions and 111 deletions

View file

@ -48,10 +48,15 @@ This will prune remote posts older than 90 days (configurable with [`config :ple
### Options
- `--keep-threads` - Don't prune posts when they are part of a thread where at least one post has seen local interaction (e.g. one of the posts is a local post, or is favourited by a local user, or has been repeated by a local user...). It also wont delete posts when at least one of the posts in that thread is kept (e.g. because one of the posts has seen recent activity).
- `--keep-followed <mode>` - If set to `posts` all posts and boosts of users with local follows will be kept.
If set to `full` it will additionally keep any posts such users interacted with; this requires `--keep-threads`.
By default this is set to `none` and followed users are not treated special.
- `--keep-threads` - Don't prune posts when they are part of a thread where at least one post has seen local interaction (e.g. one of the posts is a local post, or is favourited by a local user, or has been repeated by a local user...). It also wont delete posts when at least one of the posts in the thread has seen recent activity or is kept due to `--keep-followed`.
- `--keep-non-public` - Keep non-public posts like DM's and followers-only, even if they are remote.
- `--limit` - limits how many remote posts get pruned. This limit does **not** apply to any of the follow up jobs. If wanting to keep the database load in check it is thus advisable to run the standalone `prune_orphaned_activities` task with a limit afterwards instead of passing `--prune-orphaned-activities` to this task.
- `--prune-orphaned-activities` - Also prune orphaned activities afterwards. Activities are things like Like, Create, Announce, Flag (aka reports)... They can significantly help reduce the database size.
- `--prune-pinned` - Also prune pinned posts; keeping pinned posts does not suffice to protect their threads from pruning, even when using `--keep-threads`.
Note, if using this option and pinned posts are pruned, they and their threads will just be refetched on the next user update. Therefore it usually doesn't bring much gain while incurring a heavy fetch load after pruning.
- `--vacuum` - Run `VACUUM FULL` after the objects are pruned. This should not be used on a regular basis, but is useful if your instance has been running for a long time before pruning.
## Prune orphaned activities from the database

View file

@ -20,6 +20,10 @@ defmodule Mix.Tasks.Pleroma.Database do
@shortdoc "A collection of database related tasks"
@moduledoc File.read!("docs/docs/administration/CLI_tasks/database.md")
defp maybe_concat(str, condition, appendix) do
if condition, do: str <> appendix, else: str
end
defp maybe_limit(query, limit_cnt) do
if is_number(limit_cnt) and limit_cnt > 0 do
limit(query, [], ^limit_cnt)
@ -116,6 +120,149 @@ def prune_orphaned_activities(limit \\ 0, opts \\ []) when is_number(limit) do
del_single + del_array
end
defp query_pinned_object_apids() do
Pleroma.User
|> select([u], %{ap_id: fragment("jsonb_object_keys(?)", u.pinned_objects)})
end
defp query_pinned_object_ids() do
# If this additional level of subquery is omitted and we directly supply AP ids
# to te final query, it appears to overexert PostgreSQL(17)'s planner leading
# to a very inefficient query with enormous memory and time consumption.
# By supplying database IDs it ends up quite cheap however.
Object
|> where([o], fragment("?->>'id' IN ?", o.data, subquery(query_pinned_object_apids())))
|> select([o], o.id)
end
defp query_followed_remote_user_apids() do
Pleroma.FollowingRelationship
|> join(:inner, [rel], ufing in User, on: rel.following_id == ufing.id)
|> join(:inner, [rel], ufer in User, on: rel.follower_id == ufer.id)
|> where([rel], rel.state == :follow_accept)
|> where([_rel, ufing, ufer], ufer.local and not ufing.local)
|> select([_rel, ufing], %{ap_id: ufing.ap_id})
end
defp parse_keep_followed_arg(options) do
case Keyword.get(options, :keep_followed) do
"full" -> :full
"posts" -> :posts
"none" -> false
nil -> false
_ -> raise "Invalid argument for keep_followed! Must be 'full', 'posts' or 'none'"
end
end
defp maybe_restrict_followed_activities(query, options) do
case Keyword.get(options, :keep_followed) do
:full ->
having(
query,
[a],
fragment(
"bool_and(?->>'actor' NOT IN ?)",
a.data,
subquery(query_followed_remote_user_apids())
)
)
:posts ->
having(
query,
[a],
not fragment(
"bool_or(?->>'actor' IN ? AND ?->>'type' = ANY('{Create,Announce}'))",
a.data,
subquery(query_followed_remote_user_apids()),
a.data
)
)
_ ->
query
end
end
defp deletable_objects_keeping_threads(time_deadline, limit_cnt, options) do
# We want to delete objects from threads where
# 1. the newest post is still old
# 2. none of the activities is local
# 3. none of the activities is bookmarked
# 4. optionally none of the posts is non-public
deletable_context =
if Keyword.get(options, :keep_non_public) do
Pleroma.Activity
|> join(:left, [a], b in Pleroma.Bookmark, on: a.id == b.activity_id)
|> group_by([a], fragment("? ->> 'context'::text", a.data))
|> having(
[a],
not fragment(
# Posts (checked on Create Activity) is non-public
"bool_or((not(?->'to' \\? ? OR ?->'cc' \\? ?)) and ? ->> 'type' = 'Create')",
a.data,
^Pleroma.Constants.as_public(),
a.data,
^Pleroma.Constants.as_public(),
a.data
)
)
else
Pleroma.Activity
|> join(:left, [a], b in Pleroma.Bookmark, on: a.id == b.activity_id)
|> group_by([a], fragment("? ->> 'context'::text", a.data))
end
|> having([a], max(a.updated_at) < ^time_deadline)
|> having([a], not fragment("bool_or(?)", a.local))
|> having([_, b], fragment("max(?::text) is null", b.id))
|> maybe_restrict_followed_activities(options)
|> maybe_limit(limit_cnt)
|> select([a], fragment("? ->> 'context'::text", a.data))
Pleroma.Object
|> where([o], fragment("? ->> 'context'::text", o.data) in subquery(deletable_context))
end
defp deletable_objects_breaking_threads(time_deadline, limit_cnt, options) do
deletable =
if Keyword.get(options, :keep_non_public) do
Pleroma.Object
|> where(
[o],
fragment(
"?->'to' \\? ? OR ?->'cc' \\? ?",
o.data,
^Pleroma.Constants.as_public(),
o.data,
^Pleroma.Constants.as_public()
)
)
else
Pleroma.Object
end
|> where([o], o.updated_at < ^time_deadline)
|> where(
[o],
fragment("split_part(?->>'actor', '/', 3) != ?", o.data, ^Pleroma.Web.Endpoint.host())
)
|> then(fn q ->
if Keyword.get(options, :keep_followed) do
where(
q,
[o],
fragment("?->>'actor'", o.data) not in subquery(query_followed_remote_user_apids())
)
else
q
end
end)
|> maybe_limit(limit_cnt)
|> select([o], o.id)
Pleroma.Object
|> where([o], o.id in subquery(deletable))
end
def run(["remove_embedded_objects" | args]) do
{options, [], []} =
OptionParser.parse(
@ -173,16 +320,9 @@ def run(["prune_orphaned_activities" | args]) do
{limit, options} = Keyword.pop(options, :limit, 0)
log_message = "Pruning orphaned activities"
log_message =
if limit > 0 do
log_message <> ", limiting deletion to #{limit} rows"
else
log_message
end
Logger.info(log_message)
"Pruning orphaned activities"
|> maybe_concat(limit > 0, ", limiting deletion to #{limit} rows")
|> Logger.info()
deleted = prune_orphaned_activities(limit, options)
@ -195,13 +335,22 @@ def run(["prune_objects" | args]) do
args,
strict: [
vacuum: :boolean,
keep_followed: :string,
keep_threads: :boolean,
keep_non_public: :boolean,
prune_orphaned_activities: :boolean,
prune_pinned: :boolean,
limit: :integer
]
)
kf = parse_keep_followed_arg(options)
options = Keyword.put(options, :keep_followed, kf)
if kf == :full and not Keyword.get(options, :keep_threads) do
raise "keep_followed=full only works in conjunction with keep_thread!"
end
start_pleroma()
deadline = Pleroma.Config.get([:instance, :remote_post_retention_days])
@ -209,111 +358,35 @@ def run(["prune_objects" | args]) do
limit_cnt = Keyword.get(options, :limit, 0)
log_message = "Pruning objects older than #{deadline} days"
log_message =
if Keyword.get(options, :keep_non_public) do
log_message <> ", keeping non public posts"
else
log_message
end
log_message =
if Keyword.get(options, :keep_threads) do
log_message <> ", keeping threads intact"
else
log_message
end
log_message =
if Keyword.get(options, :prune_orphaned_activities) do
log_message <> ", pruning orphaned activities"
else
log_message
end
log_message =
if Keyword.get(options, :vacuum) do
log_message <>
"Pruning objects older than #{deadline} days"
|> maybe_concat(Keyword.get(options, :keep_non_public), ", keeping non public posts")
|> maybe_concat(Keyword.get(options, :keep_threads), ", keeping threads intact")
|> maybe_concat(kf, ", keeping #{kf} activities of followed users")
|> maybe_concat(Keyword.get(options, :prune_pinned), ", pruning pinned posts")
|> maybe_concat(
Keyword.get(options, :prune_orphaned_activities),
", pruning orphaned activities"
)
|> maybe_concat(
Keyword.get(options, :vacuum),
", doing a full vacuum (you shouldn't do this as a recurring maintanance task)"
else
log_message
end
log_message =
if limit_cnt > 0 do
log_message <> ", limiting to #{limit_cnt} rows"
else
log_message
end
Logger.info(log_message)
)
|> maybe_concat(limit_cnt > 0, ", limiting to #{limit_cnt} rows")
|> Logger.info()
{del_obj, _} =
if Keyword.get(options, :keep_threads) do
# We want to delete objects from threads where
# 1. the newest post is still old
# 2. none of the activities is local
# 3. none of the activities is bookmarked
# 4. optionally none of the posts is non-public
deletable_context =
if Keyword.get(options, :keep_non_public) do
Pleroma.Activity
|> join(:left, [a], b in Pleroma.Bookmark, on: a.id == b.activity_id)
|> group_by([a], fragment("? ->> 'context'::text", a.data))
|> having(
[a],
not fragment(
# Posts (checked on Create Activity) is non-public
"bool_or((not(?->'to' \\? ? OR ?->'cc' \\? ?)) and ? ->> 'type' = 'Create')",
a.data,
^Pleroma.Constants.as_public(),
a.data,
^Pleroma.Constants.as_public(),
a.data
)
)
deletable_objects_keeping_threads(time_deadline, limit_cnt, options)
else
Pleroma.Activity
|> join(:left, [a], b in Pleroma.Bookmark, on: a.id == b.activity_id)
|> group_by([a], fragment("? ->> 'context'::text", a.data))
deletable_objects_breaking_threads(time_deadline, limit_cnt, options)
end
|> having([a], max(a.updated_at) < ^time_deadline)
|> having([a], not fragment("bool_or(?)", a.local))
|> having([_, b], fragment("max(?::text) is null", b.id))
|> maybe_limit(limit_cnt)
|> select([a], fragment("? ->> 'context'::text", a.data))
Pleroma.Object
|> where([o], fragment("? ->> 'context'::text", o.data) in subquery(deletable_context))
|> then(fn q ->
if Keyword.get(options, :prune_pinned) do
q
else
deletable =
if Keyword.get(options, :keep_non_public) do
Pleroma.Object
|> where(
[o],
fragment(
"?->'to' \\? ? OR ?->'cc' \\? ?",
o.data,
^Pleroma.Constants.as_public(),
o.data,
^Pleroma.Constants.as_public()
)
)
else
Pleroma.Object
end
|> where([o], o.updated_at < ^time_deadline)
|> where(
[o],
fragment("split_part(?->>'actor', '/', 3) != ?", o.data, ^Pleroma.Web.Endpoint.host())
)
|> maybe_limit(limit_cnt)
|> select([o], o.id)
Pleroma.Object
|> where([o], o.id in subquery(deletable))
where(q, [o], o.id not in subquery(query_pinned_object_ids()))
end
end)
|> Repo.delete_all(timeout: :infinity)
Logger.info("Deleted #{del_obj} objects...")

View file

@ -88,6 +88,74 @@ test "it prunes old objects from the database", %{old_insert_date: old_insert_da
refute Object.get_by_id(note_remote_non_public_id)
end
test "it retains pinned posts by default", %{old_insert_date: old_insert_date} do
insert(:note)
pin_user = insert(:user, local: false)
%{id: note_remote_pinned_id, data: note_remote_pinned_data} =
:note
|> insert(user: pin_user)
|> Ecto.Changeset.change(%{updated_at: old_insert_date})
|> Repo.update!()
User.add_pinned_object_id(pin_user, note_remote_pinned_data["id"])
note_remote_non_public =
%{id: note_remote_non_public_id, data: note_remote_non_public_data} =
:note
|> insert()
note_remote_non_public
|> Ecto.Changeset.change(%{
updated_at: old_insert_date,
data: note_remote_non_public_data |> update_in(["to"], fn _ -> [] end)
})
|> Repo.update!()
assert length(Repo.all(Object)) == 3
Mix.Tasks.Pleroma.Database.run(["prune_objects"])
assert length(Repo.all(Object)) == 2
assert Object.get_by_id(note_remote_pinned_id)
refute Object.get_by_id(note_remote_non_public_id)
end
test "it prunes pinned posts with --prune-pinned", %{old_insert_date: old_insert_date} do
insert(:note)
pin_user = insert(:user, local: false)
%{id: note_remote_pinned_id, data: note_remote_pinned_data} =
:note
|> insert(user: pin_user)
|> Ecto.Changeset.change(%{updated_at: old_insert_date})
|> Repo.update!()
User.add_pinned_object_id(pin_user, note_remote_pinned_data["id"])
note_remote_non_public =
%{id: note_remote_non_public_id, data: note_remote_non_public_data} =
:note
|> insert()
note_remote_non_public
|> Ecto.Changeset.change(%{
updated_at: old_insert_date,
data: note_remote_non_public_data |> update_in(["to"], fn _ -> [] end)
})
|> Repo.update!()
assert length(Repo.all(Object)) == 3
Mix.Tasks.Pleroma.Database.run(["prune_objects", "--prune-pinned"])
assert length(Repo.all(Object)) == 1
refute Object.get_by_id(note_remote_pinned_id)
refute Object.get_by_id(note_remote_non_public_id)
end
test "it cleans up bookmarks", %{old_insert_date: old_insert_date} do
user = insert(:user)
{:ok, old_object_activity} = CommonAPI.post(user, %{status: "yadayada"})
@ -351,6 +419,85 @@ test "with the --keep-threads option it keeps old threads with bookmarked posts"
assert length(Repo.all(Object)) == 1
end
defp prepare_keep_followed_test(old_insert_date) do
remote_user = insert(:user, local: false)
local_user = insert(:user, local: true)
third_party = insert(:user, local: false)
CommonAPI.follow(local_user, remote_user)
CommonAPI.accept_follow_request(local_user, remote_user)
assert :follow_accept == Pleroma.FollowingRelationship.get(local_user, remote_user).state
{:ok, old_remote_post_activity} =
CommonAPI.post(remote_user, %{status: "some thing", local: false})
old_remote_post_activity
|> Ecto.Changeset.change(%{local: false, updated_at: old_insert_date})
|> Repo.update!()
old_remote_post_activity.object
|> Ecto.Changeset.change(%{updated_at: old_insert_date})
|> Repo.update!()
{:ok, old_liked_post_activity} =
CommonAPI.post(third_party, %{status: "boo!", local: false})
{:ok, old_like_activity} = CommonAPI.favorite(remote_user, old_liked_post_activity.id)
old_liked_post_activity
|> Ecto.Changeset.change(%{local: false, updated_at: old_insert_date})
|> Repo.update!()
old_liked_post_activity.object
|> Ecto.Changeset.change(%{updated_at: old_insert_date})
|> Repo.update!()
old_like_activity
|> Ecto.Changeset.change(%{local: false, updated_at: old_insert_date})
|> Repo.update!()
assert length(Repo.all(Object)) == 2
{old_remote_post_activity.object.id, old_liked_post_activity.object.id}
end
test "by default does not keep posts of followed users", %{
old_insert_date: old_insert_date
} do
_ = prepare_keep_followed_test(old_insert_date)
Mix.Tasks.Pleroma.Database.run(["prune_objects"])
assert length(Repo.all(Object)) == 0
end
test "with the --keep-followed posts option it keeps old posts of followed users", %{
old_insert_date: old_insert_date
} do
{old_remote_post_id, old_liked_post_id} =
prepare_keep_followed_test(old_insert_date)
Mix.Tasks.Pleroma.Database.run(["prune_objects", "--keep-followed", "posts"])
assert length(Repo.all(Object)) == 1
assert Object.get_by_id(old_remote_post_id)
refute Object.get_by_id(old_liked_post_id)
end
test "with the --keep-followed full option it keeps old posts liked by a followed user", %{
old_insert_date: old_insert_date
} do
_ = prepare_keep_followed_test(old_insert_date)
Mix.Tasks.Pleroma.Database.run([
"prune_objects",
"--keep-followed",
"full",
"--keep-threads"
])
assert length(Repo.all(Object)) == 2
end
test "We don't have unexpected tables which may contain objects that are referenced by activities" do
# We can delete orphaned activities. For that we look for the objects they reference in the 'objects', 'activities', and 'users' table.
# If someone adds another table with objects (idk, maybe with separate relations, or collections or w/e), then we need to make sure we