Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 11 additions & 77 deletions apps/health/lib/health/checkers/run_queue.ex
Original file line number Diff line number Diff line change
@@ -1,90 +1,24 @@
defmodule Health.Checkers.RunQueue do
@moduledoc """
Health check which makes sure the Erlang [Run
Queue](http://erlang.org/doc/man/erlang.html#statistics-1) is reasonably
low.
Health check for monitoring the Erlang [Run
Queue](http://erlang.org/doc/man/erlang.html#statistics-1).
This check always returns healthy as we don't want to kill tasks based on the run queue length.
Instead it logs the maximum run queue length across all schedulers for monitoring purposes.
Comment on lines +6 to +7
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion (non-blocking): if this isn't going to be used as a "real" health check anymore, does it make sense to maybe split this out and make it its own GenServer living somewhere in the supervision tree? The only issue is that I'm not sure where exactly that "somewhere" would be, especially in the context of the API umbrella app setup.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I both agree, but didn't feel like it was worth the overhead. I can easily be convinced otherwise. The rest of the metrics we collect come from a reporter built into ehmon. We don't currently have another way of surfacing metrics so it'd mean building a way of scheduling this to run on some interval, which just seemed like more code than it was worth for this one metric.

"""
require Logger

def current do
[run_queue: queue_size()]
[run_queue: max_queue_length()]
end

def healthy? do
h? = queue_size() <= max_run_queue_length()

_ = log_processes(h?, Logger.level())

h?
end

defp max_run_queue_length, do: 100

defp queue_size do
:erlang.statistics(:run_queue)
end

def log_processes(false, level) when level in [:info, :debug] do
spawn(fn ->
for line <- log_lines() do
_ = Logger.info(line)
end
end)

:logged
end

def log_processes(_, _) do
:ignored
end

def log_lines do
start_time = System.monotonic_time()

for pid <- Process.list() do
# lt short for log time
"process_info pid=#{inspect(pid)} lt=#{start_time} #{log_info(pid)}"
end
end

def log_info(pid) do
info =
Process.info(
pid,
~w(current_function initial_call status message_queue_len priority total_heap_size heap_size stack_size reductions dictionary registered_name memory)a
)

log_info_iodata(info)
end

defp log_info_iodata(info) when is_list(info) do
info =
if initial_call = info[:dictionary][:"$initial_call"] do
Keyword.put(info, :initial_call, initial_call)
else
info
end

info = Keyword.delete(info, :dictionary)

for {k, v} <- info do
[Atom.to_string(k), "=", pid_log(v), " "]
end
end

defp log_info_iodata(nil) do
["status=dead"]
end

defp pid_log({m, f, a}) when is_atom(m) and is_atom(f) and a >= 0 do
[?", Atom.to_string(m), ?., Atom.to_string(f), ?/, Integer.to_string(a), ?"]
end

defp pid_log(atom) when is_atom(atom) do
Atom.to_string(atom)
max_length = max_queue_length()
_ = Logger.info("run_queue_check max_run_queue_length=#{max_length}")
true
end

defp pid_log(other) do
inspect(other)
defp max_queue_length do
Enum.max(:erlang.statistics(:run_queue_lengths))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nitpick (non-blocking): this could be a good pipeline operator candidate

end
end
50 changes: 8 additions & 42 deletions apps/health/test/health/checkers/run_queue_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -2,51 +2,17 @@ defmodule Health.Checkers.RunQueueTest do
use ExUnit.Case
import Health.Checkers.RunQueue

describe "log_processes/2" do
test "logs if we're not healthy and the log level is low enough" do
assert log_processes(false, :info) == :logged
assert log_processes(false, :debug) == :logged
end

test "does nothing when we're healthy" do
assert log_processes(true, :info) == :ignored
end

test "does nothing when the log level is high" do
assert log_processes(false, :warning) == :ignored
end
end

describe "log_lines/0" do
test "one line per alive process" do
lines = log_lines()
assert length(lines) >= length(Process.list())
describe "healthy?/0" do
test "always returns true" do
assert healthy?() == true
end
end

describe "log_info/1" do
test "logs information about the process" do
binary = IO.iodata_to_binary(log_info(self()))
assert binary =~ ~s(current_function="Elixir.Process.info/2")
assert binary =~ ~s(initial_call="erlang.apply/2")
assert binary =~ ~s(message_queue_len=0)
assert binary =~ ~s(status=running)
end

test "overrides initial call if present in process dictionary" do
# GenServers set this
{:ok, pid} = Agent.start_link(fn -> :ok end)
binary = IO.iodata_to_binary(log_info(pid))

assert binary =~
~s(initial_call="Elixir.Health.Checkers.RunQueueTest.-test log_info/1 overrides initial call if present in process dictionary/1-fun-0-/0")
end

test "logs a dead process" do
{:ok, pid} = Agent.start_link(fn -> :ok end)
Agent.stop(pid)
binary = IO.iodata_to_binary(log_info(pid))
assert binary =~ ~s(status=dead)
describe "current/0" do
test "returns the current run queue size" do
[run_queue: size] = current()
assert is_integer(size)
assert size >= 0
end
end
end
Loading