mbta · Whoops · Jul 8, 2025 · Jul 7, 2025 · lemald · Jul 7, 2025
@@ -1,90 +1,24 @@
 defmodule Health.Checkers.RunQueue do
   @moduledoc """
-  Health check which makes sure the Erlang [Run
-  Queue](http://erlang.org/doc/man/erlang.html#statistics-1) is reasonably
-  low.
+  Health check for monitoring the Erlang [Run
+  Queue](http://erlang.org/doc/man/erlang.html#statistics-1).
+
+  This check always returns healthy as we don't want to kill tasks based on the run queue length.
+  Instead it logs the maximum run queue length across all schedulers for monitoring purposes.
   """
   require Logger
 
   def current do
-    [run_queue: queue_size()]
+    [run_queue: max_queue_length()]
   end
 
   def healthy? do
-    h? = queue_size() <= max_run_queue_length()
-
-    _ = log_processes(h?, Logger.level())
-
-    h?
-  end
-
-  defp max_run_queue_length, do: 100
-
-  defp queue_size do
-    :erlang.statistics(:run_queue)
-  end
-
-  def log_processes(false, level) when level in [:info, :debug] do
-    spawn(fn ->
-      for line <- log_lines() do
-        _ = Logger.info(line)
-      end
-    end)
-
-    :logged
-  end
-
-  def log_processes(_, _) do
-    :ignored
-  end
-
-  def log_lines do
-    start_time = System.monotonic_time()
-
-    for pid <- Process.list() do
-      # lt short for log time
-      "process_info pid=#{inspect(pid)} lt=#{start_time} #{log_info(pid)}"
-    end
-  end
-
-  def log_info(pid) do
-    info =
-      Process.info(
-        pid,
-        ~w(current_function initial_call status message_queue_len priority total_heap_size heap_size stack_size reductions dictionary registered_name memory)a
-      )
-
-    log_info_iodata(info)
-  end
-
-  defp log_info_iodata(info) when is_list(info) do
-    info =
-      if initial_call = info[:dictionary][:"$initial_call"] do
-        Keyword.put(info, :initial_call, initial_call)
-      else
-        info
-      end
-
-    info = Keyword.delete(info, :dictionary)
-
-    for {k, v} <- info do
-      [Atom.to_string(k), "=", pid_log(v), " "]
-    end
-  end
-
-  defp log_info_iodata(nil) do
-    ["status=dead"]
-  end
-
-  defp pid_log({m, f, a}) when is_atom(m) and is_atom(f) and a >= 0 do
-    [?", Atom.to_string(m), ?., Atom.to_string(f), ?/, Integer.to_string(a), ?"]
-  end
-
-  defp pid_log(atom) when is_atom(atom) do
-    Atom.to_string(atom)
+    max_length = max_queue_length()
+    _ = Logger.info("run_queue_check max_run_queue_length=#{max_length}")
+    true
   end
 
-  defp pid_log(other) do
-    inspect(other)
+  defp max_queue_length do
+    Enum.max(:erlang.statistics(:run_queue_lengths))
   end
 end
@@ -2,51 +2,17 @@ defmodule Health.Checkers.RunQueueTest do
   use ExUnit.Case
   import Health.Checkers.RunQueue
 
-  describe "log_processes/2" do
-    test "logs if we're not healthy and the log level is low enough" do
-      assert log_processes(false, :info) == :logged
-      assert log_processes(false, :debug) == :logged
-    end
-
-    test "does nothing when we're healthy" do
-      assert log_processes(true, :info) == :ignored
-    end
-
-    test "does nothing when the log level is high" do
-      assert log_processes(false, :warning) == :ignored
-    end
-  end
-
-  describe "log_lines/0" do
-    test "one line per alive process" do
-      lines = log_lines()
-      assert length(lines) >= length(Process.list())
+  describe "healthy?/0" do
+    test "always returns true" do
+      assert healthy?() == true
     end
   end
 
-  describe "log_info/1" do
-    test "logs information about the process" do
-      binary = IO.iodata_to_binary(log_info(self()))
-      assert binary =~ ~s(current_function="Elixir.Process.info/2")
-      assert binary =~ ~s(initial_call="erlang.apply/2")
-      assert binary =~ ~s(message_queue_len=0)
-      assert binary =~ ~s(status=running)
-    end
-
-    test "overrides initial call if present in process dictionary" do
-      # GenServers set this
-      {:ok, pid} = Agent.start_link(fn -> :ok end)
-      binary = IO.iodata_to_binary(log_info(pid))
-
-      assert binary =~
-               ~s(initial_call="Elixir.Health.Checkers.RunQueueTest.-test log_info/1 overrides initial call if present in process dictionary/1-fun-0-/0")
-    end
-
-    test "logs a dead process" do
-      {:ok, pid} = Agent.start_link(fn -> :ok end)
-      Agent.stop(pid)
-      binary = IO.iodata_to_binary(log_info(pid))
-      assert binary =~ ~s(status=dead)
+  describe "current/0" do
+    test "returns the current run queue size" do
+      [run_queue: size] = current()
+      assert is_integer(size)
+      assert size >= 0
     end
   end
 end