karafka
diff --git a/‎CHANGELOG.md
Lines changed: 1 addition & 0 deletions b/‎CHANGELOG.md
Lines changed: 1 addition & 0 deletions
diff --git a/‎lib/rdkafka.rb
Lines changed: 1 addition & 0 deletions b/‎lib/rdkafka.rb
Lines changed: 1 addition & 0 deletions
diff --git a/‎lib/rdkafka/bindings.rb
Lines changed: 25 additions & 1 deletion b/‎lib/rdkafka/bindings.rb
Lines changed: 25 additions & 1 deletion
diff --git a/‎lib/rdkafka/producer.rb
Lines changed: 35 additions & 30 deletions b/‎lib/rdkafka/producer.rb
Lines changed: 35 additions & 30 deletions
diff --git a/‎lib/rdkafka/producer/partitions_count_cache.rb
Lines changed: 216 additions & 0 deletions b/‎lib/rdkafka/producer/partitions_count_cache.rb
Lines changed: 216 additions & 0 deletions
@@ -1,6 +1,7 @@
 # Rdkafka Changelog
 
 ## 0.21.1 (Unreleased)
+- [Enhancement] Replace TTL-based partition count cache with a global cache that reuses `librdkafka` statistics data when possible.
 - [Enhancement] Support producing and consuming of headers with mulitple values (KIP-82).
 - [Enhancement] Allow native Kafka customization poll time.
 - [Enhancement] Roll out experimental jruby support.
 
@@ -42,6 +42,7 @@
 require "rdkafka/error"
 require "rdkafka/metadata"
 require "rdkafka/native_kafka"
+require "rdkafka/producer/partitions_count_cache"
 require "rdkafka/producer"
 require "rdkafka/producer/delivery_handle"
 require "rdkafka/producer/delivery_report"
 
@@ -26,6 +26,8 @@ def self.lib_extension
     RD_KAFKA_OFFSET_STORED    = -1000
     RD_KAFKA_OFFSET_INVALID   = -1001
 
+    EMPTY_HASH = {}.freeze
+
     class SizePtr < FFI::Struct
       layout :value, :size_t
     end
@@ -201,9 +203,31 @@ class NativeErrorDesc < FFI::Struct
     StatsCallback = FFI::Function.new(
       :int, [:pointer, :string, :int, :pointer]
     ) do |_client_ptr, json, _json_len, _opaque|
-      # Pass the stats hash to callback in config
       if Rdkafka::Config.statistics_callback
         stats = JSON.parse(json)
+
+        # If user requested statistics callbacks, we can use the statistics data to get the
+        # partitions count for each topic when this data is published. That way we do not have
+        # to query this information when user is using `partition_key`. This takes around 0.02ms
+        # every statistics interval period (most likely every 5 seconds) and saves us from making
+        # any queries to the cluster for the partition count.
+        #
+        # One edge case is if user would set the `statistics.interval.ms` much higher than the
+        # default current partition count refresh (30 seconds). This is taken care of as the lack
+        # of reporting to the partitions cache will cause cache expire and blocking refresh.
+        #
+        # If user sets `topic.metadata.refresh.interval.ms` too high this is on the user.
+        #
+        # Since this cache is shared, having few consumers and/or producers in one process will
+        # automatically improve the querying times even with low refresh times.
+        (stats['topics'] || EMPTY_HASH).each do |topic_name, details|
+          partitions_count = details['partitions'].keys.reject { |k| k == '-1' }.size
+
+          next unless partitions_count.positive?
+
+          Rdkafka::Producer.partitions_count_cache.set(topic_name, partitions_count)
+        end
+
         Rdkafka::Config.statistics_callback.call(stats)
       end
 
 
@@ -6,13 +6,31 @@ class Producer
     include Helpers::Time
     include Helpers::OAuth
 
-    # Cache partitions count for 30 seconds
-    PARTITIONS_COUNT_TTL = 30
-
     # Empty hash used as a default
     EMPTY_HASH = {}.freeze
 
-    private_constant :PARTITIONS_COUNT_TTL, :EMPTY_HASH
+    # @private
+    @@partitions_count_cache = PartitionsCountCache.new
+
+    # Global (process wide) partitions cache. We use it to store number of topics partitions,
+    # either from the librdkafka statistics (if enabled) or via direct inline calls every now and
+    # then. Since the partitions count can only grow and should be same for all consumers and
+    # producers, we can use a global cache as long as we ensure that updates only move up.
+    #
+    # @note It is critical to remember, that not all users may have statistics callbacks enabled,
+    #   hence we should not make assumption that this cache is always updated from the stats.
+    #
+    # @return [Rdkafka::Producer::PartitionsCountCache]
+    def self.partitions_count_cache
+      @@partitions_count_cache
+    end
+
+    # @param partitions_count_cache [Rdkafka::Producer::PartitionsCountCache]
+    def self.partitions_count_cache=(partitions_count_cache)
+      @@partitions_count_cache = partitions_count_cache
+    end
+
+    private_constant :EMPTY_HASH
 
     # Raised when there was a critical issue when invoking rd_kafka_topic_new
     # This is a temporary solution until https://github.com/karafka/rdkafka-ruby/issues/451 is
@@ -43,25 +61,6 @@ def initialize(native_kafka, partitioner_name)
 
       # Makes sure, that native kafka gets closed before it gets GCed by Ruby
       ObjectSpace.define_finalizer(self, native_kafka.finalizer)
-
-      @_partitions_count_cache = Hash.new do |cache, topic|
-        topic_metadata = nil
-
-        @native_kafka.with_inner do |inner|
-          topic_metadata = ::Rdkafka::Metadata.new(inner, topic).topics&.first
-        end
-
-        partition_count = topic_metadata ? topic_metadata[:partition_count] : -1
-
-        # This approach caches the failure to fetch only for 1 second. This will make sure, that
-        # we do not cache the failure for too long but also "buys" us a bit of time in case there
-        # would be issues in the cluster so we won't overaload it with consecutive requests
-        cache[topic] = if partition_count.positive?
-                         [monotonic_now, partition_count]
-                       else
-                         [monotonic_now - PARTITIONS_COUNT_TTL + 5, partition_count]
-                       end
-      end
     end
 
     # Sets alternative set of configuration details that can be set per topic
@@ -222,18 +221,24 @@ def purge
     # @note If 'allow.auto.create.topics' is set to true in the broker, the topic will be
     #   auto-created after returning nil.
     #
-    # @note We cache the partition count for a given topic for given time.
+    # @note We cache the partition count for a given topic for given time. If statistics are
+    #   enabled for any producer or consumer, it will take precedence over per instance fetching.
+    #
     #   This prevents us in case someone uses `partition_key` from querying for the count with
-    #   each message. Instead we query once every 30 seconds at most if we have a valid partition
-    #   count or every 5 seconds in case we were not able to obtain number of partitions
+    #   each message. Instead we query at most once every 30 seconds at most if we have a valid
+    #   partition count or every 5 seconds in case we were not able to obtain number of partitions.
     def partition_count(topic)
       closed_producer_check(__method__)
 
-      @_partitions_count_cache.delete_if do |_, cached|
-        monotonic_now - cached.first > PARTITIONS_COUNT_TTL
-      end
+      self.class.partitions_count_cache.get(topic) do
+        topic_metadata = nil
+
+        @native_kafka.with_inner do |inner|
+          topic_metadata = ::Rdkafka::Metadata.new(inner, topic).topics&.first
+        end
 
-      @_partitions_count_cache[topic].last
+        topic_metadata ? topic_metadata[:partition_count] : -1
+      end
     end
 
     # Produces a message to a Kafka topic. The message is added to rdkafka's queue, call {DeliveryHandle#wait wait} on the returned delivery handle to make sure it is delivered.
 
@@ -0,0 +1,216 @@
+# frozen_string_literal: true
+
+module Rdkafka
+  class Producer
+    # Caching mechanism for Kafka topic partition counts to avoid frequent cluster queries
+    #
+    # This cache is designed to optimize the process of obtaining partition counts for topics.
+    # It uses several strategies to minimize Kafka cluster queries:
+    #
+    # @note Design considerations:
+    #
+    # 1. Statistics-based updates
+    #    When statistics callbacks are enabled (via `statistics.interval.ms`), we leverage
+    #    this data to proactively update the partition counts cache. This approach costs
+    #    approximately 0.02ms of processing time during each statistics interval (typically
+    #    every 5 seconds) but eliminates the need for explicit blocking metadata queries.
+    #
+    # 2. Edge case handling
+    #    If a user configures `statistics.interval.ms` much higher than the default cache TTL
+    #    (30 seconds), the cache will still function correctly. When statistics updates don't
+    #    occur frequently enough, the cache entries will expire naturally, triggering a
+    #    blocking refresh when needed.
+    #
+    # 3. User configuration awareness
+    #    The cache respects user-defined settings. If `topic.metadata.refresh.interval.ms` is
+    #    set very high, the responsibility for potentially stale data falls on the user. This
+    #    is an explicit design choice to honor user configuration preferences and align with
+    #    librdkafka settings.
+    #
+    # 4. Process-wide efficiency
+    #    Since this cache is shared across all Rdkafka producers and consumers within a process,
+    #    having multiple clients improves overall efficiency. Each client contributes to keeping
+    #    the cache updated, benefiting all other clients.
+    #
+    # 5. Thread-safety approach
+    #    The implementation uses fine-grained locking with per-topic mutexes to minimize
+    #    contention in multi-threaded environments while ensuring data consistency.
+    #
+    # 6. Topic recreation handling
+    #    If a topic is deleted and recreated with fewer partitions, the cache will continue to
+    #    report the higher count until either the TTL expires or the process is restarted. This
+    #    design choice simplifies the implementation while relying on librdkafka's error handling
+    #    for edge cases. In production environments, topic recreation with different partition
+    #    counts is typically accompanied by application restarts to handle structural changes.
+    #    This also aligns with the previous cache implementation.
+    class PartitionsCountCache
+      include Helpers::Time
+
+      # Default time-to-live for cached partition counts in seconds
+      #
+      # @note This default was chosen to balance freshness of metadata with performance
+      #   optimization. Most Kafka cluster topology changes are planned operations, making 30
+      #   seconds a reasonable compromise.
+      DEFAULT_TTL = 30
+
+      # Creates a new partition count cache
+      #
+      # @param ttl [Integer] Time-to-live in seconds for cached values
+      def initialize(ttl = DEFAULT_TTL)
+        @counts = {}
+        @mutex_hash = {}
+        # Used only for @mutex_hash access to ensure thread-safety when creating new mutexes
+        @mutex_for_hash = Mutex.new
+        @ttl = ttl
+      end
+
+      # Reads partition count for a topic with automatic refresh when expired
+      #
+      # This method will return the cached partition count if available and not expired.
+      # If the value is expired or not available, it will execute the provided block
+      # to fetch the current value from Kafka.
+      #
+      # @param topic [String] Kafka topic name
+      # @yield Block that returns the current partition count when cache needs refreshing
+      # @yieldreturn [Integer] Current partition count retrieved from Kafka
+      # @return [Integer] Partition count for the topic
+      #
+      # @note The implementation prioritizes read performance over write consistency
+      #   since partition counts typically only increase during normal operation.
+      def get(topic)
+        current_info = @counts[topic]
+
+        if current_info.nil? || expired?(current_info[0])
+          new_count = yield
+
+          if current_info.nil?
+            # No existing data, create a new entry with mutex
+            set(topic, new_count)
+
+            return new_count
+          else
+            current_count = current_info[1]
+
+            if new_count > current_count
+              # Higher value needs mutex to update both timestamp and count
+              set(topic, new_count)
+
+              return new_count
+            else
+              # Same or lower value, just update timestamp without mutex
+              refresh_timestamp(topic)
+
+              return current_count
+            end
+          end
+        end
+
+        current_info[1]
+      end
+
+      # Update partition count for a topic when needed
+      #
+      # This method updates the partition count for a topic in the cache.
+      # It uses a mutex to ensure thread-safety during updates.
+      #
+      # @param topic [String] Kafka topic name
+      # @param new_count [Integer] New partition count value
+      #
+      # @note We prioritize higher partition counts and only accept them when using
+      #   a mutex to ensure consistency. This design decision is based on the fact that
+      #   partition counts in Kafka only increase during normal operation.
+      def set(topic, new_count)
+        # First check outside mutex to avoid unnecessary locking
+        current_info = @counts[topic]
+
+        # For lower values, we don't update count but might need to refresh timestamp
+        if current_info && new_count < current_info[1]
+          refresh_timestamp(topic)
+
+          return
+        end
+
+        # Only lock the specific topic mutex
+        mutex_for(topic).synchronize do
+          # Check again inside the lock as another thread might have updated
+          current_info = @counts[topic]
+
+          if current_info.nil?
+            # Create new entry
+            @counts[topic] = [monotonic_now, new_count]
+          else
+            current_count = current_info[1]
+
+            if new_count > current_count
+              # Update to higher count value
+              current_info[0] = monotonic_now
+              current_info[1] = new_count
+            else
+              # Same or lower count, update timestamp only
+              current_info[0] = monotonic_now
+            end
+          end
+        end
+      end
+
+      # @return [Hash] hash with ttls and partitions counts array
+      def to_h
+        @counts
+      end
+
+      private
+
+      # Get or create a mutex for a specific topic
+      #
+      # This method ensures that each topic has its own mutex,
+      # allowing operations on different topics to proceed in parallel.
+      #
+      # @param topic [String] Kafka topic name
+      # @return [Mutex] Mutex for the specified topic
+      #
+      # @note We use a separate mutex (@mutex_for_hash) to protect the creation
+      #   of new topic mutexes. This pattern allows fine-grained locking while
+      #   maintaining thread-safety.
+      def mutex_for(topic)
+        mutex = @mutex_hash[topic]
+
+        return mutex if mutex
+
+        # Use a separate mutex to protect the creation of new topic mutexes
+        @mutex_for_hash.synchronize do
+          # Check again in case another thread created it
+          @mutex_hash[topic] ||= Mutex.new
+        end
+
+        @mutex_hash[topic]
+      end
+
+      # Update the timestamp without acquiring the mutex
+      #
+      # This is an optimization that allows refreshing the TTL of existing entries
+      # without the overhead of mutex acquisition.
+      #
+      # @param topic [String] Kafka topic name
+      #
+      # @note This method is safe for refreshing existing data regardless of count
+      #   because it only updates the timestamp, which doesn't affect the correctness
+      #   of concurrent operations.
+      def refresh_timestamp(topic)
+        current_info = @counts[topic]
+
+        return unless current_info
+
+        # Update the timestamp in-place
+        current_info[0] = monotonic_now
+      end
+
+      # Check if a timestamp has expired based on the TTL
+      #
+      # @param timestamp [Float] Monotonic timestamp to check
+      # @return [Boolean] true if expired, false otherwise
+      def expired?(timestamp)
+        monotonic_now - timestamp > @ttl
+      end
+    end
+  end
+end