@@ -27,11 +27,14 @@ package event
2727import (
2828 "context"
2929 "fmt"
30+ "sync"
3031 "time"
3132
33+ "github.com/AliceO2Group/Control/common/ecsmetrics"
3234 "github.com/AliceO2Group/Control/common/event/topic"
3335 "github.com/AliceO2Group/Control/common/logger"
3436 "github.com/AliceO2Group/Control/common/logger/infologger"
37+ "github.com/AliceO2Group/Control/common/monitoring"
3538 pb "github.com/AliceO2Group/Control/common/protos"
3639 "github.com/segmentio/kafka-go"
3740 "github.com/sirupsen/logrus"
@@ -53,9 +56,26 @@ func (*DummyWriter) WriteEvent(interface{}) {}
5356func (* DummyWriter ) WriteEventWithTimestamp (interface {}, time.Time ) {}
5457func (* DummyWriter ) Close () {}
5558
59+ // Kafka writer is used to convert events from events.proto into kafka messages and to write them.
60+ // it is built with 2 workers:
61+ //
62+ // #1 is gathering kafka.Message from any goroutine which sends message into buffered channel and puts them into FifoBuffer.
63+ // #2 is poping any messages from FifoBuffer and sends them to Kafka
64+ //
65+ // The reason for this setup over setting Async: true in kafka.Writer is the ability to have some error handling
66+ // of failed messages. Moreover if we used only one worker that gathers messages from channel and then sends them directly to Kafka,
67+ // we would block whole core if we receive lot of messages at once. So we split functionality into two workers: one is
68+ // putting all messages into the buffer, so if we have a lot of messages buffer just grows without blocking whole core and the
69+ // second does all the sending. This setup allows us to gather messages from any amount of goroutines without blocking/losing messages.
70+ // Another benefit is batching messages instead of writing them one by one.
5671type KafkaWriter struct {
5772 * kafka.Writer
58- toWriteChan chan kafka.Message
73+ toBatchMessagesChan chan kafka.Message
74+ messageBuffer FifoBuffer [kafka.Message ]
75+ // NOTE: existence of this is to be able to test the writer without actually setting up kafka
76+ writeFunction func ([]kafka.Message )
77+ runningWorkers sync.WaitGroup
78+ batchingDoneCh chan struct {}
5979}
6080
6181func NewWriterWithTopic (topic topic.Topic ) * KafkaWriter {
@@ -66,16 +86,27 @@ func NewWriterWithTopic(topic topic.Topic) *KafkaWriter {
6686 Balancer : & kafka.Hash {},
6787 AllowAutoTopicCreation : true ,
6888 },
69- toWriteChan : make (chan kafka.Message , 1000 ),
89+ toBatchMessagesChan : make (chan kafka.Message , 100 ),
90+ messageBuffer : NewFifoBuffer [kafka.Message ](),
91+ runningWorkers : sync.WaitGroup {},
92+ batchingDoneCh : make (chan struct {}, 1 ),
93+ }
94+
95+ writer .writeFunction = func (messages []kafka.Message ) {
96+ writer .WriteMessages (context .Background (), messages ... )
7097 }
7198
7299 go writer .writingLoop ()
100+ go writer .batchingLoop ()
101+
73102 return writer
74103}
75104
76105func (w * KafkaWriter ) Close () {
77106 if w != nil {
78- close (w .toWriteChan )
107+ w .runningWorkers .Add (2 )
108+ close (w .toBatchMessagesChan )
109+ w .runningWorkers .Wait ()
79110 w .Writer .Close ()
80111 }
81112}
@@ -86,17 +117,36 @@ func (w *KafkaWriter) WriteEvent(e interface{}) {
86117 }
87118}
88119
89- // TODO: we can optimise this to write multiple message at once
90120func (w * KafkaWriter ) writingLoop () {
91- for message := range w .toWriteChan {
92- err := w .WriteMessages (context .Background (), message )
93- if err != nil {
94- log .WithField ("level" , infologger .IL_Support ).
95- Errorf ("failed to write async kafka message: %w" , err )
121+ for {
122+ select {
123+ case <- w .batchingDoneCh :
124+ w .runningWorkers .Done ()
125+ return
126+ default :
127+ messagesToSend := w .messageBuffer .PopMultiple (100 )
128+ if len (messagesToSend ) == 0 {
129+ continue
130+ }
131+ w .writeFunction (messagesToSend )
132+
133+ metric := ecsmetrics .NewMetric ("kafka" )
134+ metric .AddTag ("topic" , w .Topic )
135+ metric .AddValue ("sentmessages" , len (messagesToSend ))
136+ monitoring .Send (metric )
96137 }
97138 }
98139}
99140
141+ func (w * KafkaWriter ) batchingLoop () {
142+ for message := range w .toBatchMessagesChan {
143+ w .messageBuffer .Push (message )
144+ }
145+ w .batchingDoneCh <- struct {}{}
146+ w .messageBuffer .ReleaseGoroutines ()
147+ w .runningWorkers .Done ()
148+ }
149+
100150type HasEnvID interface {
101151 GetEnvironmentId () string
102152}
@@ -109,6 +159,7 @@ func extractAndConvertEnvID[T HasEnvID](object T) []byte {
109159 return nil
110160}
111161
162+ // TODO: there should be written test to convert all of these messages
112163func internalEventToKafkaEvent (internalEvent interface {}, timestamp time.Time ) (kafkaEvent * pb.Event , key []byte , err error ) {
113164 kafkaEvent = & pb.Event {
114165 Timestamp : timestamp .UnixMilli (),
@@ -188,9 +239,5 @@ func (w *KafkaWriter) WriteEventWithTimestamp(e interface{}, timestamp time.Time
188239 return
189240 }
190241
191- select {
192- case w .toWriteChan <- message :
193- default :
194- log .Warnf ("Writer of kafka topic [%s] cannot write because channel is full, discarding a message" , w .Writer .Topic )
195- }
242+ w .toBatchMessagesChan <- message
196243}
0 commit comments