x-pack/filebeat/input/gcs: add support for CSV decoding

The test file txn.csv.gz was obtained from https://netskopepartnerlogfilebucket.s3.amazonaws.com/txn-1722875066329034-fe10b6a23cc643c4b282e6190de2352d.csv.gz
elastic · Sep 25, 2024 · 1197c09 · 1197c09
1 parent 4380b9d
commit 1197c09
Show file tree

Hide file tree

Showing 13 changed files with 1,169 additions and 20 deletions.
diff --git a/CHANGELOG.next.asciidoc b/CHANGELOG.next.asciidoc
@@ -306,6 +306,8 @@ https://github.com/elastic/beats/compare/v8.8.1\...main[Check the HEAD diff]
 - Add `use_kubeadm` config option for filebeat (both filbeat.input and autodiscovery) in order to toggle kubeadm-config api requests {pull}40301[40301]
 - Make HTTP library function inclusion non-conditional in CEL input. {pull}40912[40912]
 - Add support for Crowdstrike streaming API to the streaming input. {issue}40264[40264] {pull}40838[40838]
+- Add CSV decoding capacity to gcs input {pull}40979[40979]
+
 
 *Auditbeat*
 

diff --git a/x-pack/filebeat/input/gcs/config.go b/x-pack/filebeat/input/gcs/config.go
@@ -16,6 +16,7 @@ import (
 	"golang.org/x/oauth2/google"
 
 	"github.com/elastic/beats/v7/libbeat/common/match"
+	"github.com/elastic/beats/v7/libbeat/reader/parser"
 )
 
 // MaxWorkers, Poll, PollInterval, BucketTimeOut, ParseJSON, FileSelectors, TimeStampEpoch & ExpandEventListFromField
@@ -41,6 +42,8 @@ type config struct {
 	Buckets []bucket `config:"buckets" validate:"required"`
 	// FileSelectors - Defines a list of regex patterns that can be used to filter out objects from the bucket.
 	FileSelectors []fileSelectorConfig `config:"file_selectors"`
+	// ReaderConfig is the default parser and decoder configuration.
+	ReaderConfig readerConfig `config:",inline"`
 	// TimeStampEpoch - Defines the epoch time in seconds, which is used to filter out objects that are older than the specified timestamp.
 	TimeStampEpoch *int64 `config:"timestamp_epoch"`
 	// ExpandEventListFromField - Defines the field name that will be used to expand the event into separate events.
@@ -58,6 +61,7 @@ type bucket struct {
 	PollInterval             *time.Duration       `config:"poll_interval,omitempty"`
 	ParseJSON                *bool                `config:"parse_json,omitempty"`
 	FileSelectors            []fileSelectorConfig `config:"file_selectors"`
+	ReaderConfig             readerConfig         `config:",inline"`
 	TimeStampEpoch           *int64               `config:"timestamp_epoch"`
 	ExpandEventListFromField string               `config:"expand_event_list_from_field"`
 }
@@ -68,6 +72,12 @@ type fileSelectorConfig struct {
 	// TODO: Add support for reader config in future
 }
 
+// readerConfig defines the options for reading the content of an GCS object.
+type readerConfig struct {
+	Parsers  parser.Config `config:",inline"`
+	Decoding decoderConfig `config:"decoding"`
+}
+
 type authConfig struct {
 	CredentialsJSON *jsonCredentialsConfig `config:"credentials_json,omitempty"`
 	CredentialsFile *fileCredentialsConfig `config:"credentials_file,omitempty"`

diff --git a/x-pack/filebeat/input/gcs/decoding.go b/x-pack/filebeat/input/gcs/decoding.go
@@ -0,0 +1,47 @@
+// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+// or more contributor license agreements. Licensed under the Elastic License;
+// you may not use this file except in compliance with the Elastic License.
+
+package gcs
+
+import (
+	"fmt"
+	"io"
+)
+
+// decoder is an interface for decoding data from an io.Reader.
+type decoder interface {
+	// decode reads and decodes data from an io reader based on the codec type.
+	// It returns the decoded data and an error if the data cannot be decoded.
+	decode() ([]byte, error)
+	// next advances the decoder to the next data item and returns true if there is more data to be decoded.
+	next() bool
+	// close closes the decoder and releases any resources associated with it.
+	// It returns an error if the decoder cannot be closed.
+
+	// more returns whether there are more records to read.
+	more() bool
+
+	close() error
+}
+
+// valueDecoder is a decoder that can decode directly to a JSON serialisable value.
+type valueDecoder interface {
+	decoder
+
+	decodeValue() ([]byte, map[string]any, error)
+}
+
+// newDecoder creates a new decoder based on the codec type.
+// It returns a decoder type and an error if the codec type is not supported.
+// If the reader config codec option is not set, it returns a nil decoder and nil error.
+func newDecoder(cfg decoderConfig, r io.Reader) (decoder, error) {
+	switch {
+	case cfg.Codec == nil:
+		return nil, nil
+	case cfg.Codec.CSV != nil:
+		return newCSVDecoder(cfg, r)
+	default:
+		return nil, fmt.Errorf("unsupported config value: %v", cfg)
+	}
+}
diff --git a/x-pack/filebeat/input/gcs/decoding_config.go b/x-pack/filebeat/input/gcs/decoding_config.go
@@ -0,0 +1,54 @@
+// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+// or more contributor license agreements. Licensed under the Elastic License;
+// you may not use this file except in compliance with the Elastic License.
+
+package gcs
+
+import (
+	"fmt"
+	"unicode/utf8"
+)
+
+// decoderConfig contains the configuration options for instantiating a decoder.
+type decoderConfig struct {
+	Codec *codecConfig `config:"codec"`
+}
+
+// codecConfig contains the configuration options for different codecs used by a decoder.
+type codecConfig struct {
+	CSV *csvCodecConfig `config:"csv"`
+}
+
+// csvCodecConfig contains the configuration options for the CSV codec.
+type csvCodecConfig struct {
+	Enabled bool `config:"enabled"`
+
+	// Fields is the set of field names. If it is present
+	// it is used to specify the object names of returned
+	// values and the FieldsPerRecord field in the csv.Reader.
+	// Otherwise, names are obtained from the first
+	// line of the CSV data.
+	Fields []string `config:"fields_names"`
+
+	// The fields below have the same meaning as the
+	// fields of the same name in csv.Reader.
+	Comma            *configRune `config:"comma"`
+	Comment          configRune  `config:"comment"`
+	LazyQuotes       bool        `config:"lazy_quotes"`
+	TrimLeadingSpace bool        `config:"trim_leading_space"`
+}
+
+type configRune rune
+
+func (r *configRune) Unpack(s string) error {
+	if s == "" {
+		return nil
+	}
+	n := utf8.RuneCountInString(s)
+	if n != 1 {
+		return fmt.Errorf("single character option given more than one character: %q", s)
+	}
+	_r, _ := utf8.DecodeRuneInString(s)
+	*r = configRune(_r)
+	return nil
+}
diff --git a/x-pack/filebeat/input/gcs/decoding_csv.go b/x-pack/filebeat/input/gcs/decoding_csv.go
@@ -0,0 +1,139 @@
+// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+// or more contributor license agreements. Licensed under the Elastic License;
+// you may not use this file except in compliance with the Elastic License.
+
+package gcs
+
+import (
+	"bytes"
+	"encoding/csv"
+	"fmt"
+	"io"
+	"slices"
+)
+
+// csvDecoder is a decoder for CSV data.
+type csvDecoder struct {
+	r *csv.Reader
+
+	header  []string
+	current []string
+	coming  []string
+
+	err error
+}
+
+// newParquetDecoder creates a new CSV decoder.
+func newCSVDecoder(config decoderConfig, r io.Reader) (decoder, error) {
+	d := csvDecoder{r: csv.NewReader(r)}
+	d.r.ReuseRecord = true
+	if config.Codec.CSV.Comma != nil {
+		d.r.Comma = rune(*config.Codec.CSV.Comma)
+	}
+	d.r.Comment = rune(config.Codec.CSV.Comment)
+	d.r.LazyQuotes = config.Codec.CSV.LazyQuotes
+	d.r.TrimLeadingSpace = config.Codec.CSV.TrimLeadingSpace
+	if len(config.Codec.CSV.Fields) != 0 {
+		d.r.FieldsPerRecord = len(config.Codec.CSV.Fields)
+		d.header = config.Codec.CSV.Fields
+	} else {
+		h, err := d.r.Read()
+		if err != nil {
+			return nil, err
+		}
+		d.header = slices.Clone(h)
+	}
+	var err error
+	d.coming, err = d.r.Read()
+	if err != nil {
+		return nil, err
+	}
+	d.current = make([]string, 0, len(d.header))
+	return &d, nil
+}
+
+func (d *csvDecoder) more() bool { return len(d.coming) == len(d.header) }
+
+// next advances the decoder to the next data item and returns true if
+// there is more data to be decoded.
+func (d *csvDecoder) next() bool {
+	if !d.more() && d.err != nil {
+		return false
+	}
+	d.current = d.current[:len(d.header)]
+	copy(d.current, d.coming)
+	d.coming, d.err = d.r.Read()
+	if d.err == io.EOF {
+		d.coming = nil
+	}
+	return true
+}
+
+// decode returns the JSON encoded value of the current CSV line. next must
+// have been called before any calls to decode.
+func (d *csvDecoder) decode() ([]byte, error) {
+	err := d.check()
+	if err != nil {
+		return nil, err
+	}
+	var buf bytes.Buffer
+	buf.WriteByte('{')
+	for i, n := range d.header {
+		if i != 0 {
+			buf.WriteByte(',')
+		}
+		buf.WriteByte('"')
+		buf.WriteString(n)
+		buf.WriteString(`":"`)
+		buf.WriteString(d.current[i])
+		buf.WriteByte('"')
+	}
+	buf.WriteByte('}')
+	d.current = d.current[:0]
+	return buf.Bytes(), nil
+}
+
+// decodeValue returns the value of the current CSV line interpreted as
+// an object with fields based on the header held by the receiver. next must
+// have been called before any calls to decode.
+func (d *csvDecoder) decodeValue() ([]byte, map[string]any, error) {
+	err := d.check()
+	if err != nil {
+		return nil, nil, err
+	}
+	m := make(map[string]any, len(d.header))
+	for i, n := range d.header {
+		m[n] = d.current[i]
+	}
+	d.current = d.current[:0]
+	b, err := d.decode()
+	if err != nil {
+		return nil, nil, err
+	}
+	return b, m, nil
+}
+
+func (d *csvDecoder) check() error {
+	if d.err != nil {
+		if d.err == io.EOF && d.coming == nil {
+			return nil
+		}
+		return d.err
+	}
+	if len(d.current) == 0 {
+		return fmt.Errorf("decode called before next")
+	}
+	// By the time we are here, current must be the same
+	// length as header; if it was not read, it would be
+	// zero, but if it was, it must match by the contract
+	// of the csv.Reader.
+	return nil
+}
+
+// close closes the parquet decoder and releases the resources.
+func (d *csvDecoder) close() error {
+	if d.err == io.EOF {
+		return nil
+	}
+	return d.err
+}