-
Notifications
You must be signed in to change notification settings - Fork 117
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat : Alerts should not wake up idle clickhouse service #5787
base: main
Are you sure you want to change the base?
Changes from all commits
1b01f49
5e25d74
066c63e
6c0df54
90701e5
f72869b
7fbc113
ced02b8
60d19df
b9c753f
082cc98
d81f52c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,8 +2,10 @@ package clickhouse | |
|
||
import ( | ||
"context" | ||
"encoding/json" | ||
"errors" | ||
"fmt" | ||
"net/http" | ||
"strings" | ||
"time" | ||
|
||
|
@@ -503,7 +505,55 @@ func (c *connection) renameTable(ctx context.Context, oldName, newName, onCluste | |
} | ||
|
||
func (c *connection) MayBeScaledToZero(ctx context.Context) bool { | ||
return c.config.CanScaleToZero | ||
if c.config.APIKeyID == "" { | ||
// no api key provided resort to the config set | ||
return c.config.CanScaleToZero | ||
} | ||
|
||
c.statusCheckMutex.Lock() | ||
defer c.statusCheckMutex.Unlock() | ||
// check if stauts is cached | ||
if !c.statusCheckedAt.IsZero() && time.Since(c.statusCheckedAt) <= time.Minute*10 { | ||
return c.scaledToZero | ||
} | ||
Comment on lines
+515
to
+518
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think caching for 10 mins is too long, makes it too likely to lead to a feedback loop that keeps it alive |
||
|
||
ctx, cancel := context.WithTimeout(ctx, time.Second*10) | ||
defer cancel() | ||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, fmt.Sprintf("https://api.clickhouse.cloud/v1/organizations/%s/services/%s", c.config.OrganizationID, c.config.ServiceID), http.NoBody) | ||
if err != nil { | ||
c.logger.Warn("failed to create clickhouse cloud API request", zap.Error(err)) | ||
return c.config.CanScaleToZero | ||
} | ||
req.SetBasicAuth(c.config.APIKeyID, c.config.APIKeySecret) | ||
|
||
resp, err := c.cloudAPI.Do(req) | ||
if err != nil { | ||
c.logger.Warn("failed to get clickhouse cloud API response", zap.Error(err)) | ||
return c.config.CanScaleToZero | ||
} | ||
defer resp.Body.Close() | ||
|
||
if resp.StatusCode != http.StatusOK { | ||
c.logger.Warn("failed to get clickhouse cloud API response", zap.Int("status_code", resp.StatusCode)) | ||
return c.config.CanScaleToZero | ||
} | ||
|
||
// parse response | ||
var response struct { | ||
Result struct { | ||
State string `json:"state"` | ||
} `json:"result"` | ||
} | ||
err = json.NewDecoder(resp.Body).Decode(&response) | ||
if err != nil { | ||
c.logger.Warn("failed to decode clickhouse cloud API response", zap.Error(err)) | ||
return c.config.CanScaleToZero | ||
} | ||
scaledToZero := strings.EqualFold(response.Result.State, "idle") | ||
// also cache the result | ||
c.scaledToZero = scaledToZero | ||
c.statusCheckedAt = time.Now() | ||
return scaledToZero | ||
} | ||
|
||
// acquireMetaConn gets a connection from the pool for "meta" queries like information schema (i.e. fast queries). | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -167,14 +167,15 @@ func (r *AlertReconciler) Reconcile(ctx context.Context, n *runtimev1.ResourceNa | |
// Evaluate the trigger time of the alert. If triggered by schedule, we use the "clean" scheduled time. | ||
// Note: Correction for watermarks and intervals is done in checkAlert. | ||
var triggerTime time.Time | ||
if scheduleTrigger && !adhocTrigger && !specHashTrigger && !refsTrigger { | ||
onlyScheduledTrigger := scheduleTrigger && !adhocTrigger && !specHashTrigger && !refsTrigger | ||
if onlyScheduledTrigger { | ||
triggerTime = a.State.NextRunOn.AsTime() | ||
} else { | ||
triggerTime = time.Now() | ||
} | ||
|
||
// Run alert queries and send notifications | ||
executeErr := r.executeAll(ctx, self, a, triggerTime, adhocTrigger) | ||
executeErr := r.executeAll(ctx, self, a, triggerTime, adhocTrigger, onlyScheduledTrigger) | ||
|
||
// If we were cancelled, exit without updating any other trigger-related state. | ||
// NOTE: We don't set Retrigger here because we'll leave re-scheduling to whatever cancelled the reconciler. | ||
|
@@ -376,7 +377,20 @@ func (r *AlertReconciler) setTriggerFalse(ctx context.Context, n *runtimev1.Reso | |
|
||
// executeAll runs queries and (maybe) sends notifications for the alert. It also adds entries to a.State.ExecutionHistory. | ||
// By default, an alert is checked once for the current watermark, but if a.Spec.IntervalsIsoDuration is set, it will be checked *for each* interval that has elapsed since the previous execution watermark. | ||
func (r *AlertReconciler) executeAll(ctx context.Context, self *runtimev1.Resource, a *runtimev1.Alert, triggerTime time.Time, adhocTrigger bool) error { | ||
func (r *AlertReconciler) executeAll(ctx context.Context, self *runtimev1.Resource, a *runtimev1.Alert, triggerTime time.Time, adhocTrigger, onlyScheduledTrigger bool) error { | ||
// Skip if OLAP is in idle state and alerts are configured to skip | ||
if onlyScheduledTrigger { | ||
err := r.validateOLAPState(ctx, self) | ||
if err != nil { | ||
skipErr := &skipError{} | ||
if !errors.As(err, skipErr) { | ||
return err | ||
} | ||
r.C.Logger.Info("Skipped alert check", zap.String("name", self.Meta.Name.Name), zap.String("reason", skipErr.reason)) | ||
return nil | ||
} | ||
} | ||
Comment on lines
+381
to
+392
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think this is satisfactory because it means a scheduled alert may never get checked. Ideally we would want to find out when it was last alive and check the alert if it has been alive since the last alert check. Also see the use of |
||
|
||
// Enforce timeout | ||
timeout := alertCheckDefaultTimeout | ||
if a.Spec.TimeoutSeconds > 0 { | ||
|
@@ -826,6 +840,33 @@ func (r *AlertReconciler) computeInheritedWatermark(ctx context.Context, refs [] | |
return t, !t.IsZero(), nil | ||
} | ||
|
||
func (r *AlertReconciler) validateOLAPState(ctx context.Context, self *runtimev1.Resource) error { | ||
var mvSpec *runtimev1.MetricsViewSpec | ||
for _, ref := range self.Meta.Refs { | ||
if ref.Kind != runtime.ResourceKindMetricsView { | ||
continue | ||
} | ||
mv, err := r.C.Get(ctx, ref, false) | ||
if err != nil { | ||
return err | ||
} | ||
mvSpec = mv.GetMetricsView().State.ValidSpec | ||
} | ||
if mvSpec == nil { | ||
return nil | ||
} | ||
|
||
olap, release, err := r.C.AcquireOLAP(ctx, mvSpec.Connector) | ||
if err != nil { | ||
return err | ||
} | ||
defer release() | ||
if olap.MayBeScaledToZero(ctx) { | ||
return skipError{reason: "OLAP may be scaled to zero"} | ||
} | ||
return nil | ||
} | ||
|
||
// calculateAlertExecutionTimes calculates the execution times for an alert, taking into consideration the alert's intervals configuration and previous executions. | ||
// If the alert is not configured to run on intervals, it will return a slice containing only the current watermark. | ||
// If the alert should not be executed, it returns a skipError explaining why. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These fields should probably be prefixed with
cloud_
to clarify that they are CH Cloud specific.