Skip to content

Commit

Permalink
feat(gcp+aws): Add last_scrape_time metric (#159)
Browse files Browse the repository at this point in the history
In order to better track freshness of data, this PR adds a few more operational metrics:
- `cloudcost_exporter_collector_last_scrape_time`
- `cloudcost_exporter_last_scrape_time`

The intent of these is to export in unix time the last time a scrape was performed. This can be used to alert in prometheus when the last_scrape_time was say > 60m.

This also implements in AWS the operational metrics that GCP implemented so that we have feature parity between the two. In the future it would make sense to generalize this to a common interface so that new providers do not need to implement the same metrics.

- refs #5 + #105
  • Loading branch information
Pokom authored May 2, 2024
1 parent 7dd8cee commit 4fba148
Show file tree
Hide file tree
Showing 4 changed files with 115 additions and 8 deletions.
74 changes: 72 additions & 2 deletions pkg/aws/aws.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,62 @@ type AWS struct {
}

var (
providerLastScrapeErrorDesc = prometheus.NewDesc(
prometheus.BuildFQName(cloudcost_exporter.ExporterName, "", "last_scrape_error"),
"Was the last scrape an error. 1 indicates an error.",
[]string{"provider"},
nil,
)
collectorSuccessDesc = prometheus.NewDesc(
prometheus.BuildFQName(cloudcost_exporter.ExporterName, subsystem, "collector_success"),
"Was the last scrape of the AWS metrics successful.",
[]string{"collector"},
nil,
)
collectorLastScrapeErrorDesc = prometheus.NewDesc(
prometheus.BuildFQName(cloudcost_exporter.ExporterName, "collector", "last_scrape_error"),
"Was the last scrape an error. 1 indicates an error.",
[]string{"provider", "collector"},
nil,
)
collectorDurationDesc = prometheus.NewDesc(
prometheus.BuildFQName(cloudcost_exporter.ExporterName, "collector", "last_scrape_duration_seconds"),
"Duration of the last scrape in seconds.",
[]string{"provider", "collector"},
nil,
)
collectorScrapesTotalCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: prometheus.BuildFQName(cloudcost_exporter.ExporterName, "collector", "scrapes_total"),
Help: "Total number of scrapes for a collector.",
},
[]string{"provider", "collector"},
)
collectorLastScrapeTime = prometheus.NewDesc(
prometheus.BuildFQName(cloudcost_exporter.ExporterName, "collector", "last_scrape_time"),
"Time of the last scrape.W",
[]string{"provider", "collector"},
nil,
)
providerLastScrapeTime = prometheus.NewDesc(
prometheus.BuildFQName(cloudcost_exporter.ExporterName, "", "last_scrape_time"),
"Time of the last scrape.",
[]string{"provider"},
nil,
)
providerLastScrapeDurationDesc = prometheus.NewDesc(
prometheus.BuildFQName(cloudcost_exporter.ExporterName, "", "last_scrape_duration_seconds"),
"Duration of the last scrape in seconds.",
[]string{"provider"},
nil,
)
providerScrapesTotalCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: prometheus.BuildFQName(cloudcost_exporter.ExporterName, "", "scrapes_total"),
Help: "Total number of scrapes.",
},
[]string{"provider"},
)
)

var services = []string{"S3"}
Expand Down Expand Up @@ -85,6 +135,9 @@ func New(config *Config) (*AWS, error) {

func (a *AWS) RegisterCollectors(registry provider.Registry) error {
log.Printf("Registering %d collectors for AWS", len(a.collectors))
registry.MustRegister(
collectorScrapesTotalCounter,
)
for _, c := range a.collectors {
if err := c.Register(registry); err != nil {
return err
Expand All @@ -94,6 +147,13 @@ func (a *AWS) RegisterCollectors(registry provider.Registry) error {
}

func (a *AWS) Describe(ch chan<- *prometheus.Desc) {
ch <- collectorLastScrapeErrorDesc
ch <- collectorDurationDesc
ch <- providerLastScrapeErrorDesc
ch <- providerLastScrapeDurationDesc
ch <- collectorLastScrapeTime
ch <- providerLastScrapeTime
ch <- collectorSuccessDesc
for _, c := range a.collectors {
if err := c.Describe(ch); err != nil {
log.Printf("Error describing collector %s: %s", c.Name(), err)
Expand All @@ -102,18 +162,28 @@ func (a *AWS) Describe(ch chan<- *prometheus.Desc) {
}

func (a *AWS) Collect(ch chan<- prometheus.Metric) {
start := time.Now()
wg := &sync.WaitGroup{}
wg.Add(len(a.collectors))
for _, c := range a.collectors {
go func(c provider.Collector) {
now := time.Now()
defer wg.Done()
collectorSuccess := 1.0
collectorSuccess := 0.0
if err := c.Collect(ch); err != nil {
collectorSuccess = 0.0
collectorSuccess = 1.0
log.Printf("Error collecting metrics from collector %s: %s", c.Name(), err)
}
ch <- prometheus.MustNewConstMetric(collectorLastScrapeErrorDesc, prometheus.GaugeValue, collectorSuccess, subsystem, c.Name())
ch <- prometheus.MustNewConstMetric(collectorDurationDesc, prometheus.GaugeValue, time.Since(now).Seconds(), subsystem, c.Name())
ch <- prometheus.MustNewConstMetric(collectorLastScrapeTime, prometheus.GaugeValue, float64(time.Now().Unix()), subsystem, c.Name())
ch <- prometheus.MustNewConstMetric(collectorSuccessDesc, prometheus.GaugeValue, collectorSuccess, c.Name())
collectorScrapesTotalCounter.WithLabelValues(subsystem, c.Name()).Inc()
}(c)
}
wg.Wait()
ch <- prometheus.MustNewConstMetric(providerLastScrapeErrorDesc, prometheus.GaugeValue, 0.0, subsystem)
ch <- prometheus.MustNewConstMetric(providerLastScrapeDurationDesc, prometheus.GaugeValue, time.Since(start).Seconds(), subsystem)
ch <- prometheus.MustNewConstMetric(providerLastScrapeTime, prometheus.GaugeValue, float64(time.Now().Unix()), subsystem)
providerScrapesTotalCounter.WithLabelValues(subsystem).Inc()
}
1 change: 1 addition & 0 deletions pkg/aws/aws_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ func Test_RegisterCollectors(t *testing.T) {
t.Run(tc.name, func(t *testing.T) {
ctrl := gomock.NewController(t)
r := mock_provider.NewMockRegistry(ctrl)
r.EXPECT().MustRegister(gomock.Any()).AnyTimes()
c := mock_provider.NewMockCollector(ctrl)
if tc.register != nil {
c.EXPECT().Register(r).DoAndReturn(tc.register).Times(tc.numCollectors)
Expand Down
16 changes: 16 additions & 0 deletions pkg/google/gcp.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,18 @@ var (
},
[]string{"provider", "collector"},
)
collectorLastScrapeTime = prometheus.NewDesc(
prometheus.BuildFQName(cloudcost_exporter.ExporterName, "collector", "last_scrape_time"),
"Time of the last scrape.W",
[]string{"provider", "collector"},
nil,
)
providerLastScrapeTime = prometheus.NewDesc(
prometheus.BuildFQName(cloudcost_exporter.ExporterName, "", "last_scrape_time"),
"Time of the last scrape.",
[]string{"provider"},
nil,
)
)

type GCP struct {
Expand Down Expand Up @@ -163,6 +175,8 @@ func (g *GCP) Describe(ch chan<- *prometheus.Desc) {
ch <- collectorDurationDesc
ch <- providerLastScrapeErrorDesc
ch <- providerLastScrapeDurationDesc
ch <- collectorLastScrapeTime
ch <- providerLastScrapeTime
for _, c := range g.collectors {
if err := c.Describe(ch); err != nil {
log.Printf("Error describing collector %s: %s", c.Name(), err)
Expand All @@ -187,12 +201,14 @@ func (g *GCP) Collect(ch chan<- prometheus.Metric) {
log.Printf("Collector(%s) collect respose=%.2f", c.Name(), collectorSuccess)
ch <- prometheus.MustNewConstMetric(collectorLastScrapeErrorDesc, prometheus.GaugeValue, collectorSuccess, subsystem, c.Name())
ch <- prometheus.MustNewConstMetric(collectorDurationDesc, prometheus.GaugeValue, time.Since(now).Seconds(), subsystem, c.Name())
ch <- prometheus.MustNewConstMetric(collectorLastScrapeTime, prometheus.GaugeValue, float64(time.Now().Unix()), subsystem, c.Name())
collectorScrapesTotalCounter.WithLabelValues(subsystem, c.Name()).Inc()
}(c)
}
wg.Wait()
// When can the error actually happen? Potentially if all the collectors fail?
ch <- prometheus.MustNewConstMetric(providerLastScrapeErrorDesc, prometheus.GaugeValue, 0.0, subsystem)
ch <- prometheus.MustNewConstMetric(providerLastScrapeDurationDesc, prometheus.GaugeValue, time.Since(start).Seconds(), subsystem)
ch <- prometheus.MustNewConstMetric(providerLastScrapeTime, prometheus.GaugeValue, float64(time.Now().Unix()), subsystem)
providerScrapesTotalCounter.WithLabelValues(subsystem).Inc()
}
32 changes: 26 additions & 6 deletions pkg/google/gcp_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,12 @@ func TestGCP_CollectMetrics(t *testing.T) {
Value: 0,
MetricType: prometheus.GaugeValue,
},
{
FqName: "cloudcost_exporter_collector_last_scrape_time",
Labels: utils.LabelMap{"provider": "gcp", "collector": "test"},
Value: 0,
MetricType: prometheus.GaugeValue,
},
{
FqName: "cloudcost_exporter_last_scrape_error",
Labels: utils.LabelMap{"provider": "gcp"},
Expand Down Expand Up @@ -117,6 +123,12 @@ func TestGCP_CollectMetrics(t *testing.T) {
Value: 0,
MetricType: prometheus.GaugeValue,
}, {
FqName: "cloudcost_exporter_collector_last_scrape_time",
Labels: utils.LabelMap{"provider": "gcp", "collector": "test"},
Value: 0,
MetricType: prometheus.GaugeValue,
},
{
FqName: "cloudcost_exporter_collector_last_scrape_error",
Labels: utils.LabelMap{"provider": "gcp", "collector": "test"},
Value: 0,
Expand All @@ -128,14 +140,16 @@ func TestGCP_CollectMetrics(t *testing.T) {
Value: 0,
MetricType: prometheus.GaugeValue,
},

{
FqName: "cloudcost_exporter_last_scrape_error",
Labels: utils.LabelMap{"provider": "gcp"},
FqName: "cloudcost_exporter_collector_last_scrape_time",
Labels: utils.LabelMap{"provider": "gcp", "collector": "test"},
Value: 0,
MetricType: prometheus.GaugeValue,
},

{
FqName: "cloudcost_exporter_last_scrape_duration_seconds",
FqName: "cloudcost_exporter_last_scrape_error",
Labels: utils.LabelMap{"provider": "gcp"},
Value: 0,
MetricType: prometheus.GaugeValue,
Expand Down Expand Up @@ -176,13 +190,19 @@ func TestGCP_CollectMetrics(t *testing.T) {
wg.Done()

wg.Wait()
ignoredMetricSuffix := []string{"duration_seconds", "last_scrape_time"}
// I don't love using a named loop, but this allows the inner loop to properly continue if the condition has been met.
metricsLoop:
for _, expectedMetric := range tt.expectedMetrics {
metric := utils.ReadMetrics(<-ch)
// We don't care about the value for the scrape durations, just that it exists and is returned in the order we expect.
if strings.Contains(metric.FqName, "duration_seconds") {
require.Equal(t, expectedMetric.FqName, metric.FqName)
continue
for _, suffix := range ignoredMetricSuffix {
if strings.Contains(metric.FqName, suffix) {
require.Equal(t, expectedMetric.FqName, metric.FqName)
continue metricsLoop
}
}

require.Equal(t, expectedMetric, metric)
}

Expand Down

0 comments on commit 4fba148

Please sign in to comment.