Skip to content

Commit

Permalink
Query Path Fixes (#1079)
Browse files Browse the repository at this point in the history
* Fixed issue with increasing hedged requests

Signed-off-by: Joe Elliott <[email protected]>

* Count failed blocks on 404

Signed-off-by: Joe Elliott <[email protected]>

* changelog

Signed-off-by: Joe Elliott <[email protected]>

* fixed error messages

Signed-off-by: Joe Elliott <[email protected]>

* return metrics on not found

Signed-off-by: Joe Elliott <[email protected]>

* fix metric

Signed-off-by: Joe Elliott <[email protected]>

* protect against overflow

Signed-off-by: Joe Elliott <[email protected]>

* restore 404

Signed-off-by: Joe Elliott <[email protected]>
  • Loading branch information
joe-elliott authored Oct 27, 2021
1 parent 23584ed commit 5124375
Show file tree
Hide file tree
Showing 5 changed files with 99 additions and 50 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@
* [BUGFIX] Set span's tag `span.kind` to `client` in query-frontend [#975](https://github.com/grafana/tempo/pull/975) (@mapno)
* [BUGFIX] Nil check overrides module in the `/status` handler [#994](https://github.com/grafana/tempo/pull/994) (@mapno)
* [BUGFIX] Several bug fixes for search contention and panics [#1033](https://github.com/grafana/tempo/pull/1033) (@mdisibio)
* [BUGFIX] Fixes `tempodb_backend_hedged_roundtrips_total` to correctly count hedged roundtrips. [#1079](https://github.com/grafana/tempo/pull/1079) (@joe-elliott)

## v1.1.0 / 2021-08-26
* [CHANGE] Upgrade Cortex from v1.9.0 to v1.9.0-131-ga4bf10354 [#841](https://github.com/grafana/tempo/pull/841) (@aknuds1)
Expand Down
14 changes: 7 additions & 7 deletions modules/frontend/tracebyidsharding.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,13 +96,8 @@ func (s shardQuery) RoundTrip(r *http.Request) (*http.Response, error) {
return
}

// if not found bail
if resp.StatusCode == http.StatusNotFound {
return
}

// if the status code is anything but happy, save the error and pass it down the line
if resp.StatusCode != http.StatusOK {
if resp.StatusCode != http.StatusOK && resp.StatusCode != http.StatusNotFound {
// todo: if we cancel the parent context here will it shortcircuit the other queries and fail fast?
statusCode = resp.StatusCode
bytesMsg, err := io.ReadAll(resp.Body)
Expand All @@ -113,7 +108,7 @@ func (s shardQuery) RoundTrip(r *http.Request) (*http.Response, error) {
return
}

// successful query, read the body
// read the body
buff, err := io.ReadAll(resp.Body)
if err != nil {
_ = level.Error(s.logger).Log("msg", "error reading response body status == ok", "url", innerR.RequestURI, "err", err)
Expand All @@ -140,6 +135,11 @@ func (s shardQuery) RoundTrip(r *http.Request) (*http.Response, error) {
}
}

// if not found bail
if resp.StatusCode == http.StatusNotFound {
return
}

// happy path
statusCode = http.StatusOK
overallTrace, _, _, _ = model.CombineTraceProtos(overallTrace, traceResp.Trace)
Expand Down
115 changes: 79 additions & 36 deletions modules/frontend/tracebyidsharding_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,17 +92,18 @@ func TestShardingWareDoRequest(t *testing.T) {
}

tests := []struct {
name string
status1 int
status2 int
trace1 *tempopb.Trace
trace2 *tempopb.Trace
err1 error
err2 error
failedBlockQueries int
expectedStatus int
expectedTrace *tempopb.Trace
expectedError error
name string
status1 int
status2 int
trace1 *tempopb.Trace
trace2 *tempopb.Trace
err1 error
err2 error
failedBlockQueries1 int
failedBlockQueries2 int
expectedStatus int
expectedTrace *tempopb.Trace
expectedError error
}{
{
name: "empty returns",
Expand Down Expand Up @@ -202,7 +203,6 @@ func TestShardingWareDoRequest(t *testing.T) {
trace2: trace1,
expectedError: errors.New("booo"),
},

{
name: "500+err",
status1: 500,
Expand All @@ -211,23 +211,54 @@ func TestShardingWareDoRequest(t *testing.T) {
expectedError: errors.New("booo"),
},
{
name: "failed block queries <= max",
status1: 200,
trace1: trace1,
status2: 200,
trace2: trace2,
failedBlockQueries: 1,
expectedStatus: 200,
expectedTrace: trace,
name: "failedBlocks under: 200+200",
status1: 200,
trace1: trace1,
status2: 200,
trace2: trace2,
failedBlockQueries1: 1,
failedBlockQueries2: 1,
expectedStatus: 200,
expectedTrace: trace,
},
{
name: "failedBlocks over: 200+200",
status1: 200,
trace1: trace1,
status2: 200,
trace2: trace2,
failedBlockQueries1: 0,
failedBlockQueries2: 5,
expectedError: errors.New("too many failed block queries 5 (max 2)"),
},
{
name: "failedBlocks under: 200+404",
status1: 200,
trace1: trace1,
status2: 404,
failedBlockQueries1: 1,
failedBlockQueries2: 0,
expectedStatus: 200,
expectedTrace: trace1,
},
{
name: "failedBlocks under: 404+200",
status1: 200,
trace1: trace1,
status2: 404,
failedBlockQueries1: 0,
failedBlockQueries2: 1,
expectedStatus: 200,
expectedTrace: trace1,
},
{
name: "too many failed block queries",
status1: 200,
trace1: trace1,
status2: 200,
trace2: trace2,
failedBlockQueries: 10,
expectedError: errors.New("too many failed block queries 10 (max 2)"),
name: "failedBlocks over: 404+200",
status1: 200,
trace1: trace1,
status2: 404,
failedBlockQueries1: 0,
failedBlockQueries2: 5,
expectedError: errors.New("too many failed block queries 5 (max 2)"),
},
}

Expand All @@ -239,29 +270,41 @@ func TestShardingWareDoRequest(t *testing.T) {
var trace *tempopb.Trace
var statusCode int
var err error
var failedBlockQueries int
if r.RequestURI == "/querier/api/traces/1234?mode=ingesters" {
trace = tc.trace1
statusCode = tc.status1
err = tc.err1
failedBlockQueries = tc.failedBlockQueries1
} else {
trace = tc.trace2
err = tc.err2
statusCode = tc.status2
failedBlockQueries = tc.failedBlockQueries2
}

if err != nil {
return nil, err
}

var resBytes []byte
if trace != nil {
resBytes, err = proto.Marshal(&tempopb.TraceByIDResponse{
Trace: trace,
Metrics: &tempopb.TraceByIDMetrics{
FailedBlocks: uint32(tc.failedBlockQueries),
},
})
require.NoError(t, err)
resBytes := []byte("error occurred")
if statusCode != 500 {
if trace != nil {
resBytes, err = proto.Marshal(&tempopb.TraceByIDResponse{
Trace: trace,
Metrics: &tempopb.TraceByIDMetrics{
FailedBlocks: uint32(failedBlockQueries),
},
})
require.NoError(t, err)
} else {
resBytes, err = proto.Marshal(&tempopb.TraceByIDResponse{
Metrics: &tempopb.TraceByIDMetrics{
FailedBlocks: uint32(failedBlockQueries),
},
})
require.NoError(t, err)
}
}

return &http.Response{
Expand Down
6 changes: 3 additions & 3 deletions modules/querier/http.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package querier

import (
"context"
"encoding/hex"
"fmt"
"net/http"
"time"
Expand Down Expand Up @@ -67,9 +66,10 @@ func (q *Querier) TraceByIDHandler(w http.ResponseWriter, r *http.Request) {
return
}

// record not found here, but continue on so we can marshal metrics
// to the body
if resp.Trace == nil || len(resp.Trace.Batches) == 0 {
http.Error(w, fmt.Sprintf("Unable to find %s", hex.EncodeToString(byteID)), http.StatusNotFound)
return
w.WriteHeader(http.StatusNotFound)
}

if r.Header.Get(api.HeaderAccept) == api.HeaderAcceptProtobuf {
Expand Down
13 changes: 9 additions & 4 deletions tempodb/backend/instrumentation/hedged_requests.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@ const (
)

var (
hedgedRequestsMetrics = promauto.NewCounter(
prometheus.CounterOpts{
hedgedRequestsMetrics = promauto.NewGauge(
prometheus.GaugeOpts{
Namespace: "tempodb",
Name: "backend_hedged_roundtrips_total",
Help: "Total number of hedged backend requests",
Help: "Total number of hedged backend requests. Registered as a gauge for code sanity. This is a counter.",
},
)
)
Expand All @@ -27,7 +27,12 @@ func PublishHedgedMetrics(s *hedgedhttp.Stats) {
ticker := time.NewTicker(hedgedMetricsPublishDuration)
go func() {
for range ticker.C {
hedgedRequestsMetrics.Add(float64(s.Snapshot().RequestedRoundTrips))
snap := s.Snapshot()
hedgedRequests := int64(snap.ActualRoundTrips) - int64(snap.RequestedRoundTrips)
if hedgedRequests < 0 {
hedgedRequests = 0
}
hedgedRequestsMetrics.Set(float64(hedgedRequests))
}
}()
}

0 comments on commit 5124375

Please sign in to comment.