Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 71 additions & 0 deletions .github/resources/slo-report-thresholds.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# Default SLO thresholds configuration
# Users can override/extend by providing custom thresholds via:
# - thresholds_yaml input (inline YAML)
# - thresholds_yaml_path input (file path, default: .slo/thresholds.yaml)
#
# Example custom config:
# metrics:
# - name: read_latency_ms # Override specific metric
# warning_change_percent: 10.0
# - pattern: critical_* # Add new pattern
# critical_change_percent: 25.0

# Global threshold for considering change as "neutral" (stable)
neutral_change_percent: 5.0

# Default thresholds applied to all metrics unless overridden
default:
# Regression/improvement percentage thresholds
warning_change_percent: 20.0 # > 20% change triggers warning
critical_change_percent: 50.0 # > 50% change triggers failure

# Metric-specific thresholds
# Patterns support wildcards: *_availability, read_*, etc.
metrics:
# Availability metrics (should stay high)
- pattern: "*_availability"
direction: higher_is_better
warning_min: 99.0 # < 99% triggers warning
critical_min: 95.0 # < 95% triggers failure
warning_change_percent: 1.0 # > 1% drop is significant

# Latency metrics (should stay low)
- pattern: "*_latency_*"
direction: lower_is_better
warning_change_percent: 30.0 # > 30% increase triggers warning
critical_change_percent: 100.0 # > 100% (2x) triggers failure

- pattern: "*_duration_*"
direction: lower_is_better
warning_change_percent: 30.0
critical_change_percent: 100.0

# Throughput metrics (should stay stable or increase)
- pattern: "*_throughput"
direction: higher_is_better
warning_change_percent: 25.0 # > 25% drop triggers warning
critical_change_percent: 50.0 # > 50% drop triggers failure

- pattern: "*_qps"
direction: higher_is_better
warning_change_percent: 25.0
critical_change_percent: 50.0

- pattern: "*_rps"
direction: higher_is_better
warning_change_percent: 25.0
critical_change_percent: 50.0

# Error/failure metrics (should stay at zero)
- pattern: "*_error*"
direction: lower_is_better
warning_max: 0.1 # Any errors trigger warning
critical_max: 1.0 # > 1% error rate triggers failure

- pattern: "*_failure*"
direction: lower_is_better
warning_max: 0.1
critical_max: 1.0
- pattern: "*_node_hints_misses_total*"
direction: lower_is_better
critical_max: 0 # more than zero and node_hints workload fails
33 changes: 26 additions & 7 deletions .github/scripts/build-slo-image.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,16 @@ Usage:
--tag <docker-tag> \
--src-path <sdk-path> \
--job-name <job-name> \
--ref <git-ref>
--ref <git-ref> \
--fallback-image <docker-tag>

Options:
--context Docker build context directory (e.g. $GITHUB_WORKSPACE/current).
--tag Docker image tag to build (e.g. ydb-app-current).
--src-path Value for Docker build arg SRC_PATH (e.g. native/table).
--job-name Value for Docker build arg JOB_NAME (e.g. native-table).
--ref Value for Docker build arg REF (e.g. branch name / sha).
--context Docker build context directory (e.g. $GITHUB_WORKSPACE/current).
--tag Docker image tag to build (e.g. ydb-app-current).
--src-path Value for Docker build arg SRC_PATH (e.g. native/table).
--job-name Value for Docker build arg JOB_NAME (e.g. native-table).
--ref Value for Docker build arg REF (e.g. branch name / sha).
--fallback-image Image tag to return if initial Docker image build fails
EOF
}

Expand All @@ -32,6 +34,7 @@ tag=""
ref=""
src_path=""
job_name=""
fallback_image=""

while [[ $# -gt 0 ]]; do
case "$1" in
Expand All @@ -55,6 +58,10 @@ while [[ $# -gt 0 ]]; do
job_name="${2:-}"
shift 2
;;
--fallback-image)
fallback_image="${2:-}"
shift 2
;;
-h|--help)
usage
exit 0
Expand All @@ -67,7 +74,7 @@ done

if [[ -z "$context_dir" || -z "$tag" || -z "$src_path" || -z "$job_name" || -z "$ref" ]]; then
usage
exit 2
die "Incomplete argument set"
fi

[[ -d "$context_dir" ]] || die "--context does not exist: $context_dir"
Expand All @@ -82,10 +89,22 @@ echo " SRC_PATH: $src_path"
echo " JOB_NAME: $job_name"

(
set +e
cd "$context_dir"
docker build -t "$tag" \
--build-arg "SRC_PATH=$src_path" \
--build-arg "JOB_NAME=$job_name" \
--build-arg "REF=$ref" \
-f "$dockerfile" .
exit_code=$?
echo "Docker build exit code: $exit_code"
if [ $exit_code -ne 0 ]; then
if [[ -z "$fallback_image" ]]; then
die "Docker build failed and --fallback-image is not set" >&2
fi

echo "Baseline build failed, using fallback image: $fallback_image"
docker tag "$fallback_image" "$tag"
fi
set -e
)
1 change: 1 addition & 0 deletions .github/workflows/slo-report.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ jobs:
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
github_run_id: ${{ github.event.workflow_run.id }}
thresholds_yaml_path: "${{ github.workspace }}/current/.github/resources/slo-report-thresholds.yaml"
remove-slo-label:
needs: test-ydb-slo-action
if: always() && github.event.workflow_run.event == 'pull_request'
Expand Down
46 changes: 33 additions & 13 deletions .github/workflows/slo.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,26 +45,46 @@ jobs:
name: database-sql-table
path: ./database/sql/table
label: database/sql/table
run_extra_args: ''
create_extra_args: ''
- id: database_sql_query
name: database-sql-query
path: ./database/sql/query
label: database/sql/query
run_extra_args: ''
create_extra_args: ''
- id: native_query
name: native-query
path: ./native/query
label: native/query
run_extra_args: ''
create_extra_args: ''
- id: native_table
name: native-table
path: ./native/table
label: native/table
run_extra_args: ''
create_extra_args: ''
- id: native_table_over_query_service
name: native-table-over-query-service
path: ./native/table/over/query/service
label: native/table/over/query/service
run_extra_args: ''
create_extra_args: ''
- id: native_bulk_upsert
name: native-bulk-upsert
path: ./native/bulk-upsert
label: native/bulk-upsert
run_extra_args: '-batch-size=10'
create_extra_args: ''
- id: native_node_hints
name: native-node-hints
path: ./native/node_hints
label: native/node_hints
slo_workload_read_max_rps: 100
slo_workload_write_max_rps: 100
run_extra_args: '-batch-size=10'
create_extra_args: '-min-partitions-count 10'

concurrency:
group: slo-${{ github.ref }}-${{ matrix.sdk.name }}
Expand Down Expand Up @@ -156,39 +176,39 @@ jobs:
--tag "ydb-app-baseline" \
--ref "${{ steps.baseline.outputs.ref }}" \
--src-path "${{ matrix.sdk.label }}" \
--job-name "${{ matrix.sdk.name }}"
--job-name "${{ matrix.sdk.name }}" \
--fallback-image "ydb-app-current"

- name: Initialize YDB SLO
uses: ydb-platform/ydb-slo-action/init@main
id: ydb_slo
uses: ydb-platform/ydb-slo-action/init@c938a16458c1e0a0ef440c0a0cd6fbc3a7991963
with:
github_issue: ${{ github.event.inputs.github_issue }}
github_token: ${{ secrets.GITHUB_TOKEN }}
workload_name: ${{ matrix.sdk.name }}
workload_current_ref: ${{ github.head_ref || github.ref_name }}
workload_baseline_ref: ${{ steps.baseline.outputs.ref }}
disable_compose_profiles: "${{ matrix.sdk.id == 'native_node_hints' && 'chaos' || '' }}"

- name: Prepare SLO Database
run: |
echo "Preparing SLO database..."
CREATE_EXTRA_ARGS="${{ matrix.sdk.create_extra_args }}"
docker run --rm --network ydb_ydb-net \
--add-host "ydb:172.28.0.11" \
--add-host "ydb:172.28.0.12" \
--add-host "ydb:172.28.0.13" \
--add-host "ydb:172.28.0.99" \
ydb-app-current create grpc://ydb:2136 /Root/testdb
ydb-app-current create grpc://ydb:2136 /Root/testdb $CREATE_EXTRA_ARGS

- name: Run SLO Tests (parallel)
timeout-minutes: 15
env:
DURATION: ${{ matrix.sdk.slo_workload_duration_seconds || 600 }}
READ_RPS: ${{ matrix.sdk.slo_workload_read_max_rps || 1000 }}
WRITE_RPS: ${{ matrix.sdk.slo_workload_write_max_rps || 1000 }}
RUN_EXTRA_ARGS: ${{ format('{0} -prometheus-endpoint {1}',matrix.sdk.run_extra_args,steps.ydb_slo.outputs.ydb-prometheus-url)}}
run: |
DURATION=${{ inputs.slo_workload_duration_seconds || 600 }}
READ_RPS=${{ inputs.slo_workload_read_max_rps || 1000 }}
WRITE_RPS=${{ inputs.slo_workload_write_max_rps || 1000 }}

EXTRA_ARGS=""
if [ "${{ matrix.sdk.id }}" = "native_bulk_upsert" ]; then
EXTRA_ARGS="--batch-size=10"
fi

ARGS="run grpc://ydb:2136 /Root/testdb \
-otlp-endpoint prometheus:9090 \
-report-period 250 \
Expand All @@ -197,7 +217,7 @@ jobs:
-write-rps $WRITE_RPS \
-read-timeout 100 \
-write-timeout 100 \
$EXTRA_ARGS"
$RUN_EXTRA_ARGS"

echo "Starting ydb-app-current..."
docker run -d \
Expand Down
1 change: 0 additions & 1 deletion tests/slo/database/sql/query/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,6 @@ func main() {
go w.Metrics(ctx, &wg, metricsRL)

wg.Wait()
w.FailOnError()
default:
panic(fmt.Errorf("unknown mode: %v", cfg.Mode))
}
Expand Down
1 change: 0 additions & 1 deletion tests/slo/database/sql/table/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,6 @@ func main() {
go w.Metrics(ctx, &wg, metricsRL)

wg.Wait()
w.FailOnError()
default:
panic(fmt.Errorf("unknown mode: %v", cfg.Mode))
}
Expand Down
2 changes: 2 additions & 0 deletions tests/slo/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ go 1.24.3
toolchain go1.24.10

require (
github.com/prometheus/client_golang v1.3.0
github.com/prometheus/common v0.7.0
github.com/ydb-platform/gorm-driver v0.1.3
github.com/ydb-platform/ydb-go-sdk-auth-environ v0.3.0
github.com/ydb-platform/ydb-go-sdk/v3 v3.67.0
Expand Down
2 changes: 2 additions & 0 deletions tests/slo/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -1905,6 +1905,7 @@ github.com/posener/complete v1.1.1/go.mod h1:em0nMJCgc9GFtwrmVmEMR/ZL6WyhyjMBndr
github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw=
github.com/prometheus/client_golang v0.9.3-0.20190127221311-3c4408c8b829/go.mod h1:p2iRAGwDERtqlqzRXnrOVns+ignqQo//hLXqYxZYVNs=
github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo=
github.com/prometheus/client_golang v1.3.0 h1:miYCvYqFXtl/J9FIy8eNpBfYthAEFg+Ys0XyUVEcDsc=
github.com/prometheus/client_golang v1.3.0/go.mod h1:hJaj2vgQTGQmVCsAACORcieXFeDPbaTKGT+JTgUa3og=
github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo=
github.com/prometheus/client_model v0.0.0-20190115171406-56726106282f/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo=
Expand All @@ -1918,6 +1919,7 @@ github.com/prometheus/client_model v0.5.0/go.mod h1:dTiFglRmd66nLR9Pv9f0mZi7B7fk
github.com/prometheus/client_model v0.6.0/go.mod h1:NTQHnmxFpouOD0DpvP4XujX3CdOAGQPoaGhyTchlyt8=
github.com/prometheus/common v0.2.0/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4=
github.com/prometheus/common v0.4.1/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4=
github.com/prometheus/common v0.7.0 h1:L+1lyG48J1zAQXA3RBX/nG/B3gjlHq0zTt2tlbJLyCY=
github.com/prometheus/common v0.7.0/go.mod h1:DjGbpBbp5NYNiECxcL/VnbXCCaQpKd3tt26CguLLsqA=
github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk=
github.com/prometheus/procfs v0.0.0-20190117184657-bf6a532e95b1/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk=
Expand Down
4 changes: 3 additions & 1 deletion tests/slo/internal/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ type Config struct {
Time int
ShutdownTime int

BatchSize int
BatchSize int
PrometheusEndpoint string
}

func New() (*Config, error) {
Expand Down Expand Up @@ -95,6 +96,7 @@ func New() (*Config, error) {

fs.StringVar(&cfg.OTLPEndpoint, "otlp-endpoint", "", "OTLP HTTP endpoint for metrics")
fs.IntVar(&cfg.ReportPeriod, "report-period", 250, "metrics reporting period in milliseconds")
fs.StringVar(&cfg.PrometheusEndpoint, "prometheus-endpoint", "", "Prometheus endpoint")

fs.IntVar(&cfg.ReadRPS, "read-rps", 1000, "read RPS")
fs.IntVar(&cfg.WriteRPS, "write-rps", 100, "write RPS")
Expand Down
16 changes: 10 additions & 6 deletions tests/slo/internal/generator/generator.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,22 @@ const (
MaxLength = 40
)

type Generator struct {
type Generator interface {
Generate() (Row, error)
}

type Impl struct {
currentID RowID
mu sync.Mutex
}

func New(id RowID) *Generator {
return &Generator{
func New(id RowID) *Impl {
return &Impl{
currentID: id,
}
}

func (g *Generator) Generate() (Row, error) {
func (g *Impl) Generate() (Row, error) {
g.mu.Lock()
id := g.currentID
g.currentID++
Expand All @@ -37,15 +41,15 @@ func (g *Generator) Generate() (Row, error) {
}

var err error
e.PayloadStr, err = g.genPayloadString()
e.PayloadStr, err = genPayloadString()
if err != nil {
return Row{}, err
}

return e, nil
}

func (g *Generator) genPayloadString() (*string, error) {
func genPayloadString() (*string, error) {
l := MinLength + rand.Intn(MaxLength-MinLength+1) //nolint:gosec // speed more important

sl := make([]byte, l)
Expand Down
Loading
Loading