Skip to content

Commit ada8550

Browse files
authored
iserver-test: Add 9.2 test and some improvements (#17709)
* Add 9.2 to upgrade test * Add inputs to workflow job to control which tests to run * Add back 8.15 * Merge branch 'main' into iservertest-add-9.2-test * Merge branch 'main' into iservertest-add-9.2-test * Add log for re-apply policy * Remove flush * Add retry for generator in case of connection issues * Update job conditional according to comment
1 parent c7a2c6f commit ada8550

File tree

2 files changed

+56
-37
lines changed

2 files changed

+56
-37
lines changed

.github/workflows/integration-server-test.yml

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,23 @@ name: integration-server-test
33
run-name: Integration Server Test
44

55
on:
6-
workflow_dispatch: ~
6+
workflow_dispatch:
7+
inputs:
8+
run-upgrade-tests:
9+
description: 'Run upgrade tests (SNAPSHOT)'
10+
required: false
11+
type: boolean
12+
default: true
13+
run-upgrade-bc-tests:
14+
description: 'Run upgrade tests (BC)'
15+
required: false
16+
type: boolean
17+
default: true
18+
run-standalone-tests:
19+
description: 'Run standalone-to-managed tests'
20+
required: false
21+
type: boolean
22+
default: true
723
schedule:
824
- cron: '0 2 * * 1-5'
925

@@ -31,18 +47,17 @@ jobs:
3147
uses: ./.github/workflows/generate-bc-upgrade-paths
3248

3349
run-upgrade:
34-
name: Upgrade tests (Snapshot)
50+
if: ${{ !contains(inputs.should_run, 'false') }}
51+
name: Upgrade tests (SNAPSHOT)
3552
runs-on: ubuntu-latest
3653
strategy:
3754
fail-fast: false
3855
matrix:
3956
upgrade-path:
40-
# Latest 8.15 cannot upgrade to latest 8.17, it can only go to 8.17.3.
41-
# With our current setup (only latest patch), we have to upgrade to intermediate latest 8.16 instead.
42-
# TODO: Maybe add support for upgrading to latest upgradable instead of absolute latest?
43-
- '8.15, 8.16, 8.17'
44-
- '8.17, 8.18, 9.0'
45-
- '8.17, 8.19, 9.1'
57+
- '8.15, 8.16, 8.17, 8.18'
58+
- '8.18, 8.19, 9.2'
59+
- '8.18, 9.0, 9.2'
60+
- '8.19, 9.1, 9.2'
4661
scenario:
4762
- 'Default'
4863
- 'Reroute'
@@ -68,6 +83,7 @@ jobs:
6883
SCENARIO="${{ matrix.scenario }}" UPGRADE_PATH="${{ matrix.upgrade-path }}" SNAPSHOT=true make integration-server-test/upgrade
6984
7085
run-upgrade-bc:
86+
if: ${{ !contains(inputs.should_run, 'false') }}
7187
name: Upgrade tests (BC)
7288
runs-on: ubuntu-latest
7389
needs: prepare
@@ -100,6 +116,7 @@ jobs:
100116
SCENARIO="${{ matrix.scenario }}" UPGRADE_PATH="${{ matrix.upgrade-path }}" make integration-server-test/upgrade
101117
102118
run-standalone:
119+
if: ${{ !contains(inputs.should_run, 'false') }}
103120
name: Standalone-to-managed tests
104121
runs-on: ubuntu-latest
105122
strategy:

integrationservertest/internal/gen/generator.go

Lines changed: 31 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ func (g *Generator) RunBlockingWait(ctx context.Context, version ech.Version, in
7575
if !integrations {
7676
return fmt.Errorf("failed to wait for apm server: %w", err)
7777
}
78+
g.logger.Info("re-apply apm policy")
7879
if err = g.reapplyAPMPolicy(ctx, version); err != nil {
7980
return fmt.Errorf("failed to re-apply apm policy: %w", err)
8081
}
@@ -84,21 +85,12 @@ func (g *Generator) RunBlockingWait(ctx context.Context, version ech.Version, in
8485
}
8586

8687
g.logger.Info("ingest data")
87-
if err := g.runBlocking(ctx, version); err != nil {
88+
if err := g.retryRunBlocking(ctx, version, 2); err != nil {
8889
return fmt.Errorf("cannot run generator: %w", err)
8990
}
9091

91-
// With Fleet managed APM server, we can trigger metrics flush.
92-
if integrations {
93-
g.logger.Info("flush apm metrics")
94-
if err := g.flushAPMMetrics(ctx, version); err != nil {
95-
return fmt.Errorf("cannot flush apm metrics: %w", err)
96-
}
97-
return nil
98-
}
99-
100-
// With standalone, we don't have Fleet, so simply just wait for some arbitrary time.
101-
time.Sleep(180 * time.Second)
92+
// Simply wait for some arbitrary time, for the data to be flushed.
93+
time.Sleep(200 * time.Second)
10294
return nil
10395
}
10496

@@ -159,9 +151,35 @@ func (g *Generator) runBlocking(ctx context.Context, version ech.Version) error
159151
return gen.RunBlocking(ctx)
160152
}
161153

154+
// retryRunBlocking executes runBlocking. If it fails, it will retry up to retryTimes.
155+
func (g *Generator) retryRunBlocking(ctx context.Context, version ech.Version, retryTimes int) error {
156+
// No error, don't need to retry.
157+
if err := g.runBlocking(ctx, version); err == nil {
158+
return nil
159+
}
160+
161+
// Otherwise, retry until success or run out of attempts.
162+
var finalErr error
163+
for i := 0; i < retryTimes; i++ {
164+
// Wait for some time before retrying.
165+
time.Sleep(time.Duration(i) * 30 * time.Second)
166+
167+
g.logger.Info(fmt.Sprintf("retrying ingest data attempt %d", i+1))
168+
err := g.runBlocking(ctx, version)
169+
// Retry success, simply return.
170+
if err == nil {
171+
return nil
172+
}
173+
174+
finalErr = err
175+
}
176+
177+
return finalErr
178+
}
179+
162180
func (g *Generator) reapplyAPMPolicy(ctx context.Context, version ech.Version) error {
163181
policyID := "elastic-cloud-apm"
164-
description := fmt.Sprintf("%s %s", version, rand.Text()[5:])
182+
description := fmt.Sprintf("%s %s", version, rand.Text()[:10])
165183

166184
if err := g.kbc.UpdatePackagePolicyDescriptionByID(ctx, policyID, version, description); err != nil {
167185
return fmt.Errorf(
@@ -173,22 +191,6 @@ func (g *Generator) reapplyAPMPolicy(ctx context.Context, version ech.Version) e
173191
return nil
174192
}
175193

176-
// flushAPMMetrics sends an update to the Fleet APM package policy in order
177-
// to trigger the flushing of in-flight APM metrics.
178-
func (g *Generator) flushAPMMetrics(ctx context.Context, version ech.Version) error {
179-
// Re-applying the Elastic APM policy is enough to trigger final aggregations
180-
// in APM Server and flush of in-flight metrics.
181-
if err := g.reapplyAPMPolicy(ctx, version); err != nil {
182-
return err
183-
}
184-
185-
// APM Server needs some time to flush all metrics, and we don't have any
186-
// visibility on when this completes.
187-
// NOTE: This value comes from empirical observations.
188-
time.Sleep(120 * time.Second)
189-
return nil
190-
}
191-
192194
type apmInfoResp struct {
193195
Version string `json:"version"`
194196
PublishReady bool `json:"publish_ready"`

0 commit comments

Comments
 (0)