Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/docker-tests-8.4.yml
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ jobs:
- 'VERSION=8.4 GODOG_FEATURE=host_management.feature make test'
- 'VERSION=8.4 GODOG_FEATURE=maintenance.84.feature make test'
- 'VERSION=8.4 GODOG_FEATURE=offline_mode.84.feature make test'
- 'VERSION=8.4 GODOG_FEATURE=offline_mode_az_limit.feature make test'
- 'VERSION=8.4 GODOG_FEATURE=priority.feature make test'
- 'VERSION=8.4 GODOG_FEATURE=readonly_filesystem.feature make test'
- 'VERSION=8.4 GODOG_FEATURE=recovery.feature make test'
Expand Down
38 changes: 26 additions & 12 deletions internal/app/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,9 @@ type App struct {
switchHelper mysql.ISwitchHelper
lostQuorumTime time.Time

optSyncer OptimizationSyncer
optController OptimizationController
optSyncer OptimizationSyncer
optController OptimizationController
offlineModeFilter OfflineModeFilter

lagResetupper *resetup.LagResetupper
}
Expand Down Expand Up @@ -102,6 +103,7 @@ func NewApp(configFile, logLevel string, interactive bool) (*App, error) {
slaveReadPositions: make(map[string]string),
externalReplication: externalReplication,
switchHelper: switchHelper,
offlineModeFilter: NewOfflineModeFilter(config, logger),
}

// TODO: appDcs should be separate entity
Expand Down Expand Up @@ -1613,14 +1615,16 @@ func (app *App) getMasterHost(clusterState map[string]*nodestate.NodeState) (str

func (app *App) repairOfflineMode(clusterState map[string]*nodestate.NodeState, master string) {
masterNode := app.cluster.Get(master)

pendingOfflineByAZ := make(map[string]int)
for host, state := range clusterState {
if !state.PingOk {
continue
}
if host == master {
app.repairMasterOfflineMode(host, state)
} else {
app.repairSlaveOfflineMode(host, state, masterNode, clusterState[master])
app.repairSlaveOfflineMode(host, state, masterNode, clusterState[master], clusterState, pendingOfflineByAZ)
}
}
}
Expand All @@ -1641,7 +1645,7 @@ func (app *App) repairMasterOfflineMode(host string, state *nodestate.NodeState)
}

// nolint: gocyclo
func (app *App) repairSlaveOfflineMode(host string, state *nodestate.NodeState, masterNode *mysql.Node, masterState *nodestate.NodeState) {
func (app *App) repairSlaveOfflineMode(host string, state *nodestate.NodeState, masterNode *mysql.Node, masterState *nodestate.NodeState, clusterState map[string]*nodestate.NodeState, pendingOfflineByAZ map[string]int) {
if state.SlaveState == nil || state.SlaveState.ReplicationLag == nil {
return
}
Expand Down Expand Up @@ -1683,16 +1687,26 @@ func (app *App) repairSlaveOfflineMode(host string, state *nodestate.NodeState,
}
// online => offline, if lag has increased
if !state.IsOffline && !masterState.IsReadOnly && *state.SlaveState.ReplicationLag > app.config.OfflineModeEnableLag.Seconds() {
err := node.SetOffline()
if err != nil {
app.logger.Errorf("repair: failed to set slave %s offline: %s", host, err)
} else {
app.logger.Infof("repair: slave %s set offline, because ReplicationLag (%f s) >= OfflineModeEnableLag (%v)",
host, *state.SlaveState.ReplicationLag, app.config.OfflineModeEnableLag)
err = app.optController.Enable(node)
if app.offlineModeFilter.CanSetOffline(host, clusterState, pendingOfflineByAZ) {
err := node.SetOffline()
Comment thread
dodokek marked this conversation as resolved.
if err != nil {
app.logger.Errorf("repair: failed to set optimize replication settings on slave %s: %s", host, err)
app.logger.Errorf("repair: failed to set slave %s offline: %s", host, err)
} else {
app.logger.Infof("repair: slave %s set offline, because ReplicationLag (%f s) >= OfflineModeEnableLag (%v)",
host, *state.SlaveState.ReplicationLag, app.config.OfflineModeEnableLag)

// Track all replicas which were set offline on current step
az := getAvailabilityZone(host, app.config.OfflineModeAZSeparator)
pendingOfflineByAZ[az]++

err = app.optController.Enable(node)
if err != nil {
app.logger.Errorf("repair: failed to set optimize replication settings on slave %s: %s", host, err)
}
}
} else {
app.logger.Warnf("repair: skip setting slave %s offline: AZ offline limit reached (offline_mode_max_offline_pct=%d)",
host, app.config.OfflineModeMaxOfflinePct)
}
}

Expand Down
111 changes: 111 additions & 0 deletions internal/app/offline_mode_filter.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
package app

import (
"math"
"strings"

nodestate "github.com/yandex/mysync/internal/app/node_state"
"github.com/yandex/mysync/internal/config"
"github.com/yandex/mysync/internal/log"
)

// Decide whether the node can go offline or not.
// Tracking already offline nodes on current iteration with pendingOfflineByAZ
type OfflineModeFilter interface {
CanSetOffline(host string, clusterState map[string]*nodestate.NodeState, pendingOfflineByAZ map[string]int) bool
}

func NewOfflineModeFilter(cfg *config.Config, logger *log.Logger) OfflineModeFilter {
if cfg.OfflineModeMaxOfflinePct <= 0 {
logger.Infof("using neverAllowOfflineFilter, no replicas allowed to go offline")
return &neverAllowOfflineFilter{logger: logger}
}
if cfg.OfflineModeMaxOfflinePct >= 100 {
logger.Infof("using alwaysAllowOfflineFilter, all replicas allowed to go offline")
return &alwaysAllowOfflineFilter{logger: logger}
}
logger.Infof(
"offline mode filter: using azLimitedOfflineFilter (offline_mode_max_offline_pct=%d%%, offline_mode_az_separator=%q)",
cfg.OfflineModeMaxOfflinePct, cfg.OfflineModeAZSeparator,
)
return &azLimitedOfflineFilter{
maxOfflinePct: cfg.OfflineModeMaxOfflinePct,
azSeparator: cfg.OfflineModeAZSeparator,
logger: logger,
}
}

// If offline_mode_max_offline_pct is set to 100, it means all replicas can go offline
type alwaysAllowOfflineFilter struct {
logger *log.Logger
}

func (f *alwaysAllowOfflineFilter) CanSetOffline(host string, _ map[string]*nodestate.NodeState, _ map[string]int) bool {
return true
}

// If offline_mode_max_offline_pct is set to 0, it means no replicas can go offline
type neverAllowOfflineFilter struct {
logger *log.Logger
}

func (f *neverAllowOfflineFilter) CanSetOffline(host string, _ map[string]*nodestate.NodeState, _ map[string]int) bool {
return false
}

// Handler for cases between 0 and 100 offline_mode_max_offline_pct
type azLimitedOfflineFilter struct {
maxOfflinePct int
azSeparator string
logger *log.Logger
}

func (f *azLimitedOfflineFilter) CanSetOffline(host string, clusterState map[string]*nodestate.NodeState, pendingOfflineByAZ map[string]int) bool {
az := getAvailabilityZone(host, f.azSeparator)

totalInAZ := 0
offlineInAZ := 0

for h, state := range clusterState {
if state.IsMaster || getAvailabilityZone(h, f.azSeparator) != az {
continue
}
totalInAZ++
if state.IsOffline {
offlineInAZ++
}
}

// Probably unreachable
if totalInAZ == 0 {
return false
}

// Add nodes already set offline in the current iteration
pendingInAZ := pendingOfflineByAZ[az]
offlineInAZ += pendingInAZ

// If this replica will go offline and total percentage of offline replicas in az
// will be less or equal to offline_mode_max_offline_pct, then it can go offline
willBeOfflinePct := int(math.Floor(100 * float64(offlineInAZ+1) / float64(totalInAZ)))

canGoOffline := willBeOfflinePct <= f.maxOfflinePct
f.logger.Debugf(
"offline mode filter: host %s (az=%q): total=%d, already_offline=%d, pending=%d, will_be_offline_pct=%d%%, max_offline_pct=%d%% => can_go_offline=%v",
host, az, totalInAZ, offlineInAZ-pendingInAZ, pendingInAZ, willBeOfflinePct, f.maxOfflinePct, canGoOffline,
)
return canGoOffline
}

// Extract az name from hostname prefix
// Separator is configurable and set as '-' by default
// zone_123-mysql -> zone_123 availability zone
func getAvailabilityZone(fqdn, separator string) string {
if separator == "" {
return ""
}
if idx := strings.Index(fqdn, separator); idx != -1 {
return fqdn[:idx]
}
return ""
}
191 changes: 191 additions & 0 deletions internal/app/offline_mode_filter_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
package app

import (
"testing"

"github.com/stretchr/testify/require"

nodestate "github.com/yandex/mysync/internal/app/node_state"
"github.com/yandex/mysync/internal/config"
)

func ns(isOffline bool) *nodestate.NodeState {
return &nodestate.NodeState{IsOffline: isOffline}
}

func TestGetAvailabilityZone(t *testing.T) {
cases := []struct {
fqdn string
separator string
want string
}{
{"vla-mydb-1.db.yandex.net", "-", "vla"},
{"rc1a-mydb-2.db.yandex.net", "-", "rc1a"},
{"mydb-1", "-", "mydb"},
// no separator in hostname means same zone as all others
{"mysql1", "-", ""},
{"standalone", "-", ""},
// empty separator means all hosts in one zone
{"vla-host-1", "", ""},
{"", "-", ""},
}
for _, tc := range cases {
require.Equal(t, tc.want, getAvailabilityZone(tc.fqdn, tc.separator), tc.fqdn)
}
}

func TestNewOfflineModeFilter(t *testing.T) {
logger := getLogger()

require.IsType(t, &neverAllowOfflineFilter{}, NewOfflineModeFilter(&config.Config{OfflineModeMaxOfflinePct: 0}, logger))
require.IsType(t, &neverAllowOfflineFilter{}, NewOfflineModeFilter(&config.Config{OfflineModeMaxOfflinePct: -1}, logger))
require.IsType(t, &alwaysAllowOfflineFilter{}, NewOfflineModeFilter(&config.Config{OfflineModeMaxOfflinePct: 100}, logger))
require.IsType(t, &alwaysAllowOfflineFilter{}, NewOfflineModeFilter(&config.Config{OfflineModeMaxOfflinePct: 110}, logger))
require.IsType(t, &azLimitedOfflineFilter{}, NewOfflineModeFilter(&config.Config{OfflineModeMaxOfflinePct: 50}, logger))
}

func TestAlwaysAllowOfflineFilter(t *testing.T) {
f := &alwaysAllowOfflineFilter{}
require.True(t, f.CanSetOffline("any", nil, nil))
require.True(t, f.CanSetOffline("any", map[string]*nodestate.NodeState{
"vla-host-1": ns(false),
"vla-host-2": ns(true),
}, nil))
}

func TestAzLimitedOfflineFilter_CanSetOffline(t *testing.T) {
const sep = "-"

cases := []struct {
name string
pct int
host string
state map[string]*nodestate.NodeState
want bool
}{
{
name: "empty state fail-open",
pct: 50,
host: "vla-host-1",
state: map[string]*nodestate.NodeState{},
want: false,
},
{
name: "no-dash hostnames same zone pct=50 first allowed",
pct: 50,
host: "mysql2",
state: map[string]*nodestate.NodeState{
"mysql2": ns(false),
"mysql3": ns(false),
},
want: true,
},
{
name: "no-dash hostnames same zone pct=50 second blocked",
pct: 50,
host: "mysql3",
state: map[string]*nodestate.NodeState{
"mysql2": ns(true),
"mysql3": ns(false),
},
want: false,
},
{
name: "pct=50 two hosts none offline first allowed",
pct: 50,
host: "vla-host-1",
state: map[string]*nodestate.NodeState{
"vla-host-1": ns(false),
"vla-host-2": ns(false),
},
want: true,
},
{
name: "pct=50 two hosts one offline second blocked",
pct: 50,
host: "vla-host-2",
state: map[string]*nodestate.NodeState{
"vla-host-1": ns(true),
"vla-host-2": ns(false),
},
want: false,
},
{
name: "pct=32 three online hosts no one allowed to go offline",
pct: 32,
host: "vla-host-1",
state: map[string]*nodestate.NodeState{
"vla-host-1": ns(false),
"vla-host-2": ns(false),
"vla-host-3": ns(false),
},
want: false,
},
{
name: "pct=33 three online hosts 1 allowed to go offline",
pct: 33,
host: "vla-host-1",
state: map[string]*nodestate.NodeState{
"vla-host-1": ns(false),
"vla-host-2": ns(false),
"vla-host-3": ns(false),
},
want: true,
},
{
name: "other AZ offline does not affect own AZ",
pct: 50,
host: "vla-host-1",
state: map[string]*nodestate.NodeState{
"sas-host-1": ns(true),
"sas-host-2": ns(true),
"vla-host-1": ns(false),
"vla-host-2": ns(false),
},
want: true,
},
}

logger := getLogger()

for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
f := &azLimitedOfflineFilter{maxOfflinePct: tc.pct, azSeparator: sep, logger: logger}
require.Equal(t, tc.want, f.CanSetOffline(tc.host, tc.state, map[string]int{}))
})
}
}

func TestAzLimitedOfflineFilter_EmptySeparator_AllHostsSameZone(t *testing.T) {
logger := getLogger()

f := &azLimitedOfflineFilter{maxOfflinePct: 50, azSeparator: "", logger: logger}
state := map[string]*nodestate.NodeState{
"vla-host-1": ns(false),
"sas-host-1": ns(false),
}
require.True(t, f.CanSetOffline("vla-host-1", state, map[string]int{}))

state["vla-host-1"] = ns(true)
require.False(t, f.CanSetOffline("sas-host-1", state, map[string]int{}))
}

func TestAzLimitedOfflineFilter_PendingOfflineByAZ(t *testing.T) {
logger := getLogger()
const sep = "-"

f := &azLimitedOfflineFilter{maxOfflinePct: 50, azSeparator: sep, logger: logger}

state := map[string]*nodestate.NodeState{
"vla-host-1": ns(false),
"vla-host-2": ns(false),
}

require.True(t, f.CanSetOffline("vla-host-1", state, map[string]int{}))

pending := map[string]int{"vla": 1}
require.False(t, f.CanSetOffline("vla-host-2", state, pending))

pendingOtherAZ := map[string]int{"sas": 1}
require.True(t, f.CanSetOffline("vla-host-1", state, pendingOtherAZ))
}
Loading
Loading