Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/docker-tests-8.4.yml
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ jobs:
- 'VERSION=8.4 GODOG_FEATURE=zk_maintenance.feature make test'
- 'VERSION=8.4 GODOG_FEATURE=manager_switchover.feature make test'
- 'VERSION=8.4 GODOG_FEATURE=optimization.feature make test'
- 'VERSION=8.4 GODOG_FEATURE=light_maintenance_mode.feature make test'
fail-fast: false

steps:
Expand Down
21 changes: 19 additions & 2 deletions cmd/mysync/maintenance.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,31 +12,45 @@ import (

var maintWait time.Duration
var maintReason string
var mode app.MaintenanceMode
var maintLight bool

var maintCmd = &cobra.Command{
Use: "maintenance",
Aliases: []string{"maint", "mnt"},
Short: "Enables or disables maintenance mode",
Long: ("When maintenance is enabled, MySync manager will not perform any actions.\n" +
"Light maintenance mode keeps MySync running, but suppresses automatic failover and switchover.\n" +
"When maintenance is disabled, MySync will analyze cluster state and remember it as correct."),
}

var maintOnCmd = &cobra.Command{
Use: "on",
Aliases: []string{"enable"},
Short: "Enable maintenance mode",
Long: "Enable maintenance mode.\n\n" +
"By default this enables normal maintenance mode, which pauses MySync manager actions.\n" +
"Use --light to enable light maintenance mode, which keeps MySync running but blocks automatic failover and switchover.",
Run: func(cmd *cobra.Command, args []string) {
if maintLight {
mode = app.LightMode
} else {
mode = app.FullMode
}
app, err := app.NewApp(configFile, logLevel, true)
if err != nil {
fmt.Println(err)
os.Exit(1)
}
os.Exit(app.CliEnableMaintenance(maintWait, maintReason))
os.Exit(app.CliEnableMaintenance(maintWait, maintReason, mode))
},
}

var maintOffCmd = &cobra.Command{
Use: "off",
Aliases: []string{"disable"},
Short: "Disable maintenance mode",
Long: "Disable maintenance mode and wait until MySync resumes normal cluster management.",
Run: func(cmd *cobra.Command, args []string) {
app, err := app.NewApp(configFile, logLevel, true)
if err != nil {
Expand All @@ -48,7 +62,9 @@ var maintOffCmd = &cobra.Command{
}

var maintGetCmd = &cobra.Command{
Use: "get",
Use: "get",
Short: "Show whether maintenance mode is enabled",
Long: "Print current maintenance mode state from DCS.",
Run: func(cmd *cobra.Command, args []string) {
app, err := app.NewApp(configFile, logLevel, true)
if err != nil {
Expand All @@ -67,4 +83,5 @@ func init() {
maintCmd.PersistentFlags().DurationVarP(&maintWait, "wait", "w", 30*time.Second, "how long to wait for maintenance activation, 0s to return immediately")

maintOnCmd.Flags().StringVarP(&maintReason, "reason", "r", "", "reason for maintenance (e.g. ticket number)")
maintOnCmd.Flags().BoolVar(&maintLight, "light", false, "enable light maintenance mode: keeps mysync running but suppresses automatic failover and switchover")
}
145 changes: 87 additions & 58 deletions internal/app/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -497,6 +497,22 @@ func (app *App) stateLost() appState {
return stateLost
}

func (app *App) tryLeaveMaintenance() appState {
if app.AcquireLock(pathManagerLock) {
app.logger.Info("leaving maintenance")
err := app.leaveMaintenance()
if err != nil {
app.logger.Errorf("maintenance: failed to leave: %v", err)
return stateMaintenance
}
app.removeMaintenanceFile()
return stateManager
} else {
app.removeMaintenanceFile()
return stateCandidate
}
}

func (app *App) stateMaintenance() appState {
if !app.doesMaintenanceFileExist() {
app.writeMaintenanceFile()
Expand All @@ -506,19 +522,7 @@ func (app *App) stateMaintenance() appState {
return stateMaintenance
}
if err == dcs.ErrNotFound || maintenance.ShouldLeave {
if app.AcquireLock(pathManagerLock) {
app.logger.Info("leaving maintenance")
err := app.leaveMaintenance()
if err != nil {
app.logger.Errorf("maintenance: failed to leave: %v", err)
return stateMaintenance
}
app.removeMaintenanceFile()
return stateManager
} else {
app.removeMaintenanceFile()
return stateCandidate
}
return app.tryLeaveMaintenance()
}
return stateMaintenance
}
Expand Down Expand Up @@ -585,7 +589,17 @@ func (app *App) stateManager() appState {
app.logger.Errorf("failed to get maintenance from zk %v", err)
return stateManager
}
if maintenance != nil {

// Handle maintenance light mode
lightMaintenance := maintenance != nil && maintenance.IsLightMode()
if lightMaintenance {
app.logger.Info("entering light maintenance mode")
if maintenance.ShouldLeave {
return app.tryLeaveMaintenance()
}
}

if maintenance != nil && !lightMaintenance {
if !maintenance.MySyncPaused {
app.logger.Info("entering maintenance")
err := app.enterMaintenance(maintenance, master)
Expand All @@ -600,64 +614,75 @@ func (app *App) stateManager() appState {
// check if switchover required or in progress
switchover := new(Switchover)
if err := app.dcs.Get(pathCurrentSwitch, switchover); err == nil {
err = app.approveSwitchover(switchover, activeNodes, clusterState)
if err != nil {
app.logger.Errorf("cannot perform switchover: %s", err)
err = app.FinishSwitchover(switchover, err)
if err != nil {
app.logger.Errorf("failed to reject switchover: %s", err)
}
return stateManager
}

err = app.StartSwitchover(switchover)
if err != nil {
app.logger.Errorf("failed to start switchover: %s", err)
return stateManager
}
err = app.performSwitchover(clusterState, activeNodes, switchover, master)
if app.dcs.Get(pathCurrentSwitch, new(Switchover)) == dcs.ErrNotFound {
app.logger.Errorf("switchover was aborted")
if lightMaintenance {
app.logger.Debugf("cannot perform switchover: blocked by light maintenance mode, skipping iteration")
} else {
err = app.approveSwitchover(switchover, activeNodes, clusterState)
if err != nil {
err = app.FailSwitchover(switchover, err)
app.logger.Errorf("cannot perform switchover: %s", err)
err = app.FinishSwitchover(switchover, err)
if err != nil {
app.logger.Errorf("failed to report switchover failure: %s", err)
app.logger.Errorf("failed to reject switchover: %s", err)
}
return stateManager
}

err = app.StartSwitchover(switchover)
if err != nil {
app.logger.Errorf("failed to start switchover: %s", err)
return stateManager
}
err = app.performSwitchover(clusterState, activeNodes, switchover, master)
if app.dcs.Get(pathCurrentSwitch, new(Switchover)) == dcs.ErrNotFound {
app.logger.Errorf("switchover was aborted")
} else {
err = app.FinishSwitchover(switchover, nil)
if err != nil {
// we failed to update status in DCS, it's highly possible
// that current process lost DCS connection
// and another process will take managerLock
app.logger.Errorf("failed to report switchover finish: %s", err)
err = app.FailSwitchover(switchover, err)
if err != nil {
app.logger.Errorf("failed to report switchover failure: %s", err)
}
} else {
err = app.FinishSwitchover(switchover, nil)
if err != nil {
// we failed to update status in DCS, it's highly possible
// that current process lost DCS connection
// and another process will take managerLock
app.logger.Errorf("failed to report switchover finish: %s", err)
}
}
}
return stateManager
}
return stateManager
} else if err != dcs.ErrNotFound {
app.logger.Error(err.Error())
return stateManager
}

// perform failover if needed

if !clusterStateDcs[master].PingOk || clusterStateDcs[master].IsFileSystemReadonly {
app.logger.Errorf("MASTER FAILURE")
app.t.SetIfZero(NodeFailedAt, master, time.Now())
err = app.approveFailover(clusterState, clusterStateDcs, activeNodes, master)
if err == nil {
app.logger.Infof("failover approved")
err = app.IssueFailover(master)
if err != nil {
app.logger.Error(err.Error())
}
if lightMaintenance {
app.logger.Infof("failover suppressed by light maintenance mode")
} else {
app.logger.Errorf("failover was not approved: %v", err)
err = app.approveFailover(clusterState, clusterStateDcs, activeNodes, master)
if err == nil {
app.logger.Infof("failover approved")
err = app.IssueFailover(master)
if err != nil {
app.logger.Error(err.Error())
}
} else {
app.logger.Errorf("failover was not approved: %v", err)
}

return stateManager
}
return stateManager
} else {
app.t.Clean(NodeFailedAt, master)
}

if !clusterState[master].PingOk {
app.logger.Errorf("MASTER SUSPICIOUS, do not perform any kind of repair")
return stateManager
Expand All @@ -672,16 +697,20 @@ func (app *App) stateManager() appState {
// perform after-crash failover if needed
if app.config.ResetupCrashedHosts && countHANodes(clusterState) > 1 && clusterStateDcs[master].DaemonState != nil && clusterStateDcs[master].DaemonState.CrashRecovery {
app.logger.Errorf("MASTER FAILURE (CRASH RECOVERY)")
err = app.approveFailover(clusterState, clusterStateDcs, activeNodes, master)
if err == nil {
app.logger.Infof("failover approved")
err = app.IssueFailover(master)
if err != nil {
app.logger.Error(err.Error())
}
return stateManager
if lightMaintenance {
app.logger.Infof("after-crash failover suppressed by light maintenance mode")
} else {
app.logger.Errorf("failover was not approved: %v", err)
err = app.approveFailover(clusterState, clusterStateDcs, activeNodes, master)
if err == nil {
app.logger.Infof("failover approved")
err = app.IssueFailover(master)
if err != nil {
app.logger.Error(err.Error())
}
return stateManager
} else {
app.logger.Errorf("failover was not approved: %v", err)
}
}
}

Expand Down
9 changes: 5 additions & 4 deletions internal/app/cli_maintenance.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import (
)

// CliEnableMaintenance enables maintenance mode
func (app *App) CliEnableMaintenance(waitTimeout time.Duration, reason string) int {
func (app *App) CliEnableMaintenance(waitTimeout time.Duration, reason string, mode MaintenanceMode) int {
ctx := app.baseContext()
err := app.connectDCS()
if err != nil {
Expand All @@ -24,13 +24,14 @@ func (app *App) CliEnableMaintenance(waitTimeout time.Duration, reason string) i
InitiatedBy: util.GuessWhoRunning() + "@" + app.config.Hostname,
InitiatedAt: time.Now(),
Reason: reason,
Mode: mode,
}
err = app.dcs.Create(pathMaintenance, maintenance)
if err != nil && err != dcs.ErrExists {
app.logger.Error(err.Error())
return 1
}
// wait for mysync to pause
// wait for mysync to pause or for maintenance with light mode to appear in DCS
if waitTimeout > 0 {
waitCtx, cancel := context.WithTimeout(ctx, waitTimeout)
defer cancel()
Expand All @@ -43,14 +44,14 @@ func (app *App) CliEnableMaintenance(waitTimeout time.Duration, reason string) i
if err != nil {
app.logger.Error(err.Error())
}
if maintenance.MySyncPaused {
if maintenance.MaintAcquired() {
break Out
}
case <-waitCtx.Done():
break Out
}
}
if !maintenance.MySyncPaused {
if !maintenance.MaintAcquired() {
app.logger.Error("could not wait for mysync to enter maintenance")
return 1
}
Expand Down
30 changes: 24 additions & 6 deletions internal/app/data.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,12 +135,29 @@ type SwitchoverResult struct {
}

// Maintenance struct presence means that cluster under manual control
// Light mode allows everything except failover and switchover
type MaintenanceMode string

const (
LightMode MaintenanceMode = "light"
FullMode MaintenanceMode = "full"
)

type Maintenance struct {
InitiatedBy string `json:"initiated_by"`
InitiatedAt time.Time `json:"initiated_at"`
MySyncPaused bool `json:"mysync_paused"`
ShouldLeave bool `json:"should_leave"`
Reason string `json:"reason,omitempty"`
InitiatedBy string `json:"initiated_by"`
InitiatedAt time.Time `json:"initiated_at"`
MySyncPaused bool `json:"mysync_paused"`
ShouldLeave bool `json:"should_leave"`
Reason string `json:"reason,omitempty"`
Mode MaintenanceMode `json:"mode"`
}

func (m *Maintenance) MaintAcquired() bool {
return m.IsLightMode() || m.MySyncPaused
}

func (m *Maintenance) IsLightMode() bool {
return m != nil && m.Mode == LightMode
}

func (m *Maintenance) String() string {
Expand All @@ -155,5 +172,6 @@ func (m *Maintenance) String() string {
if m.Reason != "" {
reasonSuffix = fmt.Sprintf(" (%s)", m.Reason)
}
return fmt.Sprintf("<%s by %s at %s%s>", ms, m.InitiatedBy, m.InitiatedAt, reasonSuffix)

return fmt.Sprintf("<%s by %s at %s%s. mode: %s>", ms, m.InitiatedBy, m.InitiatedAt, reasonSuffix, m.Mode)
}
29 changes: 29 additions & 0 deletions tests/features/CLI.feature
Original file line number Diff line number Diff line change
Expand Up @@ -129,3 +129,32 @@ Feature: CLI
"""
Then command return code should be "0"
Then zookeeper node "/test/cascade_nodes/mysql2" should not exist within "5" seconds

Scenario: CLI maintenance light mode writes to DCS and exits correctly
Given cluster is up and running
When I run command on host "mysql1"
"""
mysync maint on --light
"""
Then command return code should be "0"
And command output should match regexp
"""
maintenance enabled
"""
And zookeeper node "/test/maintenance" should match json within "30" seconds
"""
{
"initiated_by": "REGEXP:.*@mysql1",
"mode": "light"
}
"""
When I run command on host "mysql1"
"""
mysync maint off
"""
Then command return code should be "0"
And command output should match regexp
"""
maintenance disabled
"""
And zookeeper node "/test/maintenance" should not exist within "30" seconds
Loading
Loading