Skip to content

Commit

Permalink
reinforce cluster UUID when joining and starting up
Browse files Browse the repository at this point in the history
* when node joins existing cluster
* when cluster restarts

Signed-off-by: Alex Aizman <[email protected]>
  • Loading branch information
alex-aizman committed Mar 27, 2021
1 parent 023938c commit 8321c91
Show file tree
Hide file tree
Showing 5 changed files with 24 additions and 27 deletions.
19 changes: 8 additions & 11 deletions ais/clustermap.go
Original file line number Diff line number Diff line change
Expand Up @@ -355,25 +355,22 @@ func (m *smapX) handleDuplicateNode(nsi *cluster.Snode, del bool) (err error) {
return
}

func (m *smapX) validateUUID(newSmap *smapX, si, nsi *cluster.Snode, caller string) (err error) {
if newSmap == nil || newSmap.Version == 0 {
func (m *smapX) validateUUID(si *cluster.Snode, newSmap *smapX, caller string, cieNum int) (err error) {
if m == nil || newSmap == nil || newSmap.Version == 0 {
return
}
if m.UUID == "" || newSmap.UUID == "" {
if !cos.IsValidUUID(m.UUID) || !cos.IsValidUUID(newSmap.UUID) {
return
}
if m.UUID == newSmap.UUID {
return
}
nsiname := caller
if nsi != nil {
nsiname = nsi.Name()
} else if nsiname == "" {
nsiname = "???"
// cluster integrity error (cie)
if caller == "" {
caller = "???"
}
// FATAL: cluster integrity error (cie)
s := fmt.Sprintf("%s: Smaps have different uuids: [%s: %s] vs [%s: %s]",
ciError(50), si, m.StringEx(), nsiname, newSmap.StringEx())
s := fmt.Sprintf("%s: Smaps have different UUIDs: local [%s, %s] vs from [%s, %s]",
ciError(cieNum), si, m.StringEx(), caller, newSmap.StringEx())
err = &errSmapUUIDDiffer{s}
return
}
Expand Down
8 changes: 7 additions & 1 deletion ais/earlystart.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,10 @@ func (p *proxyrunner) bootstrap() {
smap, reliable := p.tryLoadSmap()
if !reliable {
smap = nil
} else {
glog.Infof("%s: loaded %s", p.si, smap.StringEx())
}

// 2. make the preliminary/primary decision
pid, primary = p.determineRole(smap)

Expand Down Expand Up @@ -194,6 +197,9 @@ func (p *proxyrunner) primaryStartup(loadedSmap *smapX, config *cmn.Config, ntar
si := p.si.Clone()
smap.Primary = si
smap.addProxy(si)
if loadedSmap != nil {
smap.UUID = loadedSmap.UUID
}
p.owner.smap.put(smap)
p.owner.smap.Unlock()

Expand Down Expand Up @@ -790,7 +796,7 @@ func (p *proxyrunner) discoverClusterUUID() (uuid, created string) {
uuids += id + "(cnt-" + strconv.Itoa(cnt) + ") vs "
}
uuids = strings.TrimRight(uuids, "vs ")
glog.Errorf("%s: smap UUIDs don't match %s", ciError(10), uuids)
glog.Errorf("%s: Smap UUIDs do not match %s", ciError(10), uuids)
}

if (maxCnt > 0 && len(counter) == 1) || maxCnt > minPidConfirmations {
Expand Down
12 changes: 4 additions & 8 deletions ais/httpcommon.go
Original file line number Diff line number Diff line change
Expand Up @@ -1711,14 +1711,10 @@ func (h *httprunner) extractSmap(payload msPayload, caller string) (newSmap *sma
err = fmt.Errorf("%s: not finding ourselves in %s", h.si, newSmap)
return
}
if err = smap.validateUUID(newSmap, h.si, nil, caller); err != nil {
if h.si.IsProxy() {
cos.Assert(!smap.isPrimary(h.si))
// cluster integrity error: making exception for non-primary proxies
glog.Errorf("%s (non-primary): %v - proceeding to override Smap", h.si, err)
return
}
cos.ExitLogf("%v", err) // otherwise, FATAL

// FATAL: cluster integrity error
if err = smap.validateUUID(h.si, newSmap, caller, 50 /* ciError */); err != nil {
return
}

glog.Infof(
Expand Down
11 changes: 4 additions & 7 deletions ais/prxclu.go
Original file line number Diff line number Diff line change
Expand Up @@ -288,20 +288,17 @@ func (p *proxyrunner) httpclupost(w http.ResponseWriter, r *http.Request) {
if cmn.ReadJSON(w, r, &regReq.SI) != nil {
return
}
tag = "user-register"
userRegister = true
tag, userRegister = "user-register", true
case cmn.Keepalive:
if cmn.ReadJSON(w, r, &regReq) != nil {
return
}
tag = "keepalive"
keepalive = true
tag, keepalive = "keepalive", true
case cmn.AutoRegister: // node self-register
if cmn.ReadJSON(w, r, &regReq) != nil {
return
}
tag = "join"
selfRegister = true
tag, selfRegister = "join", true
default:
p.writeErrURL(w, r)
return
Expand Down Expand Up @@ -472,7 +469,7 @@ func (p *proxyrunner) handleJoinKalive(nsi *cluster.Snode, regSmap *smapX, tag s
}
}
// check for cluster integrity errors (cie)
if err = smap.validateUUID(regSmap, p.si, nsi, ""); err != nil {
if err = smap.validateUUID(p.si, regSmap, nsi.String(), 80 /* ciError */); err != nil {
return
}
// no further checks join when cluster's starting up
Expand Down
1 change: 1 addition & 0 deletions docs/troubleshooting.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ In many cases, the entirety of a troubleshooting step boils down to cleaning up
| `cie#50` | Non-primary proxy or storage target: when receiving an updated cluster map that conflicts with the local copy. Primary proxy: when a joining node's Smap does not pass the validation. | In both cases, the node is not permitted to join (or is removed from) the cluster. |
| `cie#60` | When a primary proxy (gateway) is starting up, it uses its own local Smap to query other nodes for cluster-wide metadata. | The error is specific to bucket metadata and is triggered when there are two or more versions that are mutually incompatible. |
| `cie#70` | Same as above. | Same as above, except that there's a simple majority of nodes that have one of the BMD versions. |
| `cie#80` | Joining existing cluster | When node tries to join a cluster we do compare the node's local copy of the map with the existing one. The error, effectively, indicates that according to the node's own cluster map it must be a member of a different cluster. |

## Storage Integrity Error

Expand Down

0 comments on commit 8321c91

Please sign in to comment.