Skip to content

Commit

Permalink
gpssh: Retry with TERM env variable set during failures
Browse files Browse the repository at this point in the history
Currently gpssh always clears out the TERM env variable before performing SSH connections. This will cause problems when users want only to use a specific terminal (like tmux) when performing SSH connections since the empty TERM will cause some terminals to fail.

To fix this issue, a retry operation is added when trying to login to a host. The first login attempt is made in the current manner where it clears out the `TERM` env variable. If it fails, then the operation is retried by restoring the `TERM` env variable.

`gpssh` will now also print the exception in case of errors so that we get to know the actual reason for SSH failures aiding us in debugging.

**Output During Failure**
Before:
```
[gpadmin@cdw ~]$ gpssh -h cdw
[ERROR] unable to login to cdw
Could not acquire connection.
```

After:
```
[gpadmin@cdw ~]$ gpssh -h cdw
[ERROR] unable to login to cdw
Could not acquire connection.
End Of File (EOF). Exception style platform.
<gpssh_modules.gppxssh_wrapper.PxsshWrapper object at 0x7ffa144b7860>
version: 3.3
command: /usr/bin/ssh
args: ['/usr/bin/ssh', '-o', 'StrictHostKeyChecking=no', '-o', 'BatchMode=yes', '-q', '-l', 'gpadmin', 'cdw']
searcher: <pexpect.searcher_re object at 0x7ffa144b7c50>
buffer (last 100 chars): b''
before (last 100 chars): b'  9 07:34:06 2024 from 10.0.34.166\r\r\nopen terminal failed: missing or unsuitable terminal: unknown\r\n'
after: <class 'pexpect.EOF'>
match: None
match_index: None
exitstatus: None
flag_eof: True
pid: 52832
child_fd: 3
closed: False
timeout: 30
delimiter: <class 'pexpect.EOF'>
logfile: None
logfile_read: None
logfile_send: None
maxread: 2000
ignorecase: False
searchwindowsize: None
delaybeforesend: 0.05
delayafterclose: 0.1
delayafterterminate: 0.1
```
  • Loading branch information
jnihal authored and avamingli committed Jan 27, 2025
1 parent 5f53e44 commit e3fc633
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 26 deletions.
65 changes: 40 additions & 25 deletions gpMgmt/bin/gppylib/util/ssh_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,35 +192,50 @@ def login(self, hostList=None, userName=None, delaybeforesend=0.05, sync_multipl
good_list = []
print_lock = threading.Lock()

def connect_host(hostname, p):
self.hostList.append(hostname)
try:
# The sync_multiplier value is passed onto pexpect.pxssh which is used to determine timeout
# values for prompt verification after an ssh connection is established.
p.login(hostname, self.userName, sync_multiplier=sync_multiplier)
p.x_peer = hostname
p.x_pid = p.pid
good_list.append(p)
if self.verbose:
def connect_host(hostname):
retry_login = True

while True:
# create a new PxsshWrapper object for each retry to avoid using the
# same object which can cause unexpected behaviours
p = gppxssh_wrapper.PxsshWrapper(delaybeforesend=delaybeforesend,
sync_retries=sync_retries,
options={"StrictHostKeyChecking": "no",
"BatchMode": "yes"})

try:
# The sync_multiplier value is passed onto pexpect.pxssh which is used to determine timeout
# values for prompt verification after an ssh connection is established.
p.login(hostname, self.userName, sync_multiplier=sync_multiplier)
p.x_peer = hostname
p.x_pid = p.pid
good_list.append(p)
if self.verbose:
with print_lock:
print('[INFO] login %s' % hostname)
except Exception as e:
# some logins fail due to the clearing of the TERM env variable
# retry by restoring the TERM variable to see if it succeeds or else error out
if origTERM and retry_login:
retry_login = False
os.putenv('TERM', origTERM)
continue

with print_lock:
print('[INFO] login %s' % hostname)
except Exception as e:
with print_lock:
print('[ERROR] unable to login to %s' % hostname)
if type(e) is pxssh.ExceptionPxssh:
print(e)
elif type(e) is pxssh.EOF:
print('Could not acquire connection.')
else:
print('hint: use gpssh-exkeys to setup public-key authentication between hosts')
print('[ERROR] unable to login to %s' % hostname)
if type(e) is pxssh.ExceptionPxssh:
print(e)
elif type(e) is pxssh.EOF:
print('Could not acquire connection.')
print(e)
else:
print('hint: use gpssh-exkeys to setup public-key authentication between hosts')

break

thread_list = []
for host in hostList:
p = gppxssh_wrapper.PxsshWrapper(delaybeforesend=delaybeforesend,
sync_retries=sync_retries,
options={"StrictHostKeyChecking": "no",
"BatchMode": "yes"})
t = threading.Thread(target=connect_host, args=(host, p))
t = threading.Thread(target=connect_host, args=[host])
t.start()
thread_list.append(t)

Expand Down
33 changes: 32 additions & 1 deletion gpMgmt/bin/gppylib/util/test/unit/test_cluster_ssh_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import sys, os, pwd
import unittest
from io import StringIO
from mock import patch
from mock import patch, call

try:
gphome = os.environ.get('GPHOME')
Expand Down Expand Up @@ -91,7 +91,38 @@ def test04_exceptions(self, mock_stdout):
session2 = Session()
session2.login(['localhost'], 'gpadmin', 0.05, 1.0)
self.assertIn('[ERROR] unable to login to localhost\nhint: use gpssh-exkeys to setup public-key authentication between hosts\n', mock_stdout.getvalue())

@patch('os.getenv', return_value="term")
@patch('os.putenv')
@patch('sys.stdout', new_callable=StringIO)
def test05_login_retry_when_term_variable_is_set(self, mock_stdout, mock_putenv, mock_getenv):
'''
Test pxssh.login() retry when there is an exception and TERM env variable is set
'''

with mock.patch.object(pxssh.pxssh, 'login', side_effect=pxssh.EOF('foo')) as mock_login:
session = Session()
session.login(['localhost'], 'gpadmin', 0.05, 1.0)
self.assertIn('[ERROR] unable to login to localhost\nCould not acquire connection.\n', mock_stdout.getvalue())
mock_stdout.truncate(0)
assert mock_putenv.call_count == 3
mock_putenv.assert_has_calls([call('TERM', ''), call('TERM', 'term'), call('TERM', 'term')])

@patch('os.getenv', return_value=None)
@patch('os.putenv')
@patch('sys.stdout', new_callable=StringIO)
def test06_login_does_not_retry_when_term_variable_is_not_set(self, mock_stdout, mock_putenv, mock_getenv):
'''
Test pxssh.login() does not retry when there is an exception and TERM env variable is not set
'''

with mock.patch.object(pxssh.pxssh, 'login', side_effect=pxssh.EOF('foo')) as mock_login:
session = Session()
session.login(['localhost'], 'gpadmin', 0.05, 1.0)
self.assertIn('[ERROR] unable to login to localhost\nCould not acquire connection.\n', mock_stdout.getvalue())
self.assertNotIn('Retrying by restoring the TERM env variable.\n', mock_stdout.getvalue())
mock_stdout.truncate(0)
mock_putenv.assert_called_once_with('TERM', '')

if __name__ == "__main__":
unittest.main()

0 comments on commit e3fc633

Please sign in to comment.