Commit 463bac9c authored by Stefano Alberto Russo's avatar Stefano Alberto Russo
Browse files

Moved to computing managers. minor fixes.

parent 37f9580d
Loading
Loading
Loading
Loading
+129 −120
Original line number Diff line number Diff line
@@ -10,15 +10,71 @@ logger = logging.getLogger(__name__)
TASK_DATA_DIR = "/data"


def start_task(task):
class ComputingManager(object):
    
    # Handle proper config
    if task.computing.type == 'local':
    def start_task(self, task, **kwargs):
        
        # Get our ip address
        #import netifaces
        #netifaces.ifaddresses('eth0')
        #backend_ip = netifaces.ifaddresses('eth0')[netifaces.AF_INET][0]['addr']
        # Check for run task logic implementation
        try:
            self._start_task
        except AttributeError:
            raise NotImplementedError('Not implemented')
        
        # Call actual run task logic
        self._start_task(task, **kwargs)


    def stop_task(self, task, **kwargs):
        
        # Check for stop task logic implementation
        try:
            self._stop_task
        except AttributeError:
            raise NotImplementedError('Not implemented')
        
        # Call actual stop task logic
        self._stop_task(task, **kwargs)
        
        # Ok, save status as deleted
        task.status = 'stopped'
        task.save()
        
        # Check if the tunnel is active and if so kill it
        logger.debug('Checking if task "{}" has a running tunnel'.format(task.tid))
        check_command = 'ps -ef | grep ":'+str(task.tunnel_port)+':'+str(task.ip)+':'+str(task.port)+'" | grep -v grep | awk \'{print $2}\''
        logger.debug(check_command)
        out = os_shell(check_command, capture=True)
        logger.debug(out)
        if out.exit_code == 0:
            logger.debug('Task "{}" has a running tunnel, killing it'.format(task.tid))
            tunnel_pid = out.stdout
            # Kill Tunnel command
            kill_tunnel_command= 'kill -9 {}'.format(tunnel_pid)
        
            # Log
            logger.debug('Killing tunnel with command: {}'.format(kill_tunnel_command))
        
            # Execute
            os_shell(kill_tunnel_command, capture=True)
            if out.exit_code != 0:
                raise Exception(out.stderr)


    def get_task_log(self, task, **kwargs):
        
        # Check for get task log logic implementation
        try:
            self._get_task_log
        except AttributeError:
            raise NotImplementedError('Not implemented')
        
        # Call actual get task log logic
        return self._get_task_log(task, **kwargs)


class LocalComputingManager(ComputingManager):
    
    def _start_task(self, task):

        # Init run command #--cap-add=NET_ADMIN --cap-add=NET_RAW
        run_command  = 'sudo docker run  --network=rosetta_default --name rosetta-task-{}'.format( task.id)
@@ -39,8 +95,10 @@ def start_task(task):
        # Host name, image entry command
        run_command += ' -h task-{} -d -t {}{}'.format(task.id, registry_string, task.container.image)

        # Run the task Debug
        # Debug
        logger.debug('Running new task with command="{}"'.format(run_command))
        
        # Run the task 
        out = os_shell(run_command, capture=True)
        if out.exit_code != 0:
            raise Exception(out.stderr)
@@ -48,7 +106,6 @@ def start_task(task):
            task_tid = out.stdout
            logger.debug('Created task with id: "{}"'.format(task_tid))


            # Get task IP address
            out = os_shell('sudo docker inspect --format \'{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}\' ' + task_tid + ' | tail -n1', capture=True)
            if out.exit_code != 0:
@@ -65,9 +122,34 @@ def start_task(task):
            task.save()


    def _stop_task(self, task):

        # Delete the Docker container
        standby_supported = False
        if standby_supported:
            stop_command = 'sudo docker stop {}'.format(task.tid)
        else:
            stop_command = 'sudo docker stop {} && sudo docker rm {}'.format(task.tid,task.tid)
    
        out = os_shell(stop_command, capture=True)
        if out.exit_code != 0:
            raise Exception(out.stderr)
    
    def _get_task_log(self, task, **kwargs):

        # View the Docker container log (attach)
        view_log_command = 'sudo docker logs {}'.format(task.tid,)
        logger.debug(view_log_command)
        out = os_shell(view_log_command, capture=True)
        if out.exit_code != 0:
            raise Exception(out.stderr)
        else:
            return out.stdout


class RemoteComputingManager(ComputingManager):
    
    elif task.computing.type == 'remote':
    def _start_task(self, task, **kwargs):
        logger.debug('Starting a remote task "{}"'.format(task.computing))

        # Get computing host
@@ -143,11 +225,8 @@ def start_task(task):
        # Save
        task.save()

    elif task.computing.type == 'remoteOLD':
        logger.debug('Starting a remote task "{}"'.format(task.computing))

        # Get computing host
        host = task.computing.get_conf_param('host')
    def _stop_task(self, task, **kwargs):

        # Get user keys
        if task.computing.require_user_keys:
@@ -155,64 +234,43 @@ def start_task(task):
        else:
            raise NotImplementedError('Remote tasks not requiring keys are not yet supported')

        # 1) Run the container on the host (non blocking)
 
        if task.container.type == 'singularity':

            # Set pass if any
            if task.auth_pass:
                authstring = ' export SINGULARITYENV_AUTH_PASS={} && '.format(task.auth_pass)
            else:
                authstring = ''
        # Get computing host
        host = task.computing.get_conf_param('host')

            run_command  = 'ssh -i {} -4 -o StrictHostKeyChecking=no {} '.format(user_keys.private_key_file, host)
            run_command += '"export SINGULARITY_NOHTTPS=true && {} '.format(authstring)
            run_command += 'exec nohup singularity run --pid --writable-tmpfs --containall --cleanenv '
        # Stop the task remotely
        stop_command = 'ssh -i {} -4 -o StrictHostKeyChecking=no {} "kill -9 {}"'.format(user_keys.private_key_file, host, task.pid)
        logger.debug(stop_command)
        out = os_shell(stop_command, capture=True)
        if out.exit_code != 0:
            if not 'No such process' in out.stderr:
                raise Exception(out.stderr)

            # Set registry
            if task.container.registry == 'docker_local':
                registry = 'docker://dregistry:5000/'
            elif task.container.registry == 'docker_hub':
                registry = 'docker://'
            else:
                raise NotImplementedError('Registry {} not supported'.format(task.container.registry))

            run_command+='{}{} &> /tmp/{}.log & echo \$!"'.format(registry, task.container.image, task.uuid)
    def _get_task_log(self, task, **kwargs):
        # Get computing host
        host = task.computing.get_conf_param('host')

        # Get id_rsa
        if task.computing.require_user_keys:
            user_keys = Keys.objects.get(user=task.user, default=True)
            id_rsa_file = user_keys.private_key_file
        else:
            raise NotImplementedError('Container {} not supported'.format(task.container.type))
            raise NotImplementedError('temote with no keys not yet')

        out = os_shell(run_command, capture=True)
        # View the Singularity container log
        view_log_command = 'ssh -i {} -4 -o StrictHostKeyChecking=no {}  "cat /tmp/{}.log"'.format(id_rsa_file, host, task.uuid)
        logger.debug(view_log_command)
        out = os_shell(view_log_command, capture=True)
        if out.exit_code != 0:
            raise Exception(out.stderr)
        else:
            return out.stdout

        # Save pid echoed by the command above
        task_pid = out.stdout

        # 2) Simulate the agent (i.e. report container IP and port port)
 
        # Get task IP address
        out = os_shell('sudo docker inspect --format \'{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}\' '+host+' | tail -n1', capture=True)
        if out.exit_code != 0:
            raise Exception('Error: ' + out.stderr)
        task_ip = out.stdout
 
        # Set fields
        task.tid    = task.uuid
        task.status = TaskStatuses.running
        task.ip     = task_ip
        task.pid    = task_pid
        task.port   = int(task.container.service_ports.split(',')[0])
 
        # Save
        task.save()


    #==============================
    #  Slurm
    #==============================
    elif task.computing.type == 'slurm':
class SlurmComputingManager(ComputingManager):
    
    def _start_task(self, task, **kwargs):
        logger.debug('Starting a remote task "{}"'.format(task.computing))

        # Get computing host #Key Error ATM
@@ -271,67 +329,18 @@ def start_task(task):
            raise Exception(out.stderr)


    def _stop_task(self, task, **kwargs):
        raise NotImplementedError('Not implemented')

    else:
        raise Exception('Consistency exception: invalid computing resource "{}'.format(task.computing))

    def _get_task_log(self, task, **kwargs):
        raise NotImplementedError('Not implemented')

def stop_task(task):

    if task.computing.type == 'local':

        # Delete the Docker container
        standby_supported = False
        if standby_supported:
            stop_command = 'sudo docker stop {}'.format(task.tid)
        else:
            stop_command = 'sudo docker stop {} && sudo docker rm {}'.format(task.tid,task.tid)

        out = os_shell(stop_command, capture=True)
        if out.exit_code != 0:
            raise Exception(out.stderr)
    
    elif task.computing.type == 'remote':

        # Get user keys
        if task.computing.require_user_keys:
            user_keys = Keys.objects.get(user=task.user, default=True)
        else:
            raise NotImplementedError('Remote tasks not requiring keys are not yet supported')

        # Get computing host
        host = task.computing.get_conf_param('host')

        # Stop the task remotely
        stop_command = 'ssh -i {} -4 -o StrictHostKeyChecking=no {} "kill -9 {}"'.format(user_keys.private_key_file, host, task.pid)
        logger.debug(stop_command)
        out = os_shell(stop_command, capture=True)
        if out.exit_code != 0:
            if not 'No such process' in out.stderr:
                raise Exception(out.stderr)
    else:
        raise Exception('Don\'t know how to stop tasks on "{}" computing resource.'.format(task.computing))

    # Ok, save status as deleted
    task.status = 'stopped'
    task.save()
    # Check if the tunnel is active and if so kill it
    logger.debug('Checking if task "{}" has a running tunnel'.format(task.tid))
    check_command = 'ps -ef | grep ":'+str(task.tunnel_port)+':'+str(task.ip)+':'+str(task.port)+'" | grep -v grep | awk \'{print $2}\''
    logger.debug(check_command)
    out = os_shell(check_command, capture=True)
    logger.debug(out)
    if out.exit_code == 0:
        logger.debug('Task "{}" has a running tunnel, killing it'.format(task.tid))
        tunnel_pid = out.stdout
        # Kill Tunnel command
        kill_tunnel_command= 'kill -9 {}'.format(tunnel_pid)
    
        # Log
        logger.debug('Killing tunnel with command: {}'.format(kill_tunnel_command))
    
        # Execute
        os_shell(kill_tunnel_command, capture=True)
        if out.exit_code != 0:
            raise Exception(out.stderr)
+2 −15
Original line number Diff line number Diff line
@@ -110,21 +110,8 @@ class Command(BaseCommand):
            #==============================
            # Demo remote computing 
            #==============================    
            demo_remote_computing = Computing.objects.create(user = None,
                                                             name = 'Demo remote',
                                                             type = 'remote',
                                                             require_sys_conf  = True,
                                                             require_user_conf = False,
                                                             require_user_keys = False)    
            ComputingSysConf.objects.create(computing = demo_remote_computing,
                                            data      = {'host': 'slurmclusterworker-one'})


            #==============================
            # Demo remote (auth) computing 
            #==============================    
            demo_remote_auth_computing = Computing.objects.create(user = None,
                                                             name = 'Demo remote (auth)',
                                                             name = 'Demo remote',
                                                             type = 'remote',
                                                             require_sys_conf  = True,
                                                             require_user_conf = True,
+6 −0
Original line number Diff line number Diff line
@@ -200,6 +200,12 @@ class Computing(models.Model):
            param_value = self.user_conf_data[param]
        return param_value

    @property
    def manager(self):
        from . import computing_managers
        ComputingManager = getattr(computing_managers, '{}ComputingManager'.format(self.type.title()))
        return ComputingManager()


class ComputingSysConf(models.Model):
    uuid = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
+14 −0
Original line number Diff line number Diff line
@@ -83,6 +83,20 @@
        {% endif %}
        </td>
       </tr>
      </table>

      <br />
       <h3>Keys</h3>
       <table class="dashboard">
       
       <tr>
        <td valign="top">
        <b>Default public key</b>
        </td>
        <td>
            <pre style="max-width:300px; height:">{{ data.default_public_key }}</pre>
        </td>
       </tr>

      </table>

+0 −4
Original line number Diff line number Diff line
@@ -71,11 +71,7 @@
            <font color="#c0c0c0">Stop</font> | 
            
            {% endif %}
            {% if task.status == "exited" or task.status == "stopped" %}
            <a href="?uuid={{task.uuid}}&action=delete&details=False">Delete</a>
            {% else %}
            <font color="#c0c0c0">Delete</font>
            {% endif %}
            {% if task.status == "running" %}
             | <a href="?uuid={{task.uuid}}&action=connect">Connect</a>
             | <a href="/task_log/?uuid={{task.uuid}}&action=viewlog">View Log</a>
Loading