Commit 074a2e7b authored by Stefano Alberto Russo's avatar Stefano Alberto Russo
Browse files

Moved to dynamically obtain container IDs when stopping tasks. Added Podman...

Moved to dynamically obtain container IDs when stopping tasks. Added Podman and Docker container names. Improved how task Process IDs and Job IDs are stored.
parent 5badfd1c
Loading
Loading
Loading
Loading
+23 −28
Original line number Diff line number Diff line
@@ -132,7 +132,7 @@ class InternalStandaloneComputingManager(StandaloneComputingManager):
        #run_command += ' -v {}/user-{}:/data'.format(settings.LOCAL_USER_DATA_DIR, task.user.id)

        # Host name, image entry command
        run_command += ' -h task-{} -d -t {}/{}:{}'.format(task.short_uuid, task.container.registry, task.container.image_name, task.container.image_tag)
        run_command += ' -h task-{} --name task-{} -d -t {}/{}:{}'.format(task.short_uuid, task.short_uuid, task.container.registry, task.container.image_name, task.container.image_tag)

        # Debug
        logger.debug('Running new task with command="{}"'.format(run_command))
@@ -140,19 +140,17 @@ class InternalStandaloneComputingManager(StandaloneComputingManager):
        # Run the task 
        out = os_shell(run_command, capture=True)
        if out.exit_code != 0:
            logger.error('Got error in starting task: {}'.format(out))
            raise Exception(out.stderr)
        else:
            tid = out.stdout
            logger.debug('Created task with id: "{}"'.format(tid))
            
            # Get task IP address
            out = os_shell('sudo docker inspect --format \'{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}\' ' + tid + ' | tail -n1', capture=True)
            out = os_shell('export CONTAINER_ID=$(sudo docker ps -a --filter name=task-'+task.short_uuid+' --format {{.ID}}) && sudo docker inspect --format \'{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}\' $CONTAINER_ID | tail -n1', capture=True)
            if out.exit_code != 0:
                raise Exception('Error: ' + out.stderr)
            task_ip = out.stdout

            # Set fields
            task.id = tid
            task.status = TaskStatuses.running
            task.interface_ip = task_ip
            task.interface_port = task_port
@@ -160,17 +158,10 @@ class InternalStandaloneComputingManager(StandaloneComputingManager):
            # Save
            task.save()

        # Wait 10 seconds to see if the task is still up...


    def _stop_task(self, task):

        # Delete the Docker container
        standby_supported = False
        if standby_supported:
            stop_command = 'sudo docker stop {}'.format(task.id)
        else:
            stop_command = 'sudo docker stop {} && sudo docker rm {}'.format(task.id,task.id)
        stop_command = 'export CONTAINER_ID=$(sudo docker ps -a --filter name=task-'+task.short_uuid+' --format {{.ID}}) && sudo docker stop $CONTAINER_ID && sudo docker rm $CONTAINER_ID'
    
        out = os_shell(stop_command, capture=True)
        if out.exit_code != 0:
@@ -187,7 +178,7 @@ class InternalStandaloneComputingManager(StandaloneComputingManager):
    def _get_task_log(self, task, **kwargs):

        # View the Docker container log (attach)
        view_log_command = 'sudo docker logs {}'.format(task.id,)
        view_log_command = 'export CONTAINER_ID=$(sudo docker ps -a --filter name=task-'+task.short_uuid+' --format {{.ID}}) && sudo docker logs $CONTAINER_ID'
        logger.debug(view_log_command)
        out = os_shell(view_log_command, capture=True)
        if out.exit_code != 0:
@@ -341,13 +332,14 @@ class SSHStandaloneComputingManager(StandaloneComputingManager, SSHComputingMana
                run_command += '--network=private --uts=private --userns=keep-id '
            #run_command += '-d -t {}/{}:{}'.format(task.container.registry, task.container.image_name, task.container.image_tag)
            run_command += '-h task-{} --name task-{}  -t {}/{}:{}'.format(task.short_uuid, task.short_uuid, task.container.registry, task.container.image_name, task.container.image_tag)
            run_command += '&>> /tmp/{}_data/task.log & echo $({} ps -a --filter name=task-{} --format="{{.ID}}")"\''.format(task.uuid, container_engine, task.short_uuid)
            run_command += '&>> /tmp/{}_data/task.log &"\''.format(task.uuid)
            
        else:
            raise NotImplementedError('Container engine {} not supported'.format(container_engine))

        out = os_shell(run_command, capture=True)
        if out.exit_code != 0:
            logger.error('Got error in starting task: {}'.format(out))
            raise Exception(out.stderr)
        
        # Log        
@@ -357,8 +349,9 @@ class SSHStandaloneComputingManager(StandaloneComputingManager, SSHComputingMana
        task_uuid = task.uuid
        task = Task.objects.get(uuid=task_uuid)

        # Save pid echoed by the command above
        task.id = out.stdout
        # Save the task (container) id for Singularity, which is the PID echoed by the command above
        if container_engine == 'singularity':
            task.process_id = out.stdout

        # Save
        task.save()
@@ -377,20 +370,23 @@ class SSHStandaloneComputingManager(StandaloneComputingManager, SSHComputingMana
            container_engine = task.computing.default_container_engine

        if container_engine=='singularity':
            internal_stop_command = 'kill -9 {}'.format(task.id)            
            internal_stop_command = 'kill -9 {}'.format(task.process_id)            
        elif container_engine in ['docker', 'podman']:
            # TODO: remove this hardcoding
            prefix = 'sudo' if (computing_host == 'slurmclusterworker' and container_engine=='docker') else ''
            internal_stop_command = '{} {} stop {} && {} {} rm {}'.format(prefix,container_engine,task.id,prefix,container_engine,task.id)
            internal_stop_command = 'export CONTAINER_ID=$('+prefix+' '+container_engine+' ps -a --filter name=task-'+task.short_uuid+' --format {{.ID}}) &&'
            internal_stop_command += 'if [ "x$CONTAINER_ID" != "x" ]; then {} {} stop $CONTAINER_ID && {} {} rm $CONTAINER_ID; fi'.format(prefix,container_engine,prefix,container_engine)
        else:
            raise NotImplementedError('Container engine {} not supported'.format(container_engine))

        stop_command = 'ssh -p {} -o LogLevel=ERROR -i {} -4 -o StrictHostKeyChecking=no {}@{} \'/bin/bash -c "{}"\''.format(computing_port, computing_keys.private_key_file, computing_user, computing_host, internal_stop_command)
        out = os_shell(stop_command, capture=True)
        
        if out.exit_code != 0:
            if ('No such process' in out.stderr) or ('No such container' in out.stderr) or ('no container' in out.stderr) or ('missing' in out.stderr):
                pass
            else:
                logger.critical('Got error in stopping task: {}'.format(out))
                raise Exception(out.stderr)

        # Set task as stopped
@@ -413,9 +409,7 @@ class SSHStandaloneComputingManager(StandaloneComputingManager, SSHComputingMana
        if container_engine=='singularity':
            internal_view_log_command = 'cat /tmp/{}_data/task.log'.format(task.uuid)            
        elif container_engine in ['docker','podman']:
            # TODO: remove this hardcoding
            #prefix = 'sudo' if (computing_host == 'slurmclusterworker' and container_engine=='docker') else ''
            #internal_view_log_command = '{} {} logs {}'.format(prefix,container_engine,task.id)
            # TODO: consider podman/docker logs?
            internal_view_log_command = 'cat /tmp/{}_data/task.log'.format(task.uuid)
        else:
            raise NotImplementedError('Container engine {} not supported'.format(container_engine))
@@ -538,6 +532,7 @@ class SlurmSSHClusterComputingManager(ClusterComputingManager, SSHComputingManag

        out = os_shell(run_command, capture=True)
        if out.exit_code != 0:
            logger.error('Got error in starting task: {}'.format(out))
            raise Exception(out.stderr)

        # Log        
@@ -554,8 +549,8 @@ class SlurmSSHClusterComputingManager(ClusterComputingManager, SSHComputingManag
        task_uuid = task.uuid
        task = Task.objects.get(uuid=task_uuid)

        # Save job id as task id
        task.id = job_id
        # Save job id
        task.job_id = job_id
        
        # Set status (only fi we get here before the agent which sets the status as running via the API)
        if task.status != TaskStatuses.running:
@@ -571,7 +566,7 @@ class SlurmSSHClusterComputingManager(ClusterComputingManager, SSHComputingManag
        computing_user, computing_host, computing_port, computing_keys = get_ssh_access_mode_credentials(self.computing, task.user)

        # Stop the task remotely
        stop_command = 'ssh -o -p {} LogLevel=ERROR -i {} -4 -o StrictHostKeyChecking=no {}@{} \'/bin/bash -c "scancel {}"\''.format(computing_port, computing_keys.private_key_file, computing_user, computing_host, task.id)
        stop_command = 'ssh -o LogLevel=ERROR -i {} -4 -o StrictHostKeyChecking=no {}@{} \'/bin/bash -c "scancel {}"\''.format(computing_keys.private_key_file, computing_user, computing_host, task.job_id)
        out = os_shell(stop_command, capture=True)
        if out.exit_code != 0:
            raise Exception(out.stderr)
+27 −0
Original line number Diff line number Diff line
# Generated by Django 2.2.1 on 2023-10-07 12:49

from django.db import migrations, models


class Migration(migrations.Migration):

    dependencies = [
        ('core_app', '0034_auto_20231007_1052'),
    ]

    operations = [
        migrations.RemoveField(
            model_name='task',
            name='id',
        ),
        migrations.AddField(
            model_name='task',
            name='job_id',
            field=models.CharField(blank=True, max_length=64, null=True, verbose_name='Job ID'),
        ),
        migrations.AddField(
            model_name='task',
            name='process_id',
            field=models.CharField(blank=True, max_length=64, null=True, verbose_name='Process ID'),
        ),
    ]
+4 −3
Original line number Diff line number Diff line
@@ -274,9 +274,10 @@ class Task(models.Model):
    name  = models.CharField('Name', max_length=36, blank=False, null=False)

    # Task management
    id        = models.CharField('ID', max_length=64, blank=True, null=True) # i.e. Slurm job id, singularity PID, docker hash
    status     = models.CharField('Status', max_length=36, blank=True, null=True)
    created    = models.DateTimeField('Created on', default=timezone.now)
    process_id = models.CharField('Process ID', max_length=64, blank=True, null=True) # i.e. Singularity PID 
    job_id = models.CharField('Job ID', max_length=64, blank=True, null=True) # i.e. Slurm job id 

    # How to reach the task interface. The IP has to be intended either as the container IP if this is directly
    # reachable (i.e. using a Docker or Kubernetes network) or as the host IP address, depending on the
+2 −2
Original line number Diff line number Diff line
@@ -112,8 +112,8 @@
           </tr> -->

           <!-- <tr>
            <td><b>ID</b></td>
            <td>{{ task.id }}</td>
            <td><b>UUID</b></td>
            <td>{{ task.uuid }}</td>
           </tr> -->

           <tr>
+1 −1
Original line number Diff line number Diff line
@@ -12,7 +12,7 @@
      <hr>

  
      <b>ID:</b> {{ data.task.id }} &nbsp; &nbsp; 
      <b>UUID:</b> {{ data.task.uuid }} &nbsp; &nbsp; 
      <b>Status:</b> {{ data.task.status }} &nbsp; &nbsp; 
      <b>Auto refresh:{{data.refresh}}</b>&nbsp;
      {% if not data.refresh %} OFF {% else %} <a href="?uuid={{data.task.uuid}}">OFF</a> {% endif %} |