Commit 15bdf157 authored by Stefano Alberto Russo's avatar Stefano Alberto Russo
Browse files

Fixes and imporvements

parent 0a74f0e4
Loading
Loading
Loading
Loading
+11 −11
Original line number Diff line number Diff line
@@ -305,25 +305,25 @@ print(port)

        elif action=='set_ip_port':
            
            task_ip   = request.GET.get('ip', None)
            if not task_ip:
                return HttpResponse('IP not valid (got "{}")'.format(task_ip))
            task_interface_ip   = request.GET.get('ip', None)
            if not task_interface_ip:
                return HttpResponse('IP not valid (got "{}")'.format(task_interface_ip))
            
            task_port = request.GET.get('port', None)
            if not task_port:
                return HttpResponse('Port not valid (got "{}")'.format(task_port))
            task_interface_port = request.GET.get('port', None)
            if not task_interface_port:
                return HttpResponse('Port not valid (got "{}")'.format(task_interface_port))
            
            try:
                int(task_port)
                int(task_interface_port)
            except (TypeError, ValueError):
                return HttpResponse('Port not valid (got "{}")'.format(task_port))
                return HttpResponse('Port not valid (got "{}")'.format(task_interface_port))
              
            # Set fields
            logger.info('Setting task "{}" to ip "{}" and port "{}"'.format(task.uuid, task_ip, task_port))
            logger.info('Setting task "{}" to ip "{}" and port "{}"'.format(task.uuid, task_interface_ip, task_interface_port))
            task.status = TaskStatuses.running
            task.ip     = task_ip
            task.interface_ip = task_interface_ip
            if task.container.supports_custom_interface_port:
                task.port = int(task_port)
                task.interface_port = int(task_interface_port)
            task.save()
                    
            # Notify the user that the task called back home
+16 −30
Original line number Diff line number Diff line
@@ -102,13 +102,13 @@ class InternalSingleNodeComputingManager(SingleNodeComputingManager):
        run_command += ' -v {}/user-{}:/data'.format(settings.LOCAL_USER_DATA_DIR, task.user.id)

        # Set registry string
        if task.container.registry == 'local':
            registry_string = 'localhost:5000/'
        else:
            registry_string  = 'docker.io/'
        #if task.container.registry == 'local':
        #    registry_string = 'localhost:5000/'
        #else:
        #    registry_string  = 'docker.io/'

        # Host name, image entry command
        run_command += ' -h task-{} -d -t {}{}'.format(task.uuid, registry_string, task.container.image)
        run_command += ' -h task-{} -d -t {}/{}:{}'.format(task.uuid, task.container.registry, task.container.image, task.container.tag)

        # Debug
        logger.debug('Running new task with command="{}"'.format(run_command))
@@ -348,16 +348,15 @@ class SlurmSSHClusterComputingManager(ClusterComputingManager, SSHComputingManag
        sbatch_args += ' --output=\$HOME/{}.log --error=\$HOME/{}.log '.format(task.uuid, task.uuid)

        # Submit the job
        if task.container.type == 'singularity':
        if task.computing.default_container_runtime == 'singularity':

            #if not task.container.supports_custom_interface_port:
            #     raise Exception('This task does not support dynamic port allocation and is therefore not supported using singularity on Slurm')

            # Set pass if any
            if task.auth_pass:
                authstring = ' export SINGULARITYENV_AUTH_PASS={} && '.format(task.auth_pass)
            else:
            authstring = ''
            if not task.requires_proxy_auth and task.password:
                authstring = ' export SINGULARITYENV_AUTH_PASS={} && '.format(task.password)
                
            # Set binds, only from sys config if the resource is not owned by the user
            if self.computing.user != task.user:
@@ -382,24 +381,11 @@ class SlurmSSHClusterComputingManager(ClusterComputingManager, SSHComputingManag
            run_command += 'rm -rf /tmp/{}_data && mkdir -p /tmp/{}_data/tmp &>> \$HOME/{}.log && mkdir -p /tmp/{}_data/home &>> \$HOME/{}.log && chmod 700 /tmp/{}_data && '.format(task.uuid, task.uuid, task.uuid, task.uuid, task.uuid, task.uuid)
            run_command += 'exec nohup singularity run {} --pid --writable-tmpfs --no-home --home=/home/metauser --workdir /tmp/{}_data/tmp -B/tmp/{}_data/home:/home --containall --cleanenv '.format(binds, task.uuid, task.uuid)
            
            # Double to escape for Pythom, six for shell (double times three as \\\ escapes a single slash in shell)

            # Set registry
            if task.container.registry == 'docker_local':
                # Get local Docker registry conn string
                from.utils import get_local_docker_registry_conn_string
                local_docker_registry_conn_string = get_local_docker_registry_conn_string()
                registry = 'docker://{}/'.format(local_docker_registry_conn_string)
            elif task.container.registry == 'docker_hub':
                registry = 'docker://'
            else:
                raise NotImplementedError('Registry {} not supported'.format(task.container.registry))
    
            run_command+='{}{} &> \$HOME/{}.log\\" > \$HOME/{}.sh && sbatch {} \$HOME/{}.sh"\''.format(registry, task.container.image, task.uuid, task.uuid, sbatch_args, task.uuid)

            # Double to escape for Python, six for shell (double times three as \\\ escapes a single slash in shell)
            run_command+='docker://{}/{}:{} &> \$HOME/{}.log\\" > \$HOME/{}.sh && sbatch {} \$HOME/{}.sh"\''.format(task.container.registry, task.container.image, task.container.tag, task.uuid, task.uuid, sbatch_args, task.uuid)

        else:
            raise NotImplementedError('Container {} not supported'.format(task.container.type))
            raise NotImplementedError('Default container runtime "{}" not supported'.format(task.computing.default_container_runtime))

        out = os_shell(run_command, capture=True)
        if out.exit_code != 0:
@@ -419,8 +405,8 @@ class SlurmSSHClusterComputingManager(ClusterComputingManager, SSHComputingManag
        task_uuid = task.uuid
        task = Task.objects.get(uuid=task_uuid)

        # Save job id as task pid
        task.pid = job_id
        # Save job id as task id
        task.id = job_id
        
        # Set status (only fi we get here before the agent which sets the status as running via the API)
        if task.status != TaskStatuses.running:
@@ -443,7 +429,7 @@ class SlurmSSHClusterComputingManager(ClusterComputingManager, SSHComputingManag
        user = self.computing.conf.get('user')

        # Stop the task remotely
        stop_command = 'ssh -o LogLevel=ERROR -i {} -4 -o StrictHostKeyChecking=no {}@{} \'/bin/bash -c "scancel {}"\''.format(user_keys.private_key_file, user, host, task.pid)
        stop_command = 'ssh -o LogLevel=ERROR -i {} -4 -o StrictHostKeyChecking=no {}@{} \'/bin/bash -c "scancel {}"\''.format(user_keys.private_key_file, user, host, task.id)
        out = os_shell(stop_command, capture=True)
        if out.exit_code != 0:
            raise Exception(out.stderr)
+169 −0
Original line number Diff line number Diff line
# Generated by Django 2.2.1 on 2021-11-03 22:56

import django.contrib.postgres.fields.jsonb
from django.db import migrations, models


class Migration(migrations.Migration):

    dependencies = [
        ('core_app', '0008_auto_20211103_1232'),
    ]

    operations = [
        migrations.AlterField(
            model_name='computing',
            name='access_mode',
            field=models.CharField(max_length=36, verbose_name='Access (control) mode'),
        ),
        migrations.AlterField(
            model_name='computing',
            name='auth_mode',
            field=models.CharField(max_length=36, verbose_name='Auth mode'),
        ),
        migrations.AlterField(
            model_name='computing',
            name='container_runtimes',
            field=models.CharField(max_length=256, verbose_name='Container runtimes'),
        ),
        migrations.AlterField(
            model_name='computing',
            name='description',
            field=models.TextField(blank=True, null=True, verbose_name='Description'),
        ),
        migrations.AlterField(
            model_name='computing',
            name='name',
            field=models.CharField(max_length=255, verbose_name='Name'),
        ),
        migrations.AlterField(
            model_name='computing',
            name='type',
            field=models.CharField(max_length=255, verbose_name='Type'),
        ),
        migrations.AlterField(
            model_name='computing',
            name='wms',
            field=models.CharField(blank=True, max_length=36, null=True, verbose_name='Workload management system'),
        ),
        migrations.AlterField(
            model_name='container',
            name='arch',
            field=models.CharField(default='x86_64', max_length=36, verbose_name='Architecture'),
        ),
        migrations.AlterField(
            model_name='container',
            name='description',
            field=models.TextField(blank=True, null=True, verbose_name='Description'),
        ),
        migrations.AlterField(
            model_name='container',
            name='image',
            field=models.CharField(max_length=255, verbose_name='Image'),
        ),
        migrations.AlterField(
            model_name='container',
            name='interface_port',
            field=models.IntegerField(blank=True, null=True, verbose_name='Interface port'),
        ),
        migrations.AlterField(
            model_name='container',
            name='interface_protocol',
            field=models.CharField(blank=True, max_length=36, null=True, verbose_name='Interface protocol'),
        ),
        migrations.AlterField(
            model_name='container',
            name='interface_transport',
            field=models.CharField(blank=True, max_length=36, null=True, verbose_name='Interface transport'),
        ),
        migrations.AlterField(
            model_name='container',
            name='name',
            field=models.CharField(max_length=255, verbose_name='Name'),
        ),
        migrations.AlterField(
            model_name='container',
            name='os',
            field=models.CharField(default='linux', max_length=36, verbose_name='Operating system'),
        ),
        migrations.AlterField(
            model_name='container',
            name='registry',
            field=models.CharField(max_length=255, verbose_name='Registry'),
        ),
        migrations.AlterField(
            model_name='container',
            name='supports_custom_interface_port',
            field=models.BooleanField(default=False, verbose_name='Supports custom interface port'),
        ),
        migrations.AlterField(
            model_name='container',
            name='supports_interface_auth',
            field=models.BooleanField(default=False, verbose_name='Supports interface auth'),
        ),
        migrations.AlterField(
            model_name='container',
            name='tag',
            field=models.CharField(default='latest', max_length=255, verbose_name='Tag'),
        ),
        migrations.AlterField(
            model_name='task',
            name='auth_token',
            field=models.CharField(blank=True, max_length=36, null=True, verbose_name='Auth token'),
        ),
        migrations.AlterField(
            model_name='task',
            name='computing_options',
            field=django.contrib.postgres.fields.jsonb.JSONField(blank=True, null=True, verbose_name='Computing options'),
        ),
        migrations.AlterField(
            model_name='task',
            name='extra_binds',
            field=models.CharField(blank=True, max_length=4096, null=True, verbose_name='Extra binds'),
        ),
        migrations.AlterField(
            model_name='task',
            name='id',
            field=models.CharField(blank=True, max_length=64, null=True, verbose_name='ID'),
        ),
        migrations.AlterField(
            model_name='task',
            name='interface_ip',
            field=models.CharField(blank=True, max_length=36, null=True, verbose_name='Interface IP address'),
        ),
        migrations.AlterField(
            model_name='task',
            name='interface_port',
            field=models.IntegerField(blank=True, null=True, verbose_name='Interface port'),
        ),
        migrations.AlterField(
            model_name='task',
            name='name',
            field=models.CharField(max_length=36, verbose_name='Name'),
        ),
        migrations.AlterField(
            model_name='task',
            name='requires_proxy',
            field=models.BooleanField(verbose_name='Requires proxy'),
        ),
        migrations.AlterField(
            model_name='task',
            name='requires_proxy_auth',
            field=models.BooleanField(verbose_name='Requires proxy auth'),
        ),
        migrations.AlterField(
            model_name='task',
            name='requires_tcp_tunnel',
            field=models.BooleanField(verbose_name='Requires a TCP tunnel'),
        ),
        migrations.AlterField(
            model_name='task',
            name='status',
            field=models.CharField(blank=True, max_length=36, null=True, verbose_name='Status'),
        ),
        migrations.AlterField(
            model_name='task',
            name='tcp_tunnel_port',
            field=models.IntegerField(blank=True, null=True, verbose_name='TCP tunnel port'),
        ),
    ]
+41 −34
Original line number Diff line number Diff line
@@ -80,29 +80,29 @@ class Container(models.Model):
    # If a container has no user, it will be available to anyone. Can be created, edited and deleted only by admins.

    # Generic attributes
    name        = models.CharField('Container Name', max_length=255, blank=False, null=False)
    description = models.TextField('Container description', blank=True, null=True)
    name        = models.CharField('Name', max_length=255, blank=False, null=False)
    description = models.TextField('Description', blank=True, null=True)
    
    # Registry-related attributes
    registry = models.CharField('Container registry', max_length=255, blank=False, null=False)
    image    = models.CharField('Container image', max_length=255, blank=False, null=False)
    tag      = models.CharField('Container image', max_length=255, blank=False, null=False, default='latest')
    registry = models.CharField('Registry', max_length=255, blank=False, null=False)
    image    = models.CharField('Image', max_length=255, blank=False, null=False)
    tag      = models.CharField('Tag', max_length=255, blank=False, null=False, default='latest')

    # Platform-related
    arch = models.CharField('Container architecture', max_length=36, blank=False, null=False, default='x86_64')
    os   = models.CharField('Container operating system', max_length=36, blank=False, null=False, default='linux')
    arch = models.CharField('Architecture', max_length=36, blank=False, null=False, default='x86_64')
    os   = models.CharField('Operating system', max_length=36, blank=False, null=False, default='linux')
    
    # TODO: do we want more control with respect to kernel, CPUs, instruction sets? 
    # requires = i.e. kernel > 3, intel, AVX2
    
    # Port, protocol and transport for the container interface
    interface_port = models.IntegerField('Container interface port', blank=True, null=True) 
    interface_protocol = models.CharField('Container interface protocol', max_length=36, blank=True, null=True)
    interface_transport = models.CharField('Container interface protocol', max_length=36, blank=True, null=True)
    interface_port = models.IntegerField('Interface port', blank=True, null=True) 
    interface_protocol = models.CharField('Interface protocol', max_length=36, blank=True, null=True)
    interface_transport = models.CharField('Interface transport', max_length=36, blank=True, null=True)

    # Capabilities
    supports_custom_interface_port = models.BooleanField('Does the container support setting a custom interface port?', default=False) # BASE_PORT
    supports_interface_auth = models.BooleanField('Does the container interface support authentication?', default=False) # AUTH_USER / AUTH_PASS
    supports_custom_interface_port = models.BooleanField('Supports custom interface port', default=False) # BASE_PORT
    supports_interface_auth = models.BooleanField('Supports interface auth', default=False) # AUTH_USER / AUTH_PASS

    class Meta:
        ordering = ['name']
@@ -131,23 +131,23 @@ class Computing(models.Model):
    user = models.ForeignKey(User, related_name='+', on_delete=models.CASCADE, blank=True, null=True)
    # If a compute resource has no user, it will be available to anyone. Can be created, edited and deleted only by admins.
    
    name        = models.CharField('Computing Name', max_length=255, blank=False, null=False)
    description = models.TextField('Container description', blank=True, null=True)
    name        = models.CharField('Name', max_length=255, blank=False, null=False)
    description = models.TextField('Description', blank=True, null=True)

    # Standalone / sluster
    type = models.CharField('Computing Type', max_length=255, blank=False, null=False)
    type = models.CharField('Type', max_length=255, blank=False, null=False)

    requires_sys_conf  = models.BooleanField(default=False)
    requires_user_conf = models.BooleanField(default=False)
    requires_user_keys = models.BooleanField(default=False)

    # Interfce and interaction definition
    access_mode = models.CharField('Computing resource access (control) mode', max_length=36, blank=False, null=False)
    auth_mode   = models.CharField('Computing resource authentication mode', max_length=36, blank=False, null=False)
    wms         = models.CharField('Computing resource WMS', max_length=36, blank=True, null=True)
    access_mode = models.CharField('Access (control) mode', max_length=36, blank=False, null=False)
    auth_mode   = models.CharField('Auth mode', max_length=36, blank=False, null=False)
    wms         = models.CharField('Workload management system', max_length=36, blank=True, null=True)
    
    # Supported container runtimes
    container_runtimes = models.CharField('Computing resource container runtimes', max_length=256, blank=False, null=False) 
    container_runtimes = models.CharField('Container runtimes', max_length=256, blank=False, null=False) 

    class Meta:
        ordering = ['name']
@@ -168,6 +168,10 @@ class Computing(models.Model):
        color_map_index = string_int_hash % len(color_map)
        return color_map[color_map_index]

    @property
    def default_container_runtime(self):
        return str(self.container_runtimes).split(',')[0]


    #=======================
    # Computing manager
@@ -181,11 +185,11 @@ class Computing(models.Model):
        try:
            return self._manager
        except AttributeError:
            if self.type == 'cluster' and self.access_mode == 'ssh+cli' and self.access_mode == 'user_keys' and self.wms == 'slurm':
            if self.type == 'cluster' and self.access_mode == 'ssh+cli' and self.auth_mode == 'user_keys' and self.wms == 'slurm':
                self._manager = computing_managers.SlurmSSHClusterComputingManager(self)
            elif self.type == 'standalone' and self.access_mode == 'ssh+cli' and self.access_mode == 'user_keys' and self.wms is None:
            elif self.type == 'standalone' and self.access_mode == 'ssh+cli' and self.auth_mode == 'user_keys' and self.wms is None:
                self._manager = computing_managers.SSHSingleNodeComputingManager(self)
            elif self.type == 'standalone' and self.access_mode == 'internal' and self.access_mode == 'internal' and self.wms is None:
            elif self.type == 'standalone' and self.access_mode == 'internal' and self.auth_mode == 'internal' and self.wms is None:
                self._manager = computing_managers.InternalSingleNodeComputingManager(self)
            else:
                raise ConsistencyException('Don\'t know how to instantiate a computing manager for computing resource of type "{}", access mode "{}" and WMS "{}"'.format(self.type, self.access_mode, self.wms))
@@ -286,34 +290,37 @@ class Task(models.Model):

    uuid  = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
    user  = models.ForeignKey(User, related_name='+', on_delete=models.CASCADE)
    name  = models.CharField('Task name', max_length=36, blank=False, null=False)
    name  = models.CharField('Name', max_length=36, blank=False, null=False)

    # Task management
    id        = models.CharField('Task ID', max_length=64, blank=True, null=True) # i.e. Slurm job id, singularity PID, docker hash
    status    = models.CharField('Task status', max_length=36, blank=True, null=True)
    id        = models.CharField('ID', max_length=64, blank=True, null=True) # i.e. Slurm job id, singularity PID, docker hash
    status    = models.CharField('Status', max_length=36, blank=True, null=True)
    created   = models.DateTimeField('Created on', default=timezone.now)

    # How to reach the task interface. The IP has to be intended either as the container IP if this is directly
    # reachable (i.e. using a Docker or Kubernetes network) or as the host IP address, depending on the
    # computing resource and its computing manager/WMS/container runtime. The port is to be intended
    # as the port where the task interface is exposed on its IP address.
    interface_ip   = models.CharField('Task interface ip address', max_length=36, blank=True, null=True)
    interface_port = models.IntegerField('Task interface port', blank=True, null=True) 
    interface_ip   = models.CharField('Interface IP address', max_length=36, blank=True, null=True)
    interface_port = models.IntegerField('Interface port', blank=True, null=True) 
    
    # Task access
    requires_tcp_tunnel = models.BooleanField('Does the task require a tunnel to be opened for accessing its interface?')
    tcp_tunnel_port     = models.IntegerField('Task tunnel port', blank=True, null=True)
    requires_proxy      = models.BooleanField('Does the task require a proxy for accessing its interface?')
    requires_proxy_auth = models.BooleanField('Does the task require interface authentication to be enforced at proxy-level?')
    auth_token          = models.CharField('A one-time token for proxy or interface authentication', max_length=36, blank=True, null=True)
    requires_tcp_tunnel = models.BooleanField('Requires a TCP tunnel')
    tcp_tunnel_port     = models.IntegerField('TCP tunnel port', blank=True, null=True)
    requires_proxy      = models.BooleanField('Requires proxy')
    requires_proxy_auth = models.BooleanField('Requires proxy auth')
    auth_token          = models.CharField('Auth token', max_length=36, blank=True, null=True) # A one-time token for proxy or interface authentication

    # Links
    computing = models.ForeignKey(Computing, related_name='+', on_delete=models.CASCADE)
    container = models.ForeignKey('Container', on_delete=models.CASCADE, related_name='+')

    # Extra 
    extra_binds = models.CharField('Task container extra binds', max_length=4096, blank=True, null=True)
    computing_options = JSONField('Task computing options', blank=True, null=True) # i.e. CPUs, RAM, cluster partition etc. TODO: why here?
    extra_binds = models.CharField('Extra binds', max_length=4096, blank=True, null=True)
    computing_options = JSONField('Computing options', blank=True, null=True) # i.e. CPUs, RAM, cluster partition etc. TODO: why here?
    
    # TODO: add the option for selecting the runtime as advanced option when creating the task
    #container_runtime 

    class Meta:
        ordering = ['-created']
+1 −1
Original line number Diff line number Diff line
@@ -78,7 +78,7 @@
           </table>

           
           <a href="javascript:void(0);" id="show_button" onclick="toggle_visibility('advanced_div')">Show advanced...</a>
           <a href="javascript:void(0);" id="show_button" onclick="toggle_visibility('advanced_div')">Advanced...</a>
           

           <div id="advanced_div" style="display:none; width:360px;">
Loading