In this demo, we’re going to understand a container and build it from scatch with by stimulating the command docker with something like go run main.go

docker         run image <cmd> <params>
go run main.go run image <cmd> <params>

Since the Golang library used for this demo support only Linux, I was using an AWS EC2 instance with Ubuntu 22.04.1 LTS (Jammy Jellyfish).

Firstly, we need to install Go in the host

root@ip-172-31-21-165:~# curl https://dl.google.com/go/go1.19.4.linux-amd64.tar.gz -O
root@ip-172-31-21-165:~# rm -rf /usr/local/go && tar -C /usr/local -xzf go1.19.4.linux-amd64.tar.gz
root@ip-172-31-21-165:~# export PATH=$PATH:/usr/local/go/bin
root@ip-172-31-21-165:~# go version
go version go1.19.4 linux/amd64

Reference: Golang installation

1. A basic container Link to heading

We just need a basic main.go file which will clone the specified command after go run main.go and run it inside a namespace

package main

import (
	"fmt"
	"os"
	"os/exec"
	"syscall"
)

// docker         run image <cmd> <params>
// go run main.go run image <cmd> <params>

func main()  {
	switch os.Args[1] {
	case "run":
		run()
	default:
		panic("Invalid command!")
	}
}

func run() {
	fmt.Printf("Running main process %v as %d\n", os.Args[2:], os.Getpid())

	cmd.Stdin = os.Stdin
	cmd.Stdout = os.Stdout
	cmd.Stderr = os.Stderr

	// Cloneflags is only available in Linux
	// CLONE_NEWUTS: create the process in a new namespace
	cmd.SysProcAttr = &syscall.SysProcAttr{
		Cloneflags: syscall.CLONE_NEWUTS,
	}

	cmd.Run()
}

References: Learn more about namespaces in Go

Run a container

root@ip-172-31-21-165:~# go run main.go run /bin/bash
Running main process [/bin/bash] as 1557
root@ip-172-31-21-165:~#

It’s really hard to tell we’re in the container or not, so we want to change the namespace’s hostname to be distinguish with the host’s hostname by using syscall.Sethostname([]byte("container")). However:

  • If we put it after the cmd.Run(), it will never be triggered because the cmd.Run() line doesn’t complete until we exit the command /bin/bash we’re running.
  • If we put it before the cmd.Run(), the cloned process doesn’t really start if the cmd.Run() line isn’t called yet, so we cannot set the hostname. So we cannot put before and after the cmd.Run() line.

We can make the program run again itself by using /proc/self/exe, and we will set the hostname from another process.

func main()  {
	switch os.Args[1] {
	case "run":
		run()
	case "child":
		child()
	default:
		panic("Invalid command!")
	}
}

func run() {
	fmt.Printf("Running main process %v as %d\n", os.Args[2:], os.Getpid())

	// Reinvoke the process inside the new namespace with a child process
	cmd := exec.Command("/proc/self/exe", append([]string{"child"}, os.Args[2:]...)...)
	cmd.Stdin = os.Stdin
	cmd.Stdout = os.Stdout
	cmd.Stderr = os.Stderr

	// Cloneflags is only available in Linux
	// CLONE_NEWUTS: create the process in a new namespace
	cmd.SysProcAttr = &syscall.SysProcAttr{
		Cloneflags: syscall.CLONE_NEWUTS,
	}

	cmd.Run()
}

func child() {
	fmt.Printf("Running child process %v as %d\n", os.Args[2:], os.Getpid())

	cmd := exec.Command(os.Args[2], os.Args[3:]...)
	cmd.Stdin = os.Stdin
	cmd.Stdout = os.Stdout
	cmd.Stderr = os.Stderr

	syscall.Sethostname([]byte("container"))

	cmd.Run()
}

Run a container and the container hostname change to container

root@ip-172-31-21-165:~# go run main.go run /bin/bash
Running main process [/bin/bash] as 1557
Running child process [/bin/bash] as 1
root@container:~# ps -f
UID          PID    PPID  C STIME TTY          TIME CMD
root        1427    1426  0 17:26 pts/1    00:00:00 sudo su -
root        1428    1427  0 17:26 pts/1    00:00:00 su -
root        1429    1428  0 17:26 pts/1    00:00:00 -bash
root        1530    1429  0 17:34 pts/1    00:00:00 go run main.go run /bin/bash
root        1557    1530  0 17:34 pts/1    00:00:00 /tmp/go-build894692501/b001/exe/main run /bin/bash
root        1561    1557  0 17:34 pts/1    00:00:00 /proc/self/exe child /bin/bash
root        1565    1561  0 17:34 pts/1    00:00:00 /bin/bash
root        1582    1565  0 17:35 pts/1    00:00:00 ps -f

Although the child process has PID 1, the output of ps -f is still showing its 1561

2. Isolate processes Link to heading

Now we want to isolate processes of the container from the host. We need to create a file system for the container first. We can export the file system of a Docker container to the ubuntu-ts directory to be used by our container

root@ip-172-31-21-165:~# docker run -d --rm --name ubuntu-fs ubuntu:22.04 sleep 1000
root@ip-172-31-21-165:~# docker export ubuntu-fs -o ubuntu-fs.tar
root@ip-172-31-21-165:~# docker stop ubuntu-fs
root@ip-172-31-21-165:~# mkdir -p ubuntu-fs
root@ip-172-31-21-165:~# tar -xvf ubuntu-fs.tar -C ubuntu-fs/
root@ip-172-31-21-165:~# touch ubuntu-fs/ROOT_OF_CONTAINER

We also create a dummy ROOT_OF_CONTAINER file inside ubuntu-fs directory so we can make sure that the container will use the correct file system.

Now we need to set the root of the container to ubuntu-fs directory by using syscall.Chroot("./ubuntu-fs") and change directory to the root by using syscall.Chdir.

func run() {
	fmt.Printf("Running main process %v as %d\n", os.Args[2:], os.Getpid())

	// Reinvoke the process inside the new namespace with a child process
	cmd := exec.Command("/proc/self/exe", append([]string{"child"}, os.Args[2:]...)...)
	cmd.Stdin = os.Stdin
	cmd.Stdout = os.Stdout
	cmd.Stderr = os.Stderr

	// Cloneflags is only available in Linux
	// CLONE_NEWUTS: create the process in a new namespace
	// CLONE_NEWPID: isolates processes
    // CLONE_NEWNS: isolates mounts
	cmd.SysProcAttr = &syscall.SysProcAttr{
		Cloneflags: syscall.CLONE_NEWUTS | syscall.CLONE_NEWPID | syscall.CLONE_NEWNS,
	}

	cmd.Run()
}

func child() {
	fmt.Printf("Running child process %v as %d\n", os.Args[2:], os.Getpid())

	cmd := exec.Command(os.Args[2], os.Args[3:]...)
	cmd.Stdin = os.Stdin
	cmd.Stdout = os.Stdout
	cmd.Stderr = os.Stderr

	syscall.Sethostname([]byte("container"))
	// Change the root of the container
	syscall.Chroot("./ubuntu-fs")
	// Change directory to root after chroot
	syscall.Chdir("/")
    // Mount /proc inside container so that `ps` command works
	syscall.Mount("proc", "proc", "proc", 0, "")

	cmd.Run()

    // Unmount /proc when the process finishes
	syscall.Unmount("/proc", 0)
}

Run a container

root@ip-172-31-21-165:~# go run main.go run /bin/bash
Running main process [/bin/bash] as 1908
Running child process [/bin/bash] as 1
root@container:/# cd ..
root@container:/# ls
bin  boot  dev  etc  home  lib  lib32  lib64  libx32  media  mnt  opt  proc  root  ROOT_OF_CONTAINER  run  sbin  srv  sys  tmp  usr  var

root@container:/# sleep 1000

We can see that we cannot go higher than the root of the container by using cd .. and the dummy ROOT_OF_CONTAINER file is at the root of the container.

Open another terminal to the host, find the sleep process

root@ip-172-31-21-165:~# ps -C sleep
    PID TTY          TIME CMD
   1923 pts/1    00:00:00 sleep

Now we know 1923 is the process ID of the sleep command inside the container, and we can know all information about the process at /proc/1923. If we look at the root directory inside /proc/1923 directory, we see at the root directory is the file system that we mount to /root/ubuntu-fs

root@ip-172-31-21-165:~# ls -l /proc/1923/root
lrwxrwxrwx 1 root root 0 Dec 24 18:31 /proc/1923/root -> /root/ubuntu-fs

Note: Same thing happens to a Docker image. When we use the image, it copies the file system packed up in that image and unpacks to somewhere in the host machine and change the root directory to see the new file system

Exit the old container and run a new container. Now all processes inside the container are isolated from the host and the output of ps command is showing PID 1 for the child process.

root@ip-172-31-21-165:~# go run main.go run /bin/bash
Running main process [/bin/bash] as 2061
Running child process [/bin/bash] as 1
root@container:/# ps -f
UID          PID    PPID  C STIME TTY          TIME CMD
root           1       0  0 18:41 ?        00:00:00 /proc/self/exe child /bin/bash
root           5       1  0 18:41 ?        00:00:00 /bin/bash
root           8       5  0 18:41 ?        00:00:00 ps -f

root@container:/# mount
proc on /proc type proc (rw,relatime)

We can see the mounted proc from the host

root@ip-172-31-21-165:~# mount | grep proc
proc on /root/ubuntu-fs/proc type proc (rw,relatime)

We can unshare the mounted proc with Unshareflags

cmd.SysProcAttr = &syscall.SysProcAttr{
    Cloneflags: syscall.CLONE_NEWUTS | syscall.CLONE_NEWPID | syscall.CLONE_NEWNS,
    // By default, the new namspace shared with the host. Use Unshareflags to not share
    Unshareflags: syscall.CLONE_NEWNS,
}

Exit the old container and run a new container, we will not see the mounted proc from the host

root@ip-172-31-21-165:~# mount | grep proc

3. Resource constrains Link to heading

Let’s understand how resources are managed in a Docker container. In Linux, all resources are managed in /sys/fs/cgroup/. For example, the memory.max file inside system.slice directory will specified the maximum memory allowed using.

root@ip-172-31-21-165:/sys/fs/cgroup/system.slice# cat memory.max
max

Run 2 Docker containers, one with memory set to 10M. There are 2 new diretories created in /sys/fs/cgroup/system.slice, which are used to manage resources of each Docker containers.

root@ip-172-31-21-165:/sys/fs/cgroup# docker run --rm -it -d ubuntu:22.04 /bin/bash
d46774325ca573a3c515731d016b2ab359ed1b9369224a038d00abda9bae1a7a

root@ip-172-31-21-165:/sys/fs/cgroup# docker run --rm -it -d --memory=10M  ubuntu:22.04 /bin/bash
86ec6b6890adb5fd66bae17c3933364796f98e6e9aedaf87a16151b1dbe11d5d

root@ip-172-31-21-165:/sys/fs/cgroup/system.slice# cat docker-d46774325ca573a3c515731d016b2ab359ed1b9369224a038d00abda9bae1a7a.scope/memory.max
max

root@ip-172-31-21-165:/sys/fs/cgroup/system.slice# cat docker-86ec6b6890adb5fd66bae17c3933364796f98e6e9aedaf87a16151b1dbe11d5d.scope/memory.max
10485760

We can limit the maximum processes inside a container by setting the pids.max

func child() {
	fmt.Printf("Running child process %v as %d\n", os.Args[2:], os.Getpid())

	configCgroups()

	cmd := exec.Command(os.Args[2], os.Args[3:]...)
	cmd.Stdin = os.Stdin
	cmd.Stdout = os.Stdout
	cmd.Stderr = os.Stderr

	syscall.Sethostname([]byte("container"))
	// Change the root of the container
	syscall.Chroot("./ubuntu-fs")
	// Change directory to root after chroot
	syscall.Chdir("/")
	// Mount /proc inside container so that `ps` command works
	syscall.Mount("proc", "proc", "proc", 0, "")

	cmd.Run()

	// Unmount /proc when the process finishes
	syscall.Unmount("/proc", 0)
}

func configCgroups() {
	cgroups := "/sys/fs/cgroup/"
	container := filepath.Join(cgroups, "container")
	os.Mkdir(container, 0755)
	ioutil.WriteFile(filepath.Join(container, "pids.max"), []byte("10"), 0700)
	ioutil.WriteFile(filepath.Join(container, "cgroup.procs"), []byte(strconv.Itoa(os.Getpid())), 0700)
}

Run a new container and spread out to all processes

root@container:/# :() { : | : & }; :

We can see that the number of processes inside the container are limited

root@ip-172-31-21-165:~# ps -fax
...
    693 ?        Ss     0:00 sshd: /usr/sbin/sshd -D -o AuthorizedKeysCommand /usr/share/ec2-instance-connect/eic_run_authorized_keys %u %f -o AuthorizedKeysCommandUser ec2-instance-connect
    745 ?        Ss     0:00  \_ sshd: ubuntu [priv]
   1231 ?        S      0:00  |   \_ sshd: ubuntu@pts/0
   1239 pts/0    Ss     0:00  |       \_ -bash
   1594 pts/0    S+     0:00  |           \_ sudo su -
   1595 pts/1    Ss     0:00  |               \_ sudo su -
   1596 pts/1    S      0:00  |                   \_ su -
   1597 pts/1    S      0:00  |                       \_ -bash
   9535 pts/1    Sl     0:00  |                           \_ go run main.go run /bin/bash
   9562 pts/1    Sl     0:00  |                               \_ /tmp/go-build1049092112/b001/exe/main run /bin/bash
   9566 pts/1    Sl     0:00  |                                   \_ /proc/self/exe child /bin/bash
   9570 pts/1    S+     0:00  |                                       \_ /bin/bash
   9576 pts/1    Z      0:00  |                                       \_ [bash] <defunct>
   9577 pts/1    Z      0:00  |                                       \_ [bash] <defunct>
   9578 pts/1    Z      0:00  |                                       \_ [bash] <defunct>
   9579 pts/1    Z      0:00  |                                       \_ [bash] <defunct>
   9580 pts/1    Z      0:00  |                                       \_ [bash] <defunct>
...
Info
Learn more about the session at GOTO Amsterdam 2018 at here